diff --git a/tree/ntuple/CMakeLists.txt b/tree/ntuple/CMakeLists.txt index b126ee0faccc23..791c6a228a0954 100644 --- a/tree/ntuple/CMakeLists.txt +++ b/tree/ntuple/CMakeLists.txt @@ -30,6 +30,7 @@ HEADERS ROOT/RNTupleFillContext.hxx ROOT/RNTupleFillStatus.hxx ROOT/RNTupleImtTaskScheduler.hxx + ROOT/RNTupleIndex.hxx ROOT/RNTupleMerger.hxx ROOT/RNTupleMetrics.hxx ROOT/RNTupleModel.hxx @@ -64,6 +65,7 @@ SOURCES v7/src/RNTupleDescriptor.cxx v7/src/RNTupleDescriptorFmt.cxx v7/src/RNTupleFillContext.cxx + v7/src/RNTupleIndex.cxx v7/src/RNTupleMerger.cxx v7/src/RNTupleMetrics.cxx v7/src/RNTupleModel.cxx diff --git a/tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx b/tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx new file mode 100644 index 00000000000000..817ac115ea44c1 --- /dev/null +++ b/tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx @@ -0,0 +1,141 @@ +/// \file ROOT/RNTupleIndex.hxx +/// \ingroup NTuple ROOT7 +/// \author Florine de Geus +/// \date 2024-04-02 +/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback +/// is welcome! + +/************************************************************************* + * Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef ROOT7_RNTupleIndex +#define ROOT7_RNTupleIndex + +#include +#include + +#include +#include +#include +#include +#include + +namespace ROOT { +namespace Experimental { +namespace Internal { +///////////////////////////////////////////////////////////////////////////// +/// Container for the combined hash of the indexed fields. Uses the implementation from `boost::hash_combine` (see +/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine). +struct RIndexValue { + std::size_t fValue = 0; + + void operator+=(std::size_t other) { fValue ^= other + 0x9e3779b9 + (fValue << 6) + (fValue >> 2); } + inline bool operator==(const RIndexValue &other) const { return other.fValue == fValue; } + inline size_t operator()(const ROOT::Experimental::Internal::RIndexValue &val) const { return val.fValue; } +}; + +// clang-format off +/** +\class ROOT::Experimental::Internal::RNTupleIndex +\ingroup NTuple +\brief Build an index for an RNTuple so it can be joined onto other RNTuples. +*/ +// clang-format on +class RNTupleIndex { + friend std::unique_ptr + CreateRNTupleIndex(std::vector fieldNames, RPageSource &pageSource); + +private: + std::vector> fFields; + std::unordered_map, RIndexValue> fIndex; + + ///////////////////////////////////////////////////////////////////////////// + /// \brief Create an RNTupleIndex for an existing RNTuple. + /// + /// \param[in] The fields that will make up the index. + /// \param[in] The page source of the RNTuple to build the index for. + /// + /// \note The page source is assumed be attached already. + RNTupleIndex(std::vector> fields, RPageSource &pageSource); + +public: + RNTupleIndex(const RNTupleIndex &other) = delete; + RNTupleIndex &operator=(const RNTupleIndex &other) = delete; + RNTupleIndex(RNTupleIndex &&other) = delete; + RNTupleIndex &operator=(RNTupleIndex &&other) = delete; + + std::size_t GetNElems() const { return fIndex.size(); } + + void Add(std::vector valuePtrs, NTupleSize_t entry); + + ///////////////////////////////////////////////////////////////////////////// + /// \brief Get the entry number containing the given index value. + /// + /// \param[in] value The indexed value + /// \return The entry number, containing the specified index value. When no such entry exists, return + /// `kInvalidNTupleIndex` + /// + /// Note that in case multiple entries corresponding to the provided index value exist, the first occurrence is + /// returned. Use RNTupleIndex::GetEntryIndices to get all entries. + NTupleSize_t GetEntryIndex(std::vector valuePtrs) const; + + ///////////////////////////////////////////////////////////////////////////// + /// \brief Get the entry number containing the given index value. + /// + /// \sa GetEntryIndex(std::vector valuePtrs) + template + NTupleSize_t GetEntryIndex(Ts... values) const + { + if (sizeof...(Ts) != fFields.size()) + throw RException(R__FAIL("number of value pointers must match number of indexed fields")); + + std::vector valuePtrs; + ([&] { valuePtrs.push_back(&values); }(), ...); + + return GetEntryIndex(valuePtrs); + } + + ///////////////////////////////////////////////////////////////////////////// + /// \brief Get all entry numbers for the given index. + /// + /// \param[in] value The indexed value + /// \return The entry numbers containing the specified index value. When no entries exists, return an empty vector. + std::vector GetEntryIndices(std::vector valuePtrs) const; + + ///////////////////////////////////////////////////////////////////////////// + /// \brief Get all entry numbers for the given index. + /// + /// \sa GetEntryIndices(std::vector valuePtrs) + template + std::vector GetEntryIndices(Ts... values) const + { + if (sizeof...(Ts) != fFields.size()) + throw RException(R__FAIL("number of value pointers must match number of indexed fields")); + + std::vector valuePtrs; + ([&] { valuePtrs.push_back(&values); }(), ...); + + return GetEntryIndices(valuePtrs); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// \brief Create an RNTupleIndex from an existing RNTuple. +/// +/// \param[in] fieldNames The names of the fields to index. +/// \param pageSource The page source. +/// +/// \return A pointer to the newly-created index. +/// +std::unique_ptr CreateRNTupleIndex(std::vector fieldNames, RPageSource &pageSource); + +} // namespace Internal +} // namespace Experimental +} // namespace ROOT + +#endif // ROOT7_RNTupleIndex diff --git a/tree/ntuple/v7/src/RNTupleIndex.cxx b/tree/ntuple/v7/src/RNTupleIndex.cxx new file mode 100644 index 00000000000000..21dd9c91ceb7c1 --- /dev/null +++ b/tree/ntuple/v7/src/RNTupleIndex.cxx @@ -0,0 +1,98 @@ +/// \file RNTupleIndex.cxx +/// \ingroup NTuple ROOT7 +/// \author Florine de Geus +/// \date 2024-04-02 +/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback +/// is welcome! + +/************************************************************************* + * Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include + +ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector> fields, + RPageSource &pageSource) + : fFields(std::move(fields)) +{ + std::vector fieldValues; + for (const auto &field : fFields) { + fieldValues.emplace_back(field->CreateValue()); + } + + for (std::uint64_t i = 0; i < pageSource.GetNEntries(); ++i) { + std::vector ptrs; + for (auto &fieldValue : fieldValues) { + fieldValue.Read(i); + ptrs.push_back(fieldValue.GetPtr().get()); + } + Add(ptrs, i); + } +} + +void ROOT::Experimental::Internal::RNTupleIndex::Add(std::vector valuePtrs, NTupleSize_t entry) +{ + RIndexValue indexValue; + for (unsigned i = 0; i < fFields.size(); ++i) { + indexValue += fFields[i]->GetHash(valuePtrs[i]); + } + fIndex[indexValue].push_back(entry); +} + +ROOT::Experimental::NTupleSize_t +ROOT::Experimental::Internal::RNTupleIndex::GetEntryIndex(std::vector valuePtrs) const +{ + auto entryIndices = GetEntryIndices(valuePtrs); + if (entryIndices.empty()) + return kInvalidNTupleIndex; + return entryIndices.front(); +} + +std::vector +ROOT::Experimental::Internal::RNTupleIndex::GetEntryIndices(std::vector valuePtrs) const +{ + RIndexValue indexValue; + for (unsigned i = 0; i < fFields.size(); ++i) { + indexValue += fFields[i]->GetHash(valuePtrs[i]); + } + + if (!fIndex.count(indexValue)) + return {}; + + return fIndex.at(indexValue); +} + +//------------------------------------------------------------------------------ + +std::unique_ptr +ROOT::Experimental::Internal::CreateRNTupleIndex(std::vector fieldNames, RPageSource &pageSource) +{ + pageSource.Attach(); + auto desc = pageSource.GetSharedDescriptorGuard(); + + std::vector> fields; + + for (const auto &fieldName : fieldNames) { + auto fieldId = desc->FindFieldId(fieldName); + if (fieldId == kInvalidDescriptorId) + throw RException(R__FAIL("could not find field \"" + std::string(fieldName) + "")); + + const auto &fieldDesc = desc->GetFieldDescriptor(fieldId); + auto fieldOrException = RFieldBase::Create(fieldDesc.GetFieldName(), fieldDesc.GetTypeName()); + if (!fieldOrException) { + throw RException(R__FAIL("could not construct field \"" + std::string(fieldName) + "\"")); + } + auto field = fieldOrException.Unwrap(); + field->SetOnDiskId(fieldDesc.GetId()); + + CallConnectPageSourceOnField(*field, pageSource); + + fields.push_back(std::move(field)); + } + + return std::unique_ptr(new RNTupleIndex(std::move(fields), pageSource)); +} diff --git a/tree/ntuple/v7/test/CMakeLists.txt b/tree/ntuple/v7/test/CMakeLists.txt index 32ad6a74da34c6..babb12254b5417 100644 --- a/tree/ntuple/v7/test/CMakeLists.txt +++ b/tree/ntuple/v7/test/CMakeLists.txt @@ -35,6 +35,7 @@ ROOT_GENERATE_DICTIONARY(RXTupleDict ${CMAKE_CURRENT_SOURCE_DIR}/RXTuple.hxx ROOT_ADD_GTEST(ntuple_descriptor ntuple_descriptor.cxx LIBRARIES ROOTNTuple CustomStruct) ROOT_ADD_GTEST(ntuple_endian ntuple_endian.cxx LIBRARIES ROOTNTuple) ROOT_ADD_GTEST(ntuple_friends ntuple_friends.cxx LIBRARIES ROOTNTuple CustomStruct) +ROOT_ADD_GTEST(ntuple_index ntuple_index.cxx LIBRARIES ROOTNTuple CustomStruct) ROOT_ADD_GTEST(ntuple_merger ntuple_merger.cxx LIBRARIES ROOTNTuple CustomStruct) ROOT_ADD_GTEST(ntuple_metrics ntuple_metrics.cxx LIBRARIES ROOTNTuple CustomStruct) ROOT_ADD_GTEST(ntuple_model ntuple_model.cxx LIBRARIES ROOTNTuple CustomStruct) diff --git a/tree/ntuple/v7/test/ntuple_index.cxx b/tree/ntuple/v7/test/ntuple_index.cxx new file mode 100644 index 00000000000000..a96065d3c71a56 --- /dev/null +++ b/tree/ntuple/v7/test/ntuple_index.cxx @@ -0,0 +1,161 @@ +#include "ntuple_test.hxx" + +TEST(RNTupleIndex, Basic) +{ + FileRaii fileGuard("test_ntuple_index_basic.root"); + { + auto model = RNTupleModel::Create(); + auto fld = model->MakeField("fld"); + + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath()); + + for (int i = 0; i < 10; ++i) { + *fld = i * 2; + ntuple->Fill(); + } + } + + auto pageSource = RPageSource::Create("ntuple", fileGuard.GetPath()); + auto index = ROOT::Experimental::Internal::CreateRNTupleIndex({"fld"}, *pageSource); + + auto ntuple = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + auto fld = ntuple->GetView("fld"); + + for (unsigned i = 0; i < ntuple->GetNEntries(); ++i) { + auto fldValue = fld(i); + EXPECT_EQ(fldValue, i * 2); + EXPECT_EQ(index->GetEntryIndex({&fldValue}), i); + } +} + +TEST(RNTupleIndex, SparseSecondary) +{ + FileRaii fileGuardMain("test_ntuple_index_sparse_secondary1.root"); + { + auto model = RNTupleModel::Create(); + auto fldEvent = model->MakeField("event"); + + auto ntuple = RNTupleWriter::Recreate(std::move(model), "primary", fileGuardMain.GetPath()); + + for (int i = 0; i < 10; ++i) { + *fldEvent = i; + ntuple->Fill(); + } + } + + FileRaii fileGuardSecondary("test_ntuple_index_sparse_secondary2.root"); + { + auto model = RNTupleModel::Create(); + auto fldEvent = model->MakeField("event"); + auto fldX = model->MakeField("x"); + + auto ntuple = RNTupleWriter::Recreate(std::move(model), "secondary", fileGuardSecondary.GetPath()); + + for (int i = 0; i < 5; ++i) { + *fldEvent = i * 2; + *fldX = static_cast(i) / 3.14; + ntuple->Fill(); + } + } + + auto mainNtuple = RNTupleReader::Open("primary", fileGuardMain.GetPath()); + auto fldEvent = mainNtuple->GetView("event"); + + auto secondaryPageSource = RPageSource::Create("secondary", fileGuardSecondary.GetPath()); + auto index = ROOT::Experimental::Internal::CreateRNTupleIndex({"event"}, *secondaryPageSource); + auto secondaryNTuple = RNTupleReader::Open("secondary", fileGuardSecondary.GetPath()); + auto fldX = secondaryNTuple->GetView("x"); + + for (unsigned i = 0; i < mainNtuple->GetNEntries(); ++i) { + auto event = fldEvent(i); + + if (i % 2 == 1) { + EXPECT_EQ(index->GetEntryIndex(event), ROOT::Experimental::kInvalidNTupleIndex) + << "entry should not be present in the index"; + } else { + auto idx = index->GetEntryIndex(event); + EXPECT_EQ(idx, i / 2); + EXPECT_FLOAT_EQ(fldX(idx), static_cast(idx) / 3.14); + } + } +} + +TEST(RNTupleIndex, MultipleFields) +{ + FileRaii fileGuard("test_ntuple_index_multiple_fields.root"); + { + auto model = RNTupleModel::Create(); + auto fldRun = model->MakeField("run"); + auto fldEvent = model->MakeField("event"); + auto fldX = model->MakeField("x"); + + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath()); + + for (int i = 0; i < 3; ++i) { + *fldRun = i; + for (int j = 0; j < 5; ++j) { + *fldEvent = j; + *fldX = static_cast(i + j) / 3.14; + ntuple->Fill(); + } + } + } + + auto pageSource = RPageSource::Create("ntuple", fileGuard.GetPath()); + pageSource->Attach(); + + auto index = ROOT::Experimental::Internal::CreateRNTupleIndex({"run", "event"}, *pageSource); + + auto ntuple = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + auto fld = ntuple->GetView("x"); + + std::uint64_t event, run; + for (std::uint64_t i = 0; i < pageSource->GetNEntries(); ++i) { + run = i / 5; + event = i % 5; + auto entryIdx = index->GetEntryIndex(run, event); + EXPECT_EQ(fld(entryIdx), fld(i)); + } + + auto idx1 = index->GetEntryIndex(2, 1); + auto idx2 = index->GetEntryIndex(1, 2); + EXPECT_NE(idx1, idx2); +} + +TEST(RNTupleIndex, MultipleMatches) +{ + FileRaii fileGuard("test_ntuple_index_multiple_fields.root"); + { + auto model = RNTupleModel::Create(); + auto fldRun = model->MakeField("run"); + + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath()); + + *fldRun = 1; + for (int i = 0; i < 10; ++i) { + if (i > 4) + *fldRun = 2; + if (i > 7) + *fldRun = 3; + ntuple->Fill(); + } + } + + auto pageSource = RPageSource::Create("ntuple", fileGuard.GetPath()); + pageSource->Attach(); + + auto index = ROOT::Experimental::Internal::CreateRNTupleIndex({"run"}, *pageSource); + + auto entryIdxs = index->GetEntryIndices(1); + auto expected = std::vector{0, 1, 2, 3, 4}; + EXPECT_EQ(expected, entryIdxs); + entryIdxs = index->GetEntryIndices(2); + expected = {5, 6, 7}; + EXPECT_EQ(expected, entryIdxs); + entryIdxs = index->GetEntryIndices(3); + expected = {8, 9}; + EXPECT_EQ(expected, entryIdxs); + entryIdxs = index->GetEntryIndices(4); + expected = {}; + EXPECT_EQ(expected, entryIdxs); +} diff --git a/tree/ntuple/v7/test/ntuple_test.hxx b/tree/ntuple/v7/test/ntuple_test.hxx index a2baf9d9b1a25f..e1fb63ae8e5709 100644 --- a/tree/ntuple/v7/test/ntuple_test.hxx +++ b/tree/ntuple/v7/test/ntuple_test.hxx @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include