Skip to content

Commit

Permalink
[ntuple] Add initial in-memory index prototype
Browse files Browse the repository at this point in the history
This PR adds (a first version of) the `RNTupleIndex`, which is an
in-memory structure that maps RNTuple field values (or combinations
thereof) to an entry index in the RNTuple for which the index was
built. At this point, the index only resides in memory and thus has
to be (re)build each time.

`RNTupleIndex` will be used by the  `RNTupleProcessor` to enable
dataset joins and will be as transparent as possible to users.
Currently, no public interface is foreseen.
  • Loading branch information
enirolf committed Jun 28, 2024
1 parent 5ff9fc5 commit 68b602a
Show file tree
Hide file tree
Showing 6 changed files with 404 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tree/ntuple/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ HEADERS
ROOT/RNTupleFillContext.hxx
ROOT/RNTupleFillStatus.hxx
ROOT/RNTupleImtTaskScheduler.hxx
ROOT/RNTupleIndex.hxx
ROOT/RNTupleMerger.hxx
ROOT/RNTupleMetrics.hxx
ROOT/RNTupleModel.hxx
Expand Down Expand Up @@ -64,6 +65,7 @@ SOURCES
v7/src/RNTupleDescriptor.cxx
v7/src/RNTupleDescriptorFmt.cxx
v7/src/RNTupleFillContext.cxx
v7/src/RNTupleIndex.cxx
v7/src/RNTupleMerger.cxx
v7/src/RNTupleMetrics.cxx
v7/src/RNTupleModel.cxx
Expand Down
141 changes: 141 additions & 0 deletions tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/// \file ROOT/RNTupleIndex.hxx
/// \ingroup NTuple ROOT7
/// \author Florine de Geus <florine.de.geus@cern.ch>
/// \date 2024-04-02
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
/// is welcome!

/*************************************************************************
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#ifndef ROOT7_RNTupleIndex
#define ROOT7_RNTupleIndex

#include <ROOT/RField.hxx>
#include <ROOT/RNTupleUtil.hxx>

#include <unordered_map>
#include <memory>
#include <set>
#include <string>
#include <vector>

namespace ROOT {
namespace Experimental {
namespace Internal {
/////////////////////////////////////////////////////////////////////////////
/// Container for the combined hash of the indexed fields. Uses the implementation from `boost::hash_combine` (see
/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine).
struct RIndexValue {
std::size_t fValue = 0;

void operator+=(std::size_t other) { fValue ^= other + 0x9e3779b9 + (fValue << 6) + (fValue >> 2); }
inline bool operator==(const RIndexValue &other) const { return other.fValue == fValue; }
inline size_t operator()(const ROOT::Experimental::Internal::RIndexValue &val) const { return val.fValue; }
};

// clang-format off
/**
\class ROOT::Experimental::Internal::RNTupleIndex
\ingroup NTuple
\brief Build an index for an RNTuple so it can be joined onto other RNTuples.
*/
// clang-format on
class RNTupleIndex {
friend std::unique_ptr<RNTupleIndex>
CreateRNTupleIndex(std::vector<std::string_view> fieldNames, RPageSource &pageSource);

private:
std::vector<std::unique_ptr<RFieldBase>> fFields;
std::unordered_map<RIndexValue, std::vector<NTupleSize_t>, RIndexValue> fIndex;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex for an existing RNTuple.
///
/// \param[in] The fields that will make up the index.
/// \param[in] The page source of the RNTuple to build the index for.
///
/// \note The page source is assumed be attached already.
RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> fields, RPageSource &pageSource);

public:
RNTupleIndex(const RNTupleIndex &other) = delete;
RNTupleIndex &operator=(const RNTupleIndex &other) = delete;
RNTupleIndex(RNTupleIndex &&other) = delete;
RNTupleIndex &operator=(RNTupleIndex &&other) = delete;

std::size_t GetNElems() const { return fIndex.size(); }

void Add(std::vector<void *> valuePtrs, NTupleSize_t entry);

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the entry number containing the given index value.
///
/// \param[in] value The indexed value
/// \return The entry number, containing the specified index value. When no such entry exists, return
/// `kInvalidNTupleIndex`
///
/// Note that in case multiple entries corresponding to the provided index value exist, the first occurrence is
/// returned. Use RNTupleIndex::GetEntryIndices to get all entries.
NTupleSize_t GetEntryIndex(std::vector<void *> valuePtrs) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the entry number containing the given index value.
///
/// \sa GetEntryIndex(std::vector<void *> valuePtrs)
template <typename... Ts>
NTupleSize_t GetEntryIndex(Ts... values) const
{
if (sizeof...(Ts) != fFields.size())
throw RException(R__FAIL("number of value pointers must match number of indexed fields"));

std::vector<void *> valuePtrs;
([&] { valuePtrs.push_back(&values); }(), ...);

return GetEntryIndex(valuePtrs);
}

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry numbers for the given index.
///
/// \param[in] value The indexed value
/// \return The entry numbers containing the specified index value. When no entries exists, return an empty vector.
std::vector<NTupleSize_t> GetEntryIndices(std::vector<void *> valuePtrs) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry numbers for the given index.
///
/// \sa GetEntryIndices(std::vector<void *> valuePtrs)
template <typename... Ts>
std::vector<NTupleSize_t> GetEntryIndices(Ts... values) const
{
if (sizeof...(Ts) != fFields.size())
throw RException(R__FAIL("number of value pointers must match number of indexed fields"));

std::vector<void *> valuePtrs;
([&] { valuePtrs.push_back(&values); }(), ...);

return GetEntryIndices(valuePtrs);
}
};

////////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex from an existing RNTuple.
///
/// \param[in] fieldNames The names of the fields to index.
/// \param pageSource The page source.
///
/// \return A pointer to the newly-created index.
///
std::unique_ptr<RNTupleIndex> CreateRNTupleIndex(std::vector<std::string_view> fieldNames, RPageSource &pageSource);

} // namespace Internal
} // namespace Experimental
} // namespace ROOT

#endif // ROOT7_RNTupleIndex
98 changes: 98 additions & 0 deletions tree/ntuple/v7/src/RNTupleIndex.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/// \file RNTupleIndex.cxx
/// \ingroup NTuple ROOT7
/// \author Florine de Geus <florine.de.geus@cern.ch>
/// \date 2024-04-02
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
/// is welcome!

/*************************************************************************
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#include <ROOT/RNTupleIndex.hxx>

ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> fields,
RPageSource &pageSource)
: fFields(std::move(fields))
{
std::vector<RFieldBase::RValue> fieldValues;
for (const auto &field : fFields) {
fieldValues.emplace_back(field->CreateValue());
}

for (std::uint64_t i = 0; i < pageSource.GetNEntries(); ++i) {
std::vector<void *> ptrs;
for (auto &fieldValue : fieldValues) {
fieldValue.Read(i);
ptrs.push_back(fieldValue.GetPtr<void>().get());
}
Add(ptrs, i);
}
}

void ROOT::Experimental::Internal::RNTupleIndex::Add(std::vector<void *> valuePtrs, NTupleSize_t entry)
{
RIndexValue indexValue;
for (unsigned i = 0; i < fFields.size(); ++i) {
indexValue += fFields[i]->GetHash(valuePtrs[i]);
}
fIndex[indexValue].push_back(entry);
}

ROOT::Experimental::NTupleSize_t
ROOT::Experimental::Internal::RNTupleIndex::GetEntryIndex(std::vector<void *> valuePtrs) const
{
auto entryIndices = GetEntryIndices(valuePtrs);
if (entryIndices.empty())
return kInvalidNTupleIndex;
return entryIndices.front();
}

std::vector<ROOT::Experimental::NTupleSize_t>
ROOT::Experimental::Internal::RNTupleIndex::GetEntryIndices(std::vector<void *> valuePtrs) const
{
RIndexValue indexValue;
for (unsigned i = 0; i < fFields.size(); ++i) {
indexValue += fFields[i]->GetHash(valuePtrs[i]);
}

if (!fIndex.count(indexValue))
return {};

return fIndex.at(indexValue);
}

//------------------------------------------------------------------------------

std::unique_ptr<ROOT::Experimental::Internal::RNTupleIndex>
ROOT::Experimental::Internal::CreateRNTupleIndex(std::vector<std::string_view> fieldNames, RPageSource &pageSource)
{
pageSource.Attach();
auto desc = pageSource.GetSharedDescriptorGuard();

std::vector<std::unique_ptr<RFieldBase>> fields;

for (const auto &fieldName : fieldNames) {
auto fieldId = desc->FindFieldId(fieldName);
if (fieldId == kInvalidDescriptorId)
throw RException(R__FAIL("could not find field \"" + std::string(fieldName) + ""));

const auto &fieldDesc = desc->GetFieldDescriptor(fieldId);
auto fieldOrException = RFieldBase::Create(fieldDesc.GetFieldName(), fieldDesc.GetTypeName());
if (!fieldOrException) {
throw RException(R__FAIL("could not construct field \"" + std::string(fieldName) + "\""));
}
auto field = fieldOrException.Unwrap();
field->SetOnDiskId(fieldDesc.GetId());

CallConnectPageSourceOnField(*field, pageSource);

fields.push_back(std::move(field));
}

return std::unique_ptr<RNTupleIndex>(new RNTupleIndex(std::move(fields), pageSource));
}
1 change: 1 addition & 0 deletions tree/ntuple/v7/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ ROOT_GENERATE_DICTIONARY(RXTupleDict ${CMAKE_CURRENT_SOURCE_DIR}/RXTuple.hxx
ROOT_ADD_GTEST(ntuple_descriptor ntuple_descriptor.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_endian ntuple_endian.cxx LIBRARIES ROOTNTuple)
ROOT_ADD_GTEST(ntuple_friends ntuple_friends.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_index ntuple_index.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_merger ntuple_merger.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_metrics ntuple_metrics.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_model ntuple_model.cxx LIBRARIES ROOTNTuple CustomStruct)
Expand Down
Loading

0 comments on commit 68b602a

Please sign in to comment.