-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This adds (a first version of) the `RNTupleIndex`, which is an in-memory structure that maps RNTuple field values (or combinations thereof) to an entry index in the RNTuple for which the index was built. At this point, the index only resides in memory and thus has to be (re)build each time. `RNTupleIndex` will be used by the `RNTupleProcessor` to enable dataset joins and will be as transparent as possible to users. Currently, no public interface is foreseen.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
/// \file ROOT/RNTupleIndex.hxx | ||
/// \ingroup NTuple ROOT7 | ||
/// \author Florine de Geus <florine.de.geus@cern.ch> | ||
/// \date 2024-04-02 | ||
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback | ||
/// is welcome! | ||
|
||
/************************************************************************* | ||
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. * | ||
* All rights reserved. * | ||
* * | ||
* For the licensing terms see $ROOTSYS/LICENSE. * | ||
* For the list of contributors see $ROOTSYS/README/CREDITS. * | ||
*************************************************************************/ | ||
|
||
#ifndef ROOT7_RNTupleIndex | ||
#define ROOT7_RNTupleIndex | ||
|
||
#include <ROOT/RField.hxx> | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <vector> | ||
|
||
namespace ROOT { | ||
namespace Experimental { | ||
namespace Internal { | ||
// clang-format off | ||
/** | ||
\class ROOT::Experimental::Internal::RNTupleIndex | ||
\ingroup NTuple | ||
\brief Builds an index on one or several fields of an RNTuple so it can be joined onto other RNTuples. | ||
*/ | ||
// clang-format on | ||
class RNTupleIndex { | ||
public: | ||
using NTupleIndexValue_t = std::uint64_t; | ||
|
||
private: | ||
///////////////////////////////////////////////////////////////////////////// | ||
/// Container for the hashes of the indexed fields. | ||
class RIndexValue { | ||
public: | ||
std::vector<NTupleIndexValue_t> fFieldValues; | ||
RIndexValue(const std::vector<NTupleIndexValue_t> &fieldValues) | ||
{ | ||
fFieldValues.reserve(fieldValues.size()); | ||
fFieldValues = fieldValues; | ||
} | ||
inline bool operator==(const RIndexValue &other) const { return other.fFieldValues == fFieldValues; } | ||
}; | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// Hash combinining the individual index value hashes from RIndexValue. Uses the implementation from | ||
/// `boost::hash_combine` (see | ||
/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine). | ||
struct RIndexValueHash { | ||
inline std::size_t operator()(const RIndexValue &indexValue) const | ||
{ | ||
std::size_t combinedHash = 0; | ||
for (const auto &fieldVal : indexValue.fFieldValues) { | ||
combinedHash ^= fieldVal + 0x9e3779b9 + (fieldVal << 6) + (fieldVal >> 2); | ||
} | ||
return combinedHash; | ||
} | ||
}; | ||
|
||
/// The fields for which the index is built. Used to compute the hashes for each entry value. | ||
const std::vector<std::unique_ptr<RFieldBase>> fFields; | ||
|
||
/// The index itself. Maps field values (or combinations thereof in case the index is defined for multiple fields) to | ||
/// their respsective entry numbers. | ||
std::unordered_map<RIndexValue, std::vector<NTupleSize_t>, RIndexValueHash> fIndex; | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Create an RNTupleIndex for an existing RNTuple. | ||
/// | ||
/// \param[in] The fields that will make up the index. | ||
/// \param[in] The number of entries to index. | ||
/// | ||
/// \note The page source is assumed be attached already. | ||
RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields, NTupleSize_t nEntries); | ||
|
||
public: | ||
RNTupleIndex(const RNTupleIndex &other) = delete; | ||
RNTupleIndex &operator=(const RNTupleIndex &other) = delete; | ||
RNTupleIndex(RNTupleIndex &&other) = delete; | ||
RNTupleIndex &operator=(RNTupleIndex &&other) = delete; | ||
~RNTupleIndex() = default; | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Create an RNTupleIndex from an existing RNTuple. | ||
/// | ||
/// \param[in] fieldNames The names of the fields to index. | ||
/// \param pageSource The page source. | ||
/// | ||
/// \return A pointer to the newly-created index. | ||
/// | ||
/// \note Only integral-type fields are allowed to be used as index fields. | ||
static std::unique_ptr<RNTupleIndex> Create(const std::vector<std::string> &fieldNames, RPageSource &pageSource); | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Get the number of elements currently indexed. | ||
/// | ||
/// \return The number of elements currently indexed. | ||
std::size_t GetNElements() const { return fIndex.size(); } | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Get the first entry number containing the given index value. | ||
/// | ||
/// \param[in] valuePtrs A vector of pointers to the index values to look up. | ||
/// | ||
/// \return The first entry number that corresponds to `valuePtrs`. When no such entry exists, `kInvalidNTupleIndex` | ||
/// is returned. | ||
/// | ||
/// Note that in case multiple entries corresponding to the provided index value exist, the first occurrence is | ||
/// returned. Use RNTupleIndex::GetAllEntryNumbers to get all entries. | ||
NTupleSize_t GetFirstEntryNumber(const std::vector<void *> &valuePtrs) const; | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Get the entry number containing the given index value. | ||
/// | ||
/// \sa GetFirstEntryNumber(std::vector<void *> valuePtrs) | ||
template <typename... Ts> | ||
NTupleSize_t GetFirstEntryNumber(Ts... values) const | ||
{ | ||
if (sizeof...(Ts) != fFields.size()) | ||
throw RException(R__FAIL("Number of values must match number of indexed fields.")); | ||
|
||
std::vector<void *> valuePtrs; | ||
valuePtrs.reserve(sizeof...(Ts)); | ||
([&] { valuePtrs.push_back(&values); }(), ...); | ||
|
||
return GetFirstEntryNumber(valuePtrs); | ||
} | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Get all entry numbers for the given index. | ||
/// | ||
/// \param[in] valuePtrs A vector of pointers to the index values to look up. | ||
/// | ||
/// \return The entry numbers that corresponds to `valuePtrs`. When no such entry exists, an empty vector is | ||
/// returned. | ||
const std::vector<NTupleSize_t> *GetAllEntryNumbers(const std::vector<void *> &valuePtrs) const; | ||
|
||
///////////////////////////////////////////////////////////////////////////// | ||
/// \brief Get all entry numbers for the given index. | ||
/// | ||
/// \sa GetAllEntryNumbers(std::vector<void *> valuePtrs) | ||
template <typename... Ts> | ||
const std::vector<NTupleSize_t> *GetAllEntryNumbers(Ts... values) const | ||
{ | ||
if (sizeof...(Ts) != fFields.size()) | ||
throw RException(R__FAIL("Number of values must match number of indexed fields.")); | ||
|
||
std::vector<void *> valuePtrs; | ||
valuePtrs.reserve(sizeof...(Ts)); | ||
([&] { valuePtrs.push_back(&values); }(), ...); | ||
|
||
return GetAllEntryNumbers(valuePtrs); | ||
} | ||
}; | ||
} // namespace Internal | ||
} // namespace Experimental | ||
} // namespace ROOT | ||
|
||
#endif // ROOT7_RNTupleIndex |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
/// \file RNTupleIndex.cxx | ||
/// \ingroup NTuple ROOT7 | ||
/// \author Florine de Geus <florine.de.geus@cern.ch> | ||
/// \date 2024-04-02 | ||
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback | ||
/// is welcome! | ||
|
||
/************************************************************************* | ||
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. * | ||
* All rights reserved. * | ||
* * | ||
* For the licensing terms see $ROOTSYS/LICENSE. * | ||
* For the list of contributors see $ROOTSYS/README/CREDITS. * | ||
*************************************************************************/ | ||
|
||
#include <ROOT/RNTupleIndex.hxx> | ||
|
||
namespace { | ||
ROOT::Experimental::Internal::RNTupleIndex::NTupleIndexValue_t | ||
CastValuePtr(void *valuePtr, const ROOT::Experimental::RFieldBase &field) | ||
{ | ||
ROOT::Experimental::Internal::RNTupleIndex::NTupleIndexValue_t value; | ||
|
||
switch (field.GetValueSize()) { | ||
case 1: value = *reinterpret_cast<std::uint8_t *>(valuePtr); break; | ||
case 2: value = *reinterpret_cast<std::uint16_t *>(valuePtr); break; | ||
case 4: value = *reinterpret_cast<std::uint32_t *>(valuePtr); break; | ||
case 8: value = *reinterpret_cast<std::uint64_t *>(valuePtr); break; | ||
default: assert(false); | ||
Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / alma9-clang clang LLVM_ENABLE_ASSERTIONS=On, CMAKE_C_COMPILER=clang, CMAKE_CXX_COMPILER=clang++
Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / mac14 ARM64 LLVM_ENABLE_ASSERTIONS=On, CMAKE_CXX_STANDARD=20
Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / mac13 ARM64 LLVM_ENABLE_ASSERTIONS=On, builtin_zlib=ON
|
||
} | ||
|
||
return value; | ||
Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / alma9 march_native CMAKE_BUILD_TYPE=RelWithDebInfo, CMAKE_CXX_FLAGS=-march=native, CMAKE_C_FLAGS=-march=native, fortran=OFF
Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / alma8 LLVM_ENABLE_ASSERTIONS=On
Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / alma9 modules_off runtime_cxxmodules=Off
Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / fedora39 LLVM_ENABLE_ASSERTIONS=On, CMAKE_CXX_STANDARD=20
Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx GitHub Actions / ubuntu20 LLVM_ENABLE_ASSERTIONS=On
|
||
} | ||
} // anonymous namespace | ||
|
||
ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields, | ||
NTupleSize_t nEntries) | ||
: fFields(std::move(fields)) | ||
{ | ||
std::vector<RFieldBase::RValue> fieldValues; | ||
fieldValues.reserve(fields.size()); | ||
for (const auto &field : fFields) { | ||
if (!field->IsSimple() || field->GetTypeName() == "float" || field->GetTypeName() == "double") { | ||
throw RException(R__FAIL("Cannot use field \"" + field->GetFieldName() + "\" with type \"" + | ||
field->GetTypeName() + "\" for indexing. Only integral types are allowed.")); | ||
} | ||
fieldValues.emplace_back(field->CreateValue()); | ||
} | ||
|
||
std::vector<NTupleIndexValue_t> indexValues; | ||
indexValues.reserve(fFields.size()); | ||
|
||
for (unsigned i = 0; i < nEntries; ++i) { | ||
indexValues.clear(); | ||
for (auto &fieldValue : fieldValues) { | ||
// TODO(fdegeus): use bulk reading | ||
fieldValue.Read(i); | ||
|
||
auto valuePtr = fieldValue.GetPtr<void>(); | ||
indexValues.push_back(CastValuePtr(valuePtr.get(), fieldValue.GetField())); | ||
} | ||
fIndex[RIndexValue(indexValues)].push_back(i); | ||
} | ||
} | ||
|
||
std::unique_ptr<ROOT::Experimental::Internal::RNTupleIndex> | ||
ROOT::Experimental::Internal::RNTupleIndex::Create(const std::vector<std::string> &fieldNames, RPageSource &pageSource) | ||
{ | ||
pageSource.Attach(); | ||
auto desc = pageSource.GetSharedDescriptorGuard(); | ||
|
||
std::vector<std::unique_ptr<RFieldBase>> fields; | ||
fields.reserve(fieldNames.size()); | ||
|
||
for (const auto &fieldName : fieldNames) { | ||
auto fieldId = desc->FindFieldId(fieldName); | ||
if (fieldId == kInvalidDescriptorId) | ||
throw RException(R__FAIL("Could not find field \"" + std::string(fieldName) + ".")); | ||
|
||
const auto &fieldDesc = desc->GetFieldDescriptor(fieldId); | ||
auto field = fieldDesc.CreateField(desc.GetRef()); | ||
|
||
CallConnectPageSourceOnField(*field, pageSource); | ||
|
||
fields.push_back(std::move(field)); | ||
} | ||
|
||
return std::unique_ptr<RNTupleIndex>(new RNTupleIndex(fields, pageSource.GetNEntries())); | ||
} | ||
|
||
ROOT::Experimental::NTupleSize_t | ||
ROOT::Experimental::Internal::RNTupleIndex::GetFirstEntryNumber(const std::vector<void *> &valuePtrs) const | ||
{ | ||
const auto entryIndices = GetAllEntryNumbers(valuePtrs); | ||
if (!entryIndices) | ||
return kInvalidNTupleIndex; | ||
return entryIndices->front(); | ||
} | ||
|
||
const std::vector<ROOT::Experimental::NTupleSize_t> * | ||
ROOT::Experimental::Internal::RNTupleIndex::GetAllEntryNumbers(const std::vector<void *> &valuePtrs) const | ||
{ | ||
if (valuePtrs.size() != fFields.size()) | ||
throw RException(R__FAIL("Number of value pointers must match number of indexed fields.")); | ||
|
||
std::vector<NTupleIndexValue_t> indexValues; | ||
indexValues.reserve(fFields.size()); | ||
|
||
for (unsigned i = 0; i < valuePtrs.size(); ++i) { | ||
indexValues.push_back(CastValuePtr(valuePtrs[i], *fFields[i])); | ||
} | ||
|
||
auto entryNumber = fIndex.find(RIndexValue(indexValues)); | ||
|
||
if (entryNumber == fIndex.end()) | ||
return nullptr; | ||
|
||
return &(entryNumber->second); | ||
} |