Skip to content

Commit

Permalink
[ntuple] Add initial in-memory index prototype
Browse files Browse the repository at this point in the history
This adds (a first version of) the `RNTupleIndex`, which is an
in-memory structure that maps RNTuple field values (or combinations
thereof) to an entry index in the RNTuple for which the index was
built. At this point, the index only resides in memory and thus has
to be (re)build each time.

`RNTupleIndex` will be used by the `RNTupleProcessor` to enable
dataset joins and will be as transparent as possible to users.
Currently, no public interface is foreseen.
  • Loading branch information
enirolf committed Sep 10, 2024
1 parent f7aab70 commit eeacbc6
Show file tree
Hide file tree
Showing 6 changed files with 514 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tree/ntuple/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ HEADERS
ROOT/RNTupleFillContext.hxx
ROOT/RNTupleFillStatus.hxx
ROOT/RNTupleImtTaskScheduler.hxx
ROOT/RNTupleIndex.hxx
ROOT/RNTupleMerger.hxx
ROOT/RNTupleMetrics.hxx
ROOT/RNTupleModel.hxx
Expand Down Expand Up @@ -64,6 +65,7 @@ SOURCES
v7/src/RNTupleDescriptor.cxx
v7/src/RNTupleDescriptorFmt.cxx
v7/src/RNTupleFillContext.cxx
v7/src/RNTupleIndex.cxx
v7/src/RNTupleMerger.cxx
v7/src/RNTupleMetrics.cxx
v7/src/RNTupleModel.cxx
Expand Down
168 changes: 168 additions & 0 deletions tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/// \file ROOT/RNTupleIndex.hxx
/// \ingroup NTuple ROOT7
/// \author Florine de Geus <florine.de.geus@cern.ch>
/// \date 2024-04-02
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
/// is welcome!

/*************************************************************************
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#ifndef ROOT7_RNTupleIndex
#define ROOT7_RNTupleIndex

#include <ROOT/RField.hxx>

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

namespace ROOT {
namespace Experimental {
namespace Internal {
// clang-format off
/**
\class ROOT::Experimental::Internal::RNTupleIndex
\ingroup NTuple
\brief Builds an index on one or several fields of an RNTuple so it can be joined onto other RNTuples.
*/
// clang-format on
class RNTupleIndex {
public:
using NTupleIndexValue_t = std::uint64_t;

private:
/////////////////////////////////////////////////////////////////////////////
/// Container for the hashes of the indexed fields.
class RIndexValue {
public:
std::vector<NTupleIndexValue_t> fFieldValues;
RIndexValue(const std::vector<NTupleIndexValue_t> &fieldValues)
{
fFieldValues.reserve(fieldValues.size());
fFieldValues = fieldValues;
}
inline bool operator==(const RIndexValue &other) const { return other.fFieldValues == fFieldValues; }
};

/////////////////////////////////////////////////////////////////////////////
/// Hash combinining the individual index value hashes from RIndexValue. Uses the implementation from
/// `boost::hash_combine` (see
/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine).
struct RIndexValueHash {
inline std::size_t operator()(const RIndexValue &indexValue) const
{
std::size_t combinedHash = 0;
for (const auto &fieldVal : indexValue.fFieldValues) {
combinedHash ^= fieldVal + 0x9e3779b9 + (fieldVal << 6) + (fieldVal >> 2);
}
return combinedHash;
}
};

/// The fields for which the index is built. Used to compute the hashes for each entry value.
const std::vector<std::unique_ptr<RFieldBase>> fFields;

/// The index itself. Maps field values (or combinations thereof in case the index is defined for multiple fields) to
/// their respsective entry numbers.
std::unordered_map<RIndexValue, std::vector<NTupleSize_t>, RIndexValueHash> fIndex;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex for an existing RNTuple.
///
/// \param[in] The fields that will make up the index.
/// \param[in] The number of entries to index.
///
/// \note The page source is assumed be attached already.
RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields, NTupleSize_t nEntries);

public:
RNTupleIndex(const RNTupleIndex &other) = delete;
RNTupleIndex &operator=(const RNTupleIndex &other) = delete;
RNTupleIndex(RNTupleIndex &&other) = delete;
RNTupleIndex &operator=(RNTupleIndex &&other) = delete;
~RNTupleIndex() = default;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex from an existing RNTuple.
///
/// \param[in] fieldNames The names of the fields to index.
/// \param pageSource The page source.
///
/// \return A pointer to the newly-created index.
///
/// \note Only integral-type fields are allowed to be used as index fields.
static std::unique_ptr<RNTupleIndex> Create(const std::vector<std::string> &fieldNames, RPageSource &pageSource);

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the number of elements currently indexed.
///
/// \return The number of elements currently indexed.
std::size_t GetNElements() const { return fIndex.size(); }

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the first entry number containing the given index value.
///
/// \param[in] valuePtrs A vector of pointers to the index values to look up.
///
/// \return The first entry number that corresponds to `valuePtrs`. When no such entry exists, `kInvalidNTupleIndex`
/// is returned.
///
/// Note that in case multiple entries corresponding to the provided index value exist, the first occurrence is
/// returned. Use RNTupleIndex::GetAllEntryNumbers to get all entries.
NTupleSize_t GetFirstEntryNumber(const std::vector<void *> &valuePtrs) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the entry number containing the given index value.
///
/// \sa GetFirstEntryNumber(std::vector<void *> valuePtrs)
template <typename... Ts>
NTupleSize_t GetFirstEntryNumber(Ts... values) const
{
if (sizeof...(Ts) != fFields.size())
throw RException(R__FAIL("Number of values must match number of indexed fields."));

std::vector<void *> valuePtrs;
valuePtrs.reserve(sizeof...(Ts));
([&] { valuePtrs.push_back(&values); }(), ...);

return GetFirstEntryNumber(valuePtrs);
}

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry numbers for the given index.
///
/// \param[in] valuePtrs A vector of pointers to the index values to look up.
///
/// \return The entry numbers that corresponds to `valuePtrs`. When no such entry exists, an empty vector is
/// returned.
const std::vector<NTupleSize_t> *GetAllEntryNumbers(const std::vector<void *> &valuePtrs) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry numbers for the given index.
///
/// \sa GetAllEntryNumbers(std::vector<void *> valuePtrs)
template <typename... Ts>
const std::vector<NTupleSize_t> *GetAllEntryNumbers(Ts... values) const
{
if (sizeof...(Ts) != fFields.size())
throw RException(R__FAIL("Number of values must match number of indexed fields."));

std::vector<void *> valuePtrs;
valuePtrs.reserve(sizeof...(Ts));
([&] { valuePtrs.push_back(&values); }(), ...);

return GetAllEntryNumbers(valuePtrs);
}
};
} // namespace Internal
} // namespace Experimental
} // namespace ROOT

#endif // ROOT7_RNTupleIndex
119 changes: 119 additions & 0 deletions tree/ntuple/v7/src/RNTupleIndex.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/// \file RNTupleIndex.cxx
/// \ingroup NTuple ROOT7
/// \author Florine de Geus <florine.de.geus@cern.ch>
/// \date 2024-04-02
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
/// is welcome!

/*************************************************************************
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#include <ROOT/RNTupleIndex.hxx>

namespace {
ROOT::Experimental::Internal::RNTupleIndex::NTupleIndexValue_t
CastValuePtr(void *valuePtr, const ROOT::Experimental::RFieldBase &field)
{
ROOT::Experimental::Internal::RNTupleIndex::NTupleIndexValue_t value;

switch (field.GetValueSize()) {
case 1: value = *reinterpret_cast<std::uint8_t *>(valuePtr); break;
case 2: value = *reinterpret_cast<std::uint16_t *>(valuePtr); break;
case 4: value = *reinterpret_cast<std::uint32_t *>(valuePtr); break;
case 8: value = *reinterpret_cast<std::uint64_t *>(valuePtr); break;
default: assert(false);

Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / alma9-clang clang LLVM_ENABLE_ASSERTIONS=On, CMAKE_C_COMPILER=clang, CMAKE_CXX_COMPILER=clang++

variable 'value' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized]

Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / mac14 ARM64 LLVM_ENABLE_ASSERTIONS=On, CMAKE_CXX_STANDARD=20

variable 'value' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized]

Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / mac13 ARM64 LLVM_ENABLE_ASSERTIONS=On, builtin_zlib=ON

variable 'value' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized]

Check warning on line 29 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / mac-beta ARM64 LLVM_ENABLE_ASSERTIONS=On, CMAKE_CXX_STANDARD=20

variable 'value' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized]
}

return value;

Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / alma9 march_native CMAKE_BUILD_TYPE=RelWithDebInfo, CMAKE_CXX_FLAGS=-march=native, CMAKE_C_FLAGS=-march=native, fortran=OFF

‘value’ may be used uninitialized in this function [-Wmaybe-uninitialized]

Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / alma8 LLVM_ENABLE_ASSERTIONS=On

‘value’ may be used uninitialized in this function [-Wmaybe-uninitialized]

Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / alma9 modules_off runtime_cxxmodules=Off

‘value’ may be used uninitialized in this function [-Wmaybe-uninitialized]

Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / fedora39 LLVM_ENABLE_ASSERTIONS=On, CMAKE_CXX_STANDARD=20

‘value’ may be used uninitialized [-Wmaybe-uninitialized]

Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / ubuntu20 LLVM_ENABLE_ASSERTIONS=On

‘value’ may be used uninitialized in this function [-Wmaybe-uninitialized]

Check warning on line 32 in tree/ntuple/v7/src/RNTupleIndex.cxx

View workflow job for this annotation

GitHub Actions / debian125 LLVM_ENABLE_ASSERTIONS=On, CMAKE_CXX_STANDARD=20

‘value’ may be used uninitialized [-Wmaybe-uninitialized]
}
} // anonymous namespace

ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields,
NTupleSize_t nEntries)
: fFields(std::move(fields))
{
std::vector<RFieldBase::RValue> fieldValues;
fieldValues.reserve(fields.size());
for (const auto &field : fFields) {
if (!field->IsSimple() || field->GetTypeName() == "float" || field->GetTypeName() == "double") {
throw RException(R__FAIL("Cannot use field \"" + field->GetFieldName() + "\" with type \"" +
field->GetTypeName() + "\" for indexing. Only integral types are allowed."));
}
fieldValues.emplace_back(field->CreateValue());
}

std::vector<NTupleIndexValue_t> indexValues;
indexValues.reserve(fFields.size());

for (unsigned i = 0; i < nEntries; ++i) {
indexValues.clear();
for (auto &fieldValue : fieldValues) {
// TODO(fdegeus): use bulk reading
fieldValue.Read(i);

auto valuePtr = fieldValue.GetPtr<void>();
indexValues.push_back(CastValuePtr(valuePtr.get(), fieldValue.GetField()));
}
fIndex[RIndexValue(indexValues)].push_back(i);
}
}

std::unique_ptr<ROOT::Experimental::Internal::RNTupleIndex>
ROOT::Experimental::Internal::RNTupleIndex::Create(const std::vector<std::string> &fieldNames, RPageSource &pageSource)
{
pageSource.Attach();
auto desc = pageSource.GetSharedDescriptorGuard();

std::vector<std::unique_ptr<RFieldBase>> fields;
fields.reserve(fieldNames.size());

for (const auto &fieldName : fieldNames) {
auto fieldId = desc->FindFieldId(fieldName);
if (fieldId == kInvalidDescriptorId)
throw RException(R__FAIL("Could not find field \"" + std::string(fieldName) + "."));

const auto &fieldDesc = desc->GetFieldDescriptor(fieldId);
auto field = fieldDesc.CreateField(desc.GetRef());

CallConnectPageSourceOnField(*field, pageSource);

fields.push_back(std::move(field));
}

return std::unique_ptr<RNTupleIndex>(new RNTupleIndex(fields, pageSource.GetNEntries()));
}

ROOT::Experimental::NTupleSize_t
ROOT::Experimental::Internal::RNTupleIndex::GetFirstEntryNumber(const std::vector<void *> &valuePtrs) const
{
const auto entryIndices = GetAllEntryNumbers(valuePtrs);
if (!entryIndices)
return kInvalidNTupleIndex;
return entryIndices->front();
}

const std::vector<ROOT::Experimental::NTupleSize_t> *
ROOT::Experimental::Internal::RNTupleIndex::GetAllEntryNumbers(const std::vector<void *> &valuePtrs) const
{
if (valuePtrs.size() != fFields.size())
throw RException(R__FAIL("Number of value pointers must match number of indexed fields."));

std::vector<NTupleIndexValue_t> indexValues;
indexValues.reserve(fFields.size());

for (unsigned i = 0; i < valuePtrs.size(); ++i) {
indexValues.push_back(CastValuePtr(valuePtrs[i], *fFields[i]));
}

auto entryNumber = fIndex.find(RIndexValue(indexValues));

if (entryNumber == fIndex.end())
return nullptr;

return &(entryNumber->second);
}
1 change: 1 addition & 0 deletions tree/ntuple/v7/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ ROOT_GENERATE_DICTIONARY(RNTupleDescriptorDict ${CMAKE_CURRENT_SOURCE_DIR}/RNTup
DEPENDENCIES RIO CustomStruct)
ROOT_ADD_GTEST(ntuple_endian ntuple_endian.cxx LIBRARIES ROOTNTuple)
ROOT_ADD_GTEST(ntuple_friends ntuple_friends.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_index ntuple_index.cxx LIBRARIES ROOTNTuple)
ROOT_ADD_GTEST(ntuple_merger ntuple_merger.cxx LIBRARIES ROOTNTuple CustomStruct ZLIB::ZLIB)
ROOT_ADD_GTEST(ntuple_metrics ntuple_metrics.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_model ntuple_model.cxx LIBRARIES ROOTNTuple CustomStruct)
Expand Down
Loading

0 comments on commit eeacbc6

Please sign in to comment.