Skip to content

SQlite data source for RDataFrame #2322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Aug 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
420485d
[DF] Add skeleton for sqlite data source
jblomer Jul 12, 2018
8844e7a
[DF] Build sqlite data source only if sqlite support is on
jblomer Jul 12, 2018
d9b2b8f
[DF] Draft of sqlite data source
jblomer Jul 14, 2018
87fe2ec
[DF] Fix memory bug in sqlite data source
jblomer Jul 14, 2018
50cc6f8
[DF] Sqlite data source featuree complete, single-threaded
jblomer Jul 15, 2018
64a01a2
[DF] Avoid 'unused parameter' warnings in sqlite ds
jblomer Jul 16, 2018
9405c43
[DF] revert accidental changes in base class
jblomer Jul 16, 2018
7480fc9
[DF] Fix expression columns in SQlite data source
jblomer Jul 16, 2018
12cc5b5
[DF] Fix code sytle in sqlite ds
jblomer Jul 16, 2018
d54f954
[DF] Make SQlite ds thread safe through full serialization
jblomer Jul 16, 2018
03a2b33
[DF] Add unit tests for sqlite data source
jblomer Jul 18, 2018
98d9837
[DF] Fix compiler warning in sqlite ds
jblomer Jul 18, 2018
fdd083a
[DF] Test sqlite queries with duplicated columns
jblomer Jul 19, 2018
7eb3431
[DF] Minor fixes to unit tests
jblomer Aug 6, 2018
87db91b
[DF] Add missing include
jblomer Aug 6, 2018
ce9f16f
[DF] Adjust virtual inheritance in sqlite ds
jblomer Aug 6, 2018
df2efed
[DF] Cosmetics in sqlite ds
jblomer Aug 6, 2018
11d428e
[DF] Fix fNSlots initialization in sqlite ds
jblomer Aug 6, 2018
2c1f86d
[DF] Replace NULL by nullptr in sqlite ds
jblomer Aug 6, 2018
e9b7841
[DF] Cosmetics in sqlite ds
jblomer Aug 6, 2018
bbf4283
[DF] Add missing includes in sqlite ds
jblomer Aug 6, 2018
dc0311d
[DF] Improve initialization of RSqliteDS::fValues
jblomer Aug 6, 2018
03e0c58
[DF] Remove unnecessary lock guards in RSqliteDS
jblomer Aug 6, 2018
d05c2fb
[DF] Use static array for type names in RSqliteDS
jblomer Aug 6, 2018
56c35dd
[DF] Code simplification in RSqliteDS
jblomer Aug 6, 2018
1891e6d
[DF] Fix identification of sqlite column type
jblomer Aug 7, 2018
87a5d43
[DF] test snapshot for sqlite ds
jblomer Aug 13, 2018
1699c5d
[DF] add warning for IMT with RSqliteDS
jblomer Aug 16, 2018
7a84ebe
[DF] formatting fixes according to clang-format
jblomer Aug 29, 2018
5a7cc7c
[DF] fix includes for building RSqliteDS
jblomer Aug 29, 2018
ac276b4
[DF] More comments for sqlite ds
jblomer Aug 29, 2018
07a32c1
[DF] Don't link against SQlite libraries if not found
jblomer Aug 29, 2018
cbaf713
[DF] fix comment line
jblomer Aug 30, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion tree/dataframe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ if (ARROW_FOUND)
include_directories(${ARROW_INCLUDE_DIR})
endif()

if (sqlite)
include_directories(${SQLITE_INCLUDE_DIR})
set (DATAFRAME_SQLITE_LIBRARIES ${SQLITE_LIBRARIES})
endif()

ROOT_GLOB_HEADERS(dictHeaders inc/*.h inc/ROOT/*.hxx)

# these headers are deprecated
Expand All @@ -22,11 +27,16 @@ if(NOT ARROW_FOUND)
list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/src/RArrowDS.cxx)
endif()

if(NOT sqlite)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you may need to use ${ROOT_sqlite_FOUND} or equivalent here and elsewhere.

list(REMOVE_ITEM dictHeaders ${CMAKE_CURRENT_SOURCE_DIR}/inc/ROOT/RSqliteDS.hxx)
list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/src/RSqliteDS.cxx)
endif()

ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
HEADERS ${dictHeaders}
SOURCES ${sources}
DICTIONARY_OPTIONS "-writeEmptyRootPCM"
LIBRARIES ${TBB_LIBRARIES} ${ARROW_SHARED_LIB}
LIBRARIES ${TBB_LIBRARIES} ${ARROW_SHARED_LIB} ${DATAFRAME_SQLITE_LIBRARIES}
DEPENDENCIES Tree TreePlayer Hist RIO ROOTVecOps Imt
${TREEPLAYER_DEPENDENCIES})

Expand Down
120 changes: 120 additions & 0 deletions tree/dataframe/inc/ROOT/RSqliteDS.hxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Author: Jakob Blomer CERN 07/2018

/*************************************************************************
* Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#ifndef ROOT_RSQLITEDS
#define ROOT_RSQLITEDS

#include "ROOT/RDataFrame.hxx"
#include "ROOT/RDataSource.hxx"
#include "ROOT/RStringView.hxx"

#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

#include <sqlite3.h>

namespace ROOT {

namespace RDF {

// clang-format off
/**
\class ROOT::RDF::RSqliteDS
\ingroup dataframe
\brief RSqliteDS is an RDF data source implementation for SQL result sets from sqlite3 files.

The RSqliteDS is able to feed an RDataFrame with data from a SQlite SELECT query. One can use it like

auto rdf = ROOT::RDF::MakeSqliteDataFrame("/path/to/file.sqlite", "select name from table");
auto h = rdf.Define("lName", "name.length()").Histo1D("lName");

The data source has to provide column types for all the columns. Determining column types in SQlite is tricky
as it is dynamically typed and in principle each row can have different column types. The following heuristics
is used:

- If a table column is queried as is ("SELECT colname FROM table"), the default/declared column type is taken.
- For expressions ("SELECT 1+1 FROM table"), the type of the first row of the result set determines the column type.
That can result in a column to be of thought of type NULL where subsequent rows actually have meaningful values.
The provided SELECT query can be used to avoid such ambiguities.
*/
class RSqliteDS final : public ROOT::RDF::RDataSource {
private:
// clang-format off
/// All the types known to SQlite. Changes require changing fgTypeNames, too.
enum class ETypes {
kInteger,
kReal,
kText,
kBlob,
kNull
};
// clang-format on

/// Used to hold a single "cell" of the SELECT query's result table. Can be changed to std::variant once available.
struct Value_t {
explicit Value_t(ETypes type);

ETypes fType;
bool fIsActive; ///< Not all columns of the query are necessarily used by the RDF. Allows for skipping them.
Long64_t fInteger;
double fReal;
std::string fText;
std::vector<unsigned char> fBlob;
void *fNull;
void *fPtr; ///< Points to one of the values; an address to this pointer is returned by GetColumnReadersImpl.
};

void SqliteError(int errcode);

sqlite3 *fDb;
sqlite3_stmt *fQuery;
unsigned int fNSlots;
ULong64_t fNRow;
std::vector<std::string> fColumnNames;
std::vector<ETypes> fColumnTypes;
/// The data source is inherently single-threaded and returns only one row at a time. This vector holds the results.
std::vector<Value_t> fValues;

// clang-format off
/// Corresponds to the types defined in ETypes.
static constexpr char const *fgTypeNames[] = {
"Long64_t",
"double",
"std::string",
"std::vector<unsigned char>",
"void *"
};
// clang-format on

public:
RSqliteDS(std::string_view fileName, std::string_view query);
~RSqliteDS();
void SetNSlots(unsigned int nSlots) final;
const std::vector<std::string> &GetColumnNames() const final;
bool HasColumn(std::string_view colName) const final;
std::string GetTypeName(std::string_view colName) const final;
std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
bool SetEntry(unsigned int slot, ULong64_t entry) final;
void Initialise() final;

protected:
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final;
};

RDataFrame MakeSqliteDataFrame(std::string_view fileName, std::string_view query);

} // namespace RDF

} // namespace ROOT

#endif
Loading