From cfe4836fdd0fc760928d400533329b2706d66285 Mon Sep 17 00:00:00 2001 From: Juraj Smiesko <34742917+kjvbrt@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:56:12 +0200 Subject: [PATCH] Add an RDataSource for podio files and collections (#593) * Moving RDataSource closer to Podio/EDM4hep * Adding podio::ROOTDataSource class to the rootmap * Separating datasource into standalone library * Adding ON flag to all tests * The headers should install now * Other suggested adjustment * Installing also utilities directory * Cleanup setup code slightly to avoid unnecessary copies * Adding missing podioDataSourceDict target --------- Co-authored-by: Thomas Madlener --- .github/workflows/key4hep.yml | 1 + .github/workflows/pre-commit.yml | 1 + .github/workflows/publish-docs.yml | 2 +- .github/workflows/test.yml | 1 + .github/workflows/ubuntu.yml | 1 + CMakeLists.txt | 16 +- include/podio/DataSource.h | 160 ++++++++++++++++ src/CMakeLists.txt | 69 ++++++- src/DataSource.cc | 187 +++++++++++++++++++ src/rds_selection.xml | 5 + tests/root_io/CMakeLists.txt | 13 ++ tests/root_io/read_with_rdatasource_root.cpp | 35 ++++ 12 files changed, 476 insertions(+), 15 deletions(-) create mode 100644 include/podio/DataSource.h create mode 100644 src/DataSource.cc create mode 100644 src/rds_selection.xml create mode 100644 tests/root_io/read_with_rdatasource_root.cpp diff --git a/.github/workflows/key4hep.yml b/.github/workflows/key4hep.yml index b9f540a43..8e015323f 100644 --- a/.github/workflows/key4hep.yml +++ b/.github/workflows/key4hep.yml @@ -34,6 +34,7 @@ jobs: -DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror -Wno-error=deprecated-declarations " \ -DUSE_EXTERNAL_CATCH2=AUTO \ -DENABLE_RNTUPLE=ON \ + -DENABLE_DATASOURCE=ON \ -G Ninja .. echo "::endgroup::" echo "::group::Build" diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 1f2e9275a..4e2784bff 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -35,6 +35,7 @@ jobs: cmake .. -DENABLE_SIO=ON \ -DENABLE_JULIA=ON \ -DENABLE_RNTUPLE=ON \ + -DENABLE_DATASOURCE=ON \ -DCMAKE_CXX_STANDARD=20 \ -DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror "\ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 4c38d1038..52e4e620f 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -32,7 +32,7 @@ jobs: echo -e "::endgroup::\n::group::Build podio" cmake -B build . --install-prefix=$(pwd)/install \ -GNinja -DENABLE_SIO=ON -DENABLE_RNTUPLE=ON \ - -DBUILD_TESTING=OFF \ + -DENABLE_DATASOURCE=ON -DBUILD_TESTING=OFF \ -DCMAKE_CXX_STANDARD=20 cmake --build build --target install source ./init.sh && source ./env.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ec62e79fe..5b27058f9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,6 +31,7 @@ jobs: cmake -DENABLE_SIO=ON \ -DENABLE_JULIA=ON \ -DENABLE_RNTUPLE=ON \ + -DENABLE_DATASOURCE=ON \ -DCMAKE_INSTALL_PREFIX=../install \ -DCMAKE_CXX_STANDARD=20 \ -DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror -Wno-error=deprecated-declarations " \ diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 3ed25cb27..2ef4d8af8 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -27,6 +27,7 @@ jobs: cd build cmake -DENABLE_SIO=ON \ -DENABLE_JULIA=ON \ + -DENABLE_DATASOURCE=ON \ -DCMAKE_INSTALL_PREFIX=../install \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror -Wno-error=deprecated-declarations " \ diff --git a/CMakeLists.txt b/CMakeLists.txt index dfd6f98db..1f4517076 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,7 @@ option(CREATE_DOC "Whether or not to create doxygen doc target." OFF) option(ENABLE_SIO "Build SIO I/O support" OFF) option(PODIO_RELAX_PYVER "Do not require exact python version match with ROOT" OFF) option(ENABLE_RNTUPLE "Build with support for the new ROOT NTtuple format" OFF) +option(ENABLE_DATASOURCE "Build podio's ROOT DataSource" OFF) option(PODIO_USE_CLANG_FORMAT "Use clang-format to format the code" OFF) option(ENABLE_JULIA "Enable Julia support. When enabled, Julia datamodels will be generated, and Julia tests will run." OFF) @@ -76,13 +77,16 @@ option(ENABLE_JULIA "Enable Julia support. When enabled, Julia datamodels w #--- Declare ROOT dependency --------------------------------------------------- list(APPEND CMAKE_PREFIX_PATH $ENV{ROOTSYS}) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -if(NOT ENABLE_RNTUPLE) - find_package(ROOT REQUIRED COMPONENTS RIO Tree) -else() - find_package(ROOT REQUIRED COMPONENTS RIO Tree ROOTNTuple) - if(${ROOT_VERSION} VERSION_LESS 6.28.02) +set(root_components_needed RIO Tree) +if(ENABLE_RNTUPLE) + list(APPEND root_components_needed ROOTNTuple) +endif() +if(ENABLE_DATASOURCE) + list(APPEND root_components_needed ROOTDataFrame) +endif() +find_package(ROOT REQUIRED COMPONENTS ${root_components_needed}) +if((ENABLE_RNTUPLE) AND (${ROOT_VERSION} VERSION_LESS 6.28.02)) message(FATAL_ERROR "You are trying to build podio with support for the new ROOT NTuple format, but your ROOT version is too old. Please update ROOT to at least version 6.28.02") - endif() endif() # ROOT_CXX_STANDARD was introduced in https://github.com/root-project/root/pull/6466 diff --git a/include/podio/DataSource.h b/include/podio/DataSource.h new file mode 100644 index 000000000..17485c124 --- /dev/null +++ b/include/podio/DataSource.h @@ -0,0 +1,160 @@ +#ifndef PODIO_DATASOURCE_H +#define PODIO_DATASOURCE_H + +// Podio +#include +#include +#include + +// ROOT +#include +#include + +// STL +#include +#include +#include +#include +#include + +namespace podio { +class DataSource : public ROOT::RDF::RDataSource { +public: + /// + /// @brief Construct the podio::DataSource from the provided file. + /// + explicit DataSource(const std::string& filePath, int nEvents = -1); + + /// + /// @brief Construct the podio::DataSource from the provided file list. + /// + explicit DataSource(const std::vector& filePathList, int nEvents = -1); + + /// + /// @brief Inform the podio::DataSource of the desired level of parallelism. + /// + void SetNSlots(unsigned int nSlots) override; + + /// + /// @brief Inform podio::DataSource that an event-loop is about to start. + /// + void Initialize() override; + + /// + /// @brief Retrieve from podio::DataSource a set of ranges of entries that + /// can be processed concurrently. + /// + std::vector> GetEntryRanges() override; + + /// + /// @brief Inform podio::DataSource that a certain thread is about to start + /// working on a certain range of entries. + /// + void InitSlot(unsigned int slot, ULong64_t firstEntry) override; + + /// + /// @brief Inform podio::DataSource that a certain thread is about to start + /// working on a certain entry. + /// + bool SetEntry(unsigned int slot, ULong64_t entry) override; + + /// + /// @brief Inform podio::DataSource that a certain thread finished working + /// on a certain range of entries. + /// + void FinalizeSlot(unsigned int slot) override; + + /// + /// @brief Inform podio::DataSource that an event-loop finished. + /// + void Finalize() override; + + /// + /// @brief Returns a reference to the collection of the dataset's column + /// names + /// + const std::vector& GetColumnNames() const override; + + /// + /// @brief Checks if the dataset has a certain column. + /// + bool HasColumn(std::string_view columnName) const override; + + /// + /// @brief Type of a column as a string. Required for JITting. + /// + std::string GetTypeName(std::string_view columnName) const override; + +protected: + /// + /// @brief Type-erased vector of pointers to pointers to column + /// values --- one per slot. + /// + std::vector GetColumnReadersImpl(std::string_view name, const std::type_info& typeInfo) override; + + std::string AsString() override { + return "Podio data source"; + }; + +private: + /// Number of slots/threads + unsigned int m_nSlots = 1; + + /// Input filename + std::vector m_filePathList = {}; + + /// Total number of events + ULong64_t m_nEvents = 0; + + /// Ranges of events available to be processed + std::vector> m_rangesAvailable = {}; + + /// Ranges of events available ever created + std::vector> m_rangesAll = {}; + + /// Column names + std::vector m_columnNames{}; + + /// Column types + std::vector m_columnTypes = {}; + + /// Collections, m_Collections[columnIndex][slotIndex] + std::vector> m_Collections = {}; + + /// Active collections + std::vector m_activeCollections = {}; + + /// Root podio readers + std::vector> m_podioReaders = {}; + + /// Podio frames + std::vector> m_frames = {}; + + /// + /// @brief Setup input for the podio::DataSource. + /// + /// @param[in] Number of events. + /// @return void. + /// + void SetupInput(int nEvents); +}; + +/// +/// @brief Create RDataFrame from multiple Podio files. +/// +/// @param[in] filePathList List of file paths from which the RDataFrame +/// will be created. +/// @return RDataFrame created from input file list. +/// +ROOT::RDataFrame CreateDataFrame(const std::vector& filePathList); + +/// +/// @brief Create RDataFrame from a Podio file. +/// +/// @param[in] filePath File path from which the RDataFrame will be created. +/// @return RDataFrame created from input file list. +/// +ROOT::RDataFrame CreateDataFrame(const std::string& filePath); +} // namespace podio + +#endif /* PODIO_DATASOURCE_H */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 119de39ab..6f425fe8f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -139,6 +139,7 @@ if(ENABLE_SIO) LIST(APPEND INSTALL_LIBRARIES podioSioIO podioSioIODict) endif() + # --- IO set(io_sources Writer.cc @@ -160,19 +161,63 @@ if(ENABLE_SIO) target_link_libraries(podioIO PUBLIC podio::podioSioIO) endif() + +# --- DataSource +if(ENABLE_DATASOURCE) + set(rds_sources + DataSource.cc + ) + + set(rds_headers + ${PROJECT_SOURCE_DIR}/include/podio/DataSource.h + ) + + podio_add_lib_and_dict(podioDataSource "${rds_headers}" "${rds_sources}" rds_selection.xml) + target_link_libraries(podioDataSource PUBLIC podio::podio + podio::podioIO + podio::podioRootIO + ROOT::Core + ROOT::RIO + ROOT::Tree + ROOT::ROOTVecOps + ROOT::ROOTDataFrame + ) + target_compile_definitions(podioDataSource PUBLIC PODIO_ENABLE_DATASOURCE=1) +endif() + + # --- Install everything -install(TARGETS podio podioDict podioRootIO podioRootIODict podioIO ${INSTALL_LIBRARIES} - EXPORT podioTargets - DESTINATION "${CMAKE_INSTALL_LIBDIR}") +if (NOT ENABLE_DATASOURCE) + install(TARGETS podio podioDict podioRootIO podioRootIODict podioIO ${INSTALL_LIBRARIES} + EXPORT podioTargets + DESTINATION "${CMAKE_INSTALL_LIBDIR}") +else() + install(TARGETS podio podioDict podioRootIO podioRootIODict podioIO podioDataSource podioDataSourceDict ${INSTALL_LIBRARIES} + EXPORT podioTargets + DESTINATION "${CMAKE_INSTALL_LIBDIR}") +endif() # Only install the necessary headers -if (ENABLE_SIO) - install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/podio DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") -else() - install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/podio DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" - REGEX SIO.*\\.h$ EXCLUDE ) +file(GLOB headers_necessary + "${PROJECT_SOURCE_DIR}/include/podio/*.h") + +if (NOT ENABLE_SIO) + list(FILTER headers_necessary EXCLUDE REGEX SIO.*\\.h$) +endif() +if (NOT ENABLE_RNTUPLE) + list(FILTER headers_necessary EXCLUDE REGEX RNTuple.*\\.h$) +endif() +if (NOT ENABLE_DATASOURCE) + list(FILTER headers_necessary EXCLUDE REGEX DataSource.h) endif() +install(FILES ${headers_necessary} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/podio +) +install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/podio/utilities + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/podio +) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/podioDictDict.rootmap ${CMAKE_CURRENT_BINARY_DIR}/libpodioDict_rdict.pcm @@ -188,6 +233,14 @@ if (ENABLE_SIO) ) endif() +if (ENABLE_DATASOURCE) + install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/podioDataSourceDictDict.rootmap + ${CMAKE_CURRENT_BINARY_DIR}/libpodioDataSourceDict_rdict.pcm + DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ) +endif() + add_executable(podio_test_hashes test_hashes.cpp) target_link_libraries(podio_test_hashes PRIVATE podio::podio) install(TARGETS podio_test_hashes diff --git a/src/DataSource.cc b/src/DataSource.cc new file mode 100644 index 000000000..cdc678a98 --- /dev/null +++ b/src/DataSource.cc @@ -0,0 +1,187 @@ +#include "podio/DataSource.h" +#include "podio/Reader.h" + +// podio +#include + +// ROOT +#include + +// STL +#include +#include +#include + +namespace podio { +DataSource::DataSource(const std::string& filePath, int nEvents) : m_nSlots{1} { + m_filePathList.emplace_back(filePath); + SetupInput(nEvents); +} + +DataSource::DataSource(const std::vector& filePathList, int nEvents) : + m_nSlots{1}, m_filePathList{filePathList} { + SetupInput(nEvents); +} + +void DataSource::SetupInput(int nEvents) { + if (m_filePathList.empty()) { + throw std::runtime_error("podio::DataSource: No input files provided!"); + } + + // Check if the provided file(s) exists and contain required metadata is done + // by podio::Reader + + // Create probing frame + podio::Frame frame; + unsigned int nEventsInFiles = 0; + auto podioReader = podio::makeReader(m_filePathList); + nEventsInFiles = podioReader.getEntries(podio::Category::Event); + frame = podioReader.readFrame(podio::Category::Event, 0); + + // Determine over how many events to run + if (nEventsInFiles <= 0) { + throw std::runtime_error("podio::DataSource: No events found!"); + } + + if (nEvents < 0) { + m_nEvents = nEventsInFiles; + } else if (nEvents == 0) { + throw std::runtime_error("podio::DataSource: Requested to run over zero events!"); + } else { + m_nEvents = nEvents; + } + if (nEventsInFiles < m_nEvents) { + m_nEvents = nEventsInFiles; + } + + // Get collections stored in the files + std::vector collNames = frame.getAvailableCollections(); + for (auto&& collName : collNames) { + const podio::CollectionBase* coll = frame.get(collName); + if (coll->isValid()) { + m_columnNames.emplace_back(std::move(collName)); + m_columnTypes.emplace_back(coll->getTypeName()); + } + } +} + +void DataSource::SetNSlots(unsigned int nSlots) { + m_nSlots = nSlots; + + if (m_nSlots > m_nEvents) { + throw std::runtime_error("podio::DataSource: Number of events too small!"); + } + + int eventsPerSlot = m_nEvents / m_nSlots; + for (size_t i = 0; i < (m_nSlots - 1); ++i) { + m_rangesAll.emplace_back(eventsPerSlot * i, eventsPerSlot * (i + 1)); + } + m_rangesAll.emplace_back(eventsPerSlot * (m_nSlots - 1), m_nEvents); + m_rangesAvailable = m_rangesAll; + + // Initialize set of addresses needed + m_Collections.resize(m_columnNames.size(), std::vector(m_nSlots, nullptr)); + + // Initialize podio readers + for (size_t i = 0; i < m_nSlots; ++i) { + m_podioReaders.emplace_back(std::make_unique(podio::makeReader(m_filePathList))); + } + + for (size_t i = 0; i < m_nSlots; ++i) { + m_frames.emplace_back(std::make_unique()); + } +} + +void DataSource::Initialize() { +} + +std::vector> DataSource::GetEntryRanges() { + std::vector> rangesToBeProcessed; + for (auto& range : m_rangesAvailable) { + rangesToBeProcessed.emplace_back(range.first, range.second); + if (rangesToBeProcessed.size() >= m_nSlots) { + break; + } + } + + if (m_rangesAvailable.size() > m_nSlots) { + m_rangesAvailable.erase(m_rangesAvailable.begin(), m_rangesAvailable.begin() + m_nSlots); + } else { + m_rangesAvailable.erase(m_rangesAvailable.begin(), m_rangesAvailable.end()); + } + + return rangesToBeProcessed; +} + +void DataSource::InitSlot(unsigned int, ULong64_t) { +} + +bool DataSource::SetEntry(unsigned int slot, ULong64_t entry) { + m_frames[slot] = std::make_unique(m_podioReaders[slot]->readFrame(podio::Category::Event, entry)); + + for (auto& collectionIndex : m_activeCollections) { + m_Collections[collectionIndex][slot] = m_frames[slot]->get(m_columnNames.at(collectionIndex)); + } + + return true; +} + +void DataSource::FinalizeSlot(unsigned int) { +} + +void DataSource::Finalize() { +} + +std::vector DataSource::GetColumnReadersImpl(std::string_view columnName, const std::type_info&) { + auto itr = std::find(m_columnNames.begin(), m_columnNames.end(), columnName); + if (itr == m_columnNames.end()) { + std::string errMsg = "podio::DataSource: Can't find requested column \""; + errMsg += columnName; + errMsg += "\"!"; + throw std::runtime_error(errMsg); + } + auto columnIndex = std::distance(m_columnNames.begin(), itr); + m_activeCollections.emplace_back(columnIndex); + + std::vector columnReaders(m_nSlots); + for (size_t slotIndex = 0; slotIndex < m_nSlots; ++slotIndex) { + columnReaders[slotIndex] = (void*)&m_Collections[columnIndex][slotIndex]; + } + + return columnReaders; +} + +const std::vector& DataSource::GetColumnNames() const { + return m_columnNames; +} + +bool DataSource::HasColumn(std::string_view columnName) const { + return std::find(m_columnNames.begin(), m_columnNames.end(), columnName) != m_columnNames.end(); +} + +std::string DataSource::GetTypeName(std::string_view columnName) const { + auto itr = std::find(m_columnNames.begin(), m_columnNames.end(), columnName); + if (itr == m_columnNames.end()) { + std::string errMsg = "podio::DataSource: Type name for \""; + errMsg += columnName; + errMsg += "\" not found!"; + throw std::runtime_error(errMsg); + } + + auto typeIndex = std::distance(m_columnNames.begin(), itr); + + return m_columnTypes.at(typeIndex); +} + +ROOT::RDataFrame CreateDataFrame(const std::vector& filePathList) { + ROOT::RDataFrame rdf(std::make_unique(filePathList)); + + return rdf; +} + +ROOT::RDataFrame CreateDataFrame(const std::string& filePath) { + ROOT::RDataFrame rdf(std::make_unique(filePath)); + + return rdf; +} +} // namespace podio diff --git a/src/rds_selection.xml b/src/rds_selection.xml new file mode 100644 index 000000000..ead05ceac --- /dev/null +++ b/src/rds_selection.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/tests/root_io/CMakeLists.txt b/tests/root_io/CMakeLists.txt index 7f79a000b..9d97ab916 100644 --- a/tests/root_io/CMakeLists.txt +++ b/tests/root_io/CMakeLists.txt @@ -19,7 +19,16 @@ if(ENABLE_RNTUPLE) read_interface_rntuple.cpp ) endif() +if(ENABLE_DATASOURCE) + set(root_dependent_tests + ${root_dependent_tests} + read_with_rdatasource_root.cpp + ) +endif() set(root_libs TestDataModelDict ExtensionDataModelDict podio::podioRootIO podio::podioIO) +if(ENABLE_DATASOURCE) + list(APPEND root_libs podio::podioDataSource) +endif() foreach( sourcefile ${root_dependent_tests} ) CREATE_PODIO_TEST(${sourcefile} "${root_libs}") endforeach() @@ -40,6 +49,10 @@ if(ENABLE_RNTUPLE) set_property(TEST read_interface_rntuple PROPERTY DEPENDS write_interface_rntuple) endif() +if(ENABLE_DATASOURCE) + set_property(TEST read_with_rdatasource_root PROPERTY DEPENDS write_frame_root) +endif() + add_executable(read_frame_legacy_root read_frame_legacy_root.cpp) target_link_libraries(read_frame_legacy_root PRIVATE "${root_libs}") diff --git a/tests/root_io/read_with_rdatasource_root.cpp b/tests/root_io/read_with_rdatasource_root.cpp new file mode 100644 index 000000000..29bad4319 --- /dev/null +++ b/tests/root_io/read_with_rdatasource_root.cpp @@ -0,0 +1,35 @@ +#include "datamodel/ExampleClusterCollection.h" +#include "podio/DataSource.h" + +#include +#include + +ROOT::VecOps::RVec getEnergy(const ExampleClusterCollection& inColl) { + ROOT::VecOps::RVec result; + + for (const auto& cluster : inColl) { + result.push_back(cluster.energy()); + } + + return result; +} + +int main(int argc, const char* argv[]) { + std::string inputFile = "example_frame.root"; + if (argc == 2) { + inputFile = argv[1]; + } else if (argc > 2) { + std::cout << "Wrong number of arguments" << std::endl; + std::cout << "Usage: " << argv[0] << " FILE" << std::endl; + return 1; + } + + auto dframe = podio::CreateDataFrame(inputFile); + dframe.Describe().Print(); + std::cout << std::endl; + + auto cluterEnergy = dframe.Define("cluster_energy", getEnergy, {"clusters"}).Histo1D("cluster_energy"); + cluterEnergy->Print(); + + return EXIT_SUCCESS; +}