From 91114655c7da745a78485c858a135623cb42ab56 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 8 Aug 2025 13:07:04 +0200 Subject: [PATCH 1/7] [NFC] Remove trailing spaces in RInterface.hxx --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index bd662bc633802..ff48cb48c1df0 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -882,7 +882,7 @@ public: /// return an RVec of varied values, one for each variation tag, in the same order as the tags. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -988,7 +988,7 @@ public: /// return an RVec of varied values, one for each variation tag, in the same order as the tags. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1036,7 +1036,7 @@ public: /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1091,7 +1091,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied /// values for the specified column. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1126,7 +1126,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied /// values for the specified columns. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time @@ -1163,7 +1163,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied /// values for the specified column. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -2362,13 +2362,13 @@ public: /// auto myGAE2 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); /// ~~~ /// - /// `GraphAssymErrors` should also be used for the cases in which values associated only with - /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. - /// In such cases, user should do the following: + /// `GraphAssymErrors` should also be used for the cases in which values associated only with + /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. + /// In such cases, user should do the following: /// ~~~{.cpp} /// // Create a column of zeros in RDataFrame - /// auto rdf_withzeros = rdf.Define("zero", "0"); - /// // or alternatively: + /// auto rdf_withzeros = rdf.Define("zero", "0"); + /// // or alternatively: /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;}); /// // Create the graph with y errors only /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh"); From 4a5d8a6b28db6387831a46b4f357a7230ac98dd6 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 8 Aug 2025 13:07:44 +0200 Subject: [PATCH 2/7] [RDF][Docs] Move main snapshot documentation to non-template overload. Due to the deprecation marker, the snapshot documentation is parsed incorrectly in doxygen. Here, the main documentation is moved to the non-template overload, and the other overloads are referring to this one using the full argument list. "See above" typically doesn't work, since doxygen sorts the functions. --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 47 +++++++++++----------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index ff48cb48c1df0..56ae9c5f8da44 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1251,6 +1251,27 @@ public: /// \param[in] options RSnapshotOptions struct with extra options to pass to the output TFile and TTree/RNTuple. /// \return a `RDataFrame` that wraps the snapshotted dataset. /// + template + R__DEPRECATED( + 6, 40, "Snapshot does not need template arguments anymore, you can safely remove them from this function call.") + RResultPtr> Snapshot(std::string_view treename, std::string_view filename, + const ColumnNames_t &columnList, + const RSnapshotOptions &options = RSnapshotOptions()) + { + return Snapshot(treename, filename, columnList, options); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. + /// \param[in] treename The name of the output TTree or RNTuple. + /// \param[in] filename The name of the output TFile. + /// \param[in] columnList The list of names of the columns/branches/fields to be written. + /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. + /// \return a `RDataFrame` that wraps the snapshotted dataset. + /// + /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. + /// The types of the columns are automatically inferred and do not need to be specified. + /// /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.') /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot. /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also @@ -1306,28 +1327,6 @@ public: /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts); /// ~~~ - template - R__DEPRECATED( - 6, 40, "Snapshot does not need template arguments anymore, you can safely remove them from this function call.") - RResultPtr> Snapshot(std::string_view treename, std::string_view filename, - const ColumnNames_t &columnList, - const RSnapshotOptions &options = RSnapshotOptions()) - { - return Snapshot(treename, filename, columnList, options); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. - /// \param[in] treename The name of the output TTree or RNTuple. - /// \param[in] filename The name of the output TFile. - /// \param[in] columnList The list of names of the columns/branches/fields to be written. - /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. - /// \return a `RDataFrame` that wraps the snapshotted dataset. - /// - /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. - /// The types of the columns are automatically inferred and do not need to be specified. - /// - /// See above for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options = RSnapshotOptions()) @@ -1464,7 +1463,7 @@ public: /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. /// The types of the columns are automatically inferred and do not need to be specified. /// - /// See above for a more complete description and example usages. + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp = "", const RSnapshotOptions &options = RSnapshotOptions()) @@ -1507,7 +1506,7 @@ public: /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. /// The types of the columns are automatically inferred and do not need to be specified. /// - /// See above for a more complete description and example usages. + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, std::initializer_list columnList, const RSnapshotOptions &options = RSnapshotOptions()) From 20c5c852274c3137152dbf1b8a356c6fd279a088 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 Aug 2025 13:19:14 +0200 Subject: [PATCH 3/7] [RDF] Share column readers for Defines not affected by variations. When a Define is added to a branch that gets varied, a new column reader was created for every variation and for every slot, irrespective of whether the column gets varied or not. For snapshot with variations, this column would be written for every variation, although it always evaluates to the same value. To prevent this duplication, the column readers that point to the same Define are shared after this commit. The readers are still cloned per slot though, so the Define's value caches remain thread safe. This allows SnapshotWithVariations to detect that columns are identical, so they are not written multiple times. The memory savings amount to 24 bytes per suppressed column reader,so they won't have a notable impact. --- tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx | 2 +- tree/dataframe/src/RDefineReader.cxx | 37 ++++++++++++------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx b/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx index 34192c0ad90ee..754d4cdfcb5fe 100644 --- a/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx @@ -62,7 +62,7 @@ class RDefinesWithReaders { // (see BookDefineJit). it is never null. std::shared_ptr fDefine; // Column readers per variation (in the map) per slot (in the vector). - std::vector>> fReadersPerVariation; + std::vector>> fReadersPerVariation; // Strings that were already used to represent column names in this RDataFrame instance. ROOT::Internal::RDF::RStringCache &fCachedColNames; diff --git a/tree/dataframe/src/RDefineReader.cxx b/tree/dataframe/src/RDefineReader.cxx index e374219a41b2b..2126037e70248 100644 --- a/tree/dataframe/src/RDefineReader.cxx +++ b/tree/dataframe/src/RDefineReader.cxx @@ -32,19 +32,28 @@ ROOT::Internal::RDF::RDefinesWithReaders::GetReader(unsigned int slot, std::stri if (it != defineReaders.end()) return *it->second; + std::shared_ptr readerToReturn; auto *define = fDefine.get(); - if (*nameIt != "nominal") - define = &define->GetVariedDefine(std::string(variationName)); - -#if !defined(__clang__) && __GNUC__ >= 7 && __GNUC_MINOR__ >= 3 - const auto insertion = - defineReaders.insert({*nameIt, std::make_unique(slot, *define)}); - return *insertion.first->second; -#else - // gcc < 7.3 has issues with passing the non-movable std::pair temporary into the insert call - auto reader = std::make_unique(slot, *define); - auto &ret = *reader; - defineReaders[*nameIt] = std::move(reader); - return ret; -#endif + if (*nameIt == "nominal") { + readerToReturn = std::make_shared(slot, *define); + } else { + auto *variedDefine = &define->GetVariedDefine(std::string(variationName)); + if (variedDefine == define) { + // The column in not affected by variations. We can return the same reader as for nominal + if (auto nominalReaderIt = defineReaders.find("nominal"); nominalReaderIt != defineReaders.end()) { + readerToReturn = nominalReaderIt->second; + } else { + // The nominal reader doesn't exist yet + readerToReturn = std::make_shared(slot, *define); + auto nominalNameIt = fCachedColNames.Insert("nominal"); + defineReaders.insert({*nominalNameIt, readerToReturn}); + } + } else { + readerToReturn = std::make_shared(slot, *variedDefine); + } + } + + defineReaders.insert({*nameIt, readerToReturn}); + + return *readerToReturn; } From 453a817e21fea366f4de9edeb183b8c28d2c29f1 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 20 Aug 2025 11:02:51 +0200 Subject: [PATCH 4/7] [RDF] Keep RJittedDefine pointers invariant when columns don't change. In order to not write a column multiple times in a snapshot with variations, the RDefineBase pointer needs to stay invariant when a variation is generated. However, the RJittedDefine was returning the pointer to its member, so it looked like the column was affected by variations. Here, this is changed to returning "this" when no variations are in effect. --- tree/dataframe/src/RJittedDefine.cxx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/src/RJittedDefine.cxx b/tree/dataframe/src/RJittedDefine.cxx index 99c369a2fb847..0ead1a3b0ce1d 100644 --- a/tree/dataframe/src/RJittedDefine.cxx +++ b/tree/dataframe/src/RJittedDefine.cxx @@ -66,5 +66,10 @@ void RJittedDefine::MakeVariations(const std::vector &variations) RDefineBase &RJittedDefine::GetVariedDefine(const std::string &variationName) { assert(fConcreteDefine != nullptr); - return fConcreteDefine->GetVariedDefine(variationName); + + auto &variedDefine = fConcreteDefine->GetVariedDefine(variationName); + if (&variedDefine == fConcreteDefine.get()) + return *this; // Ensures that the pointer is the same across all variations + else + return variedDefine; } From 387c01b950d5dfac9329e0f1fe233aaa4fdd7adf Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Thu, 21 Aug 2025 15:54:44 +0200 Subject: [PATCH 5/7] [RDF] Forbid calling VariationsFor on snapshots. Given that snapshots with variations will be enabled via RSnapshotOptions, the snapshot overload for Vary can be removed, and a static_assert can be used to signal the failure. --- tree/dataframe/inc/ROOT/RDFHelpers.hxx | 7 ++++--- tree/dataframe/src/RDFHelpers.cxx | 15 +++------------ tree/dataframe/test/dataframe_vary.cxx | 17 ----------------- 3 files changed, 7 insertions(+), 32 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDFHelpers.hxx b/tree/dataframe/inc/ROOT/RDFHelpers.hxx index 27fbcef39a794..34ae4f129d769 100644 --- a/tree/dataframe/inc/ROOT/RDFHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDFHelpers.hxx @@ -222,6 +222,10 @@ namespace Experimental { template RResultMap VariationsFor(RResultPtr resPtr) { + using SnapshotResult_t = ROOT::RDF::RInterface; + static_assert(!std::is_same_v, + "Snapshot with variations only can be enabled via RSnapshotOptions."); + R__ASSERT(resPtr != nullptr && "Calling VariationsFor on an empty RResultPtr"); // populate parts of the computation graph for which we only have "empty shells", e.g. RJittedActions and @@ -270,9 +274,6 @@ RResultMap VariationsFor(RResultPtr resPtr) *resPtr.fLoopManager, std::move(nominalAction), std::move(variedAction)); } -using SnapshotPtr_t = ROOT::RDF::RResultPtr>; -SnapshotPtr_t VariationsFor(SnapshotPtr_t resPtr); - /// \brief Add ProgressBar to a ROOT::RDF::RNode /// \param[in] df RDataFrame node at which ProgressBar is called. /// diff --git a/tree/dataframe/src/RDFHelpers.cxx b/tree/dataframe/src/RDFHelpers.cxx index a4d8ad422c623..dc4c5d7515937 100644 --- a/tree/dataframe/src/RDFHelpers.cxx +++ b/tree/dataframe/src/RDFHelpers.cxx @@ -141,15 +141,7 @@ unsigned int ROOT::RDF::RunGraphs(std::vector handles) return uniqueLoops.size(); } -ROOT::RDF::Experimental::SnapshotPtr_t ROOT::RDF::Experimental::VariationsFor(ROOT::RDF::Experimental::SnapshotPtr_t) -{ - throw std::logic_error("Varying a Snapshot result is not implemented yet."); -} - -namespace ROOT { -namespace RDF { - -namespace Experimental { +namespace ROOT::RDF::Experimental { void ThreadsPerTH3(unsigned int N) { @@ -398,6 +390,5 @@ void AddProgressBar(ROOT::RDataFrame dataframe) auto node = ROOT::RDF::AsRNode(dataframe); ROOT::RDF::Experimental::AddProgressBar(node); } -} // namespace Experimental -} // namespace RDF -} // namespace ROOT + +} // namespace ROOT::RDF::Experimental diff --git a/tree/dataframe/test/dataframe_vary.cxx b/tree/dataframe/test/dataframe_vary.cxx index f881329c5a099..85e6cc74d2f90 100644 --- a/tree/dataframe/test/dataframe_vary.cxx +++ b/tree/dataframe/test/dataframe_vary.cxx @@ -1557,23 +1557,6 @@ TEST_P(RDFVary, VaryTake) EXPECT_EQ(sorted(rs["x:1"]), std::vector({1, 2, 3})); } -TEST_P(RDFVary, VarySnapshot) -{ - const auto fname = "dummy.root"; - auto h = ROOT::RDataFrame(10) - .Define("x", [](ULong64_t e) { return int(e); }, {"rdfentry_"}) - .Vary( - "x", [](int x) { return ROOT::RVecI{x - 1, x + 1}; }, {"x"}, 2) - .Snapshot("t", fname, {"x"}); - EXPECT_THROW( - try { VariationsFor(h); } catch (const std::logic_error &err) { - const auto msg = "Varying a Snapshot result is not implemented yet."; - EXPECT_STREQ(err.what(), msg); - throw; - }, - std::logic_error); -} - // this is a regression test, we used to read from wrong addresses in this case TEST_P(RDFVary, MoreVariedColumnsThanVariations) { From 5bd53a7962a2f0ab80cd697cc2061bdf44000ef6 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Tue, 5 Aug 2025 13:52:51 +0200 Subject: [PATCH 6/7] [RDF] Unify members of Snapshot helpers into structs. In order to implement snapshots with systematic variations, it is beneficial to place all members related to output branches into structs. At present, this only constitutes a move from member of SnapshotHelper to member of RBranchData struct. The TTree snapshot helpers are changed accordingly. --- .../inc/ROOT/RDF/SnapshotHelpers.hxx | 48 ++- tree/dataframe/src/RDFSnapshotHelpers.cxx | 322 +++++++++++------- 2 files changed, 225 insertions(+), 145 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index e7ebca8041e95..6f938ded33f28 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -102,6 +102,34 @@ public: UntypedSnapshotRNTupleHelper MakeNew(void *newName); }; +/// Stores properties of each output branch in a Snapshot. +struct RBranchData { + std::string fInputBranchName; // This contains resolved aliases + std::string fOutputBranchName; + const std::type_info *fInputTypeID = nullptr; + TBranch *fOutputBranch = nullptr; + void *fBranchAddressForCArrays = nullptr; // Used to detect if branch addresses need to be updated + + std::unique_ptr> fEmptyInstance; + bool fIsCArray = false; + bool fIsDefine = false; + + RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, const std::type_info *typeID, + TBranch *outputBranch = nullptr) + : fInputBranchName{std::move(inputBranchName)}, + fOutputBranchName{std::move(outputBranchName)}, + fInputTypeID{typeID}, + fOutputBranch{outputBranch}, + fIsDefine(isDefine) + { + } + void ClearBranchPointers() + { + fOutputBranch = nullptr; + fBranchAddressForCArrays = nullptr; + } +}; + class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionImpl { std::string fFileName; std::string fDirName; @@ -110,17 +138,10 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionIm std::unique_ptr fOutputFile; std::unique_ptr fOutputTree; // must be a ptr because TTrees are not copy/move constructible bool fBranchAddressesNeedReset{true}; - ColumnNames_t fInputBranchNames; // This contains the resolved aliases - ColumnNames_t fOutputBranchNames; TTree *fInputTree = nullptr; // Current input tree. Set at initialization time (`InitTask`) - // TODO we might be able to unify fBranches, fBranchAddresses and fOutputBranches - std::vector fBranches; // Addresses of branches in output, non-null only for the ones holding C arrays - std::vector fBranchAddresses; // Addresses of objects associated to output branches - RBranchSet fOutputBranches; - std::vector fIsDefine; + std::vector fBranchData; // Information for all output branches ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; - std::vector fInputColumnTypeIDs; // Types for the input columns public: UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, @@ -169,11 +190,7 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelperMT final : public RAction std::vector> fOutputTrees; std::vector fBranchAddressesNeedReset; // vector does not allow concurrent writing of different elements std::vector fInputTrees; // Current input trees, one per slot. Set at initialization time (`InitTask`) - // Addresses of branches in output per slot, non-null only for the ones holding C arrays - std::vector> fBranches; - // Addresses of objects associated to output branches per slot, non-null only for the ones holding C arrays - std::vector> fBranchAddresses; - std::vector fOutputBranches; // Unique set of output branches, one per slot. + std::vector> fBranchData; // Information for all output branches of each slot // Attributes of the output TTree @@ -182,16 +199,11 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelperMT final : public RAction std::string fTreeName; TFile *fOutputFile; // Non-owning view on the output file RSnapshotOptions fOptions; - std::vector fOutputBranchNames; // Attributes related to the computation graph ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; - std::vector fInputBranchNames; // This contains the resolved aliases - std::vector fInputColumnTypeIDs; // Types for the input columns - - std::vector fIsDefine; public: UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 94c254f4d7078..fc2bf7eb7dab2 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -34,8 +34,45 @@ #include #include +#include +#include +#include + +using ROOT::Internal::RDF::RBranchData; + namespace { +void AssertNoNullBranchAddresses(std::vector const &branches) +{ + std::vector branchesWithNullAddress; + for (auto const &branchData : branches) { + if (branchData.fOutputBranch->GetAddress() == nullptr) + branchesWithNullAddress.push_back(branchData.fOutputBranch); + } + + if (branchesWithNullAddress.empty()) + return; + + // otherwise build error message and throw + std::vector missingBranchNames; + std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(), + std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); }); + std::string msg = "RDataFrame::Snapshot:"; + if (missingBranchNames.size() == 1) { + msg += " branch " + missingBranchNames[0] + + " is needed as it provides the size for one or more branches containing dynamically sized arrays, but " + "it is"; + } else { + msg += " branches "; + for (const auto &bName : missingBranchNames) + msg += bName + ", "; + msg.resize(msg.size() - 2); // remove last ", " + msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are"; + } + msg += " not part of the set of branches that are being written out."; + throw std::runtime_error(msg); +} + TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName) { if (inputTree) { @@ -49,35 +86,51 @@ TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName) return nullptr; } -void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches, TBranch *inputBranch, - const std::string &outputBranchName, int basketSize, void *address) +std::vector::iterator CreateCStyleArrayBranch(TTree &outputTree, std::vector &outputBranches, + std::vector::iterator thisBranch, + TBranch *inputBranch, int basketSize, void *address) { if (!inputBranch) - return; + return thisBranch; const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName()); if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec) - return; + return thisBranch; // must construct the leaflist for the output branch and create the branch in the output tree const auto *leaf = static_cast(inputBranch->GetListOfLeaves()->UncheckedAt(0)); if (!leaf) - return; + return thisBranch; const auto bname = leaf->GetName(); auto *sizeLeaf = leaf->GetLeafCount(); const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic()); // We proceed only if branch is a fixed-or-variable-sized array if (sizeLeaf || leaf->GetLenStatic() > 1) { - if (sizeLeaf && !outputBranches.Get(sizeLeafName)) { - // The output array branch `bname` has dynamic size stored in leaf `sizeLeafName`, but that leaf has not been - // added to the output tree yet. However, the size leaf has to be available for the creation of the array - // branch to be successful. So we create the size leaf here. - const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName()); - // Use Original basket size for Existing Branches otherwise use Custom basket Size. - const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize(); - // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for `sizeLeafName` - auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast(nullptr), - (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize); - outputBranches.Insert(sizeLeafName, outputBranch); + if (sizeLeaf) { + // The array branch `bname` has dynamic size stored in leaf `sizeLeafName`, so we need to ensure that it's + // in the output tree. + auto sizeLeafIt = + std::find_if(outputBranches.begin(), outputBranches.end(), + [&sizeLeafName](RBranchData const &bd) { return bd.fOutputBranchName == sizeLeafName; }); + if (sizeLeafIt == outputBranches.end()) { + // The size leaf is not part of the output branches yet, so emplace an empty slot for it. + // This means that iterators need to be updated in case the container reallocates. + const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch); + outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr, + /*outputBranch=*/nullptr); + thisBranch = outputBranches.begin() + indexBeforeEmplace; + sizeLeafIt = outputBranches.end() - 1; + } + if (!sizeLeafIt->fOutputBranch) { + // The size leaf was emplaced, but not initialised yet + const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName()); + // Use Original basket size for Existing Branches otherwise use Custom basket Size. + const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize(); + // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for + // `sizeLeafName` + auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast(nullptr), + (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize); + sizeLeafIt->fOutputBranch = outputBranch; + } } const auto btype = leaf->GetTypeName(); @@ -87,7 +140,7 @@ void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The " "leaf is of type '%s'. This column will not be written out.", bname, btype); - return; + return thisBranch; } const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype; @@ -102,23 +155,26 @@ void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet } return nullptr; }(); - auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize); - outputBranch->SetTitle(inputBranch->GetTitle()); - outputBranches.Insert(outputBranchName, outputBranch, true); + thisBranch->fOutputBranch = + outputTree.Branch(thisBranch->fOutputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize); + thisBranch->fOutputBranch->SetTitle(inputBranch->GetTitle()); + thisBranch->fIsCArray = true; } + + return thisBranch; } -void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&outputBranchAddress, bool isCArray, - void *valueAddress) +void SetBranchAddress(TBranch *inputBranch, RBranchData &branchData, void *valueAddress) { const static TClassRef TBOClRef("TBranchObject"); if (inputBranch && inputBranch->IsA() == TBOClRef) { - outputBranch.SetAddress(reinterpret_cast(inputBranch->GetAddress())); - } else if (outputBranch.IsA() != TBranch::Class()) { - outputBranchAddress = valueAddress; - outputBranch.SetAddress(&outputBranchAddress); + branchData.fOutputBranch->SetAddress(reinterpret_cast(inputBranch->GetAddress())); + } else if (branchData.fOutputBranch->IsA() != TBranch::Class()) { + // This is a relatively rare case of a fixed-size array getting redefined + branchData.fBranchAddressForCArrays = valueAddress; + branchData.fOutputBranch->SetAddress(&branchData.fBranchAddressForCArrays); } else { - void *correctAddress = [valueAddress, isCArray]() -> void * { + void *correctAddress = [valueAddress, isCArray = branchData.fIsCArray]() -> void * { if (isCArray) { // Address here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer @@ -127,29 +183,26 @@ void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&output } return valueAddress; }(); - outputBranch.SetAddress(correctAddress); - outputBranchAddress = valueAddress; + branchData.fOutputBranch->SetAddress(correctAddress); + branchData.fBranchAddressForCArrays = valueAddress; } } -void CreateFundamentalTypeBranch(TTree &outputTree, const std::string &outputBranchName, void *valueAddress, - const std::type_info &valueTypeID, ROOT::Internal::RDF::RBranchSet &outputBranches, - int bufSize) +void CreateFundamentalTypeBranch(TTree &outputTree, RBranchData &bd, void *valueAddress, int bufSize) { // Logic taken from // TTree::BranchImpRef( // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel) - auto rootTypeChar = ROOT::Internal::RDF::TypeID2ROOTTypeName(valueTypeID); + auto rootTypeChar = ROOT::Internal::RDF::TypeID2ROOTTypeName(*bd.fInputTypeID); if (rootTypeChar == ' ') { Warning("Snapshot", "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This " "column will not be written out.", - outputBranchName.c_str()); + bd.fOutputBranchName.c_str()); return; } - std::string leafList{outputBranchName + '/' + rootTypeChar}; - auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize); - outputBranches.Insert(outputBranchName, outputBranch); + std::string leafList{bd.fOutputBranchName + '/' + rootTypeChar}; + bd.fOutputBranch = outputTree.Branch(bd.fOutputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize); } /// Ensure that the TTree with the resulting snapshot can be written to the target TFile. This means checking that the @@ -238,13 +291,18 @@ void EnsureValidSnapshotRNTupleOutput(const ROOT::RDF::RSnapshotOptions &opts, c } } -void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches, - int basketSize, const std::string &inputBranchName, const std::string &outputBranchName, - const std::type_info &valueTypeID, void *valueAddress, TBranch *&actionHelperBranchPtr, - void *&actionHelperBranchPtrAddress, bool isDefine) +void SetBranchesHelper(TTree *inputTree, TTree &outputTree, + std::vector &allBranchData, std::size_t currentIndex, + int basketSize, void *valueAddress) { + auto branchData = allBranchData.begin() + currentIndex; + auto *inputBranch = branchData->fIsDefine ? nullptr : SearchForBranch(inputTree, branchData->fInputBranchName); - auto *inputBranch = isDefine ? nullptr : SearchForBranch(inputTree, inputBranchName); + if (branchData->fOutputBranch && valueAddress) { + // The output branch was already created, we just need to (re)set its address + SetBranchAddress(inputBranch, *branchData, valueAddress); + return; + } // Respect the original bufsize and splitlevel arguments // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid @@ -254,55 +312,47 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF: const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000); const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99; - if (auto *outputBranch = outputBranches.Get(outputBranchName); outputBranch && valueAddress) { - // The output branch was already created, we just need to (re)set its address - SetBranchAddress(inputBranch, *outputBranch, actionHelperBranchPtrAddress, - outputBranches.IsCArray(outputBranchName), valueAddress); - return; - } - - auto *dictionary = TDictionary::GetDictionary(valueTypeID); + auto *dictionary = TDictionary::GetDictionary(*branchData->fInputTypeID); if (dynamic_cast(dictionary)) { // Branch of fundamental type - CreateFundamentalTypeBranch(outputTree, outputBranchName, valueAddress, valueTypeID, outputBranches, bufSize); + CreateFundamentalTypeBranch(outputTree, *branchData, valueAddress, bufSize); return; } - if (!isDefine) { + if (!branchData->fIsDefine) { // Cases where we need a leaflist (e.g. C-style arrays) // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec). - CreateCStyleArrayBranch(outputTree, outputBranches, inputBranch, outputBranchName, bufSize, valueAddress); + branchData = CreateCStyleArrayBranch(outputTree, allBranchData, branchData, inputBranch, bufSize, valueAddress); } - if (auto *arrayBranch = outputBranches.Get(outputBranchName)) { + if (branchData->fOutputBranch) { // A branch was created in the previous function call - actionHelperBranchPtr = arrayBranch; if (valueAddress) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(valueAddress); - actionHelperBranchPtrAddress = rawRVec->data(); + branchData->fBranchAddressForCArrays = rawRVec->data(); } return; } if (auto *classPtr = dynamic_cast(dictionary)) { - TBranch *outputBranch{}; // Case of unsplit object with polymorphic type if (inputBranch && dynamic_cast(inputBranch) && valueAddress) - outputBranch = ROOT::Internal::TreeUtils::CallBranchImp(outputTree, outputBranchName.c_str(), classPtr, - inputBranch->GetAddress(), bufSize, splitLevel); + branchData->fOutputBranch = + ROOT::Internal::TreeUtils::CallBranchImp(outputTree, branchData->fOutputBranchName.c_str(), classPtr, + inputBranch->GetAddress(), bufSize, splitLevel); // General case, with valid address else if (valueAddress) - outputBranch = ROOT::Internal::TreeUtils::CallBranchImpRef(outputTree, outputBranchName.c_str(), classPtr, - TDataType::GetType(valueTypeID), valueAddress, - bufSize, splitLevel); + branchData->fOutputBranch = ROOT::Internal::TreeUtils::CallBranchImpRef( + outputTree, branchData->fOutputBranchName.c_str(), classPtr, TDataType::GetType(*branchData->fInputTypeID), + valueAddress, bufSize, splitLevel); // No value was passed, we're just creating a hollow branch to populate the dataset schema else - outputBranch = outputTree.Branch(outputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize); - outputBranches.Insert(outputBranchName, outputBranch); + branchData->fOutputBranch = + outputTree.Branch(branchData->fOutputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize); return; } @@ -388,16 +438,16 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( fDirName(dirname), fTreeName(treename), fOptions(options), - fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), - fBranches(vbnames.size(), nullptr), - fBranchAddresses(vbnames.size(), nullptr), - fIsDefine(std::move(isDefine)), fOutputLoopManager(loopManager), - fInputLoopManager(inputLM), - fInputColumnTypeIDs(colTypeIDs) + fInputLoopManager(inputLM) { EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + fBranchData.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + fBranchData.emplace_back(vbnames[i], std::move(outputBranchNames[i]), isDefine[i], colTypeIDs[i]); + } } // Define special member methods here where the definition of all the data member types is available @@ -449,17 +499,16 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UpdateCArraysPtrs(const st // associated to those is re-allocated. As a result the value of the pointer can change therewith // leaving associated to the branch of the output tree an invalid pointer. // With this code, we set the value of the pointer in the output branch anew when needed. - assert(values.size() == fBranches.size()); + assert(values.size() <= fBranchData.size()); auto nValues = values.size(); for (decltype(nValues) i{}; i < nValues; i++) { - if (fBranches[i] && fOutputBranches.IsCArray(fOutputBranchNames[i])) { + if (fBranchData[i].fIsCArray) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(values[i]); - if (auto *data = rawRVec->data(); fBranchAddresses[i] != data) { - // reset the branch address - fBranches[i]->SetAddress(data); - fBranchAddresses[i] = data; + if (auto *data = rawRVec->data(); fBranchData[i].fBranchAddressForCArrays != data) { + fBranchData[i].fOutputBranch->SetAddress(data); + fBranchData[i].fBranchAddressForCArrays = data; } } } @@ -468,26 +517,18 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UpdateCArraysPtrs(const st void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetBranches(const std::vector &values) { // create branches in output tree - auto nValues = values.size(); - for (decltype(nValues) i{}; i < nValues; i++) { - SetBranchesHelper(fInputTree, *fOutputTree, fOutputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], fBranches[i], fBranchAddresses[i], - fIsDefine[i]); + assert(fBranchData.size() == values.size()); + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(fInputTree, *fOutputTree, fBranchData, i, fOptions.fBasketSize, values[i]); } - fOutputBranches.AssertNoNullBranchAddresses(); + AssertNoNullBranchAddresses(fBranchData); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *inputTree, TTree &outputTree) { void *dummyValueAddress{}; - TBranch *dummyTBranchPtr{}; - void *dummyTBranchAddress{}; - RBranchSet outputBranches{}; - auto nBranches = fInputBranchNames.size(); - for (decltype(nBranches) i{}; i < nBranches; i++) { - SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr, - dummyTBranchAddress, fIsDefine[i]); + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(inputTree, outputTree, fBranchData, i, fOptions.fBasketSize, dummyValueAddress); } } @@ -551,16 +592,29 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelper ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::MakeNew(void *newName, std::string_view) { const std::string finalName = *reinterpret_cast(newName); + std::vector inputBranchNames; + std::vector outputBranchNames; + std::vector isDefine; + std::vector inputColumnTypeIDs; + for (const auto &bd : fBranchData) { + if (bd.fInputBranchName.empty()) + break; + inputBranchNames.push_back(bd.fInputBranchName); + outputBranchNames.push_back(bd.fOutputBranchName); + isDefine.push_back(bd.fIsDefine); + inputColumnTypeIDs.push_back(bd.fInputTypeID); + } + return ROOT::Internal::RDF::UntypedSnapshotTTreeHelper{finalName, fDirName, fTreeName, - fInputBranchNames, - fOutputBranchNames, + std::move(inputBranchNames), + std::move(outputBranchNames), fOptions, - std::vector(fIsDefine), + std::move(isDefine), fOutputLoopManager, fInputLoopManager, - fInputColumnTypeIDs}; + inputColumnTypeIDs}; } ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UntypedSnapshotTTreeHelperMT( @@ -573,21 +627,25 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UntypedSnapshotTTreeHelperMT( fOutputTrees(fNSlots), fBranchAddressesNeedReset(fNSlots, 1), fInputTrees(fNSlots), - fBranches(fNSlots, std::vector(vbnames.size(), nullptr)), - fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), - fOutputBranches(fNSlots), fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fOutputLoopManager(loopManager), - fInputLoopManager(inputLM), - fInputBranchNames(vbnames), - fInputColumnTypeIDs(colTypeIDs), - fIsDefine(std::move(isDefine)) + fInputLoopManager(inputLM) { EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + fBranchData.reserve(fNSlots); + for (unsigned int slot = 0; slot < fNSlots; ++slot) { + fBranchData.emplace_back(); + auto &thisSlot = fBranchData.back(); + thisSlot.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + thisSlot.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]); + } + } } // Define special member methods here where the definition of all the data member types is available @@ -647,9 +705,10 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::FinalizeTask(unsigned in { if (fOutputTrees[slot]->GetEntries() > 0) fOutputFiles[slot]->Write(); + for (auto &branchData : fBranchData[slot]) + branchData.ClearBranchPointers(); // Pointers might go to an old tree, so they are stale now // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones) fOutputTrees[slot].reset(nullptr); - fOutputBranches[slot].Clear(); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector &values) @@ -674,17 +733,18 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UpdateCArraysPtrs(unsign // associated to those is re-allocated. As a result the value of the pointer can change therewith // leaving associated to the branch of the output tree an invalid pointer. // With this code, we set the value of the pointer in the output branch anew when needed. - assert(values.size() == fBranches[slot].size()); + assert(values.size() <= fBranchData[slot].size()); auto nValues = values.size(); for (decltype(nValues) i{}; i < nValues; i++) { - if (fBranches[slot][i] && fOutputBranches[slot].IsCArray(fOutputBranchNames[i])) { + auto &branchData = fBranchData[slot][i]; + if (branchData.fIsCArray) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(values[i]); - if (auto *data = rawRVec->data(); fBranchAddresses[slot][i] != data) { + if (auto *data = rawRVec->data(); branchData.fBranchAddressForCArrays != data) { // reset the branch address - fBranches[slot][i]->SetAddress(data); - fBranchAddresses[slot][i] = data; + branchData.fOutputBranch->SetAddress(data); + branchData.fBranchAddressForCArrays = data; } } } @@ -694,26 +754,21 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetBranches(unsigned int const std::vector &values) { // create branches in output tree - auto nValues = values.size(); - for (decltype(nValues) i{}; i < nValues; i++) { - SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], fOutputBranches[slot], fOptions.fBasketSize, - fInputBranchNames[i], fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], - fBranches[slot][i], fBranchAddresses[slot][i], fIsDefine[i]); + auto &branchData = fBranchData[slot]; + assert(branchData.size() == values.size()); + for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions + SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], branchData, i, fOptions.fBasketSize, values[i]); } - fOutputBranches[slot].AssertNoNullBranchAddresses(); + + AssertNoNullBranchAddresses(branchData); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree *inputTree, TTree &outputTree) { void *dummyValueAddress{}; - TBranch *dummyTBranchPtr{}; - void *dummyTBranchAddress{}; - RBranchSet outputBranches{}; - auto nBranches = fInputBranchNames.size(); - for (decltype(nBranches) i{}; i < nBranches; i++) { - SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr, - dummyTBranchAddress, fIsDefine[i]); + auto &branchData = fBranchData.front(); + for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions + SetBranchesHelper(inputTree, outputTree, branchData, i, fOptions.fBasketSize, dummyValueAddress); } } @@ -785,17 +840,30 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::MakeNew(void *newName, std::string_view) { const std::string finalName = *reinterpret_cast(newName); + std::vector inputBranchNames; + std::vector outputBranchNames; + std::vector isDefine; + std::vector inputColumnTypeIDs; + for (const auto &bd : fBranchData.front()) { + if (bd.fInputBranchName.empty()) + break; + inputBranchNames.push_back(bd.fInputBranchName); + outputBranchNames.push_back(bd.fOutputBranchName); + isDefine.push_back(bd.fIsDefine); + inputColumnTypeIDs.push_back(bd.fInputTypeID); + } + return ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT{fNSlots, finalName, fDirName, fTreeName, - fInputBranchNames, - fOutputBranchNames, + std::move(inputBranchNames), + std::move(outputBranchNames), fOptions, - std::vector(fIsDefine), + std::move(isDefine), fOutputLoopManager, fInputLoopManager, - fInputColumnTypeIDs}; + std::move(inputColumnTypeIDs)}; } ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( From aed8ece889dc720065d8f630a306cb6041df411d Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 6 Aug 2025 11:53:09 +0200 Subject: [PATCH 7/7] [RDF] Remove RBranchSet from SnapshotHelpers. It is superseded by RBranchData. --- .../inc/ROOT/RDF/SnapshotHelpers.hxx | 13 ---- tree/dataframe/src/RDFSnapshotHelpers.cxx | 67 ------------------- 2 files changed, 80 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 6f938ded33f28..69ad0634af14e 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -38,19 +38,6 @@ class TBufferMergerFile; namespace ROOT::Internal::RDF { -class RBranchSet { - std::vector fBranches; - std::vector fNames; - std::vector fIsCArray; - -public: - TBranch *Get(const std::string &name) const; - bool IsCArray(const std::string &name) const; - void Insert(const std::string &name, TBranch *address, bool isCArray = false); - void Clear(); - void AssertNoNullBranchAddresses(); -}; - class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RActionImpl { std::string fFileName; std::string fDirName; diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index fc2bf7eb7dab2..c874f8e9c3c5d 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -362,73 +362,6 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, } } // namespace -TBranch *ROOT::Internal::RDF::RBranchSet::Get(const std::string &name) const -{ - auto it = std::find(fNames.begin(), fNames.end(), name); - if (it == fNames.end()) - return nullptr; - return fBranches[std::distance(fNames.begin(), it)]; -} - -bool ROOT::Internal::RDF::RBranchSet::IsCArray(const std::string &name) const -{ - if (auto it = std::find(fNames.begin(), fNames.end(), name); it != fNames.end()) - return fIsCArray[std::distance(fNames.begin(), it)]; - return false; -} - -void ROOT::Internal::RDF::RBranchSet::Insert(const std::string &name, TBranch *address, bool isCArray) -{ - if (address == nullptr) { - throw std::logic_error("Trying to insert a null branch address."); - } - if (std::find(fBranches.begin(), fBranches.end(), address) != fBranches.end()) { - throw std::logic_error("Trying to insert a branch address that's already present."); - } - if (std::find(fNames.begin(), fNames.end(), name) != fNames.end()) { - throw std::logic_error("Trying to insert a branch name that's already present."); - } - fNames.emplace_back(name); - fBranches.emplace_back(address); - fIsCArray.push_back(isCArray); -} - -void ROOT::Internal::RDF::RBranchSet::Clear() -{ - fBranches.clear(); - fNames.clear(); - fIsCArray.clear(); -} - -void ROOT::Internal::RDF::RBranchSet::AssertNoNullBranchAddresses() -{ - std::vector branchesWithNullAddress; - std::copy_if(fBranches.begin(), fBranches.end(), std::back_inserter(branchesWithNullAddress), - [](TBranch *b) { return b->GetAddress() == nullptr; }); - - if (branchesWithNullAddress.empty()) - return; - - // otherwise build error message and throw - std::vector missingBranchNames; - std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(), - std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); }); - std::string msg = "RDataFrame::Snapshot:"; - if (missingBranchNames.size() == 1) { - msg += " branch " + missingBranchNames[0] + - " is needed as it provides the size for one or more branches containing dynamically sized arrays, but " - "it is"; - } else { - msg += " branches "; - for (const auto &bName : missingBranchNames) - msg += bName + ", "; - msg.resize(msg.size() - 2); // remove last ", " - msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are"; - } - msg += " not part of the set of branches that are being written out."; - throw std::runtime_error(msg); -} - ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine,