-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ntuple] Fixes lookup & searching in the descriptor #17004
base: master
Are you sure you want to change the base?
Changes from all commits
beb5dc7
0c0ae40
bf99fa1
31f1d94
0f6cc50
f3c9d40
5bd2205
d5eae81
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,21 +179,32 @@ ROOT::Experimental::RColumnDescriptor ROOT::Experimental::RColumnDescriptor::Clo | |
ROOT::Experimental::RClusterDescriptor::RPageRange::RPageInfoExtended | ||
ROOT::Experimental::RClusterDescriptor::RPageRange::Find(ClusterSize_t::ValueType idxInCluster) const | ||
{ | ||
// TODO(jblomer): binary search | ||
RPageInfo pageInfo; | ||
decltype(idxInCluster) firstInPage = 0; | ||
NTupleSize_t pageNo = 0; | ||
for (const auto &pi : fPageInfos) { | ||
if (firstInPage + pi.fNElements > idxInCluster) { | ||
pageInfo = pi; | ||
break; | ||
const auto N = fCumulativeNElements.size(); | ||
R__ASSERT(N > 0); | ||
R__ASSERT(N == fPageInfos.size()); | ||
|
||
std::size_t left = 0; | ||
std::size_t right = N - 1; | ||
std::size_t midpoint = N; | ||
while (left <= right) { | ||
midpoint = (left + right) / 2; | ||
if (fCumulativeNElements[midpoint] <= idxInCluster) { | ||
left = midpoint + 1; | ||
continue; | ||
} | ||
firstInPage += pi.fNElements; | ||
++pageNo; | ||
|
||
if ((midpoint == 0) || (fCumulativeNElements[midpoint - 1] <= idxInCluster)) | ||
break; | ||
|
||
right = midpoint - 1; | ||
} | ||
R__ASSERT(midpoint < N); | ||
|
||
auto pageInfo = fPageInfos[midpoint]; | ||
decltype(idxInCluster) firstInPage = (midpoint == 0) ? 0 : fCumulativeNElements[midpoint - 1]; | ||
R__ASSERT(firstInPage <= idxInCluster); | ||
R__ASSERT((firstInPage + pageInfo.fNElements) > idxInCluster); | ||
return RPageInfoExtended{pageInfo, firstInPage, pageNo}; | ||
return RPageInfoExtended{pageInfo, firstInPage, midpoint}; | ||
} | ||
|
||
std::size_t | ||
|
@@ -372,42 +383,129 @@ ROOT::Experimental::RNTupleDescriptor::FindPhysicalColumnId(DescriptorId_t field | |
ROOT::Experimental::DescriptorId_t | ||
ROOT::Experimental::RNTupleDescriptor::FindClusterId(DescriptorId_t physicalColumnId, NTupleSize_t index) const | ||
{ | ||
// TODO(jblomer): binary search? | ||
for (const auto &cd : fClusterDescriptors) { | ||
if (!cd.second.ContainsColumn(physicalColumnId)) | ||
if (GetNClusterGroups() == 0) | ||
return kInvalidDescriptorId; | ||
|
||
// Binary search in the cluster group list, followed by a binary search in the clusters of that cluster group | ||
|
||
std::size_t cgLeft = 0; | ||
std::size_t cgRight = GetNClusterGroups() - 1; | ||
while (cgLeft <= cgRight) { | ||
const std::size_t cgMidpoint = (cgLeft + cgRight) / 2; | ||
const auto &clusterIds = GetClusterGroupDescriptor(fSortedClusterGroupIds[cgMidpoint]).GetClusterIds(); | ||
R__ASSERT(!clusterIds.empty()); | ||
|
||
const auto firstElementInGroup = | ||
GetClusterDescriptor(clusterIds.front()).GetColumnRange(physicalColumnId).fFirstElementIndex; | ||
if (firstElementInGroup > index) { | ||
// Look into the lower half of cluster groups | ||
R__ASSERT(cgMidpoint > 0); | ||
cgRight = cgMidpoint - 1; | ||
continue; | ||
auto columnRange = cd.second.GetColumnRange(physicalColumnId); | ||
if (columnRange.Contains(index)) | ||
return cd.second.GetId(); | ||
} | ||
|
||
const auto &lastColumnRange = GetClusterDescriptor(clusterIds.back()).GetColumnRange(physicalColumnId); | ||
if ((lastColumnRange.fFirstElementIndex + lastColumnRange.fNElements) <= index) { | ||
// Look into the upper half of cluster groups | ||
cgLeft = cgMidpoint + 1; | ||
continue; | ||
} | ||
|
||
// Binary search in the current cluster group; since we already checked the element range boundaries, | ||
// the element must be in that cluster group. | ||
std::size_t clusterLeft = 0; | ||
std::size_t clusterRight = clusterIds.size() - 1; | ||
while (clusterLeft <= clusterRight) { | ||
const std::size_t clusterMidpoint = (clusterLeft + clusterRight) / 2; | ||
const auto clusterId = clusterIds[clusterMidpoint]; | ||
const auto &columnRange = GetClusterDescriptor(clusterId).GetColumnRange(physicalColumnId); | ||
|
||
if (columnRange.Contains(index)) | ||
return clusterId; | ||
|
||
if (columnRange.fFirstElementIndex > index) { | ||
R__ASSERT(clusterMidpoint > 0); | ||
clusterRight = clusterMidpoint - 1; | ||
continue; | ||
} | ||
|
||
if (columnRange.fFirstElementIndex + columnRange.fNElements <= index) { | ||
clusterLeft = clusterMidpoint + 1; | ||
continue; | ||
} | ||
} | ||
R__ASSERT(false); | ||
} | ||
return kInvalidDescriptorId; | ||
} | ||
|
||
ROOT::Experimental::DescriptorId_t ROOT::Experimental::RNTupleDescriptor::FindClusterId(NTupleSize_t entryIdx) const | ||
{ | ||
if (GetNClusterGroups() == 0) | ||
return kInvalidDescriptorId; | ||
|
||
// Binary search in the cluster group list, followed by a binary search in the clusters of that cluster group | ||
|
||
std::size_t cgLeft = 0; | ||
std::size_t cgRight = GetNClusterGroups() - 1; | ||
while (cgLeft <= cgRight) { | ||
const std::size_t cgMidpoint = (cgLeft + cgRight) / 2; | ||
const auto &cgDesc = GetClusterGroupDescriptor(fSortedClusterGroupIds[cgMidpoint]); | ||
|
||
if (cgDesc.GetMinEntry() > entryIdx) { | ||
R__ASSERT(cgMidpoint > 0); | ||
cgRight = cgMidpoint - 1; | ||
continue; | ||
} | ||
|
||
if (cgDesc.GetMinEntry() + cgDesc.GetEntrySpan() <= entryIdx) { | ||
cgLeft = cgMidpoint + 1; | ||
continue; | ||
} | ||
|
||
// Binary search in the current cluster group; since we already checked the element range boundaries, | ||
// the element must be in that cluster group. | ||
const auto &clusterIds = cgDesc.GetClusterIds(); | ||
R__ASSERT(!clusterIds.empty()); | ||
std::size_t clusterLeft = 0; | ||
std::size_t clusterRight = clusterIds.size() - 1; | ||
while (clusterLeft <= clusterRight) { | ||
const std::size_t clusterMidpoint = (clusterLeft + clusterRight) / 2; | ||
const auto &clusterDesc = GetClusterDescriptor(clusterIds[clusterMidpoint]); | ||
|
||
if (clusterDesc.GetFirstEntryIndex() > entryIdx) { | ||
R__ASSERT(clusterMidpoint > 0); | ||
clusterRight = clusterMidpoint - 1; | ||
continue; | ||
} | ||
|
||
if (clusterDesc.GetFirstEntryIndex() + clusterDesc.GetNEntries() <= entryIdx) { | ||
clusterLeft = clusterMidpoint + 1; | ||
continue; | ||
} | ||
|
||
return clusterIds[clusterMidpoint]; | ||
} | ||
R__ASSERT(false); | ||
} | ||
return kInvalidDescriptorId; | ||
} | ||
|
||
// TODO(jblomer): fix for cases of sharded clasters | ||
ROOT::Experimental::DescriptorId_t | ||
ROOT::Experimental::RNTupleDescriptor::FindNextClusterId(DescriptorId_t clusterId) const | ||
{ | ||
const auto &clusterDesc = GetClusterDescriptor(clusterId); | ||
auto firstEntryInNextCluster = clusterDesc.GetFirstEntryIndex() + clusterDesc.GetNEntries(); | ||
// TODO(jblomer): binary search? | ||
for (const auto &cd : fClusterDescriptors) { | ||
if (cd.second.GetFirstEntryIndex() == firstEntryInNextCluster) | ||
return cd.second.GetId(); | ||
} | ||
return kInvalidDescriptorId; | ||
const auto firstEntryInNextCluster = clusterDesc.GetFirstEntryIndex() + clusterDesc.GetNEntries(); | ||
return FindClusterId(firstEntryInNextCluster); | ||
Comment on lines
+498
to
+499
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it worth shortcutting the common case here and check if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good question. The "problem" is that I think currently that shortcut will always trigger. However, I also don't want to rely on descriptor ID ordering... I need to think about it. |
||
} | ||
|
||
// TODO(jblomer): fix for cases of sharded clasters | ||
ROOT::Experimental::DescriptorId_t | ||
ROOT::Experimental::RNTupleDescriptor::FindPrevClusterId(DescriptorId_t clusterId) const | ||
{ | ||
const auto &clusterDesc = GetClusterDescriptor(clusterId); | ||
// TODO(jblomer): binary search? | ||
for (const auto &cd : fClusterDescriptors) { | ||
if (cd.second.GetFirstEntryIndex() + cd.second.GetNEntries() == clusterDesc.GetFirstEntryIndex()) | ||
return cd.second.GetId(); | ||
} | ||
return kInvalidDescriptorId; | ||
if (clusterDesc.GetFirstEntryIndex() == 0) | ||
return kInvalidDescriptorId; | ||
return FindClusterId(clusterDesc.GetFirstEntryIndex() - 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same question here |
||
} | ||
|
||
std::vector<ROOT::Experimental::DescriptorId_t> | ||
|
@@ -489,8 +587,13 @@ ROOT::Experimental::RNTupleDescriptor::AddClusterGroupDetails(DescriptorId_t clu | |
return R__FAIL("invalid attempt to re-populate existing cluster"); | ||
} | ||
} | ||
std::sort(clusterIds.begin(), clusterIds.end(), | ||
[this](DescriptorId_t a, DescriptorId_t b) | ||
{ | ||
return fClusterDescriptors[a].GetFirstEntryIndex() < fClusterDescriptors[b].GetFirstEntryIndex(); | ||
}); | ||
auto cgBuilder = Internal::RClusterGroupDescriptorBuilder::FromSummary(iter->second); | ||
cgBuilder.AddClusters(clusterIds); | ||
cgBuilder.AddSortedClusters(clusterIds); | ||
iter->second = cgBuilder.MoveDescriptor().Unwrap(); | ||
return RResult<void>::Success(); | ||
} | ||
|
@@ -554,6 +657,7 @@ std::unique_ptr<ROOT::Experimental::RNTupleDescriptor> ROOT::Experimental::RNTup | |
clone->fColumnDescriptors.emplace(d.first, d.second.Clone()); | ||
for (const auto &d : fClusterGroupDescriptors) | ||
clone->fClusterGroupDescriptors.emplace(d.first, d.second.Clone()); | ||
clone->fSortedClusterGroupIds = fSortedClusterGroupIds; | ||
for (const auto &d : fClusterDescriptors) | ||
clone->fClusterDescriptors.emplace(d.first, d.second.Clone()); | ||
for (const auto &d : fExtraTypeInfoDescriptors) | ||
|
@@ -731,10 +835,17 @@ ROOT::Experimental::Internal::RClusterDescriptorBuilder::MoveDescriptor() | |
return R__FAIL("unset cluster ID"); | ||
if (fCluster.fNEntries == 0) | ||
return R__FAIL("empty cluster"); | ||
for (const auto &pr : fCluster.fPageRanges) { | ||
for (auto &pr : fCluster.fPageRanges) { | ||
if (fCluster.fColumnRanges.count(pr.first) == 0) { | ||
return R__FAIL("missing column range"); | ||
} | ||
pr.second.fCumulativeNElements.clear(); | ||
pr.second.fCumulativeNElements.reserve(pr.second.fPageInfos.size()); | ||
NTupleSize_t sum = 0; | ||
for (const auto &pi : pr.second.fPageInfos) { | ||
sum += pi.fNElements; | ||
pr.second.fCumulativeNElements.emplace_back(sum); | ||
} | ||
} | ||
RClusterDescriptor result; | ||
std::swap(result, fCluster); | ||
|
@@ -827,6 +938,15 @@ ROOT::Experimental::RResult<void> ROOT::Experimental::Internal::RNTupleDescripto | |
ROOT::Experimental::RNTupleDescriptor ROOT::Experimental::Internal::RNTupleDescriptorBuilder::MoveDescriptor() | ||
{ | ||
EnsureValidDescriptor().ThrowOnError(); | ||
fDescriptor.fSortedClusterGroupIds.reserve(fDescriptor.fClusterGroupDescriptors.size()); | ||
for (const auto &[id, _] : fDescriptor.fClusterGroupDescriptors) | ||
fDescriptor.fSortedClusterGroupIds.emplace_back(id); | ||
std::sort(fDescriptor.fSortedClusterGroupIds.begin(), fDescriptor.fSortedClusterGroupIds.end(), | ||
[this](DescriptorId_t a, DescriptorId_t b) | ||
{ | ||
return fDescriptor.fClusterGroupDescriptors[a].GetMinEntry() < | ||
fDescriptor.fClusterGroupDescriptors[b].GetMinEntry(); | ||
}); | ||
RNTupleDescriptor result; | ||
std::swap(result, fDescriptor); | ||
return result; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unrelated (for now): this requires deserializing all page lists to populate all cluster group descriptors. In the future, we may first want to search loaded cluster groups under the assumption that by loading the (global) entry first, we already have the necessary information...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed. When we implement partial loading of page lists, we need to modify this, e.g. to first look into the available page lists and then load the remaining ones or so.