Skip to content

Commit

Permalink
Merge pull request #56 from niklak/feature/traversal-exp
Browse files Browse the repository at this point in the history
- `Matches` internal code changes.
- `NodeRef::find` an experimental method to find all descendant elements of a node that match a given path. It is much faster than `Selection::select` method.
  • Loading branch information
niklak authored Jan 15, 2025
2 parents 6c7b21e + 65be3d7 commit f2bf1d1
Show file tree
Hide file tree
Showing 13 changed files with 627 additions and 128 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,7 @@ jobs:
run: cargo test --verbose --all-targets --features "hashbrown"
- name: Run clippy
run: cargo clippy --verbose --all-targets -- -D warnings
- name: Install cargo audit
uses: taiki-e/install-action@cargo-audit
- name: Run audit
run: cargo audit
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ All notable changes to the `dom_query` crate will be documented in this file.
- Implemented `NodeRef::is_match` and `NodeRef::is` methods, which allow checking if a node matches
a given matcher (`&Matcher`) or selector (`&str`) without creating a `Selection` object.
- Implemented `Tree::base_uri`, a quick method that returns the base URI of the document based on the `href` attribute of the `<base>` element. `Document::base_uri` and `NodeRef::base_uri` provide the same functionality. Inspired by [Node: baseURI property]( https://developer.mozilla.org/en-US/docs/Web/API/Node/baseURI).
- `NodeRef::find` an experimental method to find all descendant elements of a node that match a given path. It is much faster than `Selection::select` method.

### Changed

- `Selection`'s internal code changes aimed at reducing calls to `RefCell::borrow` and `RefCell::borrow_mut`.
- `Matches` internal code changes.

## [0.11.0] - 2024-12-10

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = ["niklak <[email protected]>","importcjj <[email protected]>"]
edition = "2021"
readme = "README.md"
rust-version = "1.65"
exclude = [".*"]
exclude = [".*", "test-pages"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
Expand Down
2 changes: 2 additions & 0 deletions src/dom_tree.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
mod ops;
mod traversal;
mod tree;

pub use ops::TreeNodeOps;
pub use traversal::Traversal;
pub use tree::Tree;
66 changes: 0 additions & 66 deletions src/dom_tree/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,72 +111,6 @@ impl TreeNodeOps {
}
None
}

/// Finds the first child element of a node that satisfies the given predicate.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the parent node.
/// * `f` - The predicate to apply to each child element.
///
/// # Returns
///
/// The id of the first element that satisfies the predicate, if any.
pub fn find_child_element<F>(nodes: Ref<Vec<TreeNode>>, id: NodeId, f: F) -> Option<NodeId>
where
F: Fn(&TreeNode) -> bool,
{
child_nodes(Ref::clone(&nodes), &id, false)
.filter_map(|node_id| nodes.get(node_id.value))
.filter(|tree_node| tree_node.is_element())
.find(|tree_node| f(tree_node))
.map(|tree_node| tree_node.id)
}

/// Finds the first child element of a node that has the given name.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the parent node.
/// * `name` - The name of the element to search for.
///
/// # Returns
///
/// The id of the first element that has the given name, if any.
pub fn find_child_element_by_name(
nodes: Ref<Vec<TreeNode>>,
id: NodeId,
name: &str,
) -> Option<NodeId> {
Self::find_child_element(nodes, id, |tree_node| {
tree_node
.as_element()
.map_or(false, |el| el.node_name().as_ref() == name)
})
}

/// Finds the first descendant element of a node that has the given names.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the starting node.
/// * `names` - The names of the elements to search for.
///
/// # Returns
///
/// The id of the first descendant element that has the given names, if any.
pub fn find_descendant_element(
nodes: Ref<Vec<TreeNode>>,
id: NodeId,
names: &[&str],
) -> Option<NodeId> {
names.iter().try_fold(id, |current_id, name| {
Self::find_child_element_by_name(Ref::clone(&nodes), current_id, name)
})
}
}

// manipulation
Expand Down
136 changes: 136 additions & 0 deletions src/dom_tree/traversal.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
use std::cell::Ref;

use crate::node::child_nodes;
use crate::node::{NodeId, TreeNode};
pub struct Traversal {}

impl Traversal {
/// Finds the first child element of a node that satisfies the given predicate.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the parent node.
/// * `f` - The predicate to apply to each child element.
///
/// # Returns
///
/// The id of the first element that satisfies the predicate, if any.
pub fn find_child_element<F>(nodes: Ref<Vec<TreeNode>>, id: NodeId, f: F) -> Option<NodeId>
where
F: Fn(&TreeNode) -> bool,
{
child_nodes(Ref::clone(&nodes), &id, false)
.filter_map(|node_id| nodes.get(node_id.value))
.filter(|tree_node| tree_node.is_element())
.find(|tree_node| f(tree_node))
.map(|tree_node| tree_node.id)
}

/// Finds the first child element of a node that has the given name.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the parent node.
/// * `name` - The name of the element to search for.
///
/// # Returns
///
/// The id of the first element that has the given name, if any.
pub fn find_child_element_by_name(
nodes: Ref<Vec<TreeNode>>,
id: NodeId,
name: &str,
) -> Option<NodeId> {
Self::find_child_element(nodes, id, |tree_node| {
tree_node
.as_element()
.map_or(false, |el| el.node_name().as_ref() == name)
})
}

/// Finds the first descendant element of a node that match given path.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the starting node.
/// * `names` - The sequence of element names to search for. Currently, only element names are supported.
///
/// # Returns
///
/// The id of the first descendant element that has the given names, if any.
pub fn find_descendant_element(
nodes: Ref<Vec<TreeNode>>,
id: NodeId,
path: &[&str],
) -> Option<NodeId> {
path.iter().try_fold(id, |current_id, name| {
Self::find_child_element_by_name(Ref::clone(&nodes), current_id, name)
})
}

/// Finds all descendant elements of a node that match given path.
///
/// # Arguments
///
/// * `nodes` - The nodes of the tree.
/// * `id` - The id of the starting node.
/// * `path` - The sequence of element names to search for. Currently, only element names are supported.
///
/// # Returns
///
/// A list of ids of all descendant elements that have the given names.
///
/// # Experimental
///
/// This method is experimental and may change in the future. The `path` argument will be revised.
pub fn find_descendant_elements(
nodes: &Ref<Vec<TreeNode>>,
id: NodeId,
path: &[&str],
) -> Vec<NodeId> {
let mut tops = vec![id];
let mut res = vec![];
'work_loop: for (idx, name) in path.iter().enumerate() {
let is_last = path.len() - 1 == idx;

while let Some(id) = tops.pop() {
let mut ops: Vec<NodeId> = child_nodes(Ref::clone(nodes), &id, is_last)
.filter(|id| nodes[id.value].is_element())
.collect();
let mut candidates = vec![];

while let Some(node_id) = ops.pop() {
// Since these nodes are descendants of the primary node and
// were previously extracted from the `Tree` with only elements remaining,
// `else` case should be unreachable.
let Some(node_name) = nodes
.get(node_id.value)
.and_then(|n| n.as_element().map(|el| el.node_name()))
else {
continue;
};

if node_name.as_ref() == *name {
candidates.push(node_id);
continue;
}
ops.extend(
child_nodes(Ref::clone(nodes), &node_id, is_last)
.filter(|id| nodes[id.value].is_element()),
);
}
if is_last {
res.extend(candidates);
} else {
tops.extend(candidates);

continue 'work_loop;
}
}
}
res
}
}
3 changes: 2 additions & 1 deletion src/dom_tree/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::node::{
use crate::node::{Element, NodeData, NodeId, NodeRef, TreeNode};

use super::ops::TreeNodeOps;
use super::traversal::Traversal;

/// An implementation of arena-tree.
pub struct Tree {
Expand Down Expand Up @@ -80,7 +81,7 @@ impl Tree {
let root = self.root();
let nodes = self.nodes.borrow();

TreeNodeOps::find_descendant_element(Ref::clone(&nodes), root.id, &["html", "head", "base"])
Traversal::find_descendant_element(Ref::clone(&nodes), root.id, &["html", "head", "base"])
.and_then(|base_node_id| nodes.get(base_node_id.value))
.and_then(|base_node| base_node.as_element()?.attr("href"))
}
Expand Down
91 changes: 41 additions & 50 deletions src/matcher.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::fmt;
use std::{fmt, iter};

use cssparser::{CowRcStr, ParseError, SourceLocation, ToCss};
use html5ever::Namespace;
Expand Down Expand Up @@ -49,12 +49,10 @@ impl Matcher {
}
}

pub struct Matches<'a, T> {
roots: Vec<T>,
nodes: Vec<T>,
matcher: &'a Matcher,
pub struct Matches<'a, 'b> {
nodes: Vec<NodeRef<'a>>,
matcher: &'b Matcher,
set: NodeIdSet,
match_scope: MatchScope,
caches: SelectorCaches,
}

Expand All @@ -65,71 +63,64 @@ pub enum MatchScope {
ChildrenOnly,
}

impl<'a, T> Matches<'a, T> {
pub fn from_one(node: T, matcher: &'a Matcher, match_scope: MatchScope) -> Self {
impl<'a, 'b> Matches<'a, 'b> {
fn nodes_from_root<I: Iterator<Item = NodeRef<'a>>>(
root_nodes: I,
match_scope: MatchScope,
) -> Vec<NodeRef<'a>> {
match match_scope {
MatchScope::IncludeNode => root_nodes.collect(),
MatchScope::ChildrenOnly => root_nodes
.flat_map(|node| node.children_it(true).filter(|n| n.is_element()))
.collect(),
}
}
pub fn from_one(root_node: NodeRef<'a>, matcher: &'b Matcher, match_scope: MatchScope) -> Self {
let nodes = Self::nodes_from_root(iter::once(root_node), match_scope);
Self {
roots: vec![node],
nodes: vec![],
nodes,
matcher,
set: NodeIdSet::default(),
match_scope,
set: Default::default(),
caches: Default::default(),
}
}

pub fn from_list<I: Iterator<Item = T>>(
nodes: I,
matcher: &'a Matcher,
pub fn from_list<I: Iterator<Item = NodeRef<'a>>>(
root_nodes: I,
matcher: &'b Matcher,
match_scope: MatchScope,
) -> Self {
let nodes = Self::nodes_from_root(root_nodes, match_scope);

Self {
roots: nodes.collect(),
nodes: vec![],
nodes,
matcher,
set: NodeIdSet::default(),
match_scope,
set: Default::default(),
caches: Default::default(),
}
}
}

impl<'b> Iterator for Matches<'_, NodeRef<'b>> {
type Item = NodeRef<'b>;
impl<'a> Iterator for Matches<'a, '_> {
type Item = NodeRef<'a>;

fn next(&mut self) -> Option<Self::Item> {
loop {
if self.nodes.is_empty() {
let root = self.roots.pop()?;
match self.match_scope {
MatchScope::IncludeNode => {
self.nodes.push(root);
}
MatchScope::ChildrenOnly => {
self.nodes.extend(root.children_it(true));
}
}
while let Some(node) = self.nodes.pop() {
if self.set.contains(&node.id) {
continue;
}
self.nodes
.extend(node.children_it(true).filter(|n| n.is_element()));

while let Some(node) = self.nodes.pop() {
self.nodes.extend(node.children_it(true));

if self.set.contains(&node.id) {
continue;
}

if self
.matcher
.match_element_with_caches(&node, &mut self.caches)
{
self.set.insert(node.id);
return Some(node);
}
}

if self.roots.is_empty() {
return None;
if self
.matcher
.match_element_with_caches(&node, &mut self.caches)
{
self.set.insert(node.id);
return Some(node);
}
}
None
}
}

Expand Down
Loading

0 comments on commit f2bf1d1

Please sign in to comment.