diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 67b3483..af17b1f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -28,5 +28,7 @@ jobs: run: cargo test --verbose --all-targets --features "hashbrown" - name: Run clippy run: cargo clippy --verbose --all-targets -- -D warnings + - name: Install cargo audit + uses: taiki-e/install-action@cargo-audit - name: Run audit run: cargo audit diff --git a/CHANGELOG.md b/CHANGELOG.md index dbb642a..bc97ffe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,12 @@ All notable changes to the `dom_query` crate will be documented in this file. - Implemented `NodeRef::is_match` and `NodeRef::is` methods, which allow checking if a node matches a given matcher (`&Matcher`) or selector (`&str`) without creating a `Selection` object. - Implemented `Tree::base_uri`, a quick method that returns the base URI of the document based on the `href` attribute of the `` element. `Document::base_uri` and `NodeRef::base_uri` provide the same functionality. Inspired by [Node: baseURI property]( https://developer.mozilla.org/en-US/docs/Web/API/Node/baseURI). +- `NodeRef::find` an experimental method to find all descendant elements of a node that match a given path. It is much faster than `Selection::select` method. ### Changed - `Selection`'s internal code changes aimed at reducing calls to `RefCell::borrow` and `RefCell::borrow_mut`. +- `Matches` internal code changes. ## [0.11.0] - 2024-12-10 diff --git a/Cargo.toml b/Cargo.toml index 298cb0a..007f8d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ authors = ["niklak ","importcjj "] edition = "2021" readme = "README.md" rust-version = "1.65" -exclude = [".*"] +exclude = [".*", "test-pages"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] diff --git a/src/dom_tree.rs b/src/dom_tree.rs index 41630a8..7872d59 100644 --- a/src/dom_tree.rs +++ b/src/dom_tree.rs @@ -1,5 +1,7 @@ mod ops; +mod traversal; mod tree; pub use ops::TreeNodeOps; +pub use traversal::Traversal; pub use tree::Tree; diff --git a/src/dom_tree/ops.rs b/src/dom_tree/ops.rs index 4dc1741..ac1788c 100644 --- a/src/dom_tree/ops.rs +++ b/src/dom_tree/ops.rs @@ -111,72 +111,6 @@ impl TreeNodeOps { } None } - - /// Finds the first child element of a node that satisfies the given predicate. - /// - /// # Arguments - /// - /// * `nodes` - The nodes of the tree. - /// * `id` - The id of the parent node. - /// * `f` - The predicate to apply to each child element. - /// - /// # Returns - /// - /// The id of the first element that satisfies the predicate, if any. - pub fn find_child_element(nodes: Ref>, id: NodeId, f: F) -> Option - where - F: Fn(&TreeNode) -> bool, - { - child_nodes(Ref::clone(&nodes), &id, false) - .filter_map(|node_id| nodes.get(node_id.value)) - .filter(|tree_node| tree_node.is_element()) - .find(|tree_node| f(tree_node)) - .map(|tree_node| tree_node.id) - } - - /// Finds the first child element of a node that has the given name. - /// - /// # Arguments - /// - /// * `nodes` - The nodes of the tree. - /// * `id` - The id of the parent node. - /// * `name` - The name of the element to search for. - /// - /// # Returns - /// - /// The id of the first element that has the given name, if any. - pub fn find_child_element_by_name( - nodes: Ref>, - id: NodeId, - name: &str, - ) -> Option { - Self::find_child_element(nodes, id, |tree_node| { - tree_node - .as_element() - .map_or(false, |el| el.node_name().as_ref() == name) - }) - } - - /// Finds the first descendant element of a node that has the given names. - /// - /// # Arguments - /// - /// * `nodes` - The nodes of the tree. - /// * `id` - The id of the starting node. - /// * `names` - The names of the elements to search for. - /// - /// # Returns - /// - /// The id of the first descendant element that has the given names, if any. - pub fn find_descendant_element( - nodes: Ref>, - id: NodeId, - names: &[&str], - ) -> Option { - names.iter().try_fold(id, |current_id, name| { - Self::find_child_element_by_name(Ref::clone(&nodes), current_id, name) - }) - } } // manipulation diff --git a/src/dom_tree/traversal.rs b/src/dom_tree/traversal.rs new file mode 100644 index 0000000..a6b2a4d --- /dev/null +++ b/src/dom_tree/traversal.rs @@ -0,0 +1,136 @@ +use std::cell::Ref; + +use crate::node::child_nodes; +use crate::node::{NodeId, TreeNode}; +pub struct Traversal {} + +impl Traversal { + /// Finds the first child element of a node that satisfies the given predicate. + /// + /// # Arguments + /// + /// * `nodes` - The nodes of the tree. + /// * `id` - The id of the parent node. + /// * `f` - The predicate to apply to each child element. + /// + /// # Returns + /// + /// The id of the first element that satisfies the predicate, if any. + pub fn find_child_element(nodes: Ref>, id: NodeId, f: F) -> Option + where + F: Fn(&TreeNode) -> bool, + { + child_nodes(Ref::clone(&nodes), &id, false) + .filter_map(|node_id| nodes.get(node_id.value)) + .filter(|tree_node| tree_node.is_element()) + .find(|tree_node| f(tree_node)) + .map(|tree_node| tree_node.id) + } + + /// Finds the first child element of a node that has the given name. + /// + /// # Arguments + /// + /// * `nodes` - The nodes of the tree. + /// * `id` - The id of the parent node. + /// * `name` - The name of the element to search for. + /// + /// # Returns + /// + /// The id of the first element that has the given name, if any. + pub fn find_child_element_by_name( + nodes: Ref>, + id: NodeId, + name: &str, + ) -> Option { + Self::find_child_element(nodes, id, |tree_node| { + tree_node + .as_element() + .map_or(false, |el| el.node_name().as_ref() == name) + }) + } + + /// Finds the first descendant element of a node that match given path. + /// + /// # Arguments + /// + /// * `nodes` - The nodes of the tree. + /// * `id` - The id of the starting node. + /// * `names` - The sequence of element names to search for. Currently, only element names are supported. + /// + /// # Returns + /// + /// The id of the first descendant element that has the given names, if any. + pub fn find_descendant_element( + nodes: Ref>, + id: NodeId, + path: &[&str], + ) -> Option { + path.iter().try_fold(id, |current_id, name| { + Self::find_child_element_by_name(Ref::clone(&nodes), current_id, name) + }) + } + + /// Finds all descendant elements of a node that match given path. + /// + /// # Arguments + /// + /// * `nodes` - The nodes of the tree. + /// * `id` - The id of the starting node. + /// * `path` - The sequence of element names to search for. Currently, only element names are supported. + /// + /// # Returns + /// + /// A list of ids of all descendant elements that have the given names. + /// + /// # Experimental + /// + /// This method is experimental and may change in the future. The `path` argument will be revised. + pub fn find_descendant_elements( + nodes: &Ref>, + id: NodeId, + path: &[&str], + ) -> Vec { + let mut tops = vec![id]; + let mut res = vec![]; + 'work_loop: for (idx, name) in path.iter().enumerate() { + let is_last = path.len() - 1 == idx; + + while let Some(id) = tops.pop() { + let mut ops: Vec = child_nodes(Ref::clone(nodes), &id, is_last) + .filter(|id| nodes[id.value].is_element()) + .collect(); + let mut candidates = vec![]; + + while let Some(node_id) = ops.pop() { + // Since these nodes are descendants of the primary node and + // were previously extracted from the `Tree` with only elements remaining, + // `else` case should be unreachable. + let Some(node_name) = nodes + .get(node_id.value) + .and_then(|n| n.as_element().map(|el| el.node_name())) + else { + continue; + }; + + if node_name.as_ref() == *name { + candidates.push(node_id); + continue; + } + ops.extend( + child_nodes(Ref::clone(nodes), &node_id, is_last) + .filter(|id| nodes[id.value].is_element()), + ); + } + if is_last { + res.extend(candidates); + } else { + tops.extend(candidates); + + continue 'work_loop; + } + } + } + res + } +} diff --git a/src/dom_tree/tree.rs b/src/dom_tree/tree.rs index cd0389a..671e512 100644 --- a/src/dom_tree/tree.rs +++ b/src/dom_tree/tree.rs @@ -13,6 +13,7 @@ use crate::node::{ use crate::node::{Element, NodeData, NodeId, NodeRef, TreeNode}; use super::ops::TreeNodeOps; +use super::traversal::Traversal; /// An implementation of arena-tree. pub struct Tree { @@ -80,7 +81,7 @@ impl Tree { let root = self.root(); let nodes = self.nodes.borrow(); - TreeNodeOps::find_descendant_element(Ref::clone(&nodes), root.id, &["html", "head", "base"]) + Traversal::find_descendant_element(Ref::clone(&nodes), root.id, &["html", "head", "base"]) .and_then(|base_node_id| nodes.get(base_node_id.value)) .and_then(|base_node| base_node.as_element()?.attr("href")) } diff --git a/src/matcher.rs b/src/matcher.rs index b00bd75..ab9c2e4 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -1,4 +1,4 @@ -use std::fmt; +use std::{fmt, iter}; use cssparser::{CowRcStr, ParseError, SourceLocation, ToCss}; use html5ever::Namespace; @@ -49,12 +49,10 @@ impl Matcher { } } -pub struct Matches<'a, T> { - roots: Vec, - nodes: Vec, - matcher: &'a Matcher, +pub struct Matches<'a, 'b> { + nodes: Vec>, + matcher: &'b Matcher, set: NodeIdSet, - match_scope: MatchScope, caches: SelectorCaches, } @@ -65,71 +63,64 @@ pub enum MatchScope { ChildrenOnly, } -impl<'a, T> Matches<'a, T> { - pub fn from_one(node: T, matcher: &'a Matcher, match_scope: MatchScope) -> Self { +impl<'a, 'b> Matches<'a, 'b> { + fn nodes_from_root>>( + root_nodes: I, + match_scope: MatchScope, + ) -> Vec> { + match match_scope { + MatchScope::IncludeNode => root_nodes.collect(), + MatchScope::ChildrenOnly => root_nodes + .flat_map(|node| node.children_it(true).filter(|n| n.is_element())) + .collect(), + } + } + pub fn from_one(root_node: NodeRef<'a>, matcher: &'b Matcher, match_scope: MatchScope) -> Self { + let nodes = Self::nodes_from_root(iter::once(root_node), match_scope); Self { - roots: vec![node], - nodes: vec![], + nodes, matcher, - set: NodeIdSet::default(), - match_scope, + set: Default::default(), caches: Default::default(), } } - pub fn from_list>( - nodes: I, - matcher: &'a Matcher, + pub fn from_list>>( + root_nodes: I, + matcher: &'b Matcher, match_scope: MatchScope, ) -> Self { + let nodes = Self::nodes_from_root(root_nodes, match_scope); + Self { - roots: nodes.collect(), - nodes: vec![], + nodes, matcher, - set: NodeIdSet::default(), - match_scope, + set: Default::default(), caches: Default::default(), } } } -impl<'b> Iterator for Matches<'_, NodeRef<'b>> { - type Item = NodeRef<'b>; +impl<'a> Iterator for Matches<'a, '_> { + type Item = NodeRef<'a>; fn next(&mut self) -> Option { - loop { - if self.nodes.is_empty() { - let root = self.roots.pop()?; - match self.match_scope { - MatchScope::IncludeNode => { - self.nodes.push(root); - } - MatchScope::ChildrenOnly => { - self.nodes.extend(root.children_it(true)); - } - } + while let Some(node) = self.nodes.pop() { + if self.set.contains(&node.id) { + continue; } + self.nodes + .extend(node.children_it(true).filter(|n| n.is_element())); - while let Some(node) = self.nodes.pop() { - self.nodes.extend(node.children_it(true)); - - if self.set.contains(&node.id) { - continue; - } - - if self - .matcher - .match_element_with_caches(&node, &mut self.caches) - { - self.set.insert(node.id); - return Some(node); - } - } - - if self.roots.is_empty() { - return None; + if self + .matcher + .match_element_with_caches(&node, &mut self.caches) + { + self.set.insert(node.id); + return Some(node); } } + None } } diff --git a/src/node/node_ref.rs b/src/node/node_ref.rs index 24e6050..30f48fa 100644 --- a/src/node/node_ref.rs +++ b/src/node/node_ref.rs @@ -10,6 +10,7 @@ use html5ever::Attribute; use tendril::StrTendril; +use crate::dom_tree::Traversal; use crate::entities::copy_attrs; use crate::Document; use crate::Matcher; @@ -659,4 +660,21 @@ impl NodeRef<'_> { pub fn base_uri(&self) -> Option { self.tree.base_uri() } + + /// Finds all descendant elements of this node that match the given path. + /// + /// The path is a sequence of element names. The method returns a vector of + /// [`NodeRef`]s that correspond to the matching elements. The elements are + /// returned in the order they appear in the document tree. + /// + /// # Experimental + /// This method is experimental and may change in the future. The `path` argument will be revised. + pub fn find(&self, path: &[&str]) -> Vec { + let nodes = self.tree.nodes.borrow(); + let found_ids = Traversal::find_descendant_elements(&nodes, self.id, path); + found_ids + .iter() + .map(|node_id| NodeRef::new(*node_id, self.tree)) + .collect() + } } diff --git a/src/node/selector.rs b/src/node/selector.rs index 2a4ee9e..a781f53 100644 --- a/src/node/selector.rs +++ b/src/node/selector.rs @@ -1,6 +1,6 @@ use std::ops::Deref; -use html5ever::{local_name, namespace_url, ns}; +use html5ever::{namespace_url, ns}; use selectors::attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}; use selectors::context::MatchingContext; use selectors::matching::ElementSelectorFlags; @@ -26,6 +26,7 @@ impl selectors::Element for NodeRef<'_> { /// Converts self into an opaque representation. It can be crucial. #[inline] fn opaque(&self) -> OpaqueElement { + // TODO: ? let nodes = self.tree.nodes.borrow(); let node = nodes.get(self.id.value).expect("element not in the tree!"); OpaqueElement::new(node) @@ -140,8 +141,7 @@ impl selectors::Element for NodeRef<'_> { } AnyLink | Link => match self.node_name() { Some(node_name) => { - matches!(node_name.deref(), "a" | "area" | "link") - && self.attr("href").is_some() + matches!(node_name.deref(), "a" | "area" | "link") && self.has_attr("href") } None => false, }, @@ -163,13 +163,8 @@ impl selectors::Element for NodeRef<'_> { fn is_link(&self) -> bool { self.query_or(false, |node| { if let NodeData::Element(ref e) = node.data { - return matches!( - e.name.local, - local_name!("a") | local_name!("area") | local_name!("link") - ) && e - .attrs - .iter() - .any(|attr| attr.name.local == local_name!("href")); + let node_name = e.name.local.as_ref(); + return matches!(node_name, "a" | "area" | "link") && e.has_attr("href"); } false }) diff --git a/test-pages/002.html b/test-pages/002.html new file mode 100644 index 0000000..f5972df --- /dev/null +++ b/test-pages/002.html @@ -0,0 +1,371 @@ + + + +
+
+
+

For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very nice API. It suffers from lack of separation of concerns. The input, output and state are all managed by interacting with one object, and state is tracked using events. Also, the event-based model doesn’t play well with JavaScript’s recent focus on Promise- and generator-based asynchronous programming. +

+

The Fetch API intends to fix most of these problems. It does this by introducing the same primitives to JS that are used in the HTTP protocol. In addition, it introduces a utility function fetch() that succinctly captures the intention of retrieving a resource from the network.

+

The Fetch specification, which defines the API, nails down the semantics of a user agent fetching a resource. This, combined with ServiceWorkers, is an attempt to:

+
    +
  1. Improve the offline experience.
  2. +
  3. Expose the building blocks of the Web to the platform as part of the + extensible web movement. +
  4. +
+

As of this writing, the Fetch API is available in Firefox 39 (currently Nightly) and Chrome 42 (currently dev). Github has a Fetch polyfill.

+

Feature detection

+

Fetch API support can be detected by checking for Headers,Request, Response or fetch on the window or worker scope.

+

Simple fetching

+

The most useful, high-level part of the Fetch API is the fetch() function. In its simplest form it takes a URL and returns a promise that resolves to the response. The response is captured as a Response object.

+
+
+
fetch("/data.json").then(function(res) {
+  // res instanceof Response == true.
+  if (res.ok) {
+    res.json().then(function(data) {
+      console.log(data.entries);
+    });
+  } else {
+    console.log("Looks like the response wasn't perfect, got status", res.status);
+  }
+}, function(e) {
+  console.log("Fetch failed!", e);
+});
+
+
+

Submitting some parameters, it would look like this:

+
+
+
fetch("http://www.example.org/submit.php", {
+  method: "POST",
+  headers: {
+    "Content-Type": "application/x-www-form-urlencoded"
+  },
+  body: "firstName=Nikhil&favColor=blue&password=easytoguess"
+}).then(function(res) {
+  if (res.ok) {
+    alert("Perfect! Your settings are saved.");
+  } else if (res.status == 401) {
+    alert("Oops! You are not authorized.");
+  }
+}, function(e) {
+  alert("Error submitting form!");
+});
+
+
+

The fetch() function’s arguments are the same as those passed to the +
+ Request() constructor, so you may directly pass arbitrarily complex requests to fetch() as discussed below. +

+

Headers

+

Fetch introduces 3 interfaces. These are Headers, Request and +
+ Response. They map directly to the underlying HTTP concepts, but have +
certain visibility filters in place for privacy and security reasons, such as +
supporting CORS rules and ensuring cookies aren’t readable by third parties. +

+

The Headers interface is a simple multi-map of names to values:

+
+
+
var content = "Hello World";
+var reqHeaders = new Headers();
+reqHeaders.append("Content-Type", "text/plain"
+reqHeaders.append("Content-Length", content.length.toString());
+reqHeaders.append("X-Custom-Header", "ProcessThisImmediately");
+
+
+

The same can be achieved by passing an array of arrays or a JS object literal +
to the constructor: +

+
+
+
reqHeaders = new Headers({
+  "Content-Type": "text/plain",
+  "Content-Length": content.length.toString(),
+  "X-Custom-Header": "ProcessThisImmediately",
+});
+
+
+

The contents can be queried and retrieved:

+
+
+
console.log(reqHeaders.has("Content-Type")); // true
+console.log(reqHeaders.has("Set-Cookie")); // false
+reqHeaders.set("Content-Type", "text/html");
+reqHeaders.append("X-Custom-Header", "AnotherValue");
+ 
+console.log(reqHeaders.get("Content-Length")); // 11
+console.log(reqHeaders.getAll("X-Custom-Header")); // ["ProcessThisImmediately", "AnotherValue"]
+ 
+reqHeaders.delete("X-Custom-Header");
+console.log(reqHeaders.getAll("X-Custom-Header")); // []
+
+
+

Some of these operations are only useful in ServiceWorkers, but they provide +
a much nicer API to Headers. +

+

Since Headers can be sent in requests, or received in responses, and have various limitations about what information can and should be mutable, Headers objects have a guard property. This is not exposed to the Web, but it affects which mutation operations are allowed on the Headers object. +
Possible values are: +

+
    +
  • “none”: default.
  • +
  • “request”: guard for a Headers object obtained from a Request (Request.headers).
  • +
  • “request-no-cors”: guard for a Headers object obtained from a Request created +
    with mode “no-cors”. +
  • +
  • “response”: naturally, for Headers obtained from Response (Response.headers).
  • +
  • “immutable”: Mostly used for ServiceWorkers, renders a Headers object +
    read-only. +
  • +
+

The details of how each guard affects the behaviors of the Headers object are +
in the specification. For example, you may not append or set a “request” guarded Headers’ “Content-Length” header. Similarly, inserting “Set-Cookie” into a Response header is not allowed so that ServiceWorkers may not set cookies via synthesized Responses. +

+

All of the Headers methods throw TypeError if name is not a + valid HTTP Header name. The mutation operations will throw TypeError if there is an immutable guard. Otherwise they fail silently. For example: +

+
+
+
var res = Response.error();
+try {
+  res.headers.set("Origin", "http://mybank.com");
+} catch(e) {
+  console.log("Cannot pretend to be a bank!");
+}
+
+
+

Request

+

The Request interface defines a request to fetch a resource over HTTP. URL, method and headers are expected, but the Request also allows specifying a body, a request mode, credentials and cache hints.

+

The simplest Request is of course, just a URL, as you may do to GET a resource. +

+
+
+
var req = new Request("/index.html");
+console.log(req.method); // "GET"
+console.log(req.url); // "http://example.com/index.html"
+
+
+

You may also pass a Request to the Request() constructor to create a copy. +
(This is not the same as calling the clone() method, which is covered in +
the “Reading bodies” section.). +

+
+
+
var copy = new Request(req);
+console.log(copy.method); // "GET"
+console.log(copy.url); // "http://example.com/index.html"
+
+
+

Again, this form is probably only useful in ServiceWorkers.

+

The non-URL attributes of the Request can only be set by passing initial +
values as a second argument to the constructor. This argument is a dictionary. +

+
+
+
var uploadReq = new Request("/uploadImage", {
+  method: "POST",
+  headers: {
+    "Content-Type": "image/png",
+  },
+  body: "image data"
+});
+
+
+

The Request’s mode is used to determine if cross-origin requests lead to valid responses, and which properties on the response are readable. Legal mode values are "same-origin", "no-cors" (default) and "cors".

+

The "same-origin" mode is simple, if a request is made to another origin with this mode set, the result is simply an error. You could use this to ensure that +
a request is always being made to your origin. +

+
+
+
var arbitraryUrl = document.getElementById("url-input").value;
+fetch(arbitraryUrl, { mode: "same-origin" }).then(function(res) {
+  console.log("Response succeeded?", res.ok);
+}, function(e) {
+  console.log("Please enter a same-origin URL!");
+});
+
+
+

The "no-cors" mode captures what the web platform does by default for scripts you import from CDNs, images hosted on other domains, and so on. First, it prevents the method from being anything other than “HEAD”, “GET” or “POST”. Second, if any ServiceWorkers intercept these requests, they may not add or override any headers except for these. Third, JavaScript may not access any properties of the resulting Response. This ensures that ServiceWorkers do not affect the semantics of the Web and prevents security and privacy issues that could arise from leaking data across domains.

+

"cors" mode is what you’ll usually use to make known cross-origin requests to access various APIs offered by other vendors. These are expected to adhere to +
the CORS protocol. Only a limited set of headers is exposed in the Response, but the body is readable. For example, you could get a list of Flickr’s most interesting photos today like this: +

+
+
+
var u = new URLSearchParams();
+u.append('method', 'flickr.interestingness.getList');
+u.append('api_key', '<insert api key here>');
+u.append('format', 'json');
+u.append('nojsoncallback', '1');
+ 
+var apiCall = fetch('https://api.flickr.com/services/rest?' + u);
+ 
+apiCall.then(function(response) {
+  return response.json().then(function(json) {
+    // photo is a list of photos.
+    return json.photos.photo;
+  });
+}).then(function(photos) {
+  photos.forEach(function(photo) {
+    console.log(photo.title);
+  });
+});
+
+
+

You may not read out the “Date” header since Flickr does not allow it via +
+ Access-Control-Expose-Headers. +

+
+
+
response.headers.get("Date"); // null
+
+
+

The credentials enumeration determines if cookies for the other domain are +
sent to cross-origin requests. This is similar to XHR’s withCredentials +
flag, but tri-valued as "omit" (default), "same-origin" and "include". +

+

The Request object will also give the ability to offer caching hints to the user-agent. This is currently undergoing some security review. Firefox exposes the attribute, but it has no effect.

+

Requests have two read-only attributes that are relevant to ServiceWorkers +
intercepting them. There is the string referrer, which is set by the UA to be +
the referrer of the Request. This may be an empty string. The other is +
+ context which is a rather large enumeration defining what sort of resource is being fetched. This could be “image” if the request is from an <img>tag in the controlled document, “worker” if it is an attempt to load a worker script, and so on. When used with the fetch() function, it is “fetch”. +

+

Response

+

Response instances are returned by calls to fetch(). They can also be created by JS, but this is only useful in ServiceWorkers.

+

We have already seen some attributes of Response when we looked at fetch(). The most obvious candidates are status, an integer (default value 200) and statusText (default value “OK”), which correspond to the HTTP status code and reason. The ok attribute is just a shorthand for checking that status is in the range 200-299 inclusive. +

+

headers is the Response’s Headers object, with guard “response”. The url attribute reflects the URL of the corresponding request.

+

Response also has a type, which is “basic”, “cors”, “default”, “error” or +
“opaque”. +

+
    +
  • "basic": normal, same origin response, with all headers exposed except +
    “Set-Cookie” and “Set-Cookie2″. +
  • +
  • "cors": response was received from a valid cross-origin request. + Certain headers and the bodymay be accessed. +
  • +
  • "error": network error. No useful information describing the error is available. The Response’s status is 0, headers are empty and immutable. This is the type for a Response obtained from Response.error().
  • +
  • "opaque": response for “no-cors” request to cross-origin resource. Severely
    + restricted
    +
  • +
+

The “error” type results in the fetch() Promise rejecting with TypeError. +

+

There are certain attributes that are useful only in a ServiceWorker scope. The +
idiomatic way to return a Response to an intercepted request in ServiceWorkers is: +

+
+
+
addEventListener('fetch', function(event) {
+  event.respondWith(new Response("Response body", {
+    headers: { "Content-Type" : "text/plain" }
+  });
+});
+
+
+

As you can see, Response has a two argument constructor, where both arguments are optional. The first argument is a body initializer, and the second is a dictionary to set the status, statusText and headers.

+

The static method Response.error() simply returns an error response. Similarly, Response.redirect(url, status) returns a Response resulting in +
a redirect to url. +

+

Dealing with bodies

+

Both Requests and Responses may contain body data. We’ve been glossing over it because of the various data types body may contain, but we will cover it in detail now.

+

A body is an instance of any of the following types.

+ +

In addition, Request and Response both offer the following methods to extract their body. These all return a Promise that is eventually resolved with the actual content.

+
    +
  • arrayBuffer() +
  • +
  • blob() +
  • +
  • json() +
  • +
  • text() +
  • +
  • formData() +
  • +
+

This is a significant improvement over XHR in terms of ease of use of non-text data!

+

Request bodies can be set by passing body parameters:

+
+
+
var form = new FormData(document.getElementById('login-form'));
+fetch("/login", {
+  method: "POST",
+  body: form
+})
+
+
+

Responses take the first argument as the body.

+
+
+
var res = new Response(new File(["chunk", "chunk"], "archive.zip",
+                       { type: "application/zip" }));
+
+
+

Both Request and Response (and by extension the fetch() function), will try to intelligently determine the content type. Request will also automatically set a “Content-Type” header if none is set in the dictionary.

+

Streams and cloning

+

It is important to realise that Request and Response bodies can only be read once! Both interfaces have a boolean attribute bodyUsed to determine if it is safe to read or not.

+
+
+
var res = new Response("one time use");
+console.log(res.bodyUsed); // false
+res.text().then(function(v) {
+  console.log(res.bodyUsed); // true
+});
+console.log(res.bodyUsed); // true
+ 
+res.text().catch(function(e) {
+  console.log("Tried to read already consumed Response");
+});
+
+
+

This decision allows easing the transition to an eventual stream-based Fetch API. The intention is to let applications consume data as it arrives, allowing for JavaScript to deal with larger files like videos, and perform things like compression and editing on the fly.

+

Often, you’ll want access to the body multiple times. For example, you can use the upcoming Cache API to store Requests and Responses for offline use, and Cache requires bodies to be available for reading.

+

So how do you read out the body multiple times within such constraints? The API provides a clone() method on the two interfaces. This will return a clone of the object, with a ‘new’ body. clone() MUST be called before the body of the corresponding object has been used. That is, clone() first, read later.

+
+
+
addEventListener('fetch', function(evt) {
+  var sheep = new Response("Dolly");
+  console.log(sheep.bodyUsed); // false
+  var clone = sheep.clone();
+  console.log(clone.bodyUsed); // false
+ 
+  clone.text();
+  console.log(sheep.bodyUsed); // false
+  console.log(clone.bodyUsed); // true
+ 
+  evt.respondWith(cache.add(sheep.clone()).then(function(e) {
+    return sheep;
+  });
+});
+
+
+

Future improvements

+

Along with the transition to streams, Fetch will eventually have the ability to abort running fetch()es and some way to report the progress of a fetch. These are provided by XHR, but are a little tricky to fit in the Promise-based nature of the Fetch API.

+

You can contribute to the evolution of this API by participating in discussions on the WHATWG mailing list and in the issues in the Fetch and + ServiceWorkerspecifications. +

+

For a better web!

+

The author would like to thank Andrea Marchesini, Anne van Kesteren and Ben
+ Kelly for helping with the specification and implementation.
+

+
+
+
+ + diff --git a/tests/node-traversal.rs b/tests/node-traversal.rs index 3ef0566..27e72d2 100644 --- a/tests/node-traversal.rs +++ b/tests/node-traversal.rs @@ -278,3 +278,26 @@ fn test_node_base_uri_none() { let doc = Document::from(ANCESTORS_CONTENTS); assert!(doc.base_uri().is_none()); } + +#[cfg_attr(not(target_arch = "wasm32"), test)] +#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] +fn test_node_find() { + let html_contents = include_str!("../test-pages/hacker_news.html"); + let doc = Document::from(html_contents); + let a_sel = doc.select("body td a"); + let expected_ids: Vec = a_sel.nodes().iter().map(|n| n.id).collect(); + + let root = doc.root(); + let got_ids: Vec = root + .find(&["body", "td", "a"]) + .iter() + .map(|n| n.id) + .collect(); + + assert_eq!(got_ids, expected_ids); + + let len_fin_ne = root.find(&["body", "td", "p"]).len(); + assert_eq!(len_fin_ne, 0); + let len_sel_ne = doc.select("body td p").length(); + assert_eq!(len_sel_ne, 0) +} diff --git a/tests/selection-query.rs b/tests/selection-query.rs index 75bfb48..89ab317 100644 --- a/tests/selection-query.rs +++ b/tests/selection-query.rs @@ -1,5 +1,7 @@ mod data; +use std::collections::HashSet; + use data::{doc, ANCESTORS_CONTENTS, HEADING_CONTENTS}; use dom_query::{Document, Selection}; @@ -155,3 +157,25 @@ fn test_is_has() { assert!(prev_sel.is("*:has( > img:only-child)")); } + + +#[cfg_attr(not(target_arch = "wasm32"), test)] +#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] +fn test_selection_unique() { + // This document contains many nested div elements and was taken from the dom_smoothie test data. + // I was investigating whether an additional uniqueness check during selection was necessary, + // as the results looked correct and unique without it, while the check added overhead. + // However, after removing the `set.contains` check from `Matches::next`, the dom_smoothie::Readability tests started failing. + // Therefore, the current `Matches` implementation requires the uniqueness check despite the overhead. + + let contents = include_str!("../test-pages/002.html"); + let doc: Document = contents.into(); + + let div_sel = doc.select(".page").select("div").select("div > div"); + + let sel_ids = div_sel.nodes().iter().map(|n| n.id).collect::>(); + + let unique_ids = sel_ids.iter().cloned().collect::>(); + assert_eq!(sel_ids.len(), unique_ids.len()); + +} \ No newline at end of file