Skip to content

Commit

Permalink
Merge pull request #2 from holmofy/extract-macro
Browse files Browse the repository at this point in the history
Extract macro
  • Loading branch information
holmofy authored Jul 13, 2024
2 parents 02c8eb2 + 16de982 commit e365609
Show file tree
Hide file tree
Showing 16 changed files with 835 additions and 70 deletions.
13 changes: 11 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
[workspace]
members = [".", "reqwest-scraper-macros"]

[package]
name = "reqwest-scraper"
version = "0.2.1"
version = "0.3.0"
edition = "2021"
description = "Web scraping integration with reqwest"
license-file = "LICENSE"
repository = "https://github.com/holmofy/reqwest-scraper"

[lib]
name = "reqwest_scraper"
path = "src/lib.rs"

[dependencies]
async-trait = "0.1"
itertools = "0.13"
Expand All @@ -15,13 +22,15 @@ reqwest = { version = "0.12" }
scraper = { version = "0.19", optional = true }
serde = { version = "1.0", optional = true }
serde_json = { version = "1.0", optional = true }
reqwest-scraper-macros = { version = "0.3.0", path = "./reqwest-scraper-macros", optional = true }
thiserror = "1.0"

[features]
default = ["jsonpath", "css_selector", "xpath"]
default = ["jsonpath", "css_selector", "xpath", "macros"]
jsonpath = ["jsonpath_lib", "serde", "serde_json", "reqwest/json"]
xpath = ["libxml"]
css_selector = ["scraper"]
macros = ["reqwest-scraper-macros"]

[dev-dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
Expand Down
65 changes: 61 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ Extends [reqwest](https://github.com/seanmonstar/reqwest) to support multiple we
* [x] Use [JsonPath](#jsonpath) to select fields in json response
* [x] Select elements in HTML response using [CSS selector](#css-selector)
* [x] Evalute the value in HTML response using [xpath expression](#xpath)
* [ ] Derive macro extract
* [x] [Derive macro extract](#macros)

### Start Guide

* add dependency
```toml
reqwest = { version = "0.12", features = ["json"] }
reqwest-scraper="0.2.1"
reqwest-scraper="0.3.0"
```
* use ScraperResponse
```rust
Expand Down Expand Up @@ -61,6 +61,7 @@ pub async fn request() -> Result<()> {

* `Html::select(selector: &str) -> Result<Selectable>`
* `Selectable::iter() -> impl Iterator<SelectItem>`
* `Selectable::first() -> Option<SelectItem>`
* `SelectItem::name() -> &str`
* `SelectItem::id() -> Option<&str>`
* `SelectItem::has_class(class: &str, case_sensitive: CaseSensitivity) -> bool`
Expand Down Expand Up @@ -117,8 +118,8 @@ async fn request() -> Result<()> {
* `Node::children() -> Vec<Node>`
* `Node::findnodes(relative_xpath: &str) -> Result<Vec<Node>>`
* `Node::findvalues(relative_xpath: &str) -> Result<Vec<String>>`
* `Node::findnode(relative_xpath: &str) -> Result<Node>`
* `Node::findvalue(relative_xpath: &str) -> Result<String>`
* `Node::findnode(relative_xpath: &str) -> Result<Option<Node>>`
* `Node::findvalue(relative_xpath: &str) -> Result<Option<String>>`

[**example**](./examples/xpath.rs):

Expand Down Expand Up @@ -163,6 +164,62 @@ async fn request() -> Result<()> {
}
```

<h3 id="macros">Derive macro extract</h3>

**use `FromCssSelector` & `selector` to extract html element into struct**
```rust
// define struct and derive the FromCssSelector trait
#[derive(Debug, FromCssSelector)]
#[selector(path = "#user-repositories-list > ul > li")]
struct Repo {
#[selector(path = "a[itemprop~='name']", default = "<unname>", text)]
name: String,

#[selector(path = "span[itemprop~='programmingLanguage']", text)]
program_lang: Option<String>,

#[selector(path = "div.topics-row-container>a", text)]
topics: Vec<String>,
}

// request
let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.css_selector()
.await?;

// Use the generated `from_html` method to extract data into the struct
let items = Repo::from_html(html)?;
items.iter().for_each(|item| println!("{:?}", item));
```

**use `FromXPath` & `xpath` to extract html element into struct**
```rust
// define struct and derive the FromXPath trait
#[derive(Debug, FromXPath)]
#[xpath(path = "//div[@id='user-repositories-list']/ul/li")]
struct Repo {
#[xpath(path = ".//a[contains(@itemprop,'name')]/text()", default = "<unname>")]
name: String,

#[xpath(path = ".//span[contains(@itemprop,'programmingLanguage')]/text()")]
program_lang: Option<String>,

#[xpath(path = ".//div[contains(@class,'topics-row-container')]/a/text()")]
topics: Vec<String>,
}

let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.xpath()
.await?;

// Use the generated `from_xhtml` method to extract data into the struct
let items = Repo::from_xhtml(html)?;
items.iter().for_each(|item| println!("{:?}", item));
```


## Related Projects

* [reqwest](https://github.com/seanmonstar/reqwest)
Expand Down
66 changes: 58 additions & 8 deletions examples/html.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,80 @@
use anyhow::Result;
use reqwest_scraper::ScraperResponse;
use reqwest_scraper::{FromCssSelector, ScraperResponse};

#[tokio::main]
async fn main() {
request().await.expect("request error");
}

#[derive(Debug, FromCssSelector)]
#[selector(path = "#user-repositories-list > ul > li")]
struct Repo {
#[selector(path = "a[itemprop~='name']", default = "<unname>", text)]
name: String,

#[selector(path = "span[itemprop~='programmingLanguage']", text)]
program_lang: Option<String>,

#[selector(path = "div.topics-row-container>a", text)]
topics: Vec<String>,
}

async fn request() -> Result<()> {
let html = reqwest::get("https://github.com/holmofy")
.await?
.css_selector()
.await?;

// 1. Simple extract
assert_eq!(
html.select(".p-name")?.iter().nth(0).unwrap().text().trim(),
html.select(".p-name")?
.first()
.map(|e| e.text())
.unwrap_or("xxx".into()),
"holmofy"
);

let select_result = html.select(".vcard-details > li.vcard-detail")?;
let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.css_selector()
.await?;

// 2. Select List Element
println!("\n2. Select List Element");
let select_result = html.select("#user-repositories-list > ul > li")?;

for item in select_result.iter() {
let name = item
.select("a[itemprop~='name']")?
.first()
.map(|e| e.text())
.unwrap_or("<unname>".into());

let program_lang = item
.select("span[itemprop~='programmingLanguage']")?
.first()
.map(|e| e.text());

let topics = item
.select("div.topics-row-container>a")?
.iter()
.map(|e| e.text())
.collect::<Vec<_>>();

for detail_item in select_result.iter() {
println!(
"{}",
detail_item.attr("aria-label").unwrap_or_else(|| "".into())
)
let item = Repo {
name,
program_lang,
topics,
};

println!("{:?}", item);
}

// 3. Extract By Derived Macros
println!("\n3. Extract By Derived Macros");

let items = Repo::from_html(html)?;
items.iter().for_each(|item| println!("{:?}", item));

Ok(())
}
26 changes: 26 additions & 0 deletions examples/json.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,29 @@
use anyhow::Result;
use reqwest_scraper::ScraperResponse;
use serde::Deserialize;

#[derive(Debug, Deserialize)]
struct Owner {
login: String,
id: i64,
node_id: String,
avatar_url: String,
gravatar_id: String,
url: String,
html_url: String,
followers_url: String,
following_url: String,
gists_url: String,
starred_url: String,
subscriptions_url: String,
organizations_url: String,
repos_url: String,
events_url: String,
received_events_url: String,
#[serde(alias = "type")]
_type: String,
site_admin: bool,
}

#[tokio::main]
async fn main() {
Expand All @@ -19,10 +43,12 @@ pub async fn request() -> Result<()> {
let total_count_str = json.select_as_str("$.total_count")?;
let total_count_int: i32 = json.select_one("$.total_count")?;
let names: Vec<String> = json.select("$.items[*].full_name")?;
let owners: Vec<Owner> = json.select("$.items[*].owner")?;

println!("{}", total_count_str);
println!("{}", total_count_int);
println!("{}", names.join("\t"));
owners.iter().for_each(|o| println!("{:#?}", o));

Ok(())
}
59 changes: 42 additions & 17 deletions examples/xpath.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
use anyhow::Result;
use reqwest_scraper::ScraperResponse;
use reqwest_scraper::{FromXPath, ScraperResponse};

#[tokio::main]
async fn main() {
request().await.expect("request error");
}

#[derive(Debug, FromXPath)]
#[xpath(path = "//div[@id='user-repositories-list']/ul/li")]
struct Repo {
#[xpath(path = ".//a[contains(@itemprop,'name')]/text()", default = "<unname>")]
name: String,

#[xpath(path = ".//span[contains(@itemprop,'programmingLanguage')]/text()")]
program_lang: Option<String>,

#[xpath(path = ".//div[contains(@class,'topics-row-container')]/a/text()")]
topics: Vec<String>,
}

async fn request() -> Result<()> {
let html = reqwest::get("https://github.com/holmofy")
.await?
Expand All @@ -18,7 +31,6 @@ async fn request() -> Result<()> {
.as_node()
.unwrap()
.text();
println!("{}", name);
assert_eq!(name.trim(), "holmofy");

// iterate elements
Expand All @@ -31,27 +43,40 @@ async fn request() -> Result<()> {
for item in select_result.into_iter() {
let attr = item.attr("aria-label").unwrap_or_else(|| "".into());
println!("{}", attr);
println!("{}", item.text());
println!("{}", item.text().trim());
}

// attribute extract
let select_result = html
.select("//ul[contains(@class,'vcard-details')]/li[contains(@class,'vcard-detail')]/@aria-label")?
.as_strs();
let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.xpath()
.await?;

println!("{}", select_result.len());
select_result.into_iter().for_each(|s| println!("{}", s));
// 2. Select List Element
println!("\n2. Select List Element");
let select_result = html.select("//div[@id='user-repositories-list']/ul/li")?;

//
let select_result = html
.select("//ul[contains(@class,'vcard-details')]/li[contains(@class,'vcard-detail')]/@aria-label")?
.as_nodes();
for item in select_result.as_nodes() {
let name = item.findvalue(".//a[contains(@itemprop,'name')]/text()")?.unwrap_or("".into());

println!("{}", select_result.len());
let program_lang =
item.findvalue(".//span[contains(@itemprop,'programmingLanguage')]/text()")?;

let topics = item.findvalues(".//div[contains(@class,'topics-row-container')]/a/text()")?;

let item = Repo {
name,
program_lang: program_lang,
topics,
};

println!("{:?}", item);
}

// 3. Extract By Derived Macros
println!("\n3. Extract By Derived Macros");

select_result
.into_iter()
.for_each(|n| println!("{}", n.name()));
let items = Repo::from_xhtml(html)?;
items.iter().for_each(|item| println!("{:?}", item));

Ok(())
}
21 changes: 21 additions & 0 deletions reqwest-scraper-macros/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "reqwest-scraper-macros"
version = "0.3.0"
edition = "2021"
description = "Web scraping integration with reqwest"
license-file = "LICENSE"
repository = "https://github.com/holmofy/reqwest-scraper"

[lib]
name = "reqwest_scraper_macros"
proc-macro = true

[dependencies]
proc-macro2 = "1.0"
quote = "1.0"
syn = "2.0"
darling = "0.20.10"
scraper = { version = "0.19", default-features = false }

[dev-dependencies]
reqwest-scraper = { version = "0.3.0", path = "../" }
Loading

0 comments on commit e365609

Please sign in to comment.