Skip to content

Commit

Permalink
Update Hacker News HTML text parsing logic (#99)
Browse files Browse the repository at this point in the history
## Changes

- update parsing logic to reflect new HN Algolia API change regarding the use of `<p>` for paragraph breaks
- cleanup parsing codes
  • Loading branch information
aome510 authored Oct 21, 2023
1 parent d09b1d2 commit 9301c39
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 83 deletions.
4 changes: 1 addition & 3 deletions hackernews_tui/src/client/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ impl HNClient {
format!("get item (id={item_id}) using {request_url}")
);

// The item's text returned from HN official APIs may have `<p>` tags representing
// paragraph breaks. Convert `<p>` tags to newlines to make the text easier to read.
let text = decode_html(&item.text.unwrap_or_default()).replace("<p>", "\n\n");
let text = decode_html(&item.text.unwrap_or_default());

// Construct the shortened text to represent the page's title if not exist
let chars = text.replace('\n', " ").chars().collect::<Vec<_>>();
Expand Down
28 changes: 15 additions & 13 deletions hackernews_tui/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ pub struct VoteData {
}

#[derive(Debug, Clone)]
/// A HackerNews item which can be either a story or a comment.
/// A Hacker News item which can be either a story or a comment.
///
/// This struct is a shared representation between a story and
/// a comment for rendering the item's content.
/// This struct is a shared representation between a story and a comment
/// and is used to render their content.
pub struct HnItem {
pub id: u32,
pub level: usize,
Expand Down Expand Up @@ -107,19 +107,18 @@ impl From<Story> for HnItem {
),
]);

let mut story_text = story.content;
// parse story's HTML content
let result = parse_hn_html_text(story.content, Style::default(), 0);

let minimized_text = if story_text.is_empty() {
// construct a minimized text representing the collapsed story's content
let minimized_text = if result.content.source().is_empty() {
metadata.clone()
} else {
story_text = format!("\n{story_text}");

utils::combine_styled_strings([metadata.clone(), StyledString::plain("... (more)")])
};

let mut text = metadata;
let result = parse_hn_html_text(story_text, Style::default(), 0);
text.append(result.s);
let text =
utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);

HnItem {
id: story.id,
Expand All @@ -144,17 +143,20 @@ impl From<Comment> for HnItem {
),
]);

let mut text = utils::combine_styled_strings([metadata.clone(), StyledString::plain("\n")]);
// constructs a minimized text representing the collapsed comment's content
let minimized_text = utils::combine_styled_strings([
metadata,
metadata.clone(),
StyledString::styled(
format!("({} more)", comment.n_children + 1),
component_style.metadata,
),
]);

// parse the comment's content
let result = parse_hn_html_text(comment.content, Style::default(), 0);
text.append(result.s);

let text =
utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);

HnItem {
id: comment.id,
Expand Down
38 changes: 21 additions & 17 deletions hackernews_tui/src/parser/article.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::html::HTMLParsedResult;
use super::html::HTMLTextParsedResult;
use super::rcdom::{Handle, NodeData, RcDom};
use crate::parser::html::HTMLTableParsedResult;
use crate::prelude::*;
Expand Down Expand Up @@ -40,7 +40,7 @@ impl Article {
/// # Arguments:
/// * `max_width`: the maximum width of the parsed content. This is mostly used
/// to construct a HTML table using `comfy_table`.
pub fn parse(&self, max_width: usize) -> Result<HTMLParsedResult> {
pub fn parse(&self, max_width: usize) -> Result<HTMLTextParsedResult> {
debug!("parse article ({:?})", self);

// parse HTML content into DOM node(s)
Expand Down Expand Up @@ -90,21 +90,21 @@ impl Article {
base_link_id: usize,
mut style: Style,
mut args: ArticleParseArgs,
) -> (HTMLParsedResult, bool) {
) -> (HTMLTextParsedResult, bool) {
// TODO: handle parsing <ol> tags correctly

debug!(
"parse dom node: {:?}, style: {:?}, args: {:?}",
node, style, args
);

let mut result = HTMLParsedResult::default();
let mut result = HTMLTextParsedResult::default();
let mut suffix = StyledString::new();

let mut visit_block_element_cb = || {
if !args.is_first_element_in_block {
result.s.append_plain("\n\n");
result.s.append_styled(&args.prefix, style);
result.content.append_plain("\n\n");
result.content.append_styled(&args.prefix, style);
}
args.is_first_element_in_block = true;
};
Expand All @@ -128,7 +128,7 @@ impl Article {

has_non_ws_text |= !text.trim().is_empty();

result.s.append_styled(text, style);
result.content.append_styled(text, style);
}
NodeData::Element {
ref name,
Expand All @@ -151,7 +151,9 @@ impl Article {
style = style.combine(component_style.header);
}
expanded_name!(html "br") => {
result.s.append_styled(format!("\n{}", args.prefix), style);
result
.content
.append_styled(format!("\n{}", args.prefix), style);
}
expanded_name!(html "p") => visit_block_element_cb(),
expanded_name!(html "code") => {
Expand All @@ -169,15 +171,15 @@ impl Article {

style = style.combine(component_style.multiline_code_block);

result.s.append_styled(" ", style);
result.content.append_styled(" ", style);
}
expanded_name!(html "blockquote") => {
visit_block_element_cb();

args.prefix = format!("{}▎ ", args.prefix);
style = style.combine(component_style.quote);

result.s.append_styled("▎ ", style);
result.content.append_styled("▎ ", style);
}
expanded_name!(html "table") => {
let mut table_result = HTMLTableParsedResult::default();
Expand Down Expand Up @@ -211,7 +213,7 @@ impl Article {
table.add_row(row.into_iter().map(|c| c.source().to_owned()));
}

result.s.append_styled(format!("\n\n{table}"), style);
result.content.append_styled(format!("\n\n{table}"), style);

return (result, true);
}
Expand All @@ -225,7 +227,7 @@ impl Article {
args.is_first_element_in_block = true;

result
.s
.content
.append_styled(format!("\n{}• ", args.prefix), style);
}
expanded_name!(html "img") => {
Expand All @@ -240,10 +242,12 @@ impl Article {
};

if !args.is_first_element_in_block {
result.s.append_plain("\n\n");
result.content.append_plain("\n\n");
}
result.s.append_styled(&img_desc, style);
result.s.append_styled(" (image)", component_style.metadata);
result.content.append_styled(&img_desc, style);
result
.content
.append_styled(" (image)", component_style.metadata);
}
expanded_name!(html "a") => {
// find `href` attribute of an <a> tag
Expand Down Expand Up @@ -291,7 +295,7 @@ impl Article {
}
});

result.s.append(suffix);
result.content.append(suffix);
(result, has_non_ws_text)
}

Expand Down Expand Up @@ -331,7 +335,7 @@ impl Article {
);

result.links.append(&mut child_result.links);
s.append(child_result.s);
s.append(child_result.content);
});

if !is_header {
Expand Down
Loading

0 comments on commit 9301c39

Please sign in to comment.