From 32cb38f46f2b283d9ffea34c2262f7be41c6c0f6 Mon Sep 17 00:00:00 2001 From: Jonas Kalderstam Date: Wed, 18 Dec 2024 21:21:41 +0100 Subject: [PATCH] fixed missing spaces inside some tags Signed-off-by: Jonas Kalderstam --- .../feeder/model/html/HtmlLinearizer.kt | 22 ++++--------------- .../feeder/model/html/HtmlLinearizerTest.kt | 17 ++++++++++++++ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/app/src/main/java/com/nononsenseapps/feeder/model/html/HtmlLinearizer.kt b/app/src/main/java/com/nononsenseapps/feeder/model/html/HtmlLinearizer.kt index 87f643f506..d3b15a97a0 100644 --- a/app/src/main/java/com/nononsenseapps/feeder/model/html/HtmlLinearizer.kt +++ b/app/src/main/java/com/nononsenseapps/feeder/model/html/HtmlLinearizer.kt @@ -69,7 +69,6 @@ class HtmlLinearizer { if (blockStyle.shouldSoftWrap) { node.appendCorrectlyNormalizedWhiteSpace( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } else { append(node.wholeText) @@ -113,7 +112,6 @@ class HtmlLinearizer { withLinearTextAnnotation(LinearTextAnnotationH1) { element.appendCorrectlyNormalizedWhiteSpaceRecursively( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } } @@ -124,7 +122,6 @@ class HtmlLinearizer { withLinearTextAnnotation(LinearTextAnnotationH2) { element.appendCorrectlyNormalizedWhiteSpaceRecursively( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } } @@ -135,7 +132,6 @@ class HtmlLinearizer { withLinearTextAnnotation(LinearTextAnnotationH3) { element.appendCorrectlyNormalizedWhiteSpaceRecursively( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } } @@ -146,7 +142,6 @@ class HtmlLinearizer { withLinearTextAnnotation(LinearTextAnnotationH4) { element.appendCorrectlyNormalizedWhiteSpaceRecursively( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } } @@ -157,7 +152,6 @@ class HtmlLinearizer { withLinearTextAnnotation(LinearTextAnnotationH5) { element.appendCorrectlyNormalizedWhiteSpaceRecursively( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } } @@ -168,7 +162,6 @@ class HtmlLinearizer { withLinearTextAnnotation(LinearTextAnnotationH6) { element.appendCorrectlyNormalizedWhiteSpaceRecursively( linearTextBuilder, - stripLeading = linearTextBuilder.endsWithWhitespace, ) } } @@ -925,13 +918,10 @@ class HtmlLinearizer { * Can't use JSoup's text() method because that strips invisible characters * such as ZWNJ which are crucial for several languages. */ -fun TextNode.appendCorrectlyNormalizedWhiteSpace( - builder: LinearTextBuilder, - stripLeading: Boolean, -) { +fun TextNode.appendCorrectlyNormalizedWhiteSpace(builder: LinearTextBuilder) { wholeText.asUTF8Sequence() .dropWhile { - stripLeading && isCollapsableWhiteSpace(it) + builder.endsWithWhitespace && isCollapsableWhiteSpace(it) } .fold(false) { lastWasWhite, char -> if (isCollapsableWhiteSpace(char)) { @@ -946,17 +936,13 @@ fun TextNode.appendCorrectlyNormalizedWhiteSpace( } } -fun Element.appendCorrectlyNormalizedWhiteSpaceRecursively( - builder: LinearTextBuilder, - stripLeading: Boolean, -) { +fun Element.appendCorrectlyNormalizedWhiteSpaceRecursively(builder: LinearTextBuilder) { for (child in childNodes()) { when (child) { - is TextNode -> child.appendCorrectlyNormalizedWhiteSpace(builder, stripLeading) + is TextNode -> child.appendCorrectlyNormalizedWhiteSpace(builder) is Element -> child.appendCorrectlyNormalizedWhiteSpaceRecursively( builder, - stripLeading, ) } } diff --git a/app/src/test/java/com/nononsenseapps/feeder/model/html/HtmlLinearizerTest.kt b/app/src/test/java/com/nononsenseapps/feeder/model/html/HtmlLinearizerTest.kt index 5ff348bfe9..febae02256 100644 --- a/app/src/test/java/com/nononsenseapps/feeder/model/html/HtmlLinearizerTest.kt +++ b/app/src/test/java/com/nononsenseapps/feeder/model/html/HtmlLinearizerTest.kt @@ -36,6 +36,23 @@ class HtmlLinearizerTest { assertEquals(LinearText("Hello, world!", LinearTextBlockStyle.TEXT), result[0]) } + @Test + fun `spaces inside headers are kept`() { + val html = + """ +

Link small

+ """.trimIndent() + val baseUrl = "https://example.com" + + val result = linearizer.linearize(html, baseUrl).elements + + assertEquals(1, result.size) + assertEquals( + LinearText("Link small", LinearTextBlockStyle.TEXT, LinearTextAnnotation(LinearTextAnnotationH2, 0, 9)), + result[0], + ) + } + @Test fun `should return annotations with bold, italic, and underline`() { val html = "Hello, world!"