Merge pull request #21866 from Yoast/html-parser/paragraph-length

Convert Sentence length and paragraph length to use HTML parser and enable AI button for both assessments
Yoast · Jan 6, 2025 · 88fb0a1 · 88fb0a1
2 parents 870b6c8 + 9a74b44
commit 88fb0a1
Show file tree

Hide file tree

Showing 30 changed files with 775 additions and 649 deletions.
diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/el/greekPaper.html b/packages/yoastseo/spec/fullTextTests/testTexts/el/greekPaper.html
@@ -1,10 +1,3 @@
-<!DOCTYPE html>
-<html lang="el">
-<head>
-	<meta charset="UTF-8">
-	<title>Ελληνική γλώσσα - Βικιπαίδεια</title>
-</head>
-<body>
 <p>Η <a href="/wiki/%CE%A6%CF%89%CE%BD%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1" title="Φωνολογία">φωνολογία</a>, η <a href="/wiki/%CE%9C%CE%BF%CF%81%CF%86%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1_(%CE%B3%CE%BB%CF%89%CF%83%CF%83%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1)" title="Μορφολογία (γλωσσολογία)">μορφολογία</a>, η <a href="/wiki/%CE%A3%CF%8D%CE%BD%CF%84%CE%B1%CE%BE%CE%B7_(%CE%B3%CE%BB%CF%89%CF%83%CF%83%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1)" title="Σύνταξη (γλωσσολογία)">σύνταξη</a> και το <a href="/wiki/%CE%9B%CE%B5%CE%BE%CE%B9%CE%BB%CF%8C%CE%B3%CE%B9%CE%BF" title="Λεξιλόγιο">λεξιλόγιο</a> της γλώσσας δείχνουν τόσο συντηρητικά όσο και καινοτόμα στοιχεία σε ολόκληρη την ιστορική πορεία της γλώσσας από την αρχαία έως τη σύγχρονη περίοδο. Η διαίρεση σε συμβατικές περιόδους είναι σχετικά αυθαίρετη, ειδικά επειδή σε όλες τις περιόδους ύπαρξης της η αρχαία ελληνική έχει απολαύσει υψηλό κύρος και οι εγγράμματοι άνθρωποι χρησιμοποιούσαν πολλά δάνεια από τα αρχαία ελληνικά.
 </p>
 <h3><span id=".CE.A6.CF.89.CE.BD.CE.BF.CE.BB.CE.BF.CE.B3.CE.AF.CE.B1"></span><span class="mw-headline" id="Φωνολογία">Φωνολογία</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=%CE%95%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA%CE%AE_%CE%B3%CE%BB%CF%8E%CF%83%CF%83%CE%B1&amp;veaction=edit&amp;section=8" class="mw-editsection-visualeditor" title="Επεξεργασία ενότητας: Φωνολογία">Επεξεργασία</a><span class="mw-editsection-divider"> | </span><a href="/w/index.php?title=%CE%95%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA%CE%AE_%CE%B3%CE%BB%CF%8E%CF%83%CF%83%CE%B1&amp;action=edit&amp;section=8" title="Επεξεργασία ενότητας: Φωνολογία">επεξεργασία κώδικα</a><span class="mw-editsection-bracket">]</span></span></h3>
@@ -173,7 +166,5 @@ <h3><span id=".CE.95.CE.BB.CE.BB.CE.B7.CE.BD.CE.B9.CE.BA.CF.8C_.CE.B1.CE.BB.CF.8
 <p>Τα ελληνικά γράφονται στο ελληνικό αλφάβητο από τον 9ο αιώνα π.Χ. περίπου. Δημιουργήθηκε με την τροποποίηση του <a href="/wiki/%CE%A6%CE%BF%CE%B9%CE%BD%CE%B9%CE%BA%CE%B9%CE%BA%CF%8C_%CE%B1%CE%BB%CF%86%CE%AC%CE%B2%CE%B7%CF%84%CE%BF" title="Φοινικικό αλφάβητο">φοινικικού αλφαβήτου</a>, με την καινοτομία της υιοθέτησης ορισμένων νέων γραμμάτων για την γραφή των φωνηέντων. Η παραλλαγή του αλφαβήτου που χρησιμοποιείται σήμερα είναι ουσιαστικά η ύστερη <a href="/wiki/%CE%99%CF%89%CE%BD%CE%B9%CE%BA%CE%AE_%CE%B4%CE%B9%CE%AC%CE%BB%CE%B5%CE%BA%CF%84%CE%BF%CF%82" title="Ιωνική διάλεκτος">Ιωνική</a> παραλλαγή, η οποία εισήχθη για την γραφή της <a href="/wiki/%CE%91%CF%84%CF%84%CE%B9%CE%BA%CE%AE_%CE%B4%CE%B9%CE%AC%CE%BB%CE%B5%CE%BA%CF%84%CE%BF%CF%82" title="Αττική διάλεκτος">αττικής διαλέκτου</a> το 403 π.Χ. Στην κλασική ελληνική, όπως και στην κλασική λατινική, υπήρχαν μόνο κεφαλαία γράμματα. Τα πεζά ελληνικά γράμματα αναπτύχθηκαν πολύ αργότερα από τους μεσαιωνικούς γραμματείς για να επιτρέψουν ένα ταχύτερο, πιο βολικό τρόπο γραφής με τη χρήση <a href="/wiki/%CE%9C%CE%B5%CE%BB%CE%AC%CE%BD%CE%B7" title="Μελάνη">μελανιού</a> και πένας.
 </p><p>Το ελληνικό αλφάβητο αποτελείται από 24 γράμματα, το καθένα με κεφαλαία και πεζά γράμματα. Το <a href="/wiki/%CE%A3%CE%AF%CE%B3%CE%BC%CE%B1" title="Σίγμα">σίγμα</a> έχει μια πρόσθετη πεζή μορφή (ς) που χρησιμοποιείται στο τέλος μιας λέξης:
 </p>
-</body>
-</html>
 
 <!-- "Ελληνική γλώσσα" by Wikipedia (EL) is licensed under CC-BY-SA 2.0 (https://creativecommons.org/licenses/by-sa/2.0/) -->
diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/el/greekPaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/el/greekPaper.js
@@ -66,7 +66,7 @@ const expectedResults = {
 	textLength: {
 		isApplicable: true,
 		score: 9,
-		resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 2913 words. Good job!",
+		resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 2910 words. Good job!",
 	},
 	externalLinks: {
 		isApplicable: true,
@@ -117,25 +117,25 @@ const expectedResults = {
 	},
 	textParagraphTooLong: {
 		isApplicable: true,
-		score: 9,
-		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
+		score: 3,
+		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 3 of the paragraphs contain more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
 	},
 	textSentenceLength: {
 		isApplicable: true,
-		score: 6,
-		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 27.8% of the sentences contain more than 20 words, " +
+		score: 3,
+		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 30.9% of the sentences contain more than 20 words, " +
 			"which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
 	},
 	textTransitionWords: {
 		isApplicable: true,
-		score: 3,
-		resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 19.6% of the sentences contain" +
+		score: 6,
+		resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 20.2% of the sentences contain" +
 			" transition words, which is not enough. <a href='https://yoa.st/35a' target='_blank'>Use more of them</a>.",
 	},
 	passiveVoice: {
 		isApplicable: true,
 		score: 3,
-		resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 25.8% of the sentences contain passive voice, " +
+		resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 26.6% of the sentences contain passive voice, " +
 			"which is more than the recommended maximum of 10%. <a href='https://yoa.st/34u' target='_blank'>" +
 			"Try to use their active counterparts</a>.",
 	},

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/en/englishPaperForPerformanceTest.js b/packages/yoastseo/spec/fullTextTests/testTexts/en/englishPaperForPerformanceTest.js
@@ -115,8 +115,8 @@ const expectedResults = {
 	},
 	textParagraphTooLong: {
 		isApplicable: true,
-		score: 9,
-		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
+		score: 6,
+		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 1 of the paragraphs contains more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
 	},
 	textSentenceLength: {
 		isApplicable: true,

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/es/spanishPaperForPerformanceTest.js b/packages/yoastseo/spec/fullTextTests/testTexts/es/spanishPaperForPerformanceTest.js
@@ -115,8 +115,8 @@ const expectedResults = {
 	},
 	textParagraphTooLong: {
 		isApplicable: true,
-		score: 9,
-		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
+		score: 6,
+		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 1 of the paragraphs contains more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
 	},
 	textSentenceLength: {
 		isApplicable: true,

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/fa/farsiPaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/fa/farsiPaper.js
@@ -118,8 +118,8 @@ const expectedResults = {
 	},
 	textParagraphTooLong: {
 		isApplicable: true,
-		score: 9,
-		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
+		score: 3,
+		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 3 of the paragraphs contain more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
 	},
 	textSentenceLength: {
 		isApplicable: true,

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/fr/frenchPaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/fr/frenchPaper.js
@@ -109,13 +109,13 @@ const expectedResults = {
 	},
 	textParagraphTooLong: {
 		isApplicable: true,
-		score: 9,
-		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
+		score: 3,
+		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 2 of the paragraphs contain more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
 	},
 	textSentenceLength: {
 		isApplicable: true,
 		score: 3,
-		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 40.8% of the sentences contain more" +
+		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 39.6% of the sentences contain more" +
 			" than 20 words, which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>" +
 			"Try to shorten the sentences</a>.",
 	},

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/he/hebrewPaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/he/hebrewPaper.js
@@ -120,7 +120,7 @@ const expectedResults = {
 	textSentenceLength: {
 		isApplicable: true,
 		score: 3,
-		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 66.7% of the sentences contain more than 15 words," +
+		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 78.9% of the sentences contain more than 15 words," +
 			" which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
 	},
 	textTransitionWords: {

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
@@ -124,7 +124,7 @@ const expectedResults = {
 	textSentenceLength: {
 		isApplicable: true,
 		score: 3,
-		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 50.8% of the sentences contain more than 40 characters, " +
+		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 51.6% of the sentences contain more than 40 characters, " +
 			"which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
 	},
 	textTransitionWords: {

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/pl/polishPaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/pl/polishPaper.js
@@ -109,7 +109,7 @@ const expectedResults = {
 	textSentenceLength: {
 		isApplicable: true,
 		score: 3,
-		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 45.8% of the sentences contain more than 20 words, which is more than the recommended maximum of 15%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
+		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 46.6% of the sentences contain more than 20 words, which is more than the recommended maximum of 15%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
 	},
 	textTransitionWords: {
 		isApplicable: true,

diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/pl/polishPaperForPerformanceTest.js b/packages/yoastseo/spec/fullTextTests/testTexts/pl/polishPaperForPerformanceTest.js
@@ -109,8 +109,8 @@ const expectedResults = {
 	},
 	textParagraphTooLong: {
 		isApplicable: true,
-		score: 9,
-		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
+		score: 6,
+		resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 1 of the paragraphs contains more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
 	},
 	textSentenceLength: {
 		isApplicable: true,

diff --git a/packages/yoastseo/spec/languageProcessing/helpers/html/matchParagraphsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/html/matchParagraphsSpec.js
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
@@ -1,41 +1,81 @@
 import sentencesLength from "../../../../src/languageProcessing/helpers/sentence/sentencesLength";
+import getSentencesFromTree from "../../../../src/languageProcessing/helpers/sentence/getSentencesFromTree";
 import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher";
 import EnglishResearcher from "../../../../src/languageProcessing/languages/en/Researcher";
 import Paper from "../../../../src/values/Paper";
+import buildTree from "../../../specHelpers/parse/buildTree";
 
 describe( "A test to count sentence lengths.", function() {
 	it( "should not return a length for an empty sentence", function() {
-		const sentences = [ "", "A sentence" ];
-		const mockResearcher = new EnglishResearcher( new Paper( "" ) );
+		const mockPaper = new Paper( "<p></p><p>A sentence</p>" );
+		const mockResearcher = new EnglishResearcher( mockPaper );
+		buildTree( mockPaper, mockResearcher );
 
-		const lengths = sentencesLength( sentences, mockResearcher );
+		const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
 
-		expect( lengths ).toEqual( [
-			{ sentence: "A sentence", sentenceLength: 2 },
-		] );
+		expect( sentenceLengths.length ).toEqual( 1 );
+		expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 2 );
+		expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "A sentence" );
 	} );
 
 	it( "should return the sentences and their length (the HTML tags should not be counted if present)", function() {
-		const sentences = [ "A <strong>good</strong> text", "this is a <span style='color: blue;'> textstring </span>" ];
-		const mockResearcher = new EnglishResearcher( new Paper( "" ) );
+		const mockPaper = new Paper( "<p>A <strong>good</strong> text</p>" +
+			"<p>this is a <span style='color: blue;'>string</span></p>" );
+		const mockResearcher = new EnglishResearcher( mockPaper );
+		buildTree( mockPaper, mockResearcher );
 
-		const lengths = sentencesLength( sentences, mockResearcher );
+		const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
 
-		expect( lengths ).toEqual( [
-			{ sentence: "A <strong>good</strong> text", sentenceLength: 3 },
-			{ sentence: "this is a <span style='color: blue;'> textstring </span>", sentenceLength: 4 },
-		] );
+		expect( sentenceLengths.length ).toEqual( 2 );
+		expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 3 );
+		expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "A good text" );
+		expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 4 );
+		expect( sentenceLengths[ 1 ].sentence.text ).toEqual( "this is a string" );
+	} );
+
+	it( "should return the correct length for sentences containing hyphens", function() {
+		const mockPaper = new Paper(
+			"<p>My know-it-all mother-in-law made a state-of-the-art U-turn.</p>" +
+			"<p>Her ex-husband found that low-key amazing.</p>" );
+		const mockResearcher = new EnglishResearcher( mockPaper );
+		buildTree( mockPaper, mockResearcher );
+
+		const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
+
+		expect( sentenceLengths.length ).toEqual( 2 );
+		expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 7 );
+		expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 6 );
+	} );
+
+	it( "should return the correct length for sentences containing leading and trailing spaces including the first and last token that is not spaces", function() {
+		const mockPaper = new Paper(
+			"<p> The first sentence.</p><p>The second sentence. </p>" );
+		const mockResearcher = new EnglishResearcher( mockPaper );
+		buildTree( mockPaper, mockResearcher );
+
+		const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
+
+		expect( sentenceLengths.length ).toEqual( 2 );
+		expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 3 );
+		expect( sentenceLengths[ 0 ].firstToken ).toEqual( { sourceCodeRange: { endOffset: 7, startOffset: 4 }, text: "The" } );
+		expect( sentenceLengths[ 0 ].lastToken ).toEqual( { sourceCodeRange: { endOffset: 23, startOffset: 22 }, text: "." } );
+		expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 3 );
+		expect( sentenceLengths[ 1 ].firstToken ).toEqual( { sourceCodeRange: { endOffset: 33, startOffset: 30 }, text: "The" } );
+		expect( sentenceLengths[ 1 ].lastToken ).toEqual( { sourceCodeRange: { endOffset: 50, startOffset: 49 }, text: "." } );
 	} );
 
 	it( "should return the sentences and their length for Japanese (so counting characters)", function() {
-		const sentences = [ "自然おのずから存在しているもの", "歩くさわやかな森 <span style='color: red;'> 自然 </span>" ];
-		const mockJapaneseResearcher = new JapaneseResearcher( new Paper( "" ) );
+		const mockPaper = new Paper( "<p>自然おのずから存在しているもの</p>" +
+			"<p>歩くさわやかな森 <span style='color: red;'> 自然 </span></p>" );
+		const mockJapaneseResearcher = new JapaneseResearcher( mockPaper );
+		buildTree( mockPaper, mockJapaneseResearcher );
 
-		const lengths = sentencesLength( sentences, mockJapaneseResearcher );
+		const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockJapaneseResearcher );
 
-		expect( lengths ).toEqual( [
-			{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
-			{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 10 },
-		] );
+		expect( sentenceLengths.length ).toEqual( 2 );
+		expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 15 );
+		expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "自然おのずから存在しているもの" );
+		expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 10 );
+		expect( sentenceLengths[ 1 ].sentence.text ).toEqual( "歩くさわやかな森  自然 " );
 	} );
 } );