Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Converts the sentence length assessment to use the HTML parser #21820

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions packages/yoastseo/spec/fullTextTests/runFullTextTests.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import { getLanguagesWithWordComplexity } from "../../src/helpers";

// Import test papers.
import testPapers from "./testTexts";
import fs from "fs";

testPapers.forEach( function( testPaper ) {
// eslint-disable-next-line max-statements
Expand All @@ -65,6 +66,34 @@ testPapers.forEach( function( testPaper ) {

buildTree( paper, researcher );

/**
* Writes the given contents to the given filename in the temporary directory tmp
* @param {string} filename The name of the file.
* @param {string} content The content of the file.
* @returns {void}
*/
const writeToTempFile = ( filename, content ) => {
// Creates a temporary directory in the current working directory to store the data, if it not yet exists.
// (i.e., packages/yoastseo/tmp/ if this function is called from packages/yoastseo/)
const dir = "tmp/";
if ( ! fs.existsSync( dir ) ) {
fs.mkdirSync( dir );
}

// Writes the data to this temporary directory
fs.writeFileSync( dir + filename, content );
};

// Collects the results and the header into list of ;-separated rows
const sentences = researcher.getResearch( "countSentencesFromText" );
const resultLines = sentences.map( sentence => sentence.sentence.trimStart().split( " " )[ 0 ] + ";" + sentence.sentenceLength );

// Set doExport to true to write the results to a temporary file.
const doExport = true;
if ( doExport ) {
writeToTempFile( testPaper.name + ".csv", resultLines.join( "\n" ) );
}

const expectedResults = testPaper.expectedResults;

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,41 +1,64 @@
import sentencesLength from "../../../../src/languageProcessing/helpers/sentence/sentencesLength";
import getSentencesFromTree from "../../../../src/languageProcessing/helpers/sentence/getSentencesFromTree";
import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher";
import EnglishResearcher from "../../../../src/languageProcessing/languages/en/Researcher";
import Paper from "../../../../src/values/Paper";
import buildTree from "../../../specHelpers/parse/buildTree";

describe( "A test to count sentence lengths.", function() {
it( "should not return a length for an empty sentence", function() {
const sentences = [ "", "A sentence" ];
const mockResearcher = new EnglishResearcher( new Paper( "" ) );
const mockPaper = new Paper( "<p></p><p>A sentence</p>" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const lengths = sentencesLength( sentences, mockResearcher );
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );

expect( lengths ).toEqual( [
{ sentence: "A sentence", sentenceLength: 2 },
] );
expect( sentenceLengths.length ).toEqual( 1 );
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 2 );
expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "A sentence" );
} );

it( "should return the sentences and their length (the HTML tags should not be counted if present)", function() {
const sentences = [ "A <strong>good</strong> text", "this is a <span style='color: blue;'> textstring </span>" ];
const mockResearcher = new EnglishResearcher( new Paper( "" ) );
const mockPaper = new Paper( "<p>A <strong>good</strong> text</p>" +
"<p>this is a <span style='color: blue;'>string</span></p>" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const lengths = sentencesLength( sentences, mockResearcher );
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );

expect( lengths ).toEqual( [
{ sentence: "A <strong>good</strong> text", sentenceLength: 3 },
{ sentence: "this is a <span style='color: blue;'> textstring </span>", sentenceLength: 4 },
] );
expect( sentenceLengths.length ).toEqual( 2 );
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 3 );
expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "A good text" );
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 4 );
expect( sentenceLengths[ 1 ].sentence.text ).toEqual( "this is a string" );
} );

it( "should return the correct length for sentences containing hyphens", function() {
const mockPaper = new Paper(
"<p>My know-it-all mother-in-law made a state-of-the-art U-turn.</p>" +
"<p>Her ex-husband found that low-key amazing.</p>" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );

expect( sentenceLengths.length ).toEqual( 2 );
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 7 );
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 6 );
} );

it( "should return the sentences and their length for Japanese (so counting characters)", function() {
const sentences = [ "自然おのずから存在しているもの", "歩くさわやかな森 <span style='color: red;'> 自然 </span>" ];
const mockJapaneseResearcher = new JapaneseResearcher( new Paper( "" ) );
const mockPaper = new Paper( "<p>自然おのずから存在しているもの</p>" +
"<p>歩くさわやかな森 <span style='color: red;'> 自然 </span></p>" );
const mockJapaneseResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockJapaneseResearcher );

const lengths = sentencesLength( sentences, mockJapaneseResearcher );
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockJapaneseResearcher );

expect( lengths ).toEqual( [
{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 10 },
] );
expect( sentenceLengths.length ).toEqual( 2 );
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 15 );
expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "自然おのずから存在しているもの" );
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 10 );
expect( sentenceLengths[ 1 ].sentence.text ).toEqual( "歩くさわやかな森 自然 " );
} );
} );
Original file line number Diff line number Diff line change
@@ -1,68 +1,122 @@
/* eslint-disable capitalized-comments, spaced-comment */
import getSentences from "../../../src/languageProcessing/researches/countSentencesFromText.js";
import Paper from "../../../src/values/Paper";
import EnglishResearcher from "../../../src/languageProcessing/languages/en/Researcher";
import JapaneseResearcher from "../../../src/languageProcessing/languages/ja/Researcher";
import buildTree from "../../specHelpers/parse/buildTree";

describe( "counts words in sentences from text", function() {
let paper;

it( "returns sentences with question mark", function() {
paper = new Paper( "Hello. How are you? Bye" );
expect( getSentences( paper, new EnglishResearcher() )[ 0 ].sentenceLength ).toBe( 1 );
expect( getSentences( paper, new EnglishResearcher() )[ 1 ].sentenceLength ).toBe( 3 );
expect( getSentences( paper, new EnglishResearcher() )[ 2 ].sentenceLength ).toBe( 1 );
const mockPaper = new Paper( "Hello. How are you? Bye" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 1 );
expect( sentences[ 1 ].sentenceLength ).toBe( 3 );
expect( sentences[ 2 ].sentenceLength ).toBe( 1 );
} );
it( "returns sentences with exclamation mark", function() {
paper = new Paper( "Hello. How are you! Bye" );
expect( getSentences( paper, new EnglishResearcher() )[ 0 ].sentenceLength ).toBe( 1 );
expect( getSentences( paper, new EnglishResearcher() )[ 1 ].sentenceLength ).toBe( 3 );
expect( getSentences( paper, new EnglishResearcher() )[ 2 ].sentenceLength ).toBe( 1 );
const mockPaper = new Paper( "Hello. How are you! Bye" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 1 );
expect( sentences[ 1 ].sentenceLength ).toBe( 3 );
expect( sentences[ 2 ].sentenceLength ).toBe( 1 );
} );
it( "returns sentences with many spaces", function() {
paper = new Paper( "Hello. How are you! Bye" );
expect( getSentences( paper, new EnglishResearcher() )[ 0 ].sentenceLength ).toBe( 1 );
expect( getSentences( paper, new EnglishResearcher() )[ 1 ].sentenceLength ).toBe( 3 );
expect( getSentences( paper, new EnglishResearcher() )[ 2 ].sentenceLength ).toBe( 1 );
const mockPaper = new Paper( "Hello. How are you! Bye" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 1 );
expect( sentences[ 1 ].sentenceLength ).toBe( 3 );
expect( sentences[ 2 ].sentenceLength ).toBe( 1 );
} );
it( "returns sentences with html-tags, should only count words", function() {
paper = new Paper( "This is a text <img src='image.jpg' alt='a bunch of words in an alt-tag' />" );
expect( getSentences( paper, new EnglishResearcher() )[ 0 ].sentenceLength ).toBe( 4 );
const mockPaper = new Paper( "This is a text <img src='https://example.com/image.jpg' alt='a bunch of words in an alt-tag' />" );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 4 );
} );
it( "returns sentences with html-tags, should only count words", function() {
paper = new Paper( "This is a text <img src='http://domain.com/image.jpg' alt='a bunch of words in an alt-tag' />. Another sentence." );
expect( getSentences( paper, new EnglishResearcher() )[ 0 ].sentenceLength ).toBe( 4 );
expect( getSentences( paper, new EnglishResearcher() )[ 1 ].sentenceLength ).toBe( 2 );
const mockPaper = new Paper( "This is a text <img src='https://example.com/image.jpg' alt='a bunch of words in an alt-tag' />. Another sentence." );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 4 );
expect( sentences[ 1 ].sentenceLength ).toBe( 2 );
} );
it( "should not count sentences inside elements we want to exclude from the analysis", function() {
paper = new Paper( "This is a text. <code>With some code.</code>. Another sentence." );
expect( getSentences( paper, new EnglishResearcher() )[ 0 ].sentenceLength ).toBe( 4 );
expect( getSentences( paper, new EnglishResearcher() )[ 1 ].sentenceLength ).toBe( 2 );
const mockPaper = new Paper( "This is a text. <code>With some code.</code>. Another sentence." );
const mockResearcher = new EnglishResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 4 );
expect( sentences[ 1 ].sentenceLength ).toBe( 2 );
} );
/*it( "returns sentences with question mark in Japanese", function() {
paper = new Paper( "雨が降っている。 いつ終わるの? さようなら" );
expect( getSentences( paper, new JapaneseResearcher() )[ 0 ].sentenceLength ).toBe( 8 );
expect( getSentences( paper, new JapaneseResearcher() )[ 1 ].sentenceLength ).toBe( 7 );
expect( getSentences( paper, new JapaneseResearcher() )[ 2 ].sentenceLength ).toBe( 5 );
it( "returns sentences with question mark in Japanese", function() {
const mockPaper = new Paper( "雨が降っている。 いつ終わるの? さようなら" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 8 );
expect( sentences[ 1 ].sentenceLength ).toBe( 7 );
expect( sentences[ 2 ].sentenceLength ).toBe( 5 );
} );
it( "returns sentences with exclamation mark", function() {
paper = new Paper( "雨が降っている. いつ終わるの!さようなら" );
expect( getSentences( paper, new JapaneseResearcher() )[ 0 ].sentenceLength ).toBe( 8 );
expect( getSentences( paper, new JapaneseResearcher() )[ 1 ].sentenceLength ).toBe( 7 );
expect( getSentences( paper, new JapaneseResearcher() )[ 2 ].sentenceLength ).toBe( 5 );
const mockPaper = new Paper( "雨が降っている. いつ終わるの!さようなら" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 8 );
expect( sentences[ 1 ].sentenceLength ).toBe( 7 );
expect( sentences[ 2 ].sentenceLength ).toBe( 5 );
} );
it( "returns sentences with many spaces", function() {
paper = new Paper( "雨が降っている。 いつ終わるの? さようなら" );
expect( getSentences( paper, new JapaneseResearcher() )[ 0 ].sentenceLength ).toBe( 8 );
expect( getSentences( paper, new JapaneseResearcher() )[ 1 ].sentenceLength ).toBe( 7 );
expect( getSentences( paper, new JapaneseResearcher() )[ 2 ].sentenceLength ).toBe( 5 );
const mockPaper = new Paper( "雨が降っている。 いつ終わるの? さようなら" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 8 );
expect( sentences[ 1 ].sentenceLength ).toBe( 7 );
expect( sentences[ 2 ].sentenceLength ).toBe( 5 );
} );
it( "returns sentences with html-tags, should count characters in Japanese", function() {
paper = new Paper( "いつ終わるの <img src='image.jpg' alt='自分を大事にして下さい' />" );
expect( getSentences( paper, new JapaneseResearcher() )[ 0 ].sentenceLength ).toBe( 6 );
const mockPaper = new Paper( "いつ終わるの <img src='image.jpg' alt='自分を大事にして下さい' />" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 6 );
} );
it( "returns sentences with html-tags, should count characters in Japanese", function() {
paper = new Paper( "いつ終わるの <img src='http://domain.com/image.jpg' alt='自分を大事にして下さい' />. 春がやってきます。" );
expect( getSentences( paper, new JapaneseResearcher() )[ 0 ].sentenceLength ).toBe( 7 );
expect( getSentences( paper, new JapaneseResearcher() )[ 1 ].sentenceLength ).toBe( 9 );
} );*/
const mockPaper = new Paper( "いつ終わるの <img src='http://domain.com/image.jpg' alt='自分を大事にして下さい' />. 春がやってきます。" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 7 );
expect( sentences[ 1 ].sentenceLength ).toBe( 9 );
} );
} );
Loading
Loading