WIP: Start hand-written PHP combinator-based parser

The goal of this alternative parser is to make a faster parser than the one generated by the PEG grammar directly. It must not take multiple seconds to parse a document no matter how big. This parser currently is designed to produce the blocks and their attributes, leaving all inside content raw. This is based on the seminal paper "Monadic Parser Combinators" by Hutton and Meijer, University of Nottingham, 1996. I have had a hard time finding literature detailing the performance of such an approach in PHP. Although this approach will be building up user-defined function calls it avoids creating closures by means of passing around descriptions of parsers in the PHP callback style. That is, instead of passing partial functions, we pass a callable string and an array of partial arguments. The input is added as the final input before calling the given parser. A further speedup taken in this approach involves defining parse rules "on the branches" so as to reduce backtracking. That is, if two rules share a prefix then we can parse the prefix and descend into a `first_of` branch instead of having a top-level `first_of` branch and duplicating the prefix. The latter approach has been taken in the formal grammar spec to make the rules easier to read. RegExp patterns have also been dupilcated where possible to combine rules. That is, although we could ignore whitespace and then ignore a closing block comment, instead I'm combining the RegExp pattern so that we benefit from the faster performance of the RegExp and from only making one call instead of two. Each call passes around its own state on the parse "stack."
WordPress · Jul 10, 2017 · cbe58e2 · cbe58e2
1 parent 46ae03c
commit cbe58e2
Show file tree

Hide file tree

Showing 3 changed files with 391 additions and 0 deletions.
diff --git a/gutenberg.php b/gutenberg.php
@@ -20,6 +20,7 @@
 	require_once dirname( __FILE__ ) . '/lib/client-assets.php';
 	require_once dirname( __FILE__ ) . '/lib/i18n.php';
 	require_once dirname( __FILE__ ) . '/lib/parser.php';
+	require_once dirname( __FILE__ ) . '/lib/block-parser.php';
 	require_once dirname( __FILE__ ) . '/lib/register.php';
 
 	// Register server-side code for individual blocks.

diff --git a/lib/block-parser.php b/lib/block-parser.php
@@ -0,0 +1,205 @@
+<?php
+
+if (!class_exists('Gutenberg_Block_Parser_State', false)):
+
+class Gutenberg_Block_Parser_State {
+	public $block_stack;
+}
+
+endif;
+
+if (!class_exists('Gutenberg_Block_Parser', false)):
+
+class Gutenberg_Block_Parser {
+	const BLOCK_COMMENT_OPEN  = '(^<!--)';
+	const BLOCK_COMMENT_CLOSE = '(^/?-->)';
+	const BLOCK_NAME          = '(^[[:alpha:]](?:[[:alnum:]]|/[[:alnum:]])*)i';
+	const BLOCK_ATTRIBUTES    = '(^{(?:((?!}[ \t\r\n]+/?-->).)*)})';
+	const WS                  = '(^[ \t\r\n])';
+	const WSS                 = '(^[ \t\r\n]+)';
+
+	const MAX_RUNTIME         = 1; // give up after one second
+
+	public function parse($input) {
+		$tic = microtime( true );
+
+		// trampoline for stack-safe recursion of the actual parser
+		while ( $this->input && ( microtime( true ) - $tic ) < self::MAX_RUNTIME ) {
+			return $this->proceed( $input );
+		}
+	}
+
+	public function proceed( $input ) {
+		return succeed( 'test', $input );
+	}
+
+	public static function block_void( $input ) {
+		$result = self::sequence( array(
+			array( 'self::ignore', array( 'self::match', array( '(^<!--[ \t\r\n]+wp:)' ) ) ),
+			array( 'self::match',  array( self::BLOCK_NAME ) ),
+			array( 'self::first_of', array( array(
+				array( 'self::ignore',   array( 'self::match', array( '(^[ \t\r\n]+/-->)' ) ) ),
+				array( 'self::sequence', array( array(
+					array( 'self::ignore', array( 'self::match', array( self::WSS ) ) ),
+					array( 'self::match',  array( self::BLOCK_ATTRIBUTES ) ),
+					array( 'self::ignore', array( 'self::match', array( '(^[ \t\r\n]+/-->)' ) ) )
+				) ) )
+			) ) )
+		), $input );
+
+		if ( empty( $result ) ) {
+			return array();
+		}
+
+		list( list( list( $blockName ), list( list( $raw_attrs ) ) ), $remaining ) = $result;
+		$attrs = $raw_attrs
+			? json_decode( $raw_attrs, true )
+			: array();
+
+		return array( self::block( $blockName, $attrs, '' ), $remaining );
+	}
+
+	//-----------------------------------------
+	// Parser a :: String -> [ ( a, String ) ]
+	//
+	// A parser is a function which takes a string
+	// and returns a list of things and strings
+	//
+	// An empty list is a failed parse
+	//
+	// The polymorphic "a" will eventually be a block
+	//-----------------------------------------
+	public static function succeed( $value, $input ) {
+		return array( array( $value, $input ) );
+	}
+
+	public static function fail( $input ) {
+		return array();
+	}
+
+	public static function ignore( $parser, $parser_args, $input ) {
+		$result = call_user_func_array( $parser, array_merge( $parser_args, array( $input ) ) );
+
+		if ( empty( $result ) ) {
+			return array();
+		}
+
+		list( /* production */, $remaining ) = $result;
+
+		return array( array(), $remaining );
+	}
+
+	public static function literal( $value, $input ) {
+		return strpos( $input, $value ) === 0
+			? array( $value, substr( $input, strlen( $value ) ) )
+			: array();
+	}
+
+	public static function match( $pattern, $input ) {
+		$matches = array();
+
+		$is_match = preg_match( $pattern, $input, $matches );
+
+		return $is_match
+			? array( $matches, substr( $input, strlen( $matches[ 0 ] ) ) )
+			: array();
+	}
+
+	public static function map( $f, $parser, $parser_args, $input ) {
+		$result = call_user_func_array(
+			$f,
+			call_user_func_array(
+				$parser,
+				array_merge( $parser_args, array( $input ) )
+			)
+		);
+
+		return ! empty( $result )
+			? array( $result[ 0 ], $input )
+			: array();
+	}
+
+	public static function sequence( $parsers_and_args, $input ) {
+		$output = array();
+		$remaining = $input;
+
+		foreach ( $parsers_and_args as $parser_and_args ) {
+			list( $parser, $parser_args ) = $parser_and_args;
+
+			$result = call_user_func_array( $parser, array_merge( $parser_args, array( $remaining ) ) );
+
+			if ( empty( $result ) ) {
+				return array();
+			}
+
+			list( $next, $remaining ) = $result;
+			$output[] = $next;
+		}
+
+		return array( array_values( array_filter( $output, 'self::is_not_empty' ) ), $remaining );
+	}
+
+	public static function is_not_empty( $value ) {
+		return ! empty( $value );
+	}
+
+	public static function first_of( $parsers_and_args, $input ) {
+		foreach ( $parsers_and_args as $parser_and_args ) {
+			list( $parser, $parser_args ) = $parser_and_args;
+
+			$result = call_user_func_array( $parser, array_merge( $parser_args, array( $input ) ) );
+
+			if ( ! empty( $result ) ) {
+				return $result;
+			}
+		}
+
+		return array();
+	}
+
+	public static function zero_or_more( $parser, $parser_args, $input ) {
+		$output = array();
+		$remaining = $input;
+
+		while ( true ) {
+			$result = call_user_func_array( $parser, array_merge( $parser_args, array( $remaining ) ) );
+			if ( empty( $result ) ) {
+				return array( $output, $remaining );
+			}
+
+			list( $next, $remaining ) = $result;
+			$output[] = $next;
+		}
+	}
+
+	public static function one_or_more( $parser, $parser_args, $input ) {
+		$output = array();
+		$remaining = $input;
+
+		while ( true ) {
+			$result = call_user_func_array( $parser, array_merge( $parser_args, array( $remaining ) ) );
+			if ( empty( $result ) ) {
+				return empty( $output )
+					? array()
+					: array( $output, $remaining );
+			}
+
+			list( $next, $remaining ) = $result;
+			$output[] = $next;
+		}
+	}
+
+	public static function block( $blockName, $attrs, $rawContent ) {
+		return array(
+			'blockName'  => $blockName,
+			'attrs'      => $attrs,
+			'rawContent' => $rawContent
+		);
+	}
+
+	public static function freeform( $rawContent ) {
+		return self::block( 'freeform', array(), $rawContent );
+	}
+}
+
+endif;
diff --git a/phpunit/class.block-parser-test.php b/phpunit/class.block-parser-test.php
@@ -0,0 +1,185 @@
+<?php
+
+use PHPUnit\Framework\TestCase;
+
+require_once dirname( dirname( __FILE__ ) ) . '/lib/block-parser.php';
+
+class Block_Parser_Test extends TestCase {
+	private $parser;
+
+	function parse( $input ) {
+		return $this->parser->parse( $input );
+	}
+
+	function setUp() {
+		$this->parser = new Gutenberg_Block_Parser();
+	}
+
+	function test_combinator_succeed() {
+		$this->assertEquals(
+			[ [ 'test', 'bork' ] ],
+			Gutenberg_Block_Parser::succeed( 'test', 'bork' )
+		);
+	}
+
+	function test_combinator_fail() {
+		$this->assertEquals(
+			[],
+			Gutenberg_Block_Parser::fail( 'bork' )
+		);
+	}
+
+	function test_combinator_literal_success() {
+		$this->assertEquals(
+			[ 'test', ' string' ],
+			Gutenberg_Block_Parser::literal( 'test', 'test string' )
+		);
+	}
+
+	function test_combinator_literal_fail() {
+		$this->assertEquals(
+			[],
+			Gutenberg_Block_Parser::literal( 'test', 'no match' )
+		);
+	}
+
+	function test_combinator_ignore() {
+		$this->assertEquals(
+			[ [], 'abc' ],
+			Gutenberg_Block_Parser::ignore(
+				[ 'Gutenberg_Block_Parser', 'literal' ],
+				[ '123' ],
+				'123abc'
+			)
+		);
+	}
+
+	function test_combinator_ignore_fail() {
+		$this->assertEquals(
+			[],
+			Gutenberg_Block_Parser::ignore(
+				[ 'Gutenberg_Block_Parser', 'literal' ],
+				[ 'abc' ],
+				'123abc'
+			)
+		);
+	}
+
+	function test_combinator_match_success() {
+		$this->assertEquals(
+			[ [ 'test_val' ], ' = 5' ],
+			Gutenberg_Block_Parser::match( '(^[a-z_]+)', 'test_val = 5' )
+		);
+	}
+
+	function test_combinator_match_groups_success() {
+		$this->assertEquals(
+			[ [ 'test_val = 5', 'test_val', '5' ], ';' ],
+			Gutenberg_Block_Parser::match( '(^([a-z_]+) = (\d+))', 'test_val = 5;' )
+		);
+	}
+
+	function test_combinator_match_fail() {
+		$this->assertEquals(
+			[],
+			Gutenberg_Block_Parser::match( '(^[a-z_]+)', ';test_val = 5' )
+		);
+	}
+
+	function test_combinator_zero_or_more() {
+		$this->assertEquals(
+			[ [ 'a', 'a', 'a' ], 'xyz' ],
+			Gutenberg_Block_Parser::zero_or_more(
+				[ 'Gutenberg_Block_Parser', 'literal' ],
+				[ 'a' ],
+				'aaaxyz'
+			)
+		);
+	}
+
+	function test_combinator_zero_or_more_failure() {
+		$this->assertEquals(
+			[ [], 'bbb' ],
+			Gutenberg_Block_Parser::zero_or_more(
+				[ 'Gutenberg_Block_Parser', 'literal' ],
+				[ 'a' ],
+				'bbb'
+			)
+		);
+	}
+
+	function test_combinator_one_or_more() {
+		$this->assertEquals(
+			[ [ 'a', 'a' ], 'bb' ],
+			Gutenberg_Block_Parser::one_or_more(
+				[ 'Gutenberg_Block_Parser', 'literal' ],
+				[ 'a' ],
+				'aabb'
+			)
+		);
+	}
+
+	function test_combinator_one_or_more_failure() {
+		$this->assertEquals(
+			[],
+			Gutenberg_Block_Parser::one_or_more(
+				[ 'Gutenberg_Block_Parser', 'literal' ],
+				[ 'a' ],
+				'bbb'
+			)
+		);
+	}
+
+	function test_combinator_sequence() {
+		$this->assertEquals(
+			[ [ 'a', 'b' ], 'cd' ],
+			Gutenberg_Block_Parser::sequence( [
+				[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'a' ] ],
+				[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'b' ] ],
+			], 'abcd' )
+		);
+	}
+
+	function test_combinator_sequence_failure() {
+		$this->assertEquals(
+			[],
+			Gutenberg_Block_Parser::sequence( [
+				[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'a' ] ],
+				[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'b' ] ],
+			], 'acd' )
+		);
+	}
+
+	function test_block_void_no_attrs() {
+		$this->assertEquals(
+			[ [ 'blockName' => 'core/void', 'attrs' => [], 'rawContent' => '' ], '' ],
+			Gutenberg_Block_Parser::block_void(
+				'<!-- wp:core/void /-->'
+			)
+		);
+	}
+
+	function test_block_void_with_empty_attrs() {
+		$this->assertEquals(
+			[ [ 'blockName' => 'core/void', 'attrs' => [], 'rawContent' => '' ], '' ],
+			Gutenberg_Block_Parser::block_void(
+				'<!-- wp:core/void {} /-->'
+			)
+		);
+	}
+
+	function test_block_void_with_non_empty_attrs() {
+		$this->assertEquals(
+			[ [
+				'blockName' => 'core/void',
+				'attrs' => [
+					'val' => 1337
+				],
+				'rawContent' => ''
+			], '' ],
+			Gutenberg_Block_Parser::block_void(
+				'<!-- wp:core/void { "val": 1337 } /-->'
+			)
+		);
+	}
+}