Skip to content

Commit

Permalink
WIP: Start hand-written PHP combinator-based parser
Browse files Browse the repository at this point in the history
The goal of this alternative parser is to make a faster parser than the
one generated by the PEG grammar directly. It must not take multiple
seconds to parse a document no matter how big.

This parser currently is designed to produce the blocks and their
attributes, leaving all inside content raw.

This is based on the seminal paper "Monadic Parser Combinators" by
Hutton and Meijer, University of Nottingham, 1996. I have had a hard
time finding literature detailing the performance of such an approach in
PHP.

Although this approach will be building up user-defined function
calls it avoids creating closures by means of passing around
descriptions of parsers in the PHP callback style. That is, instead of
passing partial functions, we pass a callable string and an array of
partial arguments. The input is added as the final input before calling
the given parser.

A further speedup taken in this approach involves defining parse rules "on
the branches" so as to reduce backtracking. That is, if two rules share
a prefix then we can parse the prefix and descend into a `first_of`
branch instead of having a top-level `first_of` branch and duplicating
the prefix. The latter approach has been taken in the formal grammar
spec to make the rules easier to read.

RegExp patterns have also been dupilcated where possible to combine
rules. That is, although we could ignore whitespace and then ignore a
closing block comment, instead I'm combining the RegExp pattern so that
we benefit from the faster performance of the RegExp and from only
making one call instead of two. Each call passes around its own state on
the parse "stack."
  • Loading branch information
dmsnell committed Jul 10, 2017
1 parent 46ae03c commit cbe58e2
Show file tree
Hide file tree
Showing 3 changed files with 391 additions and 0 deletions.
1 change: 1 addition & 0 deletions gutenberg.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
require_once dirname( __FILE__ ) . '/lib/client-assets.php';
require_once dirname( __FILE__ ) . '/lib/i18n.php';
require_once dirname( __FILE__ ) . '/lib/parser.php';
require_once dirname( __FILE__ ) . '/lib/block-parser.php';
require_once dirname( __FILE__ ) . '/lib/register.php';

// Register server-side code for individual blocks.
Expand Down
205 changes: 205 additions & 0 deletions lib/block-parser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
<?php

if (!class_exists('Gutenberg_Block_Parser_State', false)):

class Gutenberg_Block_Parser_State {
public $block_stack;
}

endif;

if (!class_exists('Gutenberg_Block_Parser', false)):

class Gutenberg_Block_Parser {
const BLOCK_COMMENT_OPEN = '(^<!--)';
const BLOCK_COMMENT_CLOSE = '(^/?-->)';
const BLOCK_NAME = '(^[[:alpha:]](?:[[:alnum:]]|/[[:alnum:]])*)i';
const BLOCK_ATTRIBUTES = '(^{(?:((?!}[ \t\r\n]+/?-->).)*)})';
const WS = '(^[ \t\r\n])';
const WSS = '(^[ \t\r\n]+)';

const MAX_RUNTIME = 1; // give up after one second

public function parse($input) {
$tic = microtime( true );

// trampoline for stack-safe recursion of the actual parser
while ( $this->input && ( microtime( true ) - $tic ) < self::MAX_RUNTIME ) {
return $this->proceed( $input );
}
}

public function proceed( $input ) {
return succeed( 'test', $input );
}

public static function block_void( $input ) {
$result = self::sequence( array(
array( 'self::ignore', array( 'self::match', array( '(^<!--[ \t\r\n]+wp:)' ) ) ),
array( 'self::match', array( self::BLOCK_NAME ) ),
array( 'self::first_of', array( array(
array( 'self::ignore', array( 'self::match', array( '(^[ \t\r\n]+/-->)' ) ) ),
array( 'self::sequence', array( array(
array( 'self::ignore', array( 'self::match', array( self::WSS ) ) ),
array( 'self::match', array( self::BLOCK_ATTRIBUTES ) ),
array( 'self::ignore', array( 'self::match', array( '(^[ \t\r\n]+/-->)' ) ) )
) ) )
) ) )
), $input );

if ( empty( $result ) ) {
return array();
}

list( list( list( $blockName ), list( list( $raw_attrs ) ) ), $remaining ) = $result;
$attrs = $raw_attrs
? json_decode( $raw_attrs, true )
: array();

return array( self::block( $blockName, $attrs, '' ), $remaining );
}

//-----------------------------------------
// Parser a :: String -> [ ( a, String ) ]
//
// A parser is a function which takes a string
// and returns a list of things and strings
//
// An empty list is a failed parse
//
// The polymorphic "a" will eventually be a block
//-----------------------------------------
public static function succeed( $value, $input ) {
return array( array( $value, $input ) );
}

public static function fail( $input ) {
return array();
}

public static function ignore( $parser, $parser_args, $input ) {
$result = call_user_func_array( $parser, array_merge( $parser_args, array( $input ) ) );

if ( empty( $result ) ) {
return array();
}

list( /* production */, $remaining ) = $result;

return array( array(), $remaining );
}

public static function literal( $value, $input ) {
return strpos( $input, $value ) === 0
? array( $value, substr( $input, strlen( $value ) ) )
: array();
}

public static function match( $pattern, $input ) {
$matches = array();

$is_match = preg_match( $pattern, $input, $matches );

return $is_match
? array( $matches, substr( $input, strlen( $matches[ 0 ] ) ) )
: array();
}

public static function map( $f, $parser, $parser_args, $input ) {
$result = call_user_func_array(
$f,
call_user_func_array(
$parser,
array_merge( $parser_args, array( $input ) )
)
);

return ! empty( $result )
? array( $result[ 0 ], $input )
: array();
}

public static function sequence( $parsers_and_args, $input ) {
$output = array();
$remaining = $input;

foreach ( $parsers_and_args as $parser_and_args ) {
list( $parser, $parser_args ) = $parser_and_args;

$result = call_user_func_array( $parser, array_merge( $parser_args, array( $remaining ) ) );

if ( empty( $result ) ) {
return array();
}

list( $next, $remaining ) = $result;
$output[] = $next;
}

return array( array_values( array_filter( $output, 'self::is_not_empty' ) ), $remaining );
}

public static function is_not_empty( $value ) {
return ! empty( $value );
}

public static function first_of( $parsers_and_args, $input ) {
foreach ( $parsers_and_args as $parser_and_args ) {
list( $parser, $parser_args ) = $parser_and_args;

$result = call_user_func_array( $parser, array_merge( $parser_args, array( $input ) ) );

if ( ! empty( $result ) ) {
return $result;
}
}

return array();
}

public static function zero_or_more( $parser, $parser_args, $input ) {
$output = array();
$remaining = $input;

while ( true ) {
$result = call_user_func_array( $parser, array_merge( $parser_args, array( $remaining ) ) );
if ( empty( $result ) ) {
return array( $output, $remaining );
}

list( $next, $remaining ) = $result;
$output[] = $next;
}
}

public static function one_or_more( $parser, $parser_args, $input ) {
$output = array();
$remaining = $input;

while ( true ) {
$result = call_user_func_array( $parser, array_merge( $parser_args, array( $remaining ) ) );
if ( empty( $result ) ) {
return empty( $output )
? array()
: array( $output, $remaining );
}

list( $next, $remaining ) = $result;
$output[] = $next;
}
}

public static function block( $blockName, $attrs, $rawContent ) {
return array(
'blockName' => $blockName,
'attrs' => $attrs,
'rawContent' => $rawContent
);
}

public static function freeform( $rawContent ) {
return self::block( 'freeform', array(), $rawContent );
}
}

endif;
185 changes: 185 additions & 0 deletions phpunit/class.block-parser-test.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
<?php

use PHPUnit\Framework\TestCase;

require_once dirname( dirname( __FILE__ ) ) . '/lib/block-parser.php';

class Block_Parser_Test extends TestCase {
private $parser;

function parse( $input ) {
return $this->parser->parse( $input );
}

function setUp() {
$this->parser = new Gutenberg_Block_Parser();
}

function test_combinator_succeed() {
$this->assertEquals(
[ [ 'test', 'bork' ] ],
Gutenberg_Block_Parser::succeed( 'test', 'bork' )
);
}

function test_combinator_fail() {
$this->assertEquals(
[],
Gutenberg_Block_Parser::fail( 'bork' )
);
}

function test_combinator_literal_success() {
$this->assertEquals(
[ 'test', ' string' ],
Gutenberg_Block_Parser::literal( 'test', 'test string' )
);
}

function test_combinator_literal_fail() {
$this->assertEquals(
[],
Gutenberg_Block_Parser::literal( 'test', 'no match' )
);
}

function test_combinator_ignore() {
$this->assertEquals(
[ [], 'abc' ],
Gutenberg_Block_Parser::ignore(
[ 'Gutenberg_Block_Parser', 'literal' ],
[ '123' ],
'123abc'
)
);
}

function test_combinator_ignore_fail() {
$this->assertEquals(
[],
Gutenberg_Block_Parser::ignore(
[ 'Gutenberg_Block_Parser', 'literal' ],
[ 'abc' ],
'123abc'
)
);
}

function test_combinator_match_success() {
$this->assertEquals(
[ [ 'test_val' ], ' = 5' ],
Gutenberg_Block_Parser::match( '(^[a-z_]+)', 'test_val = 5' )
);
}

function test_combinator_match_groups_success() {
$this->assertEquals(
[ [ 'test_val = 5', 'test_val', '5' ], ';' ],
Gutenberg_Block_Parser::match( '(^([a-z_]+) = (\d+))', 'test_val = 5;' )
);
}

function test_combinator_match_fail() {
$this->assertEquals(
[],
Gutenberg_Block_Parser::match( '(^[a-z_]+)', ';test_val = 5' )
);
}

function test_combinator_zero_or_more() {
$this->assertEquals(
[ [ 'a', 'a', 'a' ], 'xyz' ],
Gutenberg_Block_Parser::zero_or_more(
[ 'Gutenberg_Block_Parser', 'literal' ],
[ 'a' ],
'aaaxyz'
)
);
}

function test_combinator_zero_or_more_failure() {
$this->assertEquals(
[ [], 'bbb' ],
Gutenberg_Block_Parser::zero_or_more(
[ 'Gutenberg_Block_Parser', 'literal' ],
[ 'a' ],
'bbb'
)
);
}

function test_combinator_one_or_more() {
$this->assertEquals(
[ [ 'a', 'a' ], 'bb' ],
Gutenberg_Block_Parser::one_or_more(
[ 'Gutenberg_Block_Parser', 'literal' ],
[ 'a' ],
'aabb'
)
);
}

function test_combinator_one_or_more_failure() {
$this->assertEquals(
[],
Gutenberg_Block_Parser::one_or_more(
[ 'Gutenberg_Block_Parser', 'literal' ],
[ 'a' ],
'bbb'
)
);
}

function test_combinator_sequence() {
$this->assertEquals(
[ [ 'a', 'b' ], 'cd' ],
Gutenberg_Block_Parser::sequence( [
[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'a' ] ],
[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'b' ] ],
], 'abcd' )
);
}

function test_combinator_sequence_failure() {
$this->assertEquals(
[],
Gutenberg_Block_Parser::sequence( [
[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'a' ] ],
[ [ 'Gutenberg_Block_Parser', 'literal' ], [ 'b' ] ],
], 'acd' )
);
}

function test_block_void_no_attrs() {
$this->assertEquals(
[ [ 'blockName' => 'core/void', 'attrs' => [], 'rawContent' => '' ], '' ],
Gutenberg_Block_Parser::block_void(
'<!-- wp:core/void /-->'
)
);
}

function test_block_void_with_empty_attrs() {
$this->assertEquals(
[ [ 'blockName' => 'core/void', 'attrs' => [], 'rawContent' => '' ], '' ],
Gutenberg_Block_Parser::block_void(
'<!-- wp:core/void {} /-->'
)
);
}

function test_block_void_with_non_empty_attrs() {
$this->assertEquals(
[ [
'blockName' => 'core/void',
'attrs' => [
'val' => 1337
],
'rawContent' => ''
], '' ],
Gutenberg_Block_Parser::block_void(
'<!-- wp:core/void { "val": 1337 } /-->'
)
);
}
}

0 comments on commit cbe58e2

Please sign in to comment.