forked from erikrose/mediawiki-parser
-
-
Notifications
You must be signed in to change notification settings - Fork 18
/
preprocessor.pijnu
125 lines (103 loc) · 7.69 KB
/
preprocessor.pijnu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
preprocessor
<toolset>
def replace_by_space(node):
node.value = ' '
<definition>
# Codes
LF : '\x0d'
CR : '\x0a'
CRLF : "\x0a\x0d"
EOL : CRLF / LF / CR
TAB : " "
QUOTE : "\""
L_BRACKET : "["
R_BRACKET : "\]"
L_BRACKET_DROP : "[" : drop
R_BRACKET_DROP : "\]" : drop
L_BRACE : "{" : drop
R_BRACE : "}" : drop
SPACE : " " : drop
SPACETAB : SPACE / TAB : drop
SPACETABEOL : SPACE / TAB / EOL : drop
PIPE : "|" : drop
BANG : "!" : drop
EQUAL : "=" : drop
LT : "<" : drop
GT : ">" : drop
HASH : "#" : drop
DASH : "-" : drop
AMP : "&" : drop
SEMICOLON : ";" : drop
TEMPLATE_BEGIN : L_BRACE{2} : drop
TEMPLATE_END : R_BRACE{2} : drop
PARAMETER_BEGIN : L_BRACE{3} : drop
PARAMETER_END : R_BRACE{3} : drop
# Predefined tags
NOWIKI_BEGIN : "<nowiki>"
NOWIKI_END : "</nowiki>"
PRE_BEGIN : "<pre>"
PRE_END : "</pre>"
SOURCE_TAG_START : "<source"
SOURCE_TAG_GT : GT
SOURCE_TAG_ATTRIBUTE : ( SPACETABEOL* "lang=" QUOTE [a..zA..Z0..9 ]+ QUOTE )
SOURCE_BEGIN : SOURCE_TAG_START SOURCE_TAG_ATTRIBUTE? SPACETABEOL* SOURCE_TAG_GT
SOURCE_END : "</source>"
# Characters
any_char : [\x20..\xff] / '/'
esc_char : L_BRACKET/R_BRACKET/PIPE/L_BRACE/R_BRACE/LT/GT/AMP/SEMICOLON
esc_char_drop : L_BRACKET_DROP/R_BRACKET_DROP/PIPE/L_BRACE/R_BRACE/LT/GT/AMP/SEMICOLON
raw_char : !esc_char_drop any_char
raw_text : (raw_char / TAB)+ : join
# HTML comments
# HTML comments are totally ignored and do not appear in the final text
comment_content : ((!(DASH{2} GT) [\x20..\xff])+ / SPACETABEOL)*
html_comment : LT BANG DASH{2} comment_content DASH{2} GT : drop
# Text
page_name : raw_char+ : join
# Template parameters
# Those parameters should be substituted by their value when the current page is a template
# or by their optional default value in any case
parameter_id : raw_char+ : join
parameter_value : inline? : keep
optional_default_value : (PIPE SPACETABEOL* parameter_value)? SPACETABEOL* : liftNode
template_parameter : PARAMETER_BEGIN parameter_id optional_default_value PARAMETER_END : substitute_template_parameter
# Links
LINK_PIPE : PIPE : restore
internal_link : L_BRACKET{2} inline (LINK_PIPE inline)* R_BRACKET{2} : join
external_link : L_BRACKET inline (SPACE inline)* R_BRACKET : join
link : internal_link / external_link
# Templates
EOL_KEEP : EOL : restore
TAB_KEEP : TAB : restore
value_content : (inline / (!(SPACETABEOL* (TEMPLATE_END / PIPE)) (any_char / EOL_KEEP)))* : keep
parameter_value : value_content SPACETABEOL*
optional_value : parameter_value?
parameter_equal : SPACETABEOL* EQUAL SPACETABEOL*
parameter_name : (!(esc_char_drop/parameter_equal) raw_char)+ : join
named_parameter : parameter_name parameter_equal optional_value
standalone_parameter : value_content? : join
parameter : SPACETABEOL* PIPE SPACETABEOL* (named_parameter/standalone_parameter) : liftValue
parameters : parameter*
template : TEMPLATE_BEGIN SPACETABEOL* page_name parameters SPACETABEOL* TEMPLATE_END : substitute_template
# inline allows to have templates/links inside templates/links
structure : link / template / template_parameter
inline : (structure / raw_text)+ : @
numbered_entity : AMP HASH [0..9]+ SEMICOLON : substitute_numbered_entity
named_entity : AMP [a..zA..Z]+ SEMICOLON : substitute_named_entity
entity : named_entity / numbered_entity
# Pre and nowiki tags
# Preformatted acts like nowiki (disables wikitext parsing)
# We allow any char without parsing them as long as the tag is not closed
pre_text : (!PRE_END any_char)* : join
preformatted : PRE_BEGIN pre_text PRE_END : liftValue
eol_to_space : EOL* : replace_by_space
nowiki_text : (!NOWIKI_END (any_char/eol_to_space))* : join
nowiki : NOWIKI_BEGIN nowiki_text NOWIKI_END : liftValue
source_text : (!SOURCE_END (TAB_KEEP/any_char/EOL_KEEP))* : join
source : SOURCE_BEGIN source_text SOURCE_END : restore liftValue
# Text types
styled_text : template / template_parameter / entity
not_styled_text : html_comment / preformatted / nowiki / source
allowed_char : esc_char_drop{1} : restore liftValue
allowed_text : raw_text / allowed_char
wikitext : (not_styled_text / styled_text / allowed_text / EOL)+ : join