Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gherkin/python: Support gherkin markdown #2103

Merged
merged 29 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
164545c
feat: create Python Markdown token matcher. Match a markdown Feature…
temyers Oct 21, 2022
15b875a
feat: Match a markdown FeatureLine in French
temyers Oct 21, 2022
398d062
feature: Implement GherkinInMarkdownTokenMatcher spec: it_matches_bul…
temyers Oct 22, 2022
6e16794
feature: Implement GherkinInMarkdownTokenMatcher spec: test_it_matche…
temyers Oct 22, 2022
18af45b
feature: Implement GherkinInMarkdownTokenMatcher spec: test_it_matche…
temyers Oct 22, 2022
5548a18
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
64fec50
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
43ceb27
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
7010a0f
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
2682c47
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
6a7f169
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
4756acf
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
ae10cef
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
784f28a
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
ebfe5f4
feature: Implement GherkinInMarkdownTokenMatcher specs
temyers Oct 22, 2022
83fd8bb
clean up code
temyers Oct 22, 2022
aef421c
clean up comments
temyers Oct 24, 2022
cbe35cd
Merge branch 'main' into feature/gherkin-markdown-py
temyers Oct 24, 2022
12e8729
fix: backport f-string to python2 compatible implementation
temyers Oct 24, 2022
5589712
fix: backport textwrap.indent for python 2.7
temyers Oct 24, 2022
6242c33
fix: specify utf-8 encoding for python 2.x support
temyers Oct 24, 2022
c2e57db
build: update VSCode devcontainer to reference cucumber/cucumber-buil…
temyers Oct 24, 2022
4963f96
fix: (WIP)use unicode strings for python 2.x support
temyers Oct 24, 2022
c8a320c
fix: add unicode encoding to make test pass.
temyers Oct 24, 2022
6af967c
remove dev container
temyers Nov 8, 2022
e3bdfe3
remove IDE configuration
temyers Nov 8, 2022
fbcb536
ignore IDE configuration files (vscode)
temyers Nov 8, 2022
0ecd574
move markdown matcher to new file.
temyers Nov 8, 2022
36cc5c7
Add markdown to acceptance test feature files
temyers Nov 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ARG VARIANT="latest"
FROM cucumber/cucumber-build:${VARIANT}


20 changes: 20 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
// https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/ubuntu
{
"name": "Ubuntu",
temyers marked this conversation as resolved.
Show resolved Hide resolved
"build": {
"dockerfile": "Dockerfile",
// Update 'VARIANT' to pick an Ubuntu version: jammy / ubuntu-22.04, focal / ubuntu-20.04, bionic /ubuntu-18.04
// Use ubuntu-22.04 or ubuntu-18.04 on local arm64/Apple Silicon.
"args": { "VARIANT": "latest" }
},

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "uname -a",

// Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
"remoteUser": "cukebot"
}
temyers marked this conversation as resolved.
Show resolved Hide resolved
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"gherkin/python/gherkin/test"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
temyers marked this conversation as resolved.
Show resolved Hide resolved
241 changes: 241 additions & 0 deletions gherkin/python/gherkin/token_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,247 @@
from .dialect import Dialect
from .errors import NoSuchLanguageException

# Source: https://stackoverflow.com/a/8348914
try:
import textwrap
textwrap.indent
except AttributeError: # undefined function (wasn't added until Python 3.3)
def indent(text, amount, ch=' '):
padding = amount * ch
return ''.join(padding+line for line in text.splitlines(True))
else:
def indent(text, amount, ch=' '):
return textwrap.indent(text, amount * ch)


KEYWORD_PREFIX_BULLET = '^(\\s*[*+-]\\s*)'
KEYWORD_PREFIX_HEADER = '^(#{1,6}\\s)'

class GherkinInMarkdownTokenMatcher(object):
temyers marked this conversation as resolved.
Show resolved Hide resolved
LANGUAGE_RE = re.compile(r"^\s*#\s*language\s*:\s*([a-zA-Z\-_]+)\s*$")

def __init__(self, dialect_name='en'):
self._default_dialect_name = dialect_name
self._change_dialect(dialect_name)
self.reset()

def reset(self):
if self.dialect_name != self._default_dialect_name:
self._change_dialect(self._default_dialect_name)
self._indent_to_remove = 0
self._active_doc_string_separator = None
self.matched_feature_line=False

def match_FeatureLine(self, token):

if(self.matched_feature_line):
self._set_token_matched(token,None)

# We first try to match "# Feature: blah"
result = self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.feature_keywords, ':', token, 'FeatureLine')
# If we didn't match "# Feature: blah", we still match this line
# as a FeatureLine.
# The reason for this is that users may not want to be constrained by having this as their fist line.

if not result:
self._set_token_matched(token,'FeatureLine',token.line.get_line_text())
self.matched_feature_line=result
return result



def match_RuleLine(self, token):
return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.rule_keywords, ':', token, 'RuleLine')

def match_ScenarioLine(self, token):
return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.scenario_keywords, ':', token, 'ScenarioLine') or self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.scenario_outline_keywords, ':', token, 'ScenarioLine')

def match_BackgroundLine(self, token):
return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.background_keywords, ':', token, 'BackgroundLine')

def match_ExamplesLine(self, token):
return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.examples_keywords, ':', token, 'ExamplesLine')

def match_TableRow(self, token):
# Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables

if re.match('^\\s\\s\\s?\\s?\\s?\\|',token.line.get_line_text(0)):
table_cells = token.line.table_cells
if(self._is_gfm_table_separator(table_cells)):
return False

self._set_token_matched(token, 'TableRow', keyword='|',items=token.line.table_cells)

return True
return False

def _is_gfm_table_separator(self, table_cells):
text_of_table_cells = map(lambda x: x['text'], table_cells)
separator_values = list(filter(lambda x: re.match('^:?-+:?$',x),text_of_table_cells))
return len(separator_values) > 0


def match_StepLine(self, token):
nonStarStepKeywords = (self.dialect.given_keywords +
self.dialect.when_keywords +
self.dialect.then_keywords +
self.dialect.and_keywords +
self.dialect.but_keywords)
return self._match_title_line(KEYWORD_PREFIX_BULLET, nonStarStepKeywords, '', token, 'StepLine')

def match_Comment(self, token):
if(token.line.startswith('|')):
table_cells = token.line.table_cells
if(self._is_gfm_table_separator(table_cells)):
return True
return self._set_token_matched(token,None,False)

def match_Empty(self, token):

result = False
if token.line.is_empty():
result = True
if ( not self.match_TagLine(token) and
not self.match_FeatureLine(token) and
not self.match_ScenarioLine(token) and
not self.match_BackgroundLine(token) and
not self.match_ExamplesLine(token) and
not self.match_RuleLine(token) and
not self.match_TableRow(token) and
not self.match_Comment(token) and
not self.match_Language(token) and
not self.match_DocStringSeparator(token) and
not self.match_EOF(token) and
not self.match_StepLine(token)
):
# neutered
result = True

if(result):
self._set_token_matched(token, 'Empty', indent=0)
return result
return False

# We've made a deliberate choice not to support `# language: [ISO 639-1]` headers or similar
# in Markdown. Users should specify a language globally.
def match_Language(self, token):
if not token:
raise ValueError('no token')
return False

def match_TagLine(self, token):

tags = []
matching_tags = re.finditer('`(@[^`]+)`', token.line.get_line_text())
idx=0
for match in matching_tags:
tags.append({
'column': token.line.indent + match.start(idx) + 2,
'text': match.group(1)
})

if(len(tags) == 0):
return False

self._set_token_matched(token, 'TagLine', items=tags)
return True

def match_DocStringSeparator(self, token):
if not self._active_doc_string_separator:
# open
return (self._match_DocStringSeparator(token, '"""', True) or
self._match_DocStringSeparator(token, '````', True) or self._match_DocStringSeparator(token, '```', True))
else:
# close
return self._match_DocStringSeparator(token, self._active_doc_string_separator, False)

def _match_DocStringSeparator(self, token, separator, is_open):
if not token.line.startswith(separator):
return False

content_type = ''
if is_open:
content_type = token.line.get_rest_trimmed(len(separator))
self._active_doc_string_separator = separator
self._indent_to_remove = token.line.indent
else:
self._active_doc_string_separator = None
self._indent_to_remove = 0

# TODO: Use the separator as keyword. That's needed for pretty printing.
self._set_token_matched(token, 'DocStringSeparator', content_type, separator)
return True

def match_Other(self, token):
# take the entire line, except removing DocString indents
text = token.line.get_line_text(self._indent_to_remove)
self._set_token_matched(token, 'Other', self._unescaped_docstring(text), indent=0)
return True

def match_EOF(self, token):
if not token.eof():
return False

self._set_token_matched(token, 'EOF')
return True

def _match_title_line(self, prefix, keywords, keywordSuffix, token, token_type):

keywords_or_list="|".join(map(lambda x: re.escape(x), keywords))
match = re.search(u'{}({}){}(.*)'.format(prefix, keywords_or_list, keywordSuffix), token.line.get_line_text())
temyers marked this conversation as resolved.
Show resolved Hide resolved
indent = token.line.indent
result = False

if(match):
matchedKeyword = match.group(2)
indent += len(match.group(1))
self._set_token_matched(token, token_type, match.group(3).strip(), matchedKeyword, indent=indent)
return True
return False

def _set_token_matched(self, token, matched_type, text=None,
keyword=None, keyword_type=None, indent=None, items=None):
if items is None:
items = []
token.matched_type = matched_type
# text == '' should not result in None
token.matched_text = text.rstrip('\r\n') if text is not None else None
token.matched_keyword = keyword
token.matched_keyword_type = keyword_type
if indent is not None:
token.matched_indent = indent
else:
token.matched_indent = token.line.indent if token.line else 0
token.matched_items = items
token.location['column'] = token.matched_indent + 1
token.matched_gherkin_dialect = self.dialect_name

def _change_dialect(self, dialect_name, location=None):
dialect = Dialect.for_name(dialect_name)
if not dialect:
raise NoSuchLanguageException(dialect_name, location)

self.dialect_name = dialect_name
self.dialect = dialect
self.keyword_types = defaultdict(list)
for keyword in self.dialect.given_keywords:
self.keyword_types[keyword].append('Context')
for keyword in self.dialect.when_keywords:
self.keyword_types[keyword].append('Action')
for keyword in self.dialect.then_keywords:
self.keyword_types[keyword].append('Outcome')
for keyword in self.dialect.and_keywords + self.dialect.but_keywords:
self.keyword_types[keyword].append('Conjunction')

def _unescaped_docstring(self, text):
if self._active_doc_string_separator == '"""':
return text.replace('\\"\\"\\"', '"""')
elif self._active_doc_string_separator == '```':
return text.replace('\\`\\`\\`', '```')
else:
return text



class TokenMatcher(object):
LANGUAGE_RE = re.compile(r"^\s*#\s*language\s*:\s*([a-zA-Z\-_]+)\s*$")
Expand Down
Empty file added gherkin/python/test/__init__.py
Empty file.
Loading