diff --git a/.gitignore b/.gitignore index b43110ee90..d25fc170c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ .DS_Store .rsynced .idea/ +.devcontainer/ +.vscode/ executables node_modules diff --git a/gherkin/python/Makefile b/gherkin/python/Makefile index 9df5132d12..169d207584 100644 --- a/gherkin/python/Makefile +++ b/gherkin/python/Makefile @@ -1,6 +1,6 @@ SHELL := /usr/bin/env bash -GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature") -BAD_FEATURE_FILES = $(shell find ../testdata/bad -name "*.feature") +GOOD_FEATURE_FILES = $(shell find ../testdata/good -name "*.feature" -o -name "*.feature.md") +BAD_FEATURE_FILES = $(shell find ../testdata/bad -name "*.feature" -o -name "*.feature.md") TOKENS = $(patsubst ../testdata/%.feature,acceptance/testdata/%.feature.tokens,$(GOOD_FEATURE_FILES)) ASTS = $(patsubst ../testdata/%.feature,acceptance/testdata/%.feature.ast.ndjson,$(GOOD_FEATURE_FILES)) diff --git a/gherkin/python/gherkin/token_matcher.py b/gherkin/python/gherkin/token_matcher.py index 259de08de1..01d8139b2e 100644 --- a/gherkin/python/gherkin/token_matcher.py +++ b/gherkin/python/gherkin/token_matcher.py @@ -3,6 +3,17 @@ from .dialect import Dialect from .errors import NoSuchLanguageException +# Source: https://stackoverflow.com/a/8348914 +try: + import textwrap + textwrap.indent +except AttributeError: # undefined function (wasn't added until Python 3.3) + def indent(text, amount, ch=' '): + padding = amount * ch + return ''.join(padding+line for line in text.splitlines(True)) +else: + def indent(text, amount, ch=' '): + return textwrap.indent(text, amount * ch) class TokenMatcher(object): LANGUAGE_RE = re.compile(r"^\s*#\s*language\s*:\s*([a-zA-Z\-_]+)\s*$") diff --git a/gherkin/python/gherkin/token_matcher_markdown.py b/gherkin/python/gherkin/token_matcher_markdown.py new file mode 100644 index 0000000000..3172001805 --- /dev/null +++ b/gherkin/python/gherkin/token_matcher_markdown.py @@ -0,0 +1,231 @@ +import re +from collections import defaultdict +from .dialect import Dialect +from .errors import NoSuchLanguageException + +KEYWORD_PREFIX_BULLET = '^(\\s*[*+-]\\s*)' +KEYWORD_PREFIX_HEADER = '^(#{1,6}\\s)' + +class GherkinInMarkdownTokenMatcher(object): + LANGUAGE_RE = re.compile(r"^\s*#\s*language\s*:\s*([a-zA-Z\-_]+)\s*$") + + def __init__(self, dialect_name='en'): + self._default_dialect_name = dialect_name + self._change_dialect(dialect_name) + self.reset() + + def reset(self): + if self.dialect_name != self._default_dialect_name: + self._change_dialect(self._default_dialect_name) + self._indent_to_remove = 0 + self._active_doc_string_separator = None + self.matched_feature_line=False + + def match_FeatureLine(self, token): + + if(self.matched_feature_line): + self._set_token_matched(token,None) + + # We first try to match "# Feature: blah" + result = self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.feature_keywords, ':', token, 'FeatureLine') + # If we didn't match "# Feature: blah", we still match this line + # as a FeatureLine. + # The reason for this is that users may not want to be constrained by having this as their fist line. + + if not result: + self._set_token_matched(token,'FeatureLine',token.line.get_line_text()) + self.matched_feature_line=result + return result + + + + def match_RuleLine(self, token): + return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.rule_keywords, ':', token, 'RuleLine') + + def match_ScenarioLine(self, token): + return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.scenario_keywords, ':', token, 'ScenarioLine') or self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.scenario_outline_keywords, ':', token, 'ScenarioLine') + + def match_BackgroundLine(self, token): + return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.background_keywords, ':', token, 'BackgroundLine') + + def match_ExamplesLine(self, token): + return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.examples_keywords, ':', token, 'ExamplesLine') + + def match_TableRow(self, token): + # Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables + + if re.match('^\\s\\s\\s?\\s?\\s?\\|',token.line.get_line_text(0)): + table_cells = token.line.table_cells + if(self._is_gfm_table_separator(table_cells)): + return False + + self._set_token_matched(token, 'TableRow', keyword='|',items=token.line.table_cells) + + return True + return False + + def _is_gfm_table_separator(self, table_cells): + text_of_table_cells = map(lambda x: x['text'], table_cells) + separator_values = list(filter(lambda x: re.match('^:?-+:?$',x),text_of_table_cells)) + return len(separator_values) > 0 + + + def match_StepLine(self, token): + nonStarStepKeywords = (self.dialect.given_keywords + + self.dialect.when_keywords + + self.dialect.then_keywords + + self.dialect.and_keywords + + self.dialect.but_keywords) + return self._match_title_line(KEYWORD_PREFIX_BULLET, nonStarStepKeywords, '', token, 'StepLine') + + def match_Comment(self, token): + if(token.line.startswith('|')): + table_cells = token.line.table_cells + if(self._is_gfm_table_separator(table_cells)): + return True + return self._set_token_matched(token,None,False) + + def match_Empty(self, token): + + result = False + if token.line.is_empty(): + result = True + if ( not self.match_TagLine(token) and + not self.match_FeatureLine(token) and + not self.match_ScenarioLine(token) and + not self.match_BackgroundLine(token) and + not self.match_ExamplesLine(token) and + not self.match_RuleLine(token) and + not self.match_TableRow(token) and + not self.match_Comment(token) and + not self.match_Language(token) and + not self.match_DocStringSeparator(token) and + not self.match_EOF(token) and + not self.match_StepLine(token) + ): + # neutered + result = True + + if(result): + self._set_token_matched(token, 'Empty', indent=0) + return result + return False + + # We've made a deliberate choice not to support `# language: [ISO 639-1]` headers or similar + # in Markdown. Users should specify a language globally. + def match_Language(self, token): + if not token: + raise ValueError('no token') + return False + + def match_TagLine(self, token): + + tags = [] + matching_tags = re.finditer('`(@[^`]+)`', token.line.get_line_text()) + idx=0 + for match in matching_tags: + tags.append({ + 'column': token.line.indent + match.start(idx) + 2, + 'text': match.group(1) + }) + + if(len(tags) == 0): + return False + + self._set_token_matched(token, 'TagLine', items=tags) + return True + + def match_DocStringSeparator(self, token): + if not self._active_doc_string_separator: + # open + return (self._match_DocStringSeparator(token, '"""', True) or + self._match_DocStringSeparator(token, '````', True) or self._match_DocStringSeparator(token, '```', True)) + else: + # close + return self._match_DocStringSeparator(token, self._active_doc_string_separator, False) + + def _match_DocStringSeparator(self, token, separator, is_open): + if not token.line.startswith(separator): + return False + + content_type = '' + if is_open: + content_type = token.line.get_rest_trimmed(len(separator)) + self._active_doc_string_separator = separator + self._indent_to_remove = token.line.indent + else: + self._active_doc_string_separator = None + self._indent_to_remove = 0 + + # TODO: Use the separator as keyword. That's needed for pretty printing. + self._set_token_matched(token, 'DocStringSeparator', content_type, separator) + return True + + def match_Other(self, token): + # take the entire line, except removing DocString indents + text = token.line.get_line_text(self._indent_to_remove) + self._set_token_matched(token, 'Other', self._unescaped_docstring(text), indent=0) + return True + + def match_EOF(self, token): + if not token.eof(): + return False + + self._set_token_matched(token, 'EOF') + return True + + def _match_title_line(self, prefix, keywords, keywordSuffix, token, token_type): + + keywords_or_list="|".join(map(lambda x: re.escape(x), keywords)) + match = re.search(u'{}({}){}(.*)'.format(prefix, keywords_or_list, keywordSuffix), token.line.get_line_text()) + indent = token.line.indent + result = False + + if(match): + matchedKeyword = match.group(2) + indent += len(match.group(1)) + self._set_token_matched(token, token_type, match.group(3).strip(), matchedKeyword, indent=indent) + return True + return False + + def _set_token_matched(self, token, matched_type, text=None, + keyword=None, keyword_type=None, indent=None, items=None): + if items is None: + items = [] + token.matched_type = matched_type + # text == '' should not result in None + token.matched_text = text.rstrip('\r\n') if text is not None else None + token.matched_keyword = keyword + token.matched_keyword_type = keyword_type + if indent is not None: + token.matched_indent = indent + else: + token.matched_indent = token.line.indent if token.line else 0 + token.matched_items = items + token.location['column'] = token.matched_indent + 1 + token.matched_gherkin_dialect = self.dialect_name + + def _change_dialect(self, dialect_name, location=None): + dialect = Dialect.for_name(dialect_name) + if not dialect: + raise NoSuchLanguageException(dialect_name, location) + + self.dialect_name = dialect_name + self.dialect = dialect + self.keyword_types = defaultdict(list) + for keyword in self.dialect.given_keywords: + self.keyword_types[keyword].append('Context') + for keyword in self.dialect.when_keywords: + self.keyword_types[keyword].append('Action') + for keyword in self.dialect.then_keywords: + self.keyword_types[keyword].append('Outcome') + for keyword in self.dialect.and_keywords + self.dialect.but_keywords: + self.keyword_types[keyword].append('Conjunction') + + def _unescaped_docstring(self, text): + if self._active_doc_string_separator == '"""': + return text.replace('\\"\\"\\"', '"""') + elif self._active_doc_string_separator == '```': + return text.replace('\\`\\`\\`', '```') + else: + return text \ No newline at end of file diff --git a/gherkin/python/test/__init__.py b/gherkin/python/test/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gherkin/python/test/gherkin_in_markdown_token_matcher_test.py b/gherkin/python/test/gherkin_in_markdown_token_matcher_test.py new file mode 100644 index 0000000000..82a28215bb --- /dev/null +++ b/gherkin/python/test/gherkin_in_markdown_token_matcher_test.py @@ -0,0 +1,232 @@ +# coding=utf-8 + +import pytest +from gherkin.token import Token +from gherkin.token_matcher_markdown import GherkinInMarkdownTokenMatcher +from gherkin.gherkin_line import GherkinLine +location = { 'line': 1, 'column': 1 } + +def test_it_matches_FeatureLine(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## Feature: hello''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_FeatureLine(token) + assert token.matched_type == 'FeatureLine' + assert token.matched_keyword == 'Feature' + assert token.matched_text == 'hello' + +def test_it_matches_FeatureLine_in_French(): + tm = GherkinInMarkdownTokenMatcher('fr') + line = GherkinLine(u'''## Fonctionnalité: hello''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_FeatureLine(token) + assert token.matched_type == 'FeatureLine' + assert token.matched_keyword == u'Fonctionnalité' + assert token.matched_text == 'hello' + +def test_it_matches_bullet_Step(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine(''' * Given I have 3 cukes''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_StepLine(token) + assert token.matched_type == 'StepLine' + assert token.matched_keyword == 'Given ' + assert token.matched_text == 'I have 3 cukes' + assert token.location['column'] == 6 + +def test_it_matches_plus_Step(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine(''' + Given I have 3 cukes''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_StepLine(token) + assert token.matched_type == 'StepLine' + assert token.matched_keyword == 'Given ' + assert token.matched_text == 'I have 3 cukes' + assert token.location['column'] == 6 + +def test_it_matches_hyphen_Step(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine(''' - Given I have 3 cukes''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_StepLine(token) + assert token.matched_type == 'StepLine' + assert token.matched_keyword == 'Given ' + assert token.matched_text == 'I have 3 cukes' + assert token.location['column'] == 6 + +def test_it_matches_arbitrary_text_as_Other(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''Whatever''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_Other(token) + assert token.matched_type == 'Other' + +def test_it_matches_a_non_keyword_line_as_Other(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''* whatever Given''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_Other(token) + assert token.matched_type == 'Other' + +def test_it_matches_a_non_keyword_header_line_as_Other(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## The world is wet''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_Other(token) + assert token.matched_type == 'Other' + +def test_it_matches_3_ticks_doctring_separator(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine(''' ```somefink''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_DocStringSeparator(token) + assert token.matched_type == 'DocStringSeparator' + assert token.matched_keyword == '```' + assert token.matched_text == 'somefink' + +def test_it_matches_4_ticks_doctring_separator(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine(''' ````''',location['line']) + t1 = Token(gherkin_line=line, location=location) + assert tm.match_DocStringSeparator(t1) + assert t1.matched_type == 'DocStringSeparator' + assert t1.matched_keyword == '````' + assert t1.matched_indent == 2 + assert t1.matched_text == '' + + t2 = Token(gherkin_line=GherkinLine(''' ```''',location['line']), location=location) + assert tm.match_Other(t2) + assert t2.matched_type == 'Other' + assert t2.matched_keyword == None + assert t2.matched_text == '```' + + t3 = Token(gherkin_line=GherkinLine(''' ````''',location['line']), location=location) + assert tm.match_DocStringSeparator(t3) + assert t3.matched_type == 'DocStringSeparator' + assert t3.matched_keyword == '````' + assert t1.matched_indent == 2 + assert t3.matched_text == '' + +def test_it_matches_table_row_indented_2_spaces(): + tm = GherkinInMarkdownTokenMatcher('en') + gherkin_line = GherkinLine(''' |foo|bar|''',location['line']) + token = Token(gherkin_line, location) + assert tm.match_TableRow(token) + assert token.matched_type == 'TableRow' + assert token.matched_keyword == '|' + expected_items= [ + {'column': 4, 'text': 'foo'}, + {'column': 8, 'text': 'bar'} + ] + assert token.matched_items == expected_items + +def test_it_matches_table_row_indented_5_spaces(): + tm = GherkinInMarkdownTokenMatcher('en') + gherkin_line = GherkinLine(''' |foo|bar|''',location['line']) + token = Token(gherkin_line, location) + assert tm.match_TableRow(token) + assert token.matched_type == 'TableRow' + assert token.matched_keyword == '|' + expected_items= [ + {'column': 7, 'text': 'foo'}, + {'column': 11, 'text': 'bar'} + ] + assert token.matched_items == expected_items + +def test_it_does_not_match_table_row_indented_1_space(): + tm = GherkinInMarkdownTokenMatcher('en') + gherkin_line = GherkinLine(''' |foo|bar|''',location['line']) + token = Token(gherkin_line, location) + assert not tm.match_TableRow(token) + +def test_it_does_not_match_table_row_indented_6_space(): + tm = GherkinInMarkdownTokenMatcher('en') + gherkin_line = GherkinLine(''' |foo|bar|''',location['line']) + token = Token(gherkin_line, location) + assert not tm.match_TableRow(token) + +def test_it_matches_table_separator_row_as_comment(): + tm = GherkinInMarkdownTokenMatcher('en') + + l1 = GherkinLine(' | h1 | h2 |',location['line']) + t1 = Token(l1,location) + assert tm.match_TableRow(t1) + + l2 = GherkinLine(' | --- | --- |',location['line']) + t2 = Token(l2,location) + assert not tm.match_TableRow(t2) + assert tm.match_Comment(t2) + +def test_it_matches_indented_tags(): + tm = GherkinInMarkdownTokenMatcher('en') + + l1 = GherkinLine(' `@foo` `@bar`',location['line']) + t1 = Token(l1,location) + assert tm.match_TagLine(t1) + + assert t1.matched_type == 'TagLine' + expected_items= [ + {'column': 4, 'text': '@foo'}, + {'column': 11, 'text': '@bar'} + ] + assert t1.matched_items == expected_items + +def test_it_matches_unindented_tags(): + tm = GherkinInMarkdownTokenMatcher('en') + + l1 = GherkinLine('`@foo` `@bar`',location['line']) + t1 = Token(l1,location) + assert tm.match_TagLine(t1) + + assert t1.matched_type == 'TagLine' + expected_items= [ + {'column': 2, 'text': '@foo'}, + {'column': 11, 'text': '@bar'} + ] + assert t1.matched_items == expected_items + + +def test_it_matches_RuleLine(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## Rule: the world''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_RuleLine(token) + assert token.matched_type == 'RuleLine' + assert token.matched_keyword == 'Rule' + assert token.matched_text == 'the world' + +def test_it_matches_ScenarioLine(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## Scenario: the one where''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_ScenarioLine(token) + assert token.matched_type == 'ScenarioLine' + assert token.matched_keyword == 'Scenario' + assert token.matched_text == 'the one where' + +def test_it_matches_ScenarioLine_outline(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## Scenario Outline: the ones where''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_ScenarioLine(token) + assert token.matched_type == 'ScenarioLine' + assert token.matched_keyword == 'Scenario Outline' + assert token.matched_text == 'the ones where' + +def test_it_matches_backgroundLine(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## Background: once upon a time''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_BackgroundLine(token) + assert token.matched_type == 'BackgroundLine' + assert token.matched_keyword == 'Background' + assert token.matched_text == 'once upon a time' + +def test_it_matches_ExamplesLine(): + tm = GherkinInMarkdownTokenMatcher('en') + line = GherkinLine('''## Examples: ''',location['line']) + token = Token(gherkin_line=line, location=location) + assert tm.match_ExamplesLine(token) + assert token.matched_type == 'ExamplesLine' + assert token.matched_keyword == 'Examples' + assert token.matched_text == '' \ No newline at end of file