Merge pull request #72 from MontiCore/unicode_identifier

simplify Name-token and keyword detection, fix fragmented identifiers
MontiCore · Dec 30, 2024 · bb23e03 · bb23e03
2 parents 992bf30 + 569afa4
commit bb23e03
Show file tree

Hide file tree

Showing 17 changed files with 214 additions and 190 deletions.
diff --git a/monticore-generator/src/main/java/de/monticore/codegen/mc2cd/TransformationHelper.java b/monticore-generator/src/main/java/de/monticore/codegen/mc2cd/TransformationHelper.java
@@ -14,7 +14,6 @@
 import de.monticore.cdbasis._ast.ASTCDDefinition;
 import de.monticore.cdbasis._ast.ASTCDType;
 import de.monticore.grammar.MCGrammarSymbolTableHelper;
-import de.monticore.grammar.RegExpBuilder;
 import de.monticore.grammar.grammar._ast.*;
 import de.monticore.grammar.grammar._symboltable.AdditionalAttributeSymbol;
 import de.monticore.grammar.grammar._symboltable.MCGrammarSymbol;
@@ -38,19 +37,19 @@
 import java.io.IOException;
 import java.util.*;
 import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
 import java.util.stream.Collectors;
 
 public final class TransformationHelper {
 
-  public static final String DEFAULT_FILE_EXTENSION = ".java";
-
   public static final String AST_PREFIX = "AST";
 
   public static final String LIST_SUFFIX = "s";
 
   public static final int STAR = -1;
 
+  @Deprecated // Use LexNamer.NAME_PATTERN after release 7.7.0
+  public static final Pattern NAME_PATTERN = Pattern.compile("([a-z]|[A-Z]|[_]|[$])([a-z]|[A-Z]|[_]|[0-9]|[$])*");
+
   protected static List<String> reservedCdNames = Arrays.asList(
       // CD4A
       "derived",
@@ -609,38 +608,5 @@ public static Optional<Integer> getMax(ASTAdditionalAttribute ast) {
     }
     return Optional.empty();
   }
-
-  public static boolean isFragment(ASTProd astNode) {
-    return !(astNode instanceof ASTLexProd)
-        || ((ASTLexProd) astNode).isFragment();
-  }
-
-  public static Optional<Pattern> calculateLexPattern(MCGrammarSymbol grammar,
-      ASTLexProd lexNode) {
-    Optional<Pattern> ret = Optional.empty();
-
-    final String lexString = getLexString(grammar, lexNode);
-    try {
-      if ("[[]".equals(lexString)) {
-        return Optional.ofNullable(Pattern.compile("[\\[]"));
-      } else {
-        return Optional.ofNullable(Pattern.compile(lexString));
-      }
-    } catch (PatternSyntaxException e) {
-      Log.error("0xA0913 Internal error with pattern handling for lex rules. Pattern: " + lexString
-          + "\n", e);
-    }
-    return ret;
-  }
-
-  protected static String getLexString(MCGrammarSymbol grammar, ASTLexProd lexNode) {
-    StringBuilder builder = new StringBuilder();
-    RegExpBuilder regExp = new RegExpBuilder(builder, grammar);
-    Grammar_WithConceptsTraverser traverser = Grammar_WithConceptsMill.traverser();
-    traverser.add4Grammar(regExp);
-    traverser.setGrammarHandler(regExp);
-    lexNode.accept(traverser);
-    return builder.toString();
-  }
-
+
 }
diff --git a/monticore-generator/src/main/java/de/monticore/codegen/parser/MCGrammarInfo.java b/monticore-generator/src/main/java/de/monticore/codegen/parser/MCGrammarInfo.java
@@ -3,9 +3,6 @@
 package de.monticore.codegen.parser;
 
 import com.google.common.collect.*;
-import de.monticore.codegen.cd2java.DecorationHelper;
-import de.monticore.codegen.mc2cd.TransformationHelper;
-import de.monticore.grammar.DirectLeftRecursionDetector;
 import de.monticore.grammar.LexNamer;
 import de.monticore.grammar.MCGrammarSymbolTableHelper;
 import de.monticore.grammar.PredicatePair;
@@ -24,10 +21,8 @@
 
 import java.util.*;
 import java.util.Map.Entry;
-import java.util.regex.Pattern;
 
-import static de.monticore.codegen.mc2cd.TransformationHelper.calculateLexPattern;
-import static de.monticore.codegen.mc2cd.TransformationHelper.isFragment;
+import static de.monticore.codegen.mc2cd.TransformationHelper.NAME_PATTERN;
 
 /**
  * Contains information about a grammar which is required for the parser
@@ -39,23 +34,18 @@ public class MCGrammarInfo {
    * Keywords of the processed grammar and its super grammars
    */
   protected Set<String> keywords = Sets.newLinkedHashSet();
-
-  /**
-   * Lexer patterns
-   */
-  protected Map<MCGrammarSymbol, List<Pattern>> lexerPatterns = new LinkedHashMap<>();
 
   /**
    * Additional java code for parser defined in antlr concepts of the processed
    * grammar and its super grammars
    */
-  protected List<String> additionalParserJavaCode = new ArrayList<String>();
+  protected List<String> additionalParserJavaCode = new ArrayList<>();
 
   /**
    * Additional java code for lexer defined in antlr concepts of the processed
    * grammar and its super grammars
    */
-  protected List<String> additionalLexerJavaCode = new ArrayList<String>();
+  protected List<String> additionalLexerJavaCode = new ArrayList<>();
 
   /**
    * Predicates
@@ -78,7 +68,6 @@ public class MCGrammarInfo {
 
   public MCGrammarInfo(MCGrammarSymbol grammarSymbol) {
     this.grammarSymbol = grammarSymbol;
-    buildLexPatterns();
     findAllKeywords();
     grammarSymbol.getTokenRulesWithInherited().forEach(t -> addSplitRule(t));
     grammarSymbol.getKeywordRulesWithInherited().forEach(k -> keywordRules.add(k));
@@ -117,7 +106,7 @@ protected void addSubRules() {
     for (MCGrammarSymbol grammar : grammarsToHandle) {
       HashMap<String, List<ASTRuleReference>> ruleMap = Maps.newLinkedHashMap();
       // Collect superclasses and superinterfaces for classes
-      for (ASTClassProd classProd : ((ASTMCGrammar) grammar.getAstNode())
+      for (ASTClassProd classProd : (grammar.getAstNode())
           .getClassProdList()) {
         List<ASTRuleReference> ruleRefs = Lists.newArrayList();
         ruleRefs.addAll(classProd.getSuperRuleList());
@@ -126,17 +115,15 @@ protected void addSubRules() {
       }
 
       // Collect superclasses and superinterfaces for abstract classes
-      for (ASTAbstractProd classProd : ((ASTMCGrammar) grammar.getAstNode())
-          .getAbstractProdList()) {
+      for (ASTAbstractProd classProd : grammar.getAstNode().getAbstractProdList()) {
         List<ASTRuleReference> ruleRefs = Lists.newArrayList();
         ruleRefs.addAll(classProd.getSuperRuleList());
         ruleRefs.addAll(classProd.getSuperInterfaceRuleList());
         ruleMap.put(classProd.getName(), ruleRefs);
       }
 
       // Collect superinterfaces for interfaces
-      for (ASTInterfaceProd classProd : ((ASTMCGrammar) grammar.getAstNode())
-          .getInterfaceProdList()) {
+      for (ASTInterfaceProd classProd : grammar.getAstNode().getInterfaceProdList()) {
         List<ASTRuleReference> ruleRefs = Lists.newArrayList();
         ruleRefs.addAll(classProd.getSuperInterfaceRuleList());
         ruleMap.put(classProd.getName(), ruleRefs);
@@ -166,20 +153,6 @@ protected void addSubrule(String superrule, String subrule, ASTRuleReference rul
   }
 
 
-  protected Collection<String> addLeftRecursiveRuleForProd(ASTClassProd ast) {
-    List<ASTProd> superProds = TransformationHelper.getAllSuperProds(ast);
-    Collection<String> names = new ArrayList<>();
-    superProds.forEach(s -> names.add(s.getName()));
-    DirectLeftRecursionDetector detector = new DirectLeftRecursionDetector();
-    for (ASTAlt alt : ast.getAltList()) {
-      if (detector.isAlternativeLeftRecursive(alt, names)) {
-        names.add(ast.getName());
-        return names;
-      }
-    }
-    return Lists.newArrayList();
-  }
-
   /**
    * @return grammarSymbol
    */
@@ -248,45 +221,22 @@ public Set<String> getKeywords() {
   }
 
   /**
-   * Checks if the terminal or constant <code>name</code> is a and has to be
-   * defined in the parser.
+   * Checks if the terminal or constant <code>name</code> is a keyword and could
+   * be replaced by a name
    * 
    * @param name - rule to check
-   * @return true, if the terminal or constant <code>name</code> is a and has to
-   * be defined in the parser.
+   * @return true, if the terminal or constant <code>name</code> is a keyword and could
+   * be replaced by a name
    */
-  public boolean isKeyword(String name, MCGrammarSymbol grammar) {
-    boolean matches = false;
-    boolean found = false;
-
-    // Check with options
-    if (mustBeKeyword(name)) {
-      matches = true;
-      found = true;
-    }
-
-    // Automatically detect if not specified
-    if (!found && lexerPatterns.containsKey(grammar)) {
-      for (Pattern p : lexerPatterns.get(grammar)) {
-
-        if (p.matcher(name).matches()) {
-          matches = true;
-          Log.debug(name + " is considered as a keyword because it matches " + p + " "
-              + "(grammarsymtab)", MCGrammarSymbol.class.getSimpleName());
-          break;
-        }
-
-      }
-    }
-
-    return matches;
+  public boolean isKeyword(String name) {
+    return keywords.contains(name);
   }
 
   public List<PredicatePair> getSubRulesForParsing(String ruleName) {
     // Consider superclass
     Optional<ProdSymbol> ruleByName = grammarSymbol.getProdWithInherited(ruleName);
     List<PredicatePair> predicateList = Lists.newArrayList();
-    if (!ruleByName.isPresent()) {
+    if (ruleByName.isEmpty()) {
       return predicateList;
     }
 
@@ -324,9 +274,7 @@ protected void findAllKeywords() {
               }
             }
           }
-          Optional<MCGrammarSymbol> refGrammarSymbol = MCGrammarSymbolTableHelper
-              .getMCGrammarSymbol(astProd.getEnclosingScope());
-          TerminalVisitor tv = new TerminalVisitor(refGrammarSymbol);
+          TerminalVisitor tv = new TerminalVisitor();
           Grammar_WithConceptsTraverser traverser = Grammar_WithConceptsMill.traverser();
           traverser.add4Grammar(tv);
           astProd.accept(traverser);
@@ -335,59 +283,9 @@ protected void findAllKeywords() {
     }
 
   }
-
-  protected void buildLexPatterns() {
-    buildLexPatterns(grammarSymbol);
-    grammarSymbol.getSuperGrammarSymbols().forEach(g -> buildLexPatterns(g));
-  }
-
-  protected void buildLexPatterns(MCGrammarSymbol grammar) {
-    List<Pattern> patterns = lexerPatterns.get(grammar);
-    if (patterns == null) {
-      patterns = new ArrayList<>();
-      lexerPatterns.put(grammar, patterns);
-    }
-
-    for (ProdSymbol rule : grammar.getProdsWithInherited().values()) {
-      if (rule.isPresentAstNode() && rule.isIsLexerProd()) {
-        if (!isFragment(rule.getAstNode())) {
-          Optional<Pattern> lexPattern = calculateLexPattern(
-              grammar,
-                  (ASTLexProd) rule.getAstNode());
-
-          if (lexPattern.isPresent()) {
-            patterns.add(lexPattern.get());
-          }
-        }
-      }
-    }
-  }
-
-  public static String getListName(ASTNonTerminal a) {
-    String name;
-    if (a.isPresentUsageName()) {
-      name = a.getUsageName();
-    } else {
-      // Use Nonterminal name as attribute name starting with lower case
-      // for a list (iterated) nonterminal a 's' is added for the name
-      name = a.getName();
-    }
-    return name + DecorationHelper.GET_SUFFIX_LIST;
-  }
-
-
-  protected boolean mustBeKeyword(String rule) {
-    return keywords.contains(rule);
-  }
 
   protected class TerminalVisitor implements GrammarVisitor2 {
 
-    TerminalVisitor(Optional<MCGrammarSymbol> refGrammarSymbol) {
-      this.refGrammarSymbol = refGrammarSymbol;
-    }
-
-    Optional<MCGrammarSymbol> refGrammarSymbol;
-
     public GrammarTraverser getTraverser() {
       return traverser;
     }
@@ -400,16 +298,14 @@ public void setTraverser(GrammarTraverser traverser) {
 
     @Override
     public void visit(ASTTerminal keyword) {
-      if (isKeyword(keyword.getName(), grammarSymbol)
-              || (refGrammarSymbol.isPresent() && isKeyword(keyword.getName(), refGrammarSymbol.get()))) {
+      if (NAME_PATTERN.matcher(keyword.getName()).matches()) {
         keywords.add(keyword.getName());
       }
     }
 
     @Override
     public void visit(ASTConstant keyword) {
-      if (isKeyword(keyword.getName(), grammarSymbol)
-              || (refGrammarSymbol.isPresent() && isKeyword(keyword.getName(), refGrammarSymbol.get()))) {
+      if (NAME_PATTERN.matcher(keyword.getName()).matches()) {
         keywords.add(keyword.getName());
       }
     }

diff --git a/monticore-generator/src/main/java/de/monticore/codegen/parser/antlr/Grammar2Antlr.java b/monticore-generator/src/main/java/de/monticore/codegen/parser/antlr/Grammar2Antlr.java
@@ -258,7 +258,7 @@ public void handle(ASTConstantGroup ast) {
 
       if (x.isPresentKeyConstant()) {
         addToCodeSection(createKeyPredicate(x.getKeyConstant().getStringList(), tmpName + label));
-      } else if (!grammarInfo.isKeyword(x.getName(), grammarEntry)) {
+      } else if (!grammarInfo.isKeyword(x.getName())) {
         addToCodeSection(tmpName + label + parserHelper.getOrComputeLexSymbolName(x.getName()));
       } else if (grammarInfo.getKeywordRules().contains(x.getName())) {
         addToCodeSection(tmpName + label + parserHelper.getKeyRuleName(x.getName()));
@@ -378,7 +378,7 @@ public void visit(ASTTerminal ast) {
     String rulename;
     if (ast.getName().isEmpty()) {
       rulename = "";
-    } else if (grammarInfo.isKeyword(ast.getName(), grammarEntry) && grammarInfo.getKeywordRules().contains(ast.getName())) {
+    } else if (grammarInfo.isKeyword(ast.getName()) && grammarInfo.getKeywordRules().contains(ast.getName())) {
       rulename = parserHelper.getKeyRuleName(ast.getName());
     } else {
       rulename = parserHelper.getOrComputeLexSymbolName(ast.getName().intern());
@@ -890,7 +890,7 @@ boolean getASTMax(ASTNonTerminal ast) {
   protected void addActionForKeyword(ASTTerminal keyword, ProdSymbol rule, boolean isList, String tmpNamePlusLbl) {
     addToCodeSection("(");
     String rulename = "";
-    if (grammarInfo.isKeyword(keyword.getName(), grammarEntry)) {
+    if (grammarInfo.isKeyword(keyword.getName())) {
       rulename = parserHelper.getOrComputeLexSymbolName(keyword.getName());
     }
 

diff --git a/...icore-generator/src/main/java/de/monticore/codegen/parser/antlr/Grammar2ParseVisitor.java b/...icore-generator/src/main/java/de/monticore/codegen/parser/antlr/Grammar2ParseVisitor.java
@@ -636,7 +636,7 @@ public void handle(ASTTerminal node) {
   protected String getRuleName(String name) {
     if (name.isEmpty()) {
       return "";
-    } else if (grammarInfo.isKeyword(name, parserGeneratorHelper.getGrammarSymbol()) && grammarInfo.getKeywordRules().contains(name)) {
+    } else if (grammarInfo.isKeyword(name) && grammarInfo.getKeywordRules().contains(name)) {
       return parserGeneratorHelper.getKeyRuleName(name);
     } else {
       return parserGeneratorHelper.getCachedLexSymbolName(name.intern()).orElse("##no-usagename-for-rulename");
@@ -781,7 +781,7 @@ protected String getRuleName(ASTConstant constant) {
     } else if (constant.isPresentTokenConstant()) {
       return parserGeneratorHelper.getCachedLexSymbolName(constant.getTokenConstant().getString())
               .orElse("##no-usagename-rulename-tc");
-    } else if (!grammarInfo.isKeyword(constant.getName(), parserGeneratorHelper.getGrammarSymbol())) {
+    } else if (!grammarInfo.isKeyword(constant.getName())) {
       return parserGeneratorHelper.getCachedLexSymbolName(constant.getName())
               .orElse("##no-usagename-rulename-k");
     } else if (grammarInfo.getKeywordRules().contains(constant.getName())) {

diff --git a/monticore-grammar/src/main/grammars/de/monticore/MCBasics.mc4 b/monticore-grammar/src/main/grammars/de/monticore/MCBasics.mc4
@@ -20,7 +20,7 @@ component grammar MCBasics {
   token Name =
     ( 'a'..'z' | 'A'..'Z' | '_' | '$' )
     ( 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' | '$' )*;
-  
+
   /*=================================================================*/
 
   fragment token NEWLINE =

diff --git a/monticore-grammar/src/main/java/de/monticore/grammar/LexNamer.java b/monticore-grammar/src/main/java/de/monticore/grammar/LexNamer.java
@@ -6,6 +6,7 @@
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import de.monticore.grammar.grammar._symboltable.MCGrammarSymbol;
 import de.se_rwth.commons.logging.Log;
@@ -16,7 +17,9 @@
  * 
  */
 public class LexNamer {
-
+
+  public static final Pattern NAME_PATTERN = Pattern.compile("([a-z]|[A-Z]|[_]|[$])([a-z]|[A-Z]|[_]|[0-9]|[$])*");
+
   protected int constantCounter = 0;
 
   protected int lexCounter = 0;