semtok

perrinjerome · Jun 23, 2024 · 386541d · 386541d
1 parent 52277d1
commit 386541d
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 116 deletions.
diff --git a/server/buildoutls/semantic_tokens.py b/server/buildoutls/semantic_tokens.py
@@ -1,15 +1,13 @@
 from typing import Literal, TypeAlias
 import logging
-from io import StringIO
 import pygments
 
 from lsprotocol.types import SemanticTokens, Range, Position
 import attrs
 import pygments.lexers
 import pygments.token
 
-from .types import SEMANTIC_TOKENS
-from .buildout import BuildoutProfile, BuildoutTemplate
+from .buildout import BuildoutProfile
 from .recipes import RecipeOptionKind
 
 
@@ -47,43 +45,50 @@ def token_type(self):
     }.get(self.kind, 0)
 
 
-def get_python_tokens(source_code: str) -> list[int]:
-  data = []
-  lexer = pygments.lexers.get_lexer_by_name("python")
-  for token in lexer.get_tokens_unprocessed(source_code):
-    pass
-  return data
+# https://microsoft.github.io/language-server-protocol/specifications/lsp/3.18/specification/#textDocument_semanticTokens
 
+SEMANTIC_TOKEN_TYPES = [
+  "comment",
+  "string",
+  "number",
+  "keyword",
+  "class",
+  "function",
+  "variable",
+  "type",
+]
 
-def get_token_type(token_pygment_type: pygments.token._TokenType) -> int:
-  from pygments.token import Keyword, Comment, String, Number, Token, Name
+token_type_by_type = {t: SEMANTIC_TOKEN_TYPES.index(t) for t in SEMANTIC_TOKEN_TYPES}
+
+
+def get_token_type(token_pygment_type: pygments.token._TokenType) -> int | None:
+  from pygments.token import Keyword, Comment, String, Number, Name
 
-  if token_pygment_type in Keyword:
-    return 15
   if token_pygment_type in Comment:
-    return 17
+    return token_type_by_type["comment"]
   if token_pygment_type in String:
-    return 18
+    return token_type_by_type["string"]
   if token_pygment_type in Number:
-    return 19
+    return token_type_by_type["number"]
   if token_pygment_type in Name.Class:
-    return 2
+    return token_type_by_type["class"]
   if token_pygment_type in Name.Function:
-    return 12
-  return 0
+    return token_type_by_type["function"]
+  if token_pygment_type in Name.Builtin or token_pygment_type in Keyword.Constant:
+    return token_type_by_type["type"]
+  if token_pygment_type in Name:
+    return token_type_by_type["variable"]
+  if token_pygment_type in Keyword:
+    return token_type_by_type["keyword"]
+  return None
 
 
 def get_semantic_tokens_and_semantic_errors(
-  parsed: BuildoutProfile | BuildoutTemplate | None,
+  parsed: BuildoutProfile,
 ) -> tuple[SemanticTokens, list]:
   data: list[int] = []
   errors: list = []
 
-  if not isinstance(parsed, BuildoutProfile):
-    return SemanticTokens(data=data), errors
-
-  prev_line, prev_character = 0, 0
-
   for section_value in parsed.values():
     if recipe := section_value.getRecipe():
       for option_key, option_value in section_value.items():
@@ -92,22 +97,18 @@ def get_semantic_tokens_and_semantic_errors(
           if option_definition.kind == RecipeOptionKind.PythonScript:
             lexer = pygments.lexers.get_lexer_by_name("python")
 
+            source_code = (
+              option_value.value
+            )  # XXX isn't this too much white space normalized ?
+
+            # XXX character_adjust is wrong, we want the indentation
             character_adjust = option_value.location.range.start.character
-            line_adjust = option_value.location.range.start.line
-            character = character_adjust
-            line = line_adjust
-            prev_line, prev_character = line, character
-            source_code = option_value.value
+            delta_line = option_value.location.range.start.line + 1
+            character_adjust = delta_start = 4  # TODO indentation
+
             for pos, token_pygment_type, token_text in lexer.get_tokens_unprocessed(
               source_code
             ):
-              line = prev_line
-              character = prev_character
-              if token_text in ("\n", "\r", "\r\n"):
-                line += 1
-                character = character_adjust
-              length = len(token_text)
-
               # A specific token i in the file consists of the following array indices:
               #
               # at index 5*i - deltaLine: token line number, relative to the previous token
@@ -119,56 +120,36 @@ def get_semantic_tokens_and_semantic_errors(
               #   SemanticTokensLegend.tokenTypes. We currently ask that tokenType < 65536.
               # at index 5*i+4 - tokenModifiers: each set bit will be looked up in
               #   SemanticTokensLegend.tokenModifiers
-              tok = [
-                line - prev_line,
-                character,
-                length,
-                get_token_type(token_pygment_type),
-                0,
-              ]
-              logger.info("adding token %s", tok)
-              data.extend(tok)
-              prev_line = line
-              prev_character = character + length
-
-            try:
-              raise SyntaxError("skip")
-              source_code = option_value.value
-              for token in get_python_tokens(source_code):
-                line, character = token.start.line, token.start.character
-
-                if line == prev_line:
-                  pos = character - prev_character
-                else:
-                  pos = character
-
-                length = len(token.text)
-
-                # A specific token i in the file consists of the following array indices:
-                #
-                # at index 5*i - deltaLine: token line number, relative to the previous token
-                # at index 5*i+1 - deltaStart: token start character, relative to the previous token (relative to 0 or the previous token’s start if they are on the same line)
-                # at index 5*i+2 - length: the length of the token.
-                # at index 5*i+3 - tokenType: will be looked up in SemanticTokensLegend.tokenTypes. We currently ask that tokenType < 65536.
-                # at index 5*i+4 - tokenModifiers: each set bit will be looked up in SemanticTokensLegend.tokenModifiers
-                data.extend(
-                  [
-                    line - prev_line,
-                    pos,
-                    length,
-                    token.token_type,
+              token_type = get_token_type(token_pygment_type)
+              if token_type is not None:
+                # explode token spawning on multiple lines into multiple tokens
+                for token_text_line in token_text.splitlines():
+                  tok = [
+                    delta_line,
+                    delta_start,
+                    len(token_text_line),
+                    token_type,
                     0,
                   ]
-                )
-                prev_line = line
-                prev_character = character + length
-
-            #            module = ast.parse(
-            #              source=option_value.value,
-            #              filename="<option>",
-            #            )
-            except SyntaxError as e:
-              pass
+                  if 1:
+                    logger.info(
+                      f"🌞 token {delta_line=}, {delta_start=} {tok=} {token_text=} {token_text_line=} {token_pygment_type=}"
+                    )
+                  data.extend(tok)
+                  delta_line = 1
+                  delta_start = character_adjust
+                delta_line = 0
+                delta_start = len(token_text)
+              else:
+                if line_count := (token_text.replace("\r\n", "\n").count("\n")):
+                  delta_line += line_count
+                  delta_start = character_adjust
+                else:
+                  delta_start += len(token_text)
+                if 0:
+                  logger.info(
+                    f"skipped token {line_count=} {delta_line=}, {delta_start=} {token_text=} {token_pygment_type=}"
+                  )
 
   #  prev_line, prev_character = params.prev_line, params.prev_character
   #  this_range = Range(

diff --git a/server/buildoutls/server.py b/server/buildoutls/server.py
@@ -771,12 +771,14 @@ async def lsp_document_link(
 
 @server.feature(
   TEXT_DOCUMENT_SEMANTIC_TOKENS_FULL,
-  SemanticTokensLegend(token_types=types.SEMANTIC_TOKENS, token_modifiers=[]),
+  SemanticTokensLegend(
+    token_types=semantic_tokens.SEMANTIC_TOKEN_TYPES, token_modifiers=[]
+  ),
 )
 async def lsp_semantic_tokens_full(
   ls: LanguageServer, params: SemanticTokensParams
 ) -> SemanticTokens:
-  parsed = await buildout.open(ls, params.text_document.uri)
+  parsed = await buildout.parse(ls, params.text_document.uri)
 
   tokens, errors = semantic_tokens.get_semantic_tokens_and_semantic_errors(
     parsed,

diff --git a/server/buildoutls/tests/test_semantic_tokens.py b/server/buildoutls/tests/test_semantic_tokens.py
@@ -1,7 +1,6 @@
 from lsprotocol.types import (
   TextDocumentIdentifier,
   SemanticTokensParams,
-  SemanticTokensRangeParams,
 )
 from pygls.server import LanguageServer
 

diff --git a/server/buildoutls/types.py b/server/buildoutls/types.py
@@ -37,31 +37,3 @@ class OpenPypiPageCommandParams(TypedDict):
 class UpdateMD5SumCommandParams(TypedDict):
   document_uri: str
   section_name: str
-
-
-# https://microsoft.github.io/language-server-protocol/specifications/lsp/3.18/specification/#textDocument_semanticTokens
-SEMANTIC_TOKENS: List[str] = [
-  "namespace",
-  "type",
-  "class",
-  "enum",
-  "interface",
-  "struct",  # 5
-  "typeParameter",
-  "parameter",
-  "variable",
-  "property",
-  "enumMember",  # 10
-  "event",
-  "function",
-  "method",
-  "macro",
-  "keyword",  # 15
-  "modifier",
-  "comment",
-  "string",
-  "number",
-  "regexp",
-  "operator",
-  "decorator",
-]
diff --git a/server/setup.py b/server/setup.py
@@ -48,6 +48,7 @@
       "pytest-cov",
       "pytest",
       "types-cachetools",
+      "types-pygments",
       "types-setuptools",
       "types-toml",
       "ruff",