[Build] Better fix for break (#871)

This is an improvement on #867, which addressed a build issue by excluding the latest release of `llama-cpp-python` when setting up the GitHub runner machines. The assertion was checking for a self-consistency issue, but it need not be fatal (as evidenced by the fact that we turn it off for Phi3 models). However, if the check fails, it would be helpful to know in bug reports from subsequent failures. Hence, switch to `warnings.warn()`.
guidance-ai · Jun 1, 2024 · 15ff21d · 15ff21d
1 parent c009b1a
commit 15ff21d
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 6 deletions.
diff --git a/.github/workflows/action_gpu_unit_tests.yml b/.github/workflows/action_gpu_unit_tests.yml
@@ -57,7 +57,7 @@ jobs:
         run: |
           pip install accelerate
           pip uninstall -y llama-cpp-python
-          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.76"
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

diff --git a/.github/workflows/action_plain_unit_tests.yml b/.github/workflows/action_plain_unit_tests.yml
@@ -39,7 +39,7 @@ jobs:
         run: |
           pip install sentencepiece
           pip uninstall -y llama-cpp-python
-          pip install "llama-cpp-python!=0.2.58,!=0.2.76"
+          pip install "llama-cpp-python!=0.2.58"
       - name: Run tests (except server)
         shell: bash
         run: |

diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
@@ -53,7 +53,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.76"
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

diff --git a/.github/workflows/notebook_tests.yml b/.github/workflows/notebook_tests.yml
@@ -56,7 +56,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.76"
+          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

diff --git a/guidance/models/_model.py b/guidance/models/_model.py
@@ -4,8 +4,10 @@
 import logging
 import queue
 import re
+import textwrap
 import threading
 import time
+import warnings
 
 
 from pprint import pprint
@@ -838,8 +840,16 @@ def _cleanup_tokens(self, token_ids, token_byte_positions):
 
             # another ugly hack for tokenizers that are not stable on encode/decode cycles
             # currently only Phi-3, should generalize this method if we see more of these
-            if not hasattr(self, "_disable_retokenize_check"):
-                assert token_byte_positions[-1] == last_pos, "Cross check last_pos"
+            if token_byte_positions[-1] != last_pos:
+                if not hasattr(self, "_disable_retokenize_check"):
+                    msg = textwrap.dedent(
+                        """Self-consistency check in _cleanup_tokens() failed.
+                        
+                        This is not a fatal issue, but if there are subsequent
+                        generation problems, please include this warning in
+                        your bug report."""
+                    )
+                    warnings.warn(msg)
 
         return token_ids, token_byte_positions