Merge branch 'main' into chore/bump-version-to-0.15.0

langgenius · Jan 7, 2025 · 29402f1 · 29402f1
2 parents 7769fdb + 41f39bf
commit 29402f1
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 34 deletions.
diff --git a/api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py b/api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py
@@ -1,13 +1,10 @@
-from concurrent.futures import ProcessPoolExecutor
-from os.path import abspath, dirname, join
 from threading import Lock
-from typing import Any, cast
+from typing import Any
 
-from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer  # type: ignore
+import tiktoken
 
 _tokenizer: Any = None
 _lock = Lock()
-_executor = ProcessPoolExecutor(max_workers=1)
 
 
 class GPT2Tokenizer:
@@ -17,22 +14,28 @@ def _get_num_tokens_by_gpt2(text: str) -> int:
         use gpt2 tokenizer to get num tokens
         """
         _tokenizer = GPT2Tokenizer.get_encoder()
-        tokens = _tokenizer.encode(text, verbose=False)
+        tokens = _tokenizer.encode(text)
         return len(tokens)
 
     @staticmethod
     def get_num_tokens(text: str) -> int:
-        future = _executor.submit(GPT2Tokenizer._get_num_tokens_by_gpt2, text)
-        result = future.result()
-        return cast(int, result)
+        # Because this process needs more cpu resource, we turn this back before we find a better way to handle it.
+        #
+        # future = _executor.submit(GPT2Tokenizer._get_num_tokens_by_gpt2, text)
+        # result = future.result()
+        # return cast(int, result)
+        return GPT2Tokenizer._get_num_tokens_by_gpt2(text)
 
     @staticmethod
     def get_encoder() -> Any:
         global _tokenizer, _lock
         with _lock:
             if _tokenizer is None:
-                base_path = abspath(__file__)
-                gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
-                _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
+                # Try to use tiktoken to get the tokenizer because it is faster
+                #
+                _tokenizer = tiktoken.get_encoding("gpt2")
+                # base_path = abspath(__file__)
+                # gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
+                # _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
 
             return _tokenizer
diff --git a/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py b/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py
@@ -377,7 +377,10 @@ def _generate(
                 for tool in tools:
                     formatted_tools.append(helper.dump_model(PromptMessageFunction(function=tool)))
 
-                data["tools"] = formatted_tools
+                if prompt_messages[-1].role.value == "tool":
+                    data["tools"] = None
+                else:
+                    data["tools"] = formatted_tools
 
         if stop:
             data["stop"] = stop

diff --git a/api/core/model_runtime/model_providers/siliconflow/tts/fish-speech-1.5.yaml b/api/core/model_runtime/model_providers/siliconflow/tts/fish-speech-1.5.yaml
@@ -0,0 +1,37 @@
+model: fishaudio/fish-speech-1.5
+model_type: tts
+model_properties:
+  default_voice: 'fishaudio/fish-speech-1.5:alex'
+  voices:
+    - mode: "fishaudio/fish-speech-1.5:alex"
+      name: "Alex（男声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:benjamin"
+      name: "Benjamin（男声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:charles"
+      name: "Charles（男声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:david"
+      name: "David（男声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:anna"
+      name: "Anna（女声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:bella"
+      name: "Bella（女声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:claire"
+      name: "Claire（女声）"
+      language: [ "zh-Hans", "en-US" ]
+    - mode: "fishaudio/fish-speech-1.5:diana"
+      name: "Diana（女声）"
+      language: [ "zh-Hans", "en-US" ]
+  audio_type: 'mp3'
+  max_workers: 5
+  # stream: false
+pricing:
+  input: '0.015'
+  output: '0'
+  unit: '0.001'
+  currency: RMB
diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
@@ -2,6 +2,7 @@
 import io
 import json
 import logging
+import operator
 import os
 import tempfile
 from typing import cast
@@ -10,6 +11,8 @@
 import pandas as pd
 import pypdfium2  # type: ignore
 import yaml  # type: ignore
+from docx.table import Table
+from docx.text.paragraph import Paragraph
 
 from configs import dify_config
 from core.file import File, FileTransferMethod, file_manager
@@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
         doc_file = io.BytesIO(file_content)
         doc = docx.Document(doc_file)
         text = []
-        # Process paragraphs
-        for paragraph in doc.paragraphs:
-            if paragraph.text.strip():
-                text.append(paragraph.text)
 
-        # Process tables
-        for table in doc.tables:
-            # Table header
-            try:
-                # table maybe cause errors so ignore it.
-                if len(table.rows) > 0 and table.rows[0].cells is not None:
+        # Keep track of paragraph and table positions
+        content_items: list[tuple[int, str, Table | Paragraph]] = []
+
+        # Process paragraphs and tables
+        for i, paragraph in enumerate(doc.paragraphs):
+            if paragraph.text.strip():
+                content_items.append((i, "paragraph", paragraph))
+
+        for i, table in enumerate(doc.tables):
+            content_items.append((i, "table", table))
+
+        # Sort content items based on their original position
+        content_items.sort(key=operator.itemgetter(0))
+
+        # Process sorted content
+        for _, item_type, item in content_items:
+            if item_type == "paragraph":
+                if isinstance(item, Table):
+                    continue
+                text.append(item.text)
+            elif item_type == "table":
+                # Process tables
+                if not isinstance(item, Table):
+                    continue
+                try:
                     # Check if any cell in the table has text
                     has_content = False
-                    for row in table.rows:
+                    for row in item.rows:
                         if any(cell.text.strip() for cell in row.cells):
                             has_content = True
                             break
 
                     if has_content:
-                        markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
-                        markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
-                        for row in table.rows[1:]:
-                            markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
+                        cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
+                        markdown_table = f"| {' | '.join(cell_texts)} |\n"
+                        markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
+
+                        for row in item.rows[1:]:
+                            # Replace newlines with <br> in each cell
+                            row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
+                            markdown_table += "| " + " | ".join(row_cells) + " |\n"
+
                         text.append(markdown_table)
-            except Exception as e:
-                logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
-                continue
+                except Exception as e:
+                    logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
+                    continue
 
         return "\n".join(text)
+
     except Exception as e:
         raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
 

diff --git a/docker/.env.example b/docker/.env.example
@@ -926,3 +926,5 @@ CREATE_TIDB_SERVICE_JOB_ENABLED=false
 # Maximum number of submitted thread count in a ThreadPool for parallel node execution
 MAX_SUBMIT_COUNT=100
 
+# The maximum number of top-k value for RAG.
+TOP_K_MAX_VALUE=10
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
@@ -386,6 +386,7 @@ x-shared-env: &shared-api-worker-env
   CSP_WHITELIST: ${CSP_WHITELIST:-}
   CREATE_TIDB_SERVICE_JOB_ENABLED: ${CREATE_TIDB_SERVICE_JOB_ENABLED:-false}
   MAX_SUBMIT_COUNT: ${MAX_SUBMIT_COUNT:-100}
+  TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-10}
 
 services:
   # API service

diff --git a/web/app/components/app/configuration/config-prompt/prompt-editor-height-resize-wrap.tsx b/web/app/components/app/configuration/config-prompt/prompt-editor-height-resize-wrap.tsx
@@ -26,13 +26,15 @@ const PromptEditorHeightResizeWrap: FC<Props> = ({
   const [clientY, setClientY] = useState(0)
   const [isResizing, setIsResizing] = useState(false)
   const [prevUserSelectStyle, setPrevUserSelectStyle] = useState(getComputedStyle(document.body).userSelect)
+  const [oldHeight, setOldHeight] = useState(height)
 
   const handleStartResize = useCallback((e: React.MouseEvent<HTMLElement>) => {
     setClientY(e.clientY)
     setIsResizing(true)
+    setOldHeight(height)
     setPrevUserSelectStyle(getComputedStyle(document.body).userSelect)
     document.body.style.userSelect = 'none'
-  }, [])
+  }, [height])
 
   const handleStopResize = useCallback(() => {
     setIsResizing(false)
@@ -44,8 +46,7 @@ const PromptEditorHeightResizeWrap: FC<Props> = ({
       return
 
     const offset = e.clientY - clientY
-    let newHeight = height + offset
-    setClientY(e.clientY)
+    let newHeight = oldHeight + offset
     if (newHeight < minHeight)
       newHeight = minHeight
     onHeightChange(newHeight)