Skip to content

Commit

Permalink
Merge branch 'main' into chore/bump-version-to-0.15.0
Browse files Browse the repository at this point in the history
  • Loading branch information
laipz8200 committed Jan 7, 2025
2 parents 7769fdb + 41f39bf commit 29402f1
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 34 deletions.
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from concurrent.futures import ProcessPoolExecutor
from os.path import abspath, dirname, join
from threading import Lock
from typing import Any, cast
from typing import Any

from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer # type: ignore
import tiktoken

_tokenizer: Any = None
_lock = Lock()
_executor = ProcessPoolExecutor(max_workers=1)


class GPT2Tokenizer:
Expand All @@ -17,22 +14,28 @@ def _get_num_tokens_by_gpt2(text: str) -> int:
use gpt2 tokenizer to get num tokens
"""
_tokenizer = GPT2Tokenizer.get_encoder()
tokens = _tokenizer.encode(text, verbose=False)
tokens = _tokenizer.encode(text)
return len(tokens)

@staticmethod
def get_num_tokens(text: str) -> int:
future = _executor.submit(GPT2Tokenizer._get_num_tokens_by_gpt2, text)
result = future.result()
return cast(int, result)
# Because this process needs more cpu resource, we turn this back before we find a better way to handle it.
#
# future = _executor.submit(GPT2Tokenizer._get_num_tokens_by_gpt2, text)
# result = future.result()
# return cast(int, result)
return GPT2Tokenizer._get_num_tokens_by_gpt2(text)

@staticmethod
def get_encoder() -> Any:
global _tokenizer, _lock
with _lock:
if _tokenizer is None:
base_path = abspath(__file__)
gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
_tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
# Try to use tiktoken to get the tokenizer because it is faster
#
_tokenizer = tiktoken.get_encoding("gpt2")
# base_path = abspath(__file__)
# gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
# _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)

return _tokenizer
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,10 @@ def _generate(
for tool in tools:
formatted_tools.append(helper.dump_model(PromptMessageFunction(function=tool)))

data["tools"] = formatted_tools
if prompt_messages[-1].role.value == "tool":
data["tools"] = None
else:
data["tools"] = formatted_tools

if stop:
data["stop"] = stop
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
model: fishaudio/fish-speech-1.5
model_type: tts
model_properties:
default_voice: 'fishaudio/fish-speech-1.5:alex'
voices:
- mode: "fishaudio/fish-speech-1.5:alex"
name: "Alex(男声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:benjamin"
name: "Benjamin(男声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:charles"
name: "Charles(男声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:david"
name: "David(男声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:anna"
name: "Anna(女声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:bella"
name: "Bella(女声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:claire"
name: "Claire(女声)"
language: [ "zh-Hans", "en-US" ]
- mode: "fishaudio/fish-speech-1.5:diana"
name: "Diana(女声)"
language: [ "zh-Hans", "en-US" ]
audio_type: 'mp3'
max_workers: 5
# stream: false
pricing:
input: '0.015'
output: '0'
unit: '0.001'
currency: RMB
60 changes: 42 additions & 18 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import json
import logging
import operator
import os
import tempfile
from typing import cast
Expand All @@ -10,6 +11,8 @@
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore
from docx.table import Table
from docx.text.paragraph import Paragraph

from configs import dify_config
from core.file import File, FileTransferMethod, file_manager
Expand Down Expand Up @@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
doc_file = io.BytesIO(file_content)
doc = docx.Document(doc_file)
text = []
# Process paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text.append(paragraph.text)

# Process tables
for table in doc.tables:
# Table header
try:
# table maybe cause errors so ignore it.
if len(table.rows) > 0 and table.rows[0].cells is not None:
# Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = []

# Process paragraphs and tables
for i, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))

for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))

# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))

# Process sorted content
for _, item_type, item in content_items:
if item_type == "paragraph":
if isinstance(item, Table):
continue
text.append(item.text)
elif item_type == "table":
# Process tables
if not isinstance(item, Table):
continue
try:
# Check if any cell in the table has text
has_content = False
for row in table.rows:
for row in item.rows:
if any(cell.text.strip() for cell in row.cells):
has_content = True
break

if has_content:
markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
for row in table.rows[1:]:
markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
markdown_table = f"| {' | '.join(cell_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"

for row in item.rows[1:]:
# Replace newlines with <br> in each cell
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += "| " + " | ".join(row_cells) + " |\n"

text.append(markdown_table)
except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
continue
except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
continue

return "\n".join(text)

except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e

Expand Down
2 changes: 2 additions & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -926,3 +926,5 @@ CREATE_TIDB_SERVICE_JOB_ENABLED=false
# Maximum number of submitted thread count in a ThreadPool for parallel node execution
MAX_SUBMIT_COUNT=100

# The maximum number of top-k value for RAG.
TOP_K_MAX_VALUE=10
1 change: 1 addition & 0 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ x-shared-env: &shared-api-worker-env
CSP_WHITELIST: ${CSP_WHITELIST:-}
CREATE_TIDB_SERVICE_JOB_ENABLED: ${CREATE_TIDB_SERVICE_JOB_ENABLED:-false}
MAX_SUBMIT_COUNT: ${MAX_SUBMIT_COUNT:-100}
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-10}

services:
# API service
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ const PromptEditorHeightResizeWrap: FC<Props> = ({
const [clientY, setClientY] = useState(0)
const [isResizing, setIsResizing] = useState(false)
const [prevUserSelectStyle, setPrevUserSelectStyle] = useState(getComputedStyle(document.body).userSelect)
const [oldHeight, setOldHeight] = useState(height)

const handleStartResize = useCallback((e: React.MouseEvent<HTMLElement>) => {
setClientY(e.clientY)
setIsResizing(true)
setOldHeight(height)
setPrevUserSelectStyle(getComputedStyle(document.body).userSelect)
document.body.style.userSelect = 'none'
}, [])
}, [height])

const handleStopResize = useCallback(() => {
setIsResizing(false)
Expand All @@ -44,8 +46,7 @@ const PromptEditorHeightResizeWrap: FC<Props> = ({
return

const offset = e.clientY - clientY
let newHeight = height + offset
setClientY(e.clientY)
let newHeight = oldHeight + offset
if (newHeight < minHeight)
newHeight = minHeight
onHeightChange(newHeight)
Expand Down

0 comments on commit 29402f1

Please sign in to comment.