From bc795df1408afdb602a03aa6755eca57724a2508 Mon Sep 17 00:00:00 2001
From: shandianchengzi <1252402849@qq.com>
Date: Mon, 18 Dec 2023 03:23:47 +0800
Subject: [PATCH] =?UTF-8?q?=E8=AE=BE=E7=BD=AE=E7=83=AD=E8=AF=8D=E6=9B=BF?=
 =?UTF-8?q?=E6=8D=A2=E4=B9=8B=E5=90=8E=E5=86=8D=E6=AC=A1=E8=B0=83=E6=95=B4?=
 =?UTF-8?q?=E4=B8=AD=E8=8B=B1=E6=96=87=E4=B9=8B=E9=97=B4=E7=9A=84=E7=A9=BA?=
 =?UTF-8?q?=E6=A0=BC=EF=BC=8C=E9=81=BF=E5=85=8D=E4=B8=AD=E8=8B=B1=E7=9B=B8?=
 =?UTF-8?q?=E4=BA=92=E6=9B=BF=E6=8D=A2=E5=90=8E=E7=A9=BA=E6=A0=BC=E5=BC=82?=
 =?UTF-8?q?=E5=B8=B8=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 core_client.py | 11 ++++++++++-
 core_server.py | 39 +--------------------------------------
 globs_var.py   | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 39 deletions(-)
 create mode 100644 globs_var.py

diff --git a/core_client.py b/core_client.py
index 40ba32a..6a0b355 100644
--- a/core_client.py
+++ b/core_client.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 
+from globs_var import *
 import os
 import sys
 import platform
@@ -348,6 +349,12 @@ async def do_recognize():
         decoding_results = hot_sub_en.热词替换(decoding_results)
     if hot_rule: 
         decoding_results = hot_sub_rule.热词替换(decoding_results)
+    
+    result_0 = decoding_results
+
+    # 调整中英空格排版
+    if format_spell:
+        decoding_results = result_1 = en_in_zh.sub(adjust_space, decoding_results)
 
     # 打印结果
     if paste:   
@@ -362,7 +369,9 @@ async def do_recognize():
         keyboard.write(decoding_results)
     
     # 终端显示结果
-    console.print(f'识别结果：[green4]{decoding_results}')
+    console.print(f'识别结果：[green4]{result_0}')
+    if format_spell:
+        console.print(f'    调整中英空格排版：{result_1}')
     console.print(f'    录音时长：{len(samples1) / 16000: >8.2f}s')
     console.print(f'    识别时长：{t2 - t1: >8.2f}s')
     console.print(f'    Real Time Factor: {(t2-t1) / (len(samples1)/16000): >5.2f}')
diff --git a/core_server.py b/core_server.py
index ac48b3d..5b35631 100644
--- a/core_server.py
+++ b/core_server.py
@@ -1,4 +1,5 @@
 
+from globs_var import *
 from os import path, sep, mkdir, makedirs, getcwd, chdir
 import sys
 if 'BASE_DIR' not in globals():
@@ -15,8 +16,6 @@
 from pathlib import Path
 import time
 import asyncio
-import re
-from string import digits, ascii_letters
 
 import numpy as np
 import websockets
@@ -33,7 +32,6 @@
 
 format_num      = True      # 输出时是否将中文数字转为阿拉伯数字
 format_punc     = True      # 输出时是否启用标点符号引擎（在 MacOS 上标点引擎似乎有问题，应当改为 False）
-format_spell    = True      # 输出时是否调整中英之间的空格
 
 model_dir = Path() / 'models'
 paraformer_path = Path() / 'models' / 'paraformer-offline-zh' / 'model.onnx'
@@ -66,41 +64,6 @@ class args:
 
 # ========================================================================
 
-en_in_zh = re.compile(r"""(?ix)    # i 表示忽略大小写，x 表示开启注释模式
-    ([\u4e00-\u9fa5]|[a-z0-9]+\s)?      # 左侧是中文，或者英文加空格
-    ([a-z0-9 ]+)                    # 中间是一个或多个「英文数字加空格」
-    ([\u4e00-\u9fa5]|[a-z0-9]+)?       # 右是中文，或者英文加空格
-""")
-
-def adjust_space(original: re.Match):
-    left : str = original.group(1)
-    center : str = original.group(2)
-    right : str = original.group(3)
-    # 如果拼写字母中间有空格，就把空格都去掉
-    if center:
-        final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip()
-        # 测试地址 https://regex101.com/r/1Vtu7V/1
-        # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
-    
-    # 如果英文的左边有汉字或英文，给两组之间加上空格
-    if left :
-        if left.strip(digits) == left and center.lstrip(digits) == center :  # 左侧结尾不是数字，中间开头不是数字
-            final = ' ' + final
-        final = left.rstrip() + final
-    
-    # 如果英文左边的汉字被前一个组消费了，就要手动去看一下前一个字是不是中文
-    elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): 
-        if center.lstrip(digits) == center:     # 确保中间开头不是数字
-            final = ' ' + final
-        
-    # 如果英文的右边有汉字，给中英之间加上空格
-    if right:
-        if center.rstrip(digits) == center:     # 确保中间结尾不是数字
-            final += ' '
-        final += right.lstrip()
-
-    return final
-
 async def ws_serve(websocket, path):
     global loop
     global format_num, format_punc, format_spell
diff --git a/globs_var.py b/globs_var.py
new file mode 100644
index 0000000..c9e3b0d
--- /dev/null
+++ b/globs_var.py
@@ -0,0 +1,41 @@
+format_spell    = True      # 输出时是否调整中英之间的空格
+
+import re
+from string import digits, ascii_letters
+
+# ========================================================================
+
+en_in_zh = re.compile(r"""(?ix)    # i 表示忽略大小写，x 表示开启注释模式
+    ([\u4e00-\u9fa5]|[a-z0-9]+\s)?      # 左侧是中文，或者英文加空格
+    ([a-z0-9 ]+)                    # 中间是一个或多个「英文数字加空格」
+    ([\u4e00-\u9fa5]|[a-z0-9]+)?       # 右是中文，或者英文加空格
+""")
+
+def adjust_space(original: re.Match):
+    left : str = original.group(1)
+    center : str = original.group(2)
+    right : str = original.group(3)
+    # 如果拼写字母中间有空格，就把空格都去掉
+    if center:
+        final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip()
+        # 测试地址 https://regex101.com/r/1Vtu7V/1
+        # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
+    
+    # 如果英文的左边有汉字或英文，给两组之间加上空格
+    if left :
+        if left.strip(digits) == left and center.lstrip(digits) == center :  # 左侧结尾不是数字，中间开头不是数字
+            final = ' ' + final
+        final = left.rstrip() + final
+    
+    # 如果英文左边的汉字被前一个组消费了，就要手动去看一下前一个字是不是中文
+    elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): 
+        if center.lstrip(digits) == center:     # 确保中间开头不是数字
+            final = ' ' + final
+        
+    # 如果英文的右边有汉字，给中英之间加上空格
+    if right:
+        if center.rstrip(digits) == center:     # 确保中间结尾不是数字
+            final += ' '
+        final += right.lstrip()
+
+    return final
\ No newline at end of file