From bc795df1408afdb602a03aa6755eca57724a2508 Mon Sep 17 00:00:00 2001 From: shandianchengzi <1252402849@qq.com> Date: Mon, 18 Dec 2023 03:23:47 +0800 Subject: [PATCH] =?UTF-8?q?=E8=AE=BE=E7=BD=AE=E7=83=AD=E8=AF=8D=E6=9B=BF?= =?UTF-8?q?=E6=8D=A2=E4=B9=8B=E5=90=8E=E5=86=8D=E6=AC=A1=E8=B0=83=E6=95=B4?= =?UTF-8?q?=E4=B8=AD=E8=8B=B1=E6=96=87=E4=B9=8B=E9=97=B4=E7=9A=84=E7=A9=BA?= =?UTF-8?q?=E6=A0=BC=EF=BC=8C=E9=81=BF=E5=85=8D=E4=B8=AD=E8=8B=B1=E7=9B=B8?= =?UTF-8?q?=E4=BA=92=E6=9B=BF=E6=8D=A2=E5=90=8E=E7=A9=BA=E6=A0=BC=E5=BC=82?= =?UTF-8?q?=E5=B8=B8=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core_client.py | 11 ++++++++++- core_server.py | 39 +-------------------------------------- globs_var.py | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 39 deletions(-) create mode 100644 globs_var.py diff --git a/core_client.py b/core_client.py index 40ba32a..6a0b355 100644 --- a/core_client.py +++ b/core_client.py @@ -1,5 +1,6 @@ # coding: utf-8 +from globs_var import * import os import sys import platform @@ -348,6 +349,12 @@ async def do_recognize(): decoding_results = hot_sub_en.热词替换(decoding_results) if hot_rule: decoding_results = hot_sub_rule.热词替换(decoding_results) + + result_0 = decoding_results + + # 调整中英空格排版 + if format_spell: + decoding_results = result_1 = en_in_zh.sub(adjust_space, decoding_results) # 打印结果 if paste: @@ -362,7 +369,9 @@ async def do_recognize(): keyboard.write(decoding_results) # 终端显示结果 - console.print(f'识别结果:[green4]{decoding_results}') + console.print(f'识别结果:[green4]{result_0}') + if format_spell: + console.print(f' 调整中英空格排版:{result_1}') console.print(f' 录音时长:{len(samples1) / 16000: >8.2f}s') console.print(f' 识别时长:{t2 - t1: >8.2f}s') console.print(f' Real Time Factor: {(t2-t1) / (len(samples1)/16000): >5.2f}') diff --git a/core_server.py b/core_server.py index ac48b3d..5b35631 100644 --- a/core_server.py +++ b/core_server.py @@ -1,4 +1,5 @@ +from globs_var import * from os import path, sep, mkdir, makedirs, getcwd, chdir import sys if 'BASE_DIR' not in globals(): @@ -15,8 +16,6 @@ from pathlib import Path import time import asyncio -import re -from string import digits, ascii_letters import numpy as np import websockets @@ -33,7 +32,6 @@ format_num = True # 输出时是否将中文数字转为阿拉伯数字 format_punc = True # 输出时是否启用标点符号引擎(在 MacOS 上标点引擎似乎有问题,应当改为 False) -format_spell = True # 输出时是否调整中英之间的空格 model_dir = Path() / 'models' paraformer_path = Path() / 'models' / 'paraformer-offline-zh' / 'model.onnx' @@ -66,41 +64,6 @@ class args: # ======================================================================== -en_in_zh = re.compile(r"""(?ix) # i 表示忽略大小写,x 表示开启注释模式 - ([\u4e00-\u9fa5]|[a-z0-9]+\s)? # 左侧是中文,或者英文加空格 - ([a-z0-9 ]+) # 中间是一个或多个「英文数字加空格」 - ([\u4e00-\u9fa5]|[a-z0-9]+)? # 右是中文,或者英文加空格 -""") - -def adjust_space(original: re.Match): - left : str = original.group(1) - center : str = original.group(2) - right : str = original.group(3) - # 如果拼写字母中间有空格,就把空格都去掉 - if center: - final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip() - # 测试地址 https://regex101.com/r/1Vtu7V/1 - # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip() - - # 如果英文的左边有汉字或英文,给两组之间加上空格 - if left : - if left.strip(digits) == left and center.lstrip(digits) == center : # 左侧结尾不是数字,中间开头不是数字 - final = ' ' + final - final = left.rstrip() + final - - # 如果英文左边的汉字被前一个组消费了,就要手动去看一下前一个字是不是中文 - elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): - if center.lstrip(digits) == center: # 确保中间开头不是数字 - final = ' ' + final - - # 如果英文的右边有汉字,给中英之间加上空格 - if right: - if center.rstrip(digits) == center: # 确保中间结尾不是数字 - final += ' ' - final += right.lstrip() - - return final - async def ws_serve(websocket, path): global loop global format_num, format_punc, format_spell diff --git a/globs_var.py b/globs_var.py new file mode 100644 index 0000000..c9e3b0d --- /dev/null +++ b/globs_var.py @@ -0,0 +1,41 @@ +format_spell = True # 输出时是否调整中英之间的空格 + +import re +from string import digits, ascii_letters + +# ======================================================================== + +en_in_zh = re.compile(r"""(?ix) # i 表示忽略大小写,x 表示开启注释模式 + ([\u4e00-\u9fa5]|[a-z0-9]+\s)? # 左侧是中文,或者英文加空格 + ([a-z0-9 ]+) # 中间是一个或多个「英文数字加空格」 + ([\u4e00-\u9fa5]|[a-z0-9]+)? # 右是中文,或者英文加空格 +""") + +def adjust_space(original: re.Match): + left : str = original.group(1) + center : str = original.group(2) + right : str = original.group(3) + # 如果拼写字母中间有空格,就把空格都去掉 + if center: + final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip() + # 测试地址 https://regex101.com/r/1Vtu7V/1 + # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip() + + # 如果英文的左边有汉字或英文,给两组之间加上空格 + if left : + if left.strip(digits) == left and center.lstrip(digits) == center : # 左侧结尾不是数字,中间开头不是数字 + final = ' ' + final + final = left.rstrip() + final + + # 如果英文左边的汉字被前一个组消费了,就要手动去看一下前一个字是不是中文 + elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): + if center.lstrip(digits) == center: # 确保中间开头不是数字 + final = ' ' + final + + # 如果英文的右边有汉字,给中英之间加上空格 + if right: + if center.rstrip(digits) == center: # 确保中间结尾不是数字 + final += ' ' + final += right.lstrip() + + return final \ No newline at end of file