From 39063c48bba447fe742acaf6e6ed9acba09280f4 Mon Sep 17 00:00:00 2001 From: da3dsoul Date: Mon, 3 Apr 2023 20:24:09 -0400 Subject: [PATCH] Improve Silero's Preprocessor to Handle Punctuation and Whitespace Better --- extensions/silero_tts/tts_preprocessor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/extensions/silero_tts/tts_preprocessor.py b/extensions/silero_tts/tts_preprocessor.py index 4ce2035..3b3146f 100644 --- a/extensions/silero_tts/tts_preprocessor.py +++ b/extensions/silero_tts/tts_preprocessor.py @@ -48,7 +48,11 @@ def preprocess(string): # For now, expand abbreviations to pronunciations string = replace_abbreviations(string) + # cleanup whitespaces + string = re.sub(r'\s+([,.?!\'])', r'\1', string) string = string.strip() + string = ' '.join(string.split()) + return string @@ -97,7 +101,7 @@ def num_to_words(text): def replace_abbreviations(string): - pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]') + pattern = re.compile(r'(^|[\s("\'\[<])([A-Z]{2,4})([\s,.?!)"\'\]>]|$)') result = string while True: match = pattern.search(result)