From 7d4e419dbe518b8a22756b583042436ad57fdfda Mon Sep 17 00:00:00 2001 From: da3dsoul Date: Mon, 3 Apr 2023 18:16:27 -0400 Subject: [PATCH] Improve Silero's Preprocessor to Handle Roman Numerals and Abbreviations Better --- extensions/silero_tts/tts_preprocessor.py | 68 ++++++++++++++++------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/extensions/silero_tts/tts_preprocessor.py b/extensions/silero_tts/tts_preprocessor.py index d7f8d42..4ce2035 100644 --- a/extensions/silero_tts/tts_preprocessor.py +++ b/extensions/silero_tts/tts_preprocessor.py @@ -7,7 +7,7 @@ alphabet_map = { "B": " Bee ", "C": " See ", "D": " Dee ", - "E": " II ", + "E": " Ii ", "F": " Eff ", "G": " Jee ", "H": " Eich ", @@ -38,18 +38,64 @@ def preprocess(string): string = string.replace('“', '') string = string.replace('\n', ' ') string = remove_commas(string) + string = replace_roman(string) string = hyphen_range_to(string) string = num_to_words(string) - string = string.strip() + # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually # try to say the abbreviation or spell it out as I've done below is not agreed upon # For now, expand abbreviations to pronunciations string = replace_abbreviations(string) + string = string.strip() return string +def remove_surrounded_chars(string): + # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR + # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string' + return re.sub(r'\*[^*]*?(\*|$)', '', string) + + +def replace_roman(string): + pattern = re.compile(r'\s[IVXLCDM]+[\s,.?!)"\'\]>]') + result = string + while True: + match = pattern.search(result) + if match is None: + break + + start = match.start() + end = match.end() + result = result[0:start+1] + str(roman_to_int(result[start+1:end-1])) + result[end-1:len(result)] + + return result + + +def roman_to_int(s): + rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} + int_val = 0 + for i in range(len(s)): + if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]: + int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]] + else: + int_val += rom_val[s[i]] + return int_val + + +def hyphen_range_to(text): + pattern = re.compile(r'(\d+)[-–](\d+)') + result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text) + return result + + +def num_to_words(text): + pattern = re.compile(r'\d+') + result = pattern.sub(lambda x: num2words(int(x.group())), text) + return result + + def replace_abbreviations(string): pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]') result = string @@ -81,24 +127,6 @@ def match_mapping(char, result): return result + char -def remove_surrounded_chars(string): - # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR - # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string' - return re.sub(r'\*[^*]*?(\*|$)', '', string) - - -def hyphen_range_to(text): - pattern = re.compile(r'(\d+)[-–](\d+)') - result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text) - return result - - -def num_to_words(text): - pattern = re.compile(r'\d+') - result = pattern.sub(lambda x: num2words(int(x.group())), text) - return result - - def remove_commas(text): import re pattern = re.compile(r'(\d),(\d)')