talon-scripts/community/core/create_spoken_forms.py
2025-08-19 08:06:37 -04:00

540 lines
18 KiB
Python

import itertools
import re
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, List, Mapping, Optional
from talon import Module, actions
from .keys.symbols import symbols_for_create_spoken_forms
from .numbers.numbers import digits_map, scales, teens, tens
from .user_settings import track_csv_list
mod = Module()
DEFAULT_MINIMUM_TERM_LENGTH = 2
EXPLODE_MAX_LEN = 3
FANCY_REGULAR_EXPRESSION = r"[A-Z]?[a-z]+|[A-Z]+(?![a-z])|[0-9]+"
SYMBOLS_REGEX = "|".join(
re.escape(symbol) for symbol in set(symbols_for_create_spoken_forms.values())
)
FILE_EXTENSIONS_REGEX = r"^\b$"
file_extensions = {}
def update_regex():
global REGEX_NO_SYMBOLS
global REGEX_WITH_SYMBOLS
REGEX_NO_SYMBOLS = re.compile(
"|".join(
[
FANCY_REGULAR_EXPRESSION,
FILE_EXTENSIONS_REGEX,
]
)
)
REGEX_WITH_SYMBOLS = re.compile(
"|".join([FANCY_REGULAR_EXPRESSION, FILE_EXTENSIONS_REGEX, SYMBOLS_REGEX])
)
update_regex()
@track_csv_list("file_extensions.csv", headers=("File extension", "Name"))
def on_extensions(values):
global FILE_EXTENSIONS_REGEX
global file_extensions
file_extensions = values
FILE_EXTENSIONS_REGEX = "|".join(
re.escape(file_extension.strip()) + "$" for file_extension in values.values()
)
update_regex()
abbreviations_list = {}
@track_csv_list("abbreviations.csv", headers=("Abbreviation", "Spoken Form"))
def on_abbreviations(values):
global abbreviations_list
abbreviations_list = values
REVERSE_PRONUNCIATION_MAP = {
**{str(value): key for key, value in digits_map.items()},
**{value: key for key, value in symbols_for_create_spoken_forms.items()},
}
# begin: create the lists etc necessary for create_spoken_word_for_number
# by convention, each entry in the list has an append space... until I clean up the function
# the algorithm's expectation is slightly different from numbers.py
# ["", "one ", "two ",... "nine "] or equivalents
ones = [""] + [
REVERSE_PRONUNCIATION_MAP[str(index)] for index in range(10) if index != 0
]
# ["","","twenty","thirty","forty",..."ninety"]
# or equivalent
twenties = ["", ""] + list(tens)
# print("twenties = " + str(twenties))
thousands = [""] + [val for index, val in enumerate(scales) if index != 0]
# print("thousands = " + thousands)
# end: create the lists necessary for create_spoken_word_for_number
def create_spoken_form_for_number(num: int):
"""Creates a spoken form for an integer"""
n3 = []
r1 = ""
# create numeric string
ns = str(num)
for k in range(3, 33, 3):
r = ns[-k:]
q = len(ns) - k
# break if end of ns has been reached
if q < -2:
break
else:
if q >= 0:
n3.append(int(r[:3]))
elif q >= -1:
n3.append(int(r[:2]))
elif q >= -2:
n3.append(int(r[:1]))
r1 = r
# break each group of 3 digits into
# ones, tens/twenties, hundreds
words = []
for i, x in enumerate(n3):
b1 = x % 10
b2 = (x % 100) // 10
b3 = (x % 1000) // 100
if x == 0:
continue # skip
else:
t = thousands[i]
# print(str(b1) + ", " + str(b2) + ", " + str(b3))
if b2 == 0:
words = [ones[b1], t] + words
elif b2 == 1:
words = [teens[b1], t] + words
elif b2 > 1:
words = [twenties[b2], ones[b1], t] + words
if b3 > 0:
words = [ones[b3], scales[0]] + words
# filter out the empty strings and join
return " ".join(filter(None, words))
def create_spoken_form_years(num: str):
"""Creates spoken forms for numbers 1000 <= num <= 9999. Returns None if not supported"""
val = int(num)
if val > 9999 or val < 1000:
return None
centuries = val // 100
remainder = val % 100
words = []
if centuries % 10 != 0:
words.append(create_spoken_form_for_number(centuries))
# 1900 -> nineteen hundred
if remainder == 0:
words.append(scales[0])
else:
# 200X -> two thousand x
if remainder < 9:
words.append(REVERSE_PRONUNCIATION_MAP[str(centuries // 10)])
words.append(scales[1])
# 20XX => twenty xx
else:
words.append(create_spoken_form_for_number(str(centuries)))
if remainder != 0:
# 1906 => "nineteen six"
if remainder < 10:
# todo: decide if we want nineteen oh five"
# todo: decide if we want "and"
# both seem like a waste
# if centuries % 10 != 0:
# words.append("oh")
words.append(REVERSE_PRONUNCIATION_MAP[str(remainder)])
else:
words.append(create_spoken_form_for_number(remainder))
return " ".join(words)
# # ---------- create_spoken_form_years (uncomment to run) ----------
# def test_year(year: str, expected: str):
# result = create_spoken_form_years(year)
# print(f"test_year: test string = {year}, result = {result}, expected = {expected}")
# assert create_spoken_form_years(year) == expected
# print("************* test_year tests ******************")
# test_year("1100", "eleven hundred")
# test_year("1905", "nineteen five")
# test_year("1910", "nineteen ten")
# test_year("1925", "nineteen twenty five")
# test_year("2000", "two thousand")
# test_year("2005", "two thousand five")
# test_year("2020", "twenty twenty")
# test_year("2019", "twenty nineteen")
# test_year("2085", "twenty eighty five")
# test_year("2100", "twenty one hundred")
# test_year("2105", "twenty one five")
# test_year("9999", "ninety nine ninety nine")
# print("************* test_year tests done**************")
def create_single_spoken_form(source: str):
"""
Returns a spoken form of a string
(1) Returns the value from REVERSE_PRONUNCIATION_MAP if it exists
(2) Splits allcaps into separate letters ("ABC" -> A B C)
(3) Otherwise, returns the lower case source.
"""
normalized_source = source.lower()
try:
mapped_source = REVERSE_PRONUNCIATION_MAP[normalized_source]
except KeyError:
# Leave completely uppercase words alone, as we can deal with them later.
# Otherwise normalized the rest to help with subsequent abbreviation lookups,
# etc.
if source.isupper():
mapped_source = source
else:
mapped_source = source.lower()
return mapped_source
def create_exploded_forms(spoken_forms: List[str]):
"""Exploded common packed words into separate words"""
# TODO: This could be moved somewhere else, possibly seeded from something like
# words to replace...
packed_words = {"readme": "read me"}
new_spoken_forms = []
for line in spoken_forms:
exploded_form = []
# ex: "vm" or "usb" explodes into "V M" or "U S B"
if (
" " not in line
and line.islower()
and len(line) > 1
and len(line) <= EXPLODE_MAX_LEN
):
new_spoken_forms.append(line) # Keep a regular copy (ie: "nas")
new_spoken_forms.append(" ".join(line.upper()))
# ex: "readme" explodes into "read me"
else:
for word in line.split(" "):
if word in packed_words.keys():
exploded_form.append(packed_words[word])
else:
exploded_form.append(word)
new_spoken_forms.append(" ".join(exploded_form))
return new_spoken_forms
def create_extension_forms(spoken_forms: List[str]):
"""Add extension forms"""
new_spoken_forms = []
file_extensions_map = {v.strip(): k for k, v in file_extensions.items()}
for line in spoken_forms:
have_file_extension = False
file_extension_forms = []
dotted_extension_form = []
truncated_forms = []
for substring in line.split(" "):
# NOTE: If we ever run in to file extensions in the middle of file name, the
# truncated form is going to be busted. ie: foo.md.disabled
if substring in file_extensions_map.keys():
file_extension_forms.append(file_extensions_map[substring])
dotted_extension_form.append(REVERSE_PRONUNCIATION_MAP["."])
dotted_extension_form.append(file_extensions_map[substring])
have_file_extension = True
# purposefully down update truncated
else:
file_extension_forms.append(substring)
dotted_extension_form.append(substring)
truncated_forms.append(substring)
# print(file_extension_forms)
if have_file_extension:
new_spoken_forms.append(" ".join(file_extension_forms))
new_spoken_forms.append(" ".join(dotted_extension_form))
new_spoken_forms.append(" ".join(truncated_forms))
return set(dict.fromkeys(new_spoken_forms))
def create_cased_forms(spoken_forms: List[str]):
"""Add lower and upper case forms"""
new_spoken_forms = []
for line in spoken_forms:
lower_forms = []
upper_forms = []
# print(line)
for substring in line.split(" "):
if substring.isupper():
lower_forms.append(substring.lower())
upper_forms.append(" ".join(substring))
else:
lower_forms.append(substring)
upper_forms.append(substring)
new_spoken_forms.append(" ".join(lower_forms))
new_spoken_forms.append(" ".join(upper_forms))
return set(dict.fromkeys(new_spoken_forms))
def create_abbreviated_forms(spoken_forms: List[str]):
"""Add abbreviated case forms"""
new_spoken_forms = []
swapped_abbreviation_map = {v: k for k, v in abbreviations_list.items()}
for line in spoken_forms:
unabbreviated_forms = []
abbreviated_forms = []
for substring in line.split(" "):
if substring in swapped_abbreviation_map.keys():
abbreviated_forms.append(swapped_abbreviation_map[substring])
else:
abbreviated_forms.append(substring)
unabbreviated_forms.append(substring)
new_spoken_forms.append(" ".join(abbreviated_forms))
new_spoken_forms.append(" ".join(unabbreviated_forms))
return set(dict.fromkeys(new_spoken_forms))
def create_spoken_number_forms(source: List[str]):
"""
Create a list of spoken forms by transforming numbers in source into spoken forms.
This creates a first pass of spoken forms with numbers translated, but will go
through multiple other passes.
"""
# list of spoken forms returned
spoken_forms = []
# contains the pieces for the spoken form with individual digits
full_form_digit_wise = []
# contains the pieces for the spoken form with the spoken version of the number
full_form_fancy_numbers = []
# contains the pieces for the spoken form for years like "1900" => nineteen hundred
full_form_spoken_form_years = []
# indicates whether or not we processed created a version with the full number (>10) translated
has_fancy_number_version = False
# indicates whether or not we processed created a version with the year-like ("1900" => nineteen hundred) numbers
has_spoken_form_years = False
for substring in source:
# for piece in pieces:
# substring = piece.group(0)
length = len(substring)
# the length is currently capped at 31 digits
if length > 1 and length <= 31 and substring.isnumeric():
has_fancy_number_version = True
val = int(substring)
spoken_form_years = create_spoken_form_years(val)
spoken_form = create_spoken_form_for_number(val)
if spoken_form_years:
has_spoken_form_years = True
full_form_spoken_form_years.append(spoken_form_years)
else:
full_form_spoken_form_years.append(spoken_form)
full_form_fancy_numbers.append(spoken_form)
# build the serial digit version
for digit in substring:
full_form_digit_wise.append(create_single_spoken_form(digit))
else:
spoken_form = create_single_spoken_form(substring)
full_form_digit_wise.append(spoken_form)
full_form_fancy_numbers.append(spoken_form)
full_form_spoken_form_years.append(spoken_form)
if has_fancy_number_version:
spoken_forms.append(" ".join(full_form_fancy_numbers))
if has_spoken_form_years:
result = " ".join(full_form_spoken_form_years)
if result not in spoken_forms:
spoken_forms.append(result)
spoken_forms.append(" ".join(full_form_digit_wise))
return set(dict.fromkeys(spoken_forms))
def create_spoken_forms_from_regex(source: str, pattern: re.Pattern):
"""
Creates a list of spoken forms for source using the provided regex pattern.
For numeric pieces detected by the regex, generates both digit-wise and full
spoken forms for the numbers where appropriate.
"""
source_without_apostrophes = source.replace("'", "")
pieces = list(pattern.finditer(source_without_apostrophes))
spoken_forms = list(map(lambda x: x.group(0), pieces))
# NOTE: Order is sometimes important
transforms = [
create_spoken_number_forms,
create_extension_forms,
create_cased_forms,
create_exploded_forms,
create_abbreviated_forms,
create_extension_forms,
]
for func in transforms:
spoken_forms = func(spoken_forms)
return list(dict.fromkeys(spoken_forms))
def generate_string_subsequences(
source: str,
words_to_exclude: list[str],
minimum_term_length: int,
):
# Includes (lower-cased):
# 1. Each word in source, eg "foo bar baz" -> "foo", "bar", "baz".
# 2. Each leading subsequence of words from source,
# eg "foo bar baz" -> "foo", "foo bar", "foo bar baz"
# (but not "bar baz" - TODO: is this intentional?)
#
# Except for:
# 3. strings shorter than minimum_term_length
# 4. strings in words_to_exclude.
term_sequence = source.split(" ")
terms = {
# WARNING: This .lower() version creates unwanted duplication of broken up
# uppercase words, eg 'R E A D M E' -> 'r e a d m e'. Everything else should be
# lower case already
# term.lower().strip()
term.strip()
for term in (
term_sequence
+ list(itertools.accumulate([f"{term} " for term in term_sequence]))
)
}
return [
term
for term in terms
if (term not in words_to_exclude and len(term) >= minimum_term_length)
]
@dataclass
class SpeakableItem:
name: str
value: Any
@mod.action_class
class Actions:
def create_spoken_forms(
source: str,
words_to_exclude: Optional[list[str]] = None,
minimum_term_length: int = DEFAULT_MINIMUM_TERM_LENGTH,
generate_subsequences: bool = True,
) -> list[str]:
"""Create spoken forms for a given source"""
spoken_forms_without_symbols = create_spoken_forms_from_regex(
source, REGEX_NO_SYMBOLS
)
# todo: this could probably be optimized out if there's no symbols
spoken_forms_with_symbols = create_spoken_forms_from_regex(
source, REGEX_WITH_SYMBOLS
)
# some may be identical, so ensure the list is reduced
spoken_forms = set(spoken_forms_with_symbols + spoken_forms_without_symbols)
# only generate the subsequences if requested
if generate_subsequences:
# todo: do we care about the subsequences that are excluded.
# the only one that seems relevant are the full spoken form for
spoken_forms.update(
generate_string_subsequences(
spoken_forms_without_symbols[-1],
words_to_exclude or [],
minimum_term_length,
)
)
# Avoid empty spoken forms.
return [x for x in spoken_forms if x]
def create_spoken_forms_from_list(
sources: list[str],
words_to_exclude: Optional[list[str]] = None,
minimum_term_length: int = DEFAULT_MINIMUM_TERM_LENGTH,
generate_subsequences: bool = True,
) -> dict[str, str]:
"""Create spoken forms for all sources in a list, doing conflict resolution"""
return actions.user.create_spoken_forms_from_map(
{source: source for source in sources},
words_to_exclude,
minimum_term_length,
generate_subsequences,
)
def create_spoken_forms_from_map(
sources: Mapping[str, Any],
words_to_exclude: Optional[list[str]] = None,
minimum_term_length: int = DEFAULT_MINIMUM_TERM_LENGTH,
generate_subsequences: bool = True,
) -> dict[str, Any]:
"""Create spoken forms for all sources in a map, doing conflict resolution"""
all_spoken_forms: defaultdict[str, list[SpeakableItem]] = defaultdict(list)
for name, value in sources.items():
spoken_forms = actions.user.create_spoken_forms(
name, words_to_exclude, minimum_term_length, generate_subsequences
)
for spoken_form in spoken_forms:
all_spoken_forms[spoken_form].append(SpeakableItem(name, value))
final_spoken_forms = {}
for spoken_form, spoken_form_sources in all_spoken_forms.items():
if len(spoken_form_sources) > 1:
final_spoken_forms[spoken_form] = min(
spoken_form_sources,
key=lambda speakable_item: len(speakable_item.name),
).value
else:
final_spoken_forms[spoken_form] = spoken_form_sources[0].value
return final_spoken_forms