540 lines
18 KiB
Python
540 lines
18 KiB
Python
import itertools
|
|
import re
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from typing import Any, List, Mapping, Optional
|
|
|
|
from talon import Module, actions
|
|
|
|
from .keys.symbols import symbols_for_create_spoken_forms
|
|
from .numbers.numbers import digits_map, scales, teens, tens
|
|
from .user_settings import track_csv_list
|
|
|
|
mod = Module()
|
|
|
|
DEFAULT_MINIMUM_TERM_LENGTH = 2
|
|
EXPLODE_MAX_LEN = 3
|
|
FANCY_REGULAR_EXPRESSION = r"[A-Z]?[a-z]+|[A-Z]+(?![a-z])|[0-9]+"
|
|
SYMBOLS_REGEX = "|".join(
|
|
re.escape(symbol) for symbol in set(symbols_for_create_spoken_forms.values())
|
|
)
|
|
FILE_EXTENSIONS_REGEX = r"^\b$"
|
|
file_extensions = {}
|
|
|
|
|
|
def update_regex():
|
|
global REGEX_NO_SYMBOLS
|
|
global REGEX_WITH_SYMBOLS
|
|
REGEX_NO_SYMBOLS = re.compile(
|
|
"|".join(
|
|
[
|
|
FANCY_REGULAR_EXPRESSION,
|
|
FILE_EXTENSIONS_REGEX,
|
|
]
|
|
)
|
|
)
|
|
REGEX_WITH_SYMBOLS = re.compile(
|
|
"|".join([FANCY_REGULAR_EXPRESSION, FILE_EXTENSIONS_REGEX, SYMBOLS_REGEX])
|
|
)
|
|
|
|
|
|
update_regex()
|
|
|
|
|
|
@track_csv_list("file_extensions.csv", headers=("File extension", "Name"))
|
|
def on_extensions(values):
|
|
global FILE_EXTENSIONS_REGEX
|
|
global file_extensions
|
|
file_extensions = values
|
|
FILE_EXTENSIONS_REGEX = "|".join(
|
|
re.escape(file_extension.strip()) + "$" for file_extension in values.values()
|
|
)
|
|
update_regex()
|
|
|
|
|
|
abbreviations_list = {}
|
|
|
|
|
|
@track_csv_list("abbreviations.csv", headers=("Abbreviation", "Spoken Form"))
|
|
def on_abbreviations(values):
|
|
global abbreviations_list
|
|
abbreviations_list = values
|
|
|
|
|
|
REVERSE_PRONUNCIATION_MAP = {
|
|
**{str(value): key for key, value in digits_map.items()},
|
|
**{value: key for key, value in symbols_for_create_spoken_forms.items()},
|
|
}
|
|
|
|
# begin: create the lists etc necessary for create_spoken_word_for_number
|
|
# by convention, each entry in the list has an append space... until I clean up the function
|
|
# the algorithm's expectation is slightly different from numbers.py
|
|
|
|
# ["", "one ", "two ",... "nine "] or equivalents
|
|
ones = [""] + [
|
|
REVERSE_PRONUNCIATION_MAP[str(index)] for index in range(10) if index != 0
|
|
]
|
|
|
|
# ["","","twenty","thirty","forty",..."ninety"]
|
|
# or equivalent
|
|
twenties = ["", ""] + list(tens)
|
|
# print("twenties = " + str(twenties))
|
|
|
|
thousands = [""] + [val for index, val in enumerate(scales) if index != 0]
|
|
# print("thousands = " + thousands)
|
|
# end: create the lists necessary for create_spoken_word_for_number
|
|
|
|
|
|
def create_spoken_form_for_number(num: int):
|
|
"""Creates a spoken form for an integer"""
|
|
|
|
n3 = []
|
|
r1 = ""
|
|
# create numeric string
|
|
ns = str(num)
|
|
for k in range(3, 33, 3):
|
|
r = ns[-k:]
|
|
q = len(ns) - k
|
|
# break if end of ns has been reached
|
|
if q < -2:
|
|
break
|
|
else:
|
|
if q >= 0:
|
|
n3.append(int(r[:3]))
|
|
elif q >= -1:
|
|
n3.append(int(r[:2]))
|
|
elif q >= -2:
|
|
n3.append(int(r[:1]))
|
|
r1 = r
|
|
|
|
# break each group of 3 digits into
|
|
# ones, tens/twenties, hundreds
|
|
words = []
|
|
for i, x in enumerate(n3):
|
|
b1 = x % 10
|
|
b2 = (x % 100) // 10
|
|
b3 = (x % 1000) // 100
|
|
if x == 0:
|
|
continue # skip
|
|
else:
|
|
t = thousands[i]
|
|
|
|
# print(str(b1) + ", " + str(b2) + ", " + str(b3))
|
|
if b2 == 0:
|
|
words = [ones[b1], t] + words
|
|
elif b2 == 1:
|
|
words = [teens[b1], t] + words
|
|
elif b2 > 1:
|
|
words = [twenties[b2], ones[b1], t] + words
|
|
if b3 > 0:
|
|
words = [ones[b3], scales[0]] + words
|
|
|
|
# filter out the empty strings and join
|
|
return " ".join(filter(None, words))
|
|
|
|
|
|
def create_spoken_form_years(num: str):
|
|
"""Creates spoken forms for numbers 1000 <= num <= 9999. Returns None if not supported"""
|
|
|
|
val = int(num)
|
|
if val > 9999 or val < 1000:
|
|
return None
|
|
|
|
centuries = val // 100
|
|
remainder = val % 100
|
|
|
|
words = []
|
|
|
|
if centuries % 10 != 0:
|
|
words.append(create_spoken_form_for_number(centuries))
|
|
|
|
# 1900 -> nineteen hundred
|
|
if remainder == 0:
|
|
words.append(scales[0])
|
|
else:
|
|
# 200X -> two thousand x
|
|
if remainder < 9:
|
|
words.append(REVERSE_PRONUNCIATION_MAP[str(centuries // 10)])
|
|
words.append(scales[1])
|
|
|
|
# 20XX => twenty xx
|
|
else:
|
|
words.append(create_spoken_form_for_number(str(centuries)))
|
|
|
|
if remainder != 0:
|
|
# 1906 => "nineteen six"
|
|
if remainder < 10:
|
|
# todo: decide if we want nineteen oh five"
|
|
# todo: decide if we want "and"
|
|
# both seem like a waste
|
|
# if centuries % 10 != 0:
|
|
# words.append("oh")
|
|
|
|
words.append(REVERSE_PRONUNCIATION_MAP[str(remainder)])
|
|
else:
|
|
words.append(create_spoken_form_for_number(remainder))
|
|
|
|
return " ".join(words)
|
|
|
|
|
|
# # ---------- create_spoken_form_years (uncomment to run) ----------
|
|
# def test_year(year: str, expected: str):
|
|
# result = create_spoken_form_years(year)
|
|
# print(f"test_year: test string = {year}, result = {result}, expected = {expected}")
|
|
# assert create_spoken_form_years(year) == expected
|
|
|
|
|
|
# print("************* test_year tests ******************")
|
|
# test_year("1100", "eleven hundred")
|
|
# test_year("1905", "nineteen five")
|
|
# test_year("1910", "nineteen ten")
|
|
# test_year("1925", "nineteen twenty five")
|
|
# test_year("2000", "two thousand")
|
|
# test_year("2005", "two thousand five")
|
|
# test_year("2020", "twenty twenty")
|
|
# test_year("2019", "twenty nineteen")
|
|
# test_year("2085", "twenty eighty five")
|
|
# test_year("2100", "twenty one hundred")
|
|
# test_year("2105", "twenty one five")
|
|
# test_year("9999", "ninety nine ninety nine")
|
|
# print("************* test_year tests done**************")
|
|
|
|
|
|
def create_single_spoken_form(source: str):
|
|
"""
|
|
Returns a spoken form of a string
|
|
(1) Returns the value from REVERSE_PRONUNCIATION_MAP if it exists
|
|
(2) Splits allcaps into separate letters ("ABC" -> A B C)
|
|
(3) Otherwise, returns the lower case source.
|
|
"""
|
|
normalized_source = source.lower()
|
|
try:
|
|
mapped_source = REVERSE_PRONUNCIATION_MAP[normalized_source]
|
|
except KeyError:
|
|
# Leave completely uppercase words alone, as we can deal with them later.
|
|
# Otherwise normalized the rest to help with subsequent abbreviation lookups,
|
|
# etc.
|
|
if source.isupper():
|
|
mapped_source = source
|
|
else:
|
|
mapped_source = source.lower()
|
|
return mapped_source
|
|
|
|
|
|
def create_exploded_forms(spoken_forms: List[str]):
|
|
"""Exploded common packed words into separate words"""
|
|
# TODO: This could be moved somewhere else, possibly seeded from something like
|
|
# words to replace...
|
|
packed_words = {"readme": "read me"}
|
|
|
|
new_spoken_forms = []
|
|
for line in spoken_forms:
|
|
exploded_form = []
|
|
# ex: "vm" or "usb" explodes into "V M" or "U S B"
|
|
|
|
if (
|
|
" " not in line
|
|
and line.islower()
|
|
and len(line) > 1
|
|
and len(line) <= EXPLODE_MAX_LEN
|
|
):
|
|
new_spoken_forms.append(line) # Keep a regular copy (ie: "nas")
|
|
new_spoken_forms.append(" ".join(line.upper()))
|
|
# ex: "readme" explodes into "read me"
|
|
else:
|
|
for word in line.split(" "):
|
|
if word in packed_words.keys():
|
|
exploded_form.append(packed_words[word])
|
|
else:
|
|
exploded_form.append(word)
|
|
new_spoken_forms.append(" ".join(exploded_form))
|
|
return new_spoken_forms
|
|
|
|
|
|
def create_extension_forms(spoken_forms: List[str]):
|
|
"""Add extension forms"""
|
|
new_spoken_forms = []
|
|
|
|
file_extensions_map = {v.strip(): k for k, v in file_extensions.items()}
|
|
for line in spoken_forms:
|
|
have_file_extension = False
|
|
file_extension_forms = []
|
|
dotted_extension_form = []
|
|
truncated_forms = []
|
|
for substring in line.split(" "):
|
|
# NOTE: If we ever run in to file extensions in the middle of file name, the
|
|
# truncated form is going to be busted. ie: foo.md.disabled
|
|
|
|
if substring in file_extensions_map.keys():
|
|
file_extension_forms.append(file_extensions_map[substring])
|
|
dotted_extension_form.append(REVERSE_PRONUNCIATION_MAP["."])
|
|
dotted_extension_form.append(file_extensions_map[substring])
|
|
have_file_extension = True
|
|
# purposefully down update truncated
|
|
else:
|
|
file_extension_forms.append(substring)
|
|
dotted_extension_form.append(substring)
|
|
truncated_forms.append(substring)
|
|
# print(file_extension_forms)
|
|
if have_file_extension:
|
|
new_spoken_forms.append(" ".join(file_extension_forms))
|
|
new_spoken_forms.append(" ".join(dotted_extension_form))
|
|
new_spoken_forms.append(" ".join(truncated_forms))
|
|
|
|
return set(dict.fromkeys(new_spoken_forms))
|
|
|
|
|
|
def create_cased_forms(spoken_forms: List[str]):
|
|
"""Add lower and upper case forms"""
|
|
new_spoken_forms = []
|
|
|
|
for line in spoken_forms:
|
|
lower_forms = []
|
|
upper_forms = []
|
|
# print(line)
|
|
for substring in line.split(" "):
|
|
if substring.isupper():
|
|
lower_forms.append(substring.lower())
|
|
upper_forms.append(" ".join(substring))
|
|
else:
|
|
lower_forms.append(substring)
|
|
upper_forms.append(substring)
|
|
|
|
new_spoken_forms.append(" ".join(lower_forms))
|
|
new_spoken_forms.append(" ".join(upper_forms))
|
|
|
|
return set(dict.fromkeys(new_spoken_forms))
|
|
|
|
|
|
def create_abbreviated_forms(spoken_forms: List[str]):
|
|
"""Add abbreviated case forms"""
|
|
new_spoken_forms = []
|
|
|
|
swapped_abbreviation_map = {v: k for k, v in abbreviations_list.items()}
|
|
for line in spoken_forms:
|
|
unabbreviated_forms = []
|
|
abbreviated_forms = []
|
|
for substring in line.split(" "):
|
|
if substring in swapped_abbreviation_map.keys():
|
|
abbreviated_forms.append(swapped_abbreviation_map[substring])
|
|
else:
|
|
abbreviated_forms.append(substring)
|
|
unabbreviated_forms.append(substring)
|
|
|
|
new_spoken_forms.append(" ".join(abbreviated_forms))
|
|
new_spoken_forms.append(" ".join(unabbreviated_forms))
|
|
|
|
return set(dict.fromkeys(new_spoken_forms))
|
|
|
|
|
|
def create_spoken_number_forms(source: List[str]):
|
|
"""
|
|
Create a list of spoken forms by transforming numbers in source into spoken forms.
|
|
This creates a first pass of spoken forms with numbers translated, but will go
|
|
through multiple other passes.
|
|
"""
|
|
|
|
# list of spoken forms returned
|
|
spoken_forms = []
|
|
|
|
# contains the pieces for the spoken form with individual digits
|
|
full_form_digit_wise = []
|
|
|
|
# contains the pieces for the spoken form with the spoken version of the number
|
|
full_form_fancy_numbers = []
|
|
|
|
# contains the pieces for the spoken form for years like "1900" => nineteen hundred
|
|
full_form_spoken_form_years = []
|
|
|
|
# indicates whether or not we processed created a version with the full number (>10) translated
|
|
has_fancy_number_version = False
|
|
|
|
# indicates whether or not we processed created a version with the year-like ("1900" => nineteen hundred) numbers
|
|
has_spoken_form_years = False
|
|
|
|
for substring in source:
|
|
# for piece in pieces:
|
|
# substring = piece.group(0)
|
|
length = len(substring)
|
|
|
|
# the length is currently capped at 31 digits
|
|
if length > 1 and length <= 31 and substring.isnumeric():
|
|
has_fancy_number_version = True
|
|
val = int(substring)
|
|
spoken_form_years = create_spoken_form_years(val)
|
|
spoken_form = create_spoken_form_for_number(val)
|
|
|
|
if spoken_form_years:
|
|
has_spoken_form_years = True
|
|
full_form_spoken_form_years.append(spoken_form_years)
|
|
else:
|
|
full_form_spoken_form_years.append(spoken_form)
|
|
|
|
full_form_fancy_numbers.append(spoken_form)
|
|
|
|
# build the serial digit version
|
|
for digit in substring:
|
|
full_form_digit_wise.append(create_single_spoken_form(digit))
|
|
|
|
else:
|
|
spoken_form = create_single_spoken_form(substring)
|
|
full_form_digit_wise.append(spoken_form)
|
|
full_form_fancy_numbers.append(spoken_form)
|
|
full_form_spoken_form_years.append(spoken_form)
|
|
|
|
if has_fancy_number_version:
|
|
spoken_forms.append(" ".join(full_form_fancy_numbers))
|
|
|
|
if has_spoken_form_years:
|
|
result = " ".join(full_form_spoken_form_years)
|
|
if result not in spoken_forms:
|
|
spoken_forms.append(result)
|
|
|
|
spoken_forms.append(" ".join(full_form_digit_wise))
|
|
return set(dict.fromkeys(spoken_forms))
|
|
|
|
|
|
def create_spoken_forms_from_regex(source: str, pattern: re.Pattern):
|
|
"""
|
|
Creates a list of spoken forms for source using the provided regex pattern.
|
|
For numeric pieces detected by the regex, generates both digit-wise and full
|
|
spoken forms for the numbers where appropriate.
|
|
"""
|
|
source_without_apostrophes = source.replace("'", "")
|
|
pieces = list(pattern.finditer(source_without_apostrophes))
|
|
spoken_forms = list(map(lambda x: x.group(0), pieces))
|
|
|
|
# NOTE: Order is sometimes important
|
|
transforms = [
|
|
create_spoken_number_forms,
|
|
create_extension_forms,
|
|
create_cased_forms,
|
|
create_exploded_forms,
|
|
create_abbreviated_forms,
|
|
create_extension_forms,
|
|
]
|
|
|
|
for func in transforms:
|
|
spoken_forms = func(spoken_forms)
|
|
|
|
return list(dict.fromkeys(spoken_forms))
|
|
|
|
|
|
def generate_string_subsequences(
|
|
source: str,
|
|
words_to_exclude: list[str],
|
|
minimum_term_length: int,
|
|
):
|
|
# Includes (lower-cased):
|
|
# 1. Each word in source, eg "foo bar baz" -> "foo", "bar", "baz".
|
|
# 2. Each leading subsequence of words from source,
|
|
# eg "foo bar baz" -> "foo", "foo bar", "foo bar baz"
|
|
# (but not "bar baz" - TODO: is this intentional?)
|
|
#
|
|
# Except for:
|
|
# 3. strings shorter than minimum_term_length
|
|
# 4. strings in words_to_exclude.
|
|
term_sequence = source.split(" ")
|
|
terms = {
|
|
# WARNING: This .lower() version creates unwanted duplication of broken up
|
|
# uppercase words, eg 'R E A D M E' -> 'r e a d m e'. Everything else should be
|
|
# lower case already
|
|
# term.lower().strip()
|
|
term.strip()
|
|
for term in (
|
|
term_sequence
|
|
+ list(itertools.accumulate([f"{term} " for term in term_sequence]))
|
|
)
|
|
}
|
|
return [
|
|
term
|
|
for term in terms
|
|
if (term not in words_to_exclude and len(term) >= minimum_term_length)
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class SpeakableItem:
|
|
name: str
|
|
value: Any
|
|
|
|
|
|
@mod.action_class
|
|
class Actions:
|
|
def create_spoken_forms(
|
|
source: str,
|
|
words_to_exclude: Optional[list[str]] = None,
|
|
minimum_term_length: int = DEFAULT_MINIMUM_TERM_LENGTH,
|
|
generate_subsequences: bool = True,
|
|
) -> list[str]:
|
|
"""Create spoken forms for a given source"""
|
|
|
|
spoken_forms_without_symbols = create_spoken_forms_from_regex(
|
|
source, REGEX_NO_SYMBOLS
|
|
)
|
|
|
|
# todo: this could probably be optimized out if there's no symbols
|
|
spoken_forms_with_symbols = create_spoken_forms_from_regex(
|
|
source, REGEX_WITH_SYMBOLS
|
|
)
|
|
|
|
# some may be identical, so ensure the list is reduced
|
|
spoken_forms = set(spoken_forms_with_symbols + spoken_forms_without_symbols)
|
|
|
|
# only generate the subsequences if requested
|
|
if generate_subsequences:
|
|
# todo: do we care about the subsequences that are excluded.
|
|
# the only one that seems relevant are the full spoken form for
|
|
spoken_forms.update(
|
|
generate_string_subsequences(
|
|
spoken_forms_without_symbols[-1],
|
|
words_to_exclude or [],
|
|
minimum_term_length,
|
|
)
|
|
)
|
|
|
|
# Avoid empty spoken forms.
|
|
return [x for x in spoken_forms if x]
|
|
|
|
def create_spoken_forms_from_list(
|
|
sources: list[str],
|
|
words_to_exclude: Optional[list[str]] = None,
|
|
minimum_term_length: int = DEFAULT_MINIMUM_TERM_LENGTH,
|
|
generate_subsequences: bool = True,
|
|
) -> dict[str, str]:
|
|
"""Create spoken forms for all sources in a list, doing conflict resolution"""
|
|
return actions.user.create_spoken_forms_from_map(
|
|
{source: source for source in sources},
|
|
words_to_exclude,
|
|
minimum_term_length,
|
|
generate_subsequences,
|
|
)
|
|
|
|
def create_spoken_forms_from_map(
|
|
sources: Mapping[str, Any],
|
|
words_to_exclude: Optional[list[str]] = None,
|
|
minimum_term_length: int = DEFAULT_MINIMUM_TERM_LENGTH,
|
|
generate_subsequences: bool = True,
|
|
) -> dict[str, Any]:
|
|
"""Create spoken forms for all sources in a map, doing conflict resolution"""
|
|
all_spoken_forms: defaultdict[str, list[SpeakableItem]] = defaultdict(list)
|
|
|
|
for name, value in sources.items():
|
|
spoken_forms = actions.user.create_spoken_forms(
|
|
name, words_to_exclude, minimum_term_length, generate_subsequences
|
|
)
|
|
for spoken_form in spoken_forms:
|
|
all_spoken_forms[spoken_form].append(SpeakableItem(name, value))
|
|
|
|
final_spoken_forms = {}
|
|
for spoken_form, spoken_form_sources in all_spoken_forms.items():
|
|
if len(spoken_form_sources) > 1:
|
|
final_spoken_forms[spoken_form] = min(
|
|
spoken_form_sources,
|
|
key=lambda speakable_item: len(speakable_item.name),
|
|
).value
|
|
else:
|
|
final_spoken_forms[spoken_form] = spoken_form_sources[0].value
|
|
|
|
return final_spoken_forms
|