talon-scripts/community/core/snippets/snippets_parser.py

import logging
import re
from copy import deepcopy
from pathlib import Path
from typing import Callable, Union

from .snippet_types import Snippet, SnippetVariable


class SnippetDocument:
    file: str
    line_doc: int
    line_body: int
    variables: list[SnippetVariable] = []
    name: str | None = None
    description: str | None = None
    phrases: list[str] | None = None
    insertionScopes: list[str] | None = None
    languages: list[str] | None = None
    body: str | None = None

    def __init__(self, file: str, line_doc: int, line_body: int):
        self.file = file
        self.line_doc = line_doc
        self.line_body = line_body


def create_snippets_from_file(file: Path) -> list[Snippet]:
    documents = parse_file(file)
    return create_snippets(documents)


def create_snippets(documents: list[SnippetDocument]) -> list[Snippet]:
    if len(documents) == 0:
        return []

    if documents[0].body is None:
        default_context = documents[0]
        documents = documents[1:]
    else:
        default_context = SnippetDocument("", -1, -1)

    snippets: list[Snippet] = []

    for doc in documents:
        snippet = create_snippet(doc, default_context)
        if snippet:
            snippets.append(snippet)

    return snippets


def create_snippet(
    document: SnippetDocument,
    default_context: SnippetDocument,
) -> Snippet | None:
    body = normalize_snippet_body_tabs(document.body)
    variables = combine_variables(default_context.variables, document.variables)
    body, variables = add_final_stop_to_snippet_body(body, variables)

    snippet = Snippet(
        name=document.name or default_context.name or "",
        description=document.description or default_context.description,
        languages=document.languages or default_context.languages,
        phrases=document.phrases or default_context.phrases,
        insertion_scopes=document.insertionScopes or default_context.insertionScopes,
        variables=variables,
        body=body,
    )

    if not validate_snippet(document, snippet):
        return None

    return snippet


def validate_snippet(document: SnippetDocument, snippet: Snippet) -> bool:
    is_valid = True

    if not snippet.name:
        error(document.file, document.line_doc, "Missing snippet name")
        is_valid = False

    if snippet.variables is None:
        error(document.file, document.line_doc, "Missing snippet variables")
        return False

    for variable in snippet.variables:
        var_name = f"${variable.name}"
        if not is_variable_in_body(variable.name, snippet.body):
            error(
                document.file,
                document.line_body,
                f"Variable '{var_name}' missing in body '{snippet.body}'",
            )
            is_valid = False

        if variable.insertion_formatters is not None and snippet.phrases is None:
            error(
                document.file,
                document.line_doc,
                f"Snippet phrase required when using '{var_name}.insertionFormatter'",
            )
            is_valid = False

        if variable.wrapper_scope is not None and variable.wrapper_phrases is None:
            error(
                document.file,
                document.line_doc,
                f"'{var_name}.wrapperPhrase' required when using '{var_name}.wrapperScope'",
            )
            is_valid = False

    return is_valid


def is_variable_in_body(variable_name: str, body: str) -> bool:
    return (
        re.search(create_variable_regular_expression(variable_name), body) is not None
    )


def create_variable_regular_expression(variable_name: str) -> str:
    # $value or ${value} or ${value:default}
    # *? is used to find the smallest possible match.
    # This stops multiple stops from being treated as a single stop.
    return rf"\${variable_name}|\${{{variable_name}.*?}}"


def combine_variables(
    default_variables: list[SnippetVariable],
    document_variables: list[SnippetVariable],
) -> list[SnippetVariable]:
    variables: dict[str, SnippetVariable] = {}

    for variable in [*default_variables, *document_variables]:
        if variable.name not in variables:
            variables[variable.name] = SnippetVariable(variable.name)

        new_variable = variables[variable.name]

        if variable.insertion_formatters is not None:
            new_variable.insertion_formatters = variable.insertion_formatters

        if variable.wrapper_phrases is not None:
            new_variable.wrapper_phrases = variable.wrapper_phrases

        if variable.wrapper_scope is not None:
            new_variable.wrapper_scope = variable.wrapper_scope

    return list(variables.values())


def add_final_stop_to_snippet_body(
    body: str, variables: list[SnippetVariable]
) -> tuple[str, list[SnippetVariable]]:
    """Make the snippet body end with stop $0 to allow exiting the snippet with `snip next`.
    If the snippet has a stop named `0`, it will get replaced with the largest number of a snippet variable name
    plus 1 with the original variable metadata for stop `0` now associated with the replacement.
    """
    if body:
        final_stop_matches = find_variable_matches("0", body)

        # Only make a change if the snippet body does not end with a final stop.
        if not (
            len(final_stop_matches) > 0 and final_stop_matches[-1].end() == len(body)
        ):
            biggest_variable_number: int | None = find_largest_variable_number(body)
            if biggest_variable_number is not None:
                replacement_name = str(biggest_variable_number + 1)
                body = replace_final_stop(body, replacement_name, final_stop_matches)
                variables = replace_variables_for_final_stop(
                    variables, replacement_name
                )
            body += "$0"

    return body, variables


def replace_final_stop(body: str, replacement_name: str, final_stop_matches) -> str:
    # Dealing with matches in reverse means replacing a match
    # does not change the location of the remaining matches.
    for match in reversed(final_stop_matches):
        replacement = match.group().replace("0", replacement_name, 1)
        body = body[: match.start()] + replacement + body[match.end() :]
    return body


def replace_variables_for_final_stop(variables, replacement_name: str):
    variables_clone = deepcopy(variables)
    for variable in variables_clone:
        if variable.name == "0":
            variable.name = replacement_name
    return variables_clone


def find_variable_matches(variable_name: str, body: str) -> list[re.Match[str]]:
    """Find every match of a variable in the body"""
    expression = create_variable_regular_expression(variable_name)
    matches = [m for m in re.finditer(expression, body)]
    return matches


def find_largest_variable_number(body: str) -> int | None:
    # Find all snippet stops with a numeric variable name
    # +? is used to find the smallest possible match.
    # We need this here to avoid treating multiple stops as a single one
    regular_expression = rf"\$\d+?|\${{\d+?:.*?}}|\${{\d+?}}"
    matches = re.findall(regular_expression, body)
    if matches:
        numbers = [
            compute_first_integer_in_string(match)
            for match in matches
            if match is not None
        ]
        if numbers:
            return max(numbers)
    return None


def compute_first_integer_in_string(text: str) -> int | None:
    start_index: int | None = None
    ending_index: int | None = None
    for i, char in enumerate(text):
        if char.isdigit():
            if start_index is None:
                start_index = i
            ending_index = i + 1
        elif start_index is not None:
            break
    if start_index is not None:
        integer_text = text[start_index:ending_index]
        return int(integer_text)
    return None


def normalize_snippet_body_tabs(body: str | None) -> str:
    if not body:
        return ""

    # If snippet body already contains tabs. No change.
    if "\t" in body:
        return body

    lines = []
    smallest_indentation = None

    for line in body.splitlines():
        match = re.search(r"^\s+", line)
        indentation = match.group() if match is not None else ""

        # Keep track of smallest non-empty indentation
        if len(indentation) > 0 and (
            smallest_indentation is None or len(indentation) < len(smallest_indentation)
        ):
            smallest_indentation = indentation

        lines.append({"indentation": indentation, "rest": line[len(indentation) :]})

    # No indentation found in snippet body. No change.
    if smallest_indentation is None:
        return body

    normalized_lines = [
        reconstruct_line(smallest_indentation, line["indentation"], line["rest"])
        for line in lines
    ]

    return "\n".join(normalized_lines)


def reconstruct_line(smallest_indentation: str, indentation: str, rest: str) -> str:
    # Update indentation by replacing each occurrent of smallest space indentation with a tab
    indentation = indentation.replace(smallest_indentation, "\t")
    return f"{indentation}{rest}"


# ---------- Snippet file parser ----------


def parse_file(file: Path) -> list[SnippetDocument]:
    with open(file, encoding="utf-8") as f:
        content = f.read()
    return parse_file_content(file.name, content)


def parse_file_content(file: str, text: str) -> list[SnippetDocument]:
    doc_texts = re.split(r"^---\n?$", text, flags=re.MULTILINE)
    documents: list[SnippetDocument] = []
    line = 0

    for i, doc_text in enumerate(doc_texts):
        optional_body = i == 0 and len(doc_texts) > 1
        document = parse_document(file, line, optional_body, doc_text)
        if document is not None:
            documents.append(document)
        line += doc_text.count("\n") + 1

    return documents


def parse_document(
    file: str,
    line: int,
    optional_body: bool,
    text: str,
) -> Union[SnippetDocument, None]:
    parts = re.split(r"^-$", text, maxsplit=1, flags=re.MULTILINE)
    line_body = line + parts[0].count("\n") + 1
    org_doc = SnippetDocument(file, line, line_body)
    document = parse_context(file, line, org_doc, parts[0])

    if len(parts) == 2:
        body = parse_body(parts[1])
        if body is not None:
            if document is None:
                document = org_doc
            document.body = body

    if document and not document.body and not optional_body:
        error(file, line, f"Missing body in snippet document '{text}'")
        return None

    return document


def parse_context(
    file: str,
    line: int,
    document: SnippetDocument,
    text: str,
) -> Union[SnippetDocument, None]:
    lines = [l.strip() for l in text.splitlines()]
    keys: set[str] = set()
    variables: dict[str, SnippetVariable] = {}

    def get_variable(name: str) -> SnippetVariable:
        if name not in variables:
            variables[name] = SnippetVariable(name)
        return variables[name]

    for i, line_text in enumerate(lines):
        if line_text:
            parse_context_line(
                file,
                line + i,
                document,
                keys,
                get_variable,
                line_text,
            )

    if len(keys) == 0:
        return None

    document.variables = list(variables.values())

    return document


def parse_context_line(
    file: str,
    line: int,
    document: SnippetDocument,
    keys: set[str],
    get_variable: Callable[[str], SnippetVariable],
    text: str,
):
    parts = text.split(":")

    if len(parts) != 2:
        error(file, line, f"Invalid line '{text}'")
        return

    key = parts[0].strip()
    value = parts[1].strip()

    if not key or not value:
        error(file, line, f"Invalid line '{text}'")
        return

    if key in keys:
        warn(file, line, f"Duplicate key '{key}'")

    keys.add(key)

    match key:
        case "name":
            document.name = value
        case "description":
            document.description = value
        case "phrase":
            document.phrases = parse_vector_value(value)
        case "insertionScope":
            document.insertionScopes = parse_vector_value(value)
        case "language":
            document.languages = parse_vector_value(value)
        case _:
            if key.startswith("$"):
                parse_variable(file, line, get_variable, key, value)
            else:
                warn(file, line, f"Unknown key '{key}'")


def parse_variable(
    file: str,
    line_numb: int,
    get_variable: Callable[[str], SnippetVariable],
    key: str,
    value: str,
):
    parts = key.split(".")

    if len(parts) != 2:
        error(file, line_numb, f"Invalid variable key '{key}'")
        return

    name = parts[0][1:]
    field = parts[1]

    match field:
        case "insertionFormatter":
            get_variable(name).insertion_formatters = parse_vector_value(value)
        case "wrapperPhrase":
            get_variable(name).wrapper_phrases = parse_vector_value(value)
        case "wrapperScope":
            get_variable(name).wrapper_scope = value
        case _:
            warn(file, line_numb, f"Unknown variable key '{key}'")


def parse_body(text: str) -> Union[str, None]:
    # Find first line that is not empty. Preserve indentation.
    match_leading = re.search(r"^[ \t]*\S", text, flags=re.MULTILINE)

    if match_leading is None:
        return None

    return text[match_leading.start() :].rstrip()


def parse_vector_value(value: str) -> list[str]:
    return [v.strip() for v in value.split("|")]


def error(file: str, line: int, message: str):
    logging.error(f"{file}:{line+1} | {message}")


def warn(file: str, line: int, message: str):
    logging.warning(f"{file}:{line+1} | {message}")