defmodule CljElixir.Reader do @moduledoc """ Reader for CljElixir: tokenizes source text and parses it into CljElixir AST. The reader has two phases: 1. Tokenizer — converts source text into a flat list of tokens 2. Parser — recursive descent over the token list, producing CljElixir AST nodes ## AST representation Literals are themselves: integers, floats, strings, booleans, nil, atoms (keywords). Compound forms use tagged tuples: {:symbol, meta, name} {:list, meta, [elements]} {:vector, meta, [elements]} {:map, meta, [k1, v1, k2, v2, ...]} {:set, meta, [elements]} {:tuple, meta, [elements]} {:regex, meta, pattern} {:quote, meta, form} {:with_meta, meta, {metadata, target}} {:anon_fn, meta, body} {:quasiquote, meta, form} {:unquote, meta, form} {:splice_unquote, meta, form} {:deref, meta, form} """ alias CljElixir.Reader.Token # ── Public API ────────────────────────────────────────────────────── @doc """ Read a string of CljElixir source into a list of AST forms. Returns `{:ok, [form]}` on success, `{:error, message}` on failure. """ @spec read_string(String.t()) :: {:ok, list()} | {:error, String.t()} def read_string(source) when is_binary(source) do case tokenize(source) do {:ok, tokens} -> parse_all(tokens, []) {:error, _} = err -> err end end # ════════════════════════════════════════════════════════════════════ # TOKENIZER # ════════════════════════════════════════════════════════════════════ @doc false def tokenize(source) do chars = String.to_charlist(source) tokenize_loop(chars, 1, 1, []) end # ---------- end of input ---------- defp tokenize_loop([], _line, _col, acc), do: {:ok, Enum.reverse(acc)} # ---------- newline ---------- defp tokenize_loop([?\n | rest], line, _col, acc), do: tokenize_loop(rest, line + 1, 1, acc) defp tokenize_loop([?\r, ?\n | rest], line, _col, acc), do: tokenize_loop(rest, line + 1, 1, acc) defp tokenize_loop([?\r | rest], line, _col, acc), do: tokenize_loop(rest, line + 1, 1, acc) # ---------- whitespace / commas ---------- defp tokenize_loop([c | rest], line, col, acc) when c in [?\s, ?\t, ?,], do: tokenize_loop(rest, line, col + 1, acc) # ---------- comments ---------- defp tokenize_loop([?; | rest], line, _col, acc) do rest = skip_comment(rest) # skip_comment stops at (but does not consume) the newline or EOF. # Let the main loop's newline handler increment line/col. tokenize_loop(rest, line, 1, acc) end # ---------- strings ---------- defp tokenize_loop([?" | rest], line, col, acc) do case read_string_literal(rest, line, col + 1, []) do {:ok, value, rest2, end_line, end_col} -> token = %Token{type: :string, value: value, line: line, col: col} tokenize_loop(rest2, end_line, end_col, [token | acc]) {:error, msg} -> {:error, msg} end end # ---------- dispatch sequences: #{ #el[ #( #" ---------- defp tokenize_loop([?#, ?e, ?l, ?[ | rest], line, col, acc) do token = %Token{type: :hash_el_lbracket, value: "#el[", line: line, col: col} tokenize_loop(rest, line, col + 4, [token | acc]) end defp tokenize_loop([?#, ?{ | rest], line, col, acc) do token = %Token{type: :hash_lbrace, value: "\#{", line: line, col: col} tokenize_loop(rest, line, col + 2, [token | acc]) end defp tokenize_loop([?#, ?( | rest], line, col, acc) do token = %Token{type: :hash_lparen, value: "#(", line: line, col: col} tokenize_loop(rest, line, col + 2, [token | acc]) end defp tokenize_loop([?#, ?" | rest], line, col, acc) do case read_string_literal(rest, line, col + 2, []) do {:ok, value, rest2, end_line, end_col} -> token = %Token{type: :hash_string, value: value, line: line, col: col} tokenize_loop(rest2, end_line, end_col, [token | acc]) {:error, msg} -> {:error, msg} end end # ---------- splice-unquote ~@ (must come before unquote ~) ---------- defp tokenize_loop([?~, ?@ | rest], line, col, acc) do token = %Token{type: :splice_unquote, value: "~@", line: line, col: col} tokenize_loop(rest, line, col + 2, [token | acc]) end # ---------- unquote ~ ---------- defp tokenize_loop([?~ | rest], line, col, acc) do token = %Token{type: :unquote, value: "~", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end # ---------- delimiters ---------- defp tokenize_loop([?( | rest], line, col, acc) do token = %Token{type: :lparen, value: "(", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end defp tokenize_loop([?) | rest], line, col, acc) do token = %Token{type: :rparen, value: ")", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end defp tokenize_loop([?[ | rest], line, col, acc) do token = %Token{type: :lbracket, value: "[", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end defp tokenize_loop([?] | rest], line, col, acc) do token = %Token{type: :rbracket, value: "]", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end defp tokenize_loop([?{ | rest], line, col, acc) do token = %Token{type: :lbrace, value: "{", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end defp tokenize_loop([?} | rest], line, col, acc) do token = %Token{type: :rbrace, value: "}", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end # ---------- quote ' ---------- defp tokenize_loop([?' | rest], line, col, acc) do token = %Token{type: :quote, value: "'", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end # ---------- quasiquote ` ---------- defp tokenize_loop([?` | rest], line, col, acc) do token = %Token{type: :quasiquote, value: "`", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end # ---------- metadata ^ ---------- defp tokenize_loop([?^ | rest], line, col, acc) do token = %Token{type: :meta, value: "^", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end # ---------- deref @ ---------- defp tokenize_loop([?@ | rest], line, col, acc) do token = %Token{type: :deref, value: "@", line: line, col: col} tokenize_loop(rest, line, col + 1, [token | acc]) end # ---------- keywords ---------- defp tokenize_loop([?: | rest], line, col, acc) do case read_keyword(rest, line, col) do {:ok, kw_value, rest2, end_col} -> token = %Token{type: :keyword, value: kw_value, line: line, col: col} tokenize_loop(rest2, line, end_col, [token | acc]) {:error, msg} -> {:error, msg} end end # ---------- negative numbers: - ---------- # Since whitespace is always consumed before reaching tokenize_loop, # a standalone `-` followed by a digit is always a negative number literal. # The `-` inside symbol names (like `my-func`) is consumed by the symbol reader # and never reaches this clause as a standalone character. defp tokenize_loop([?- | rest], line, col, acc) do if starts_with_digit?(rest) do {:ok, token, rest2, end_col} = read_number(rest, line, col + 1, [?-]) token = %{token | line: line, col: col} tokenize_loop(rest2, line, end_col, [token | acc]) else # It's a symbol starting with - case read_symbol([?- | rest], line, col) do {:ok, token, rest2, end_col} -> tokenize_loop(rest2, line, end_col, [token | acc]) end end end # ---------- numbers ---------- defp tokenize_loop([c | _] = chars, line, col, acc) when c in ?0..?9 do {:ok, token, rest2, end_col} = read_number(chars, line, col, []) tokenize_loop(rest2, line, end_col, [token | acc]) end # ---------- symbols (and true/false/nil) ---------- defp tokenize_loop([c | _] = chars, line, col, acc) when c in ?a..?z or c in ?A..?Z or c == ?_ or c == ?* or c == ?! or c == ?? or c == ?< or c == ?> or c == ?= or c == ?+ or c == ?. or c == ?& or c == ?% do case read_symbol(chars, line, col) do {:ok, token, rest, end_col} -> tokenize_loop(rest, line, end_col, [token | acc]) end end # ---------- catch-all: unexpected character ---------- defp tokenize_loop([c | _], line, col, _acc) do {:error, "Unexpected character '#{<>}' at line #{line}, col #{col}"} end # ── Tokenizer helpers ─────────────────────────────────────────────── # Characters that can continue a symbol (after the start) defp symbol_continue_char?(c) when c in ?a..?z, do: true defp symbol_continue_char?(c) when c in ?A..?Z, do: true defp symbol_continue_char?(c) when c in ?0..?9, do: true defp symbol_continue_char?(c) when c in [?_, ?*, ?!, ??, ?<, ?>, ?=, ?+, ?-, ?/, ?., ?%, ?&, ?#], do: true defp symbol_continue_char?(_), do: false defp starts_with_digit?([c | _]) when c in ?0..?9, do: true defp starts_with_digit?(_), do: false defp skip_comment([?\n | _] = rest), do: rest defp skip_comment([?\r | _] = rest), do: rest defp skip_comment([]), do: [] defp skip_comment([_ | rest]), do: skip_comment(rest) # ── String literal reader ────────────────────────────────────────── defp read_string_literal([], line, _col, _acc), do: {:error, "Unterminated string starting at line #{line}"} defp read_string_literal([?" | rest], line, col, acc), do: {:ok, IO.chardata_to_string(Enum.reverse(acc)), rest, line, col + 1} defp read_string_literal([?\\, ?" | rest], line, col, acc), do: read_string_literal(rest, line, col + 2, [?" | acc]) defp read_string_literal([?\\, ?\\ | rest], line, col, acc), do: read_string_literal(rest, line, col + 2, [?\\ | acc]) defp read_string_literal([?\\, ?n | rest], line, col, acc), do: read_string_literal(rest, line, col + 2, [?\n | acc]) defp read_string_literal([?\\, ?t | rest], line, col, acc), do: read_string_literal(rest, line, col + 2, [?\t | acc]) defp read_string_literal([?\\, ?r | rest], line, col, acc), do: read_string_literal(rest, line, col + 2, [?\r | acc]) defp read_string_literal([?\n | rest], line, _col, acc), do: read_string_literal(rest, line + 1, 1, [?\n | acc]) defp read_string_literal([c | rest], line, col, acc), do: read_string_literal(rest, line, col + 1, [c | acc]) # ── Keyword reader ───────────────────────────────────────────────── # Quoted keyword: :"some-name" defp read_keyword([?" | rest], line, col) do case read_string_literal(rest, line, col + 2, []) do {:ok, value, rest2, _end_line, end_col} -> {:ok, String.to_atom(value), rest2, end_col} {:error, msg} -> {:error, msg} end end # Regular keyword: :name, :my-key, :ok defp read_keyword(chars, _line, col) do {name_chars, rest} = take_keyword_chars(chars, []) case name_chars do [] -> {:error, "Expected keyword name after ':'"} _ -> name = IO.chardata_to_string(Enum.reverse(name_chars)) atom_val = String.to_atom(name) {:ok, atom_val, rest, col + 1 + length(name_chars)} end end defp take_keyword_chars([c | rest], acc) when c in ?a..?z or c in ?A..?Z or c in ?0..?9 or c in [?_, ?-, ?!, ??, ?., ?/, ?*, ?+, ?>, ?<, ?=, ?&, ?#], do: take_keyword_chars(rest, [c | acc]) defp take_keyword_chars(rest, acc), do: {acc, rest} # ── Number reader ────────────────────────────────────────────────── defp read_number(chars, line, col, prefix) do {digit_chars, rest} = take_digits(chars, prefix) case rest do [?. | after_dot] -> case after_dot do [d | _] when d in ?0..?9 -> {frac_chars, rest2} = take_digits(after_dot, [?. | digit_chars]) str = IO.chardata_to_string(Enum.reverse(frac_chars)) {float_val, ""} = Float.parse(str) end_col = col + String.length(str) - length(prefix) token = %Token{type: :float, value: float_val, line: line, col: col} {:ok, token, rest2, end_col} _ -> # dot not followed by digit — just an integer, leave dot for next token str = IO.chardata_to_string(Enum.reverse(digit_chars)) {int_val, ""} = Integer.parse(str) end_col = col + String.length(str) - length(prefix) token = %Token{type: :integer, value: int_val, line: line, col: col} {:ok, token, rest, end_col} end _ -> str = IO.chardata_to_string(Enum.reverse(digit_chars)) {int_val, ""} = Integer.parse(str) end_col = col + String.length(str) - length(prefix) token = %Token{type: :integer, value: int_val, line: line, col: col} {:ok, token, rest, end_col} end end defp take_digits([c | rest], acc) when c in ?0..?9, do: take_digits(rest, [c | acc]) defp take_digits(rest, acc), do: {acc, rest} # ── Symbol reader ────────────────────────────────────────────────── defp read_symbol(chars, line, col) do {sym_chars, rest} = take_symbol_chars(chars, []) name = IO.chardata_to_string(Enum.reverse(sym_chars)) end_col = col + String.length(name) token = case name do "true" -> %Token{type: :boolean, value: true, line: line, col: col} "false" -> %Token{type: :boolean, value: false, line: line, col: col} "nil" -> %Token{type: :nil, value: nil, line: line, col: col} _ -> %Token{type: :symbol, value: name, line: line, col: col} end {:ok, token, rest, end_col} end defp take_symbol_chars([c | rest], acc) do if (acc == [] && symbol_start_char?(c)) || (acc != [] && symbol_continue_char?(c)) do take_symbol_chars(rest, [c | acc]) else {acc, [c | rest]} end end defp take_symbol_chars([], acc), do: {acc, []} defp symbol_start_char?(c) when c in ?a..?z, do: true defp symbol_start_char?(c) when c in ?A..?Z, do: true defp symbol_start_char?(c) when c in [?_, ?*, ?!, ??, ?<, ?>, ?=, ?+, ?-, ?., ?&, ?%], do: true defp symbol_start_char?(_), do: false # ════════════════════════════════════════════════════════════════════ # PARSER — Recursive Descent # ════════════════════════════════════════════════════════════════════ # Parse all top-level forms until tokens are exhausted defp parse_all([], acc), do: {:ok, Enum.reverse(acc)} defp parse_all(tokens, acc) do case parse_form(tokens) do {:ok, form, rest} -> parse_all(rest, [form | acc]) {:error, _} = err -> err end end # ── Parse a single form ──────────────────────────────────────────── # Literals defp parse_form([%Token{type: :integer, value: v} | rest]), do: {:ok, v, rest} defp parse_form([%Token{type: :float, value: v} | rest]), do: {:ok, v, rest} defp parse_form([%Token{type: :string, value: v} | rest]), do: {:ok, v, rest} defp parse_form([%Token{type: :keyword, value: v} | rest]), do: {:ok, v, rest} defp parse_form([%Token{type: :boolean, value: v} | rest]), do: {:ok, v, rest} defp parse_form([%Token{type: :nil} | rest]), do: {:ok, nil, rest} # Symbol defp parse_form([%Token{type: :symbol, value: name, line: l, col: c} | rest]), do: {:ok, {:symbol, %{line: l, col: c}, name}, rest} # List ( ... ) defp parse_form([%Token{type: :lparen, line: l, col: c} | rest]) do case parse_until(rest, :rparen) do {:ok, elements, rest2} -> {:ok, {:list, %{line: l, col: c}, elements}, rest2} {:error, _} = err -> err end end # Vector [ ... ] defp parse_form([%Token{type: :lbracket, line: l, col: c} | rest]) do case parse_until(rest, :rbracket) do {:ok, elements, rest2} -> {:ok, {:vector, %{line: l, col: c}, elements}, rest2} {:error, _} = err -> err end end # Map { ... } defp parse_form([%Token{type: :lbrace, line: l, col: c} | rest]) do case parse_until(rest, :rbrace) do {:ok, elements, rest2} -> {:ok, {:map, %{line: l, col: c}, elements}, rest2} {:error, _} = err -> err end end # Set #{ ... } defp parse_form([%Token{type: :hash_lbrace, line: l, col: c} | rest]) do case parse_until(rest, :rbrace) do {:ok, elements, rest2} -> {:ok, {:set, %{line: l, col: c}, elements}, rest2} {:error, _} = err -> err end end # BEAM tuple #el[ ... ] defp parse_form([%Token{type: :hash_el_lbracket, line: l, col: c} | rest]) do case parse_until(rest, :rbracket) do {:ok, elements, rest2} -> {:ok, {:tuple, %{line: l, col: c}, elements}, rest2} {:error, _} = err -> err end end # Anonymous function #( ... ) defp parse_form([%Token{type: :hash_lparen, line: l, col: c} | rest]) do case parse_until(rest, :rparen) do {:ok, elements, rest2} -> body = {:list, %{line: l, col: c}, elements} {:ok, {:anon_fn, %{line: l, col: c}, body}, rest2} {:error, _} = err -> err end end # Regex #"..." defp parse_form([%Token{type: :hash_string, value: pattern, line: l, col: c} | rest]), do: {:ok, {:regex, %{line: l, col: c}, pattern}, rest} # Quote ' defp parse_form([%Token{type: :quote, line: l, col: c} | rest]) do case parse_form(rest) do {:ok, form, rest2} -> {:ok, {:quote, %{line: l, col: c}, form}, rest2} {:error, _} = err -> err end end # Quasiquote ` defp parse_form([%Token{type: :quasiquote, line: l, col: c} | rest]) do case parse_form(rest) do {:ok, form, rest2} -> {:ok, {:quasiquote, %{line: l, col: c}, form}, rest2} {:error, _} = err -> err end end # Unquote ~ defp parse_form([%Token{type: :unquote, line: l, col: c} | rest]) do case parse_form(rest) do {:ok, form, rest2} -> {:ok, {:unquote, %{line: l, col: c}, form}, rest2} {:error, _} = err -> err end end # Splice-unquote ~@ defp parse_form([%Token{type: :splice_unquote, line: l, col: c} | rest]) do case parse_form(rest) do {:ok, form, rest2} -> {:ok, {:splice_unquote, %{line: l, col: c}, form}, rest2} {:error, _} = err -> err end end # Deref @ defp parse_form([%Token{type: :deref, line: l, col: c} | rest]) do case parse_form(rest) do {:ok, form, rest2} -> {:ok, {:deref, %{line: l, col: c}, form}, rest2} {:error, _} = err -> err end end # Metadata ^ defp parse_form([%Token{type: :meta, line: l, col: c} | rest]) do case parse_meta_value(rest, l, c) do {:ok, meta_form, rest2} -> case parse_form(rest2) do {:ok, target, rest3} -> {:ok, {:with_meta, %{line: l, col: c}, {meta_form, target}}, rest3} {:error, _} = err -> err end {:error, _} = err -> err end end # Unexpected token defp parse_form([%Token{type: type, line: l, col: c} | _]), do: {:error, "Unexpected token #{type} at line #{l}, col #{c}"} defp parse_form([]), do: {:error, "Unexpected end of input"} # ── Parse helpers ────────────────────────────────────────────────── # Parse elements until a closing delimiter token type is found defp parse_until(tokens, closer) do parse_until_loop(tokens, closer, []) end defp parse_until_loop([], closer, _acc) do name = delimiter_name(closer) {:error, "Unexpected end of input, expected '#{name}'"} end defp parse_until_loop([%Token{type: type} | rest], closer, acc) when type == closer do {:ok, Enum.reverse(acc), rest} end defp parse_until_loop(tokens, closer, acc) do case parse_form(tokens) do {:ok, form, rest} -> parse_until_loop(rest, closer, [form | acc]) {:error, _} = err -> err end end # Parse the value after ^ (metadata) # ^{...} — map metadata defp parse_meta_value([%Token{type: :lbrace, line: l, col: c} | rest], _ml, _mc) do case parse_until(rest, :rbrace) do {:ok, elements, rest2} -> {:ok, {:map, %{line: l, col: c}, elements}, rest2} {:error, _} = err -> err end end # ^:keyword — sugar for ^{:keyword true} defp parse_meta_value([%Token{type: :keyword, value: kw, line: l, col: c} | rest], _ml, _mc) do meta_map = {:map, %{line: l, col: c}, [kw, true]} {:ok, meta_map, rest} end # ^symbol — sugar for ^{:tag symbol} defp parse_meta_value([%Token{type: :symbol} | _] = tokens, _ml, _mc) do case parse_form(tokens) do {:ok, form, rest} -> {:ok, form, rest} {:error, _} = err -> err end end defp parse_meta_value(_tokens, ml, mc) do {:error, "Expected metadata value (map, keyword, or symbol) at line #{ml}, col #{mc}"} end defp delimiter_name(:rparen), do: ")" defp delimiter_name(:rbracket), do: "]" defp delimiter_name(:rbrace), do: "}" end