Files
CljElixir/lib/clj_elixir/reader.ex
Adam d8719b6d48 Phases 1-7: Complete CljElixir compiler through Malli schema adapter
Bootstrap compiler (reader, analyzer, transformer, compiler, Mix plugin),
core protocols (16 protocols for Map/List/Tuple/BitString), PersistentVector
(bit-partitioned trie), domain tools (clojurify/elixirify), BEAM concurrency
(receive, spawn, GenServer), control flow & macros (threading, try/catch,
destructuring, defmacro with quasiquote/auto-gensym), and Malli schema
adapter (m/=> specs, auto @type, recursive schemas, cross-references).

537 compiler tests + 55 Malli unit tests + 15 integration tests = 607 total.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:38:22 -04:00

648 lines
22 KiB
Elixir

defmodule CljElixir.Reader do
@moduledoc """
Reader for CljElixir: tokenizes source text and parses it into CljElixir AST.
The reader has two phases:
1. Tokenizer — converts source text into a flat list of tokens
2. Parser — recursive descent over the token list, producing CljElixir AST nodes
## AST representation
Literals are themselves: integers, floats, strings, booleans, nil, atoms (keywords).
Compound forms use tagged tuples:
{:symbol, meta, name}
{:list, meta, [elements]}
{:vector, meta, [elements]}
{:map, meta, [k1, v1, k2, v2, ...]}
{:set, meta, [elements]}
{:tuple, meta, [elements]}
{:regex, meta, pattern}
{:quote, meta, form}
{:with_meta, meta, {metadata, target}}
{:anon_fn, meta, body}
{:quasiquote, meta, form}
{:unquote, meta, form}
{:splice_unquote, meta, form}
{:deref, meta, form}
"""
alias CljElixir.Reader.Token
# ── Public API ──────────────────────────────────────────────────────
@doc """
Read a string of CljElixir source into a list of AST forms.
Returns `{:ok, [form]}` on success, `{:error, message}` on failure.
"""
@spec read_string(String.t()) :: {:ok, list()} | {:error, String.t()}
def read_string(source) when is_binary(source) do
case tokenize(source) do
{:ok, tokens} ->
parse_all(tokens, [])
{:error, _} = err ->
err
end
end
# ════════════════════════════════════════════════════════════════════
# TOKENIZER
# ════════════════════════════════════════════════════════════════════
@doc false
def tokenize(source) do
chars = String.to_charlist(source)
tokenize_loop(chars, 1, 1, [])
end
# ---------- end of input ----------
defp tokenize_loop([], _line, _col, acc), do: {:ok, Enum.reverse(acc)}
# ---------- newline ----------
defp tokenize_loop([?\n | rest], line, _col, acc),
do: tokenize_loop(rest, line + 1, 1, acc)
defp tokenize_loop([?\r, ?\n | rest], line, _col, acc),
do: tokenize_loop(rest, line + 1, 1, acc)
defp tokenize_loop([?\r | rest], line, _col, acc),
do: tokenize_loop(rest, line + 1, 1, acc)
# ---------- whitespace / commas ----------
defp tokenize_loop([c | rest], line, col, acc) when c in [?\s, ?\t, ?,],
do: tokenize_loop(rest, line, col + 1, acc)
# ---------- comments ----------
defp tokenize_loop([?; | rest], line, _col, acc) do
rest = skip_comment(rest)
# skip_comment stops at (but does not consume) the newline or EOF.
# Let the main loop's newline handler increment line/col.
tokenize_loop(rest, line, 1, acc)
end
# ---------- strings ----------
defp tokenize_loop([?" | rest], line, col, acc) do
case read_string_literal(rest, line, col + 1, []) do
{:ok, value, rest2, end_line, end_col} ->
token = %Token{type: :string, value: value, line: line, col: col}
tokenize_loop(rest2, end_line, end_col, [token | acc])
{:error, msg} ->
{:error, msg}
end
end
# ---------- dispatch sequences: #{ #el[ #( #" ----------
defp tokenize_loop([?#, ?e, ?l, ?[ | rest], line, col, acc) do
token = %Token{type: :hash_el_lbracket, value: "#el[", line: line, col: col}
tokenize_loop(rest, line, col + 4, [token | acc])
end
defp tokenize_loop([?#, ?{ | rest], line, col, acc) do
token = %Token{type: :hash_lbrace, value: "\#{", line: line, col: col}
tokenize_loop(rest, line, col + 2, [token | acc])
end
defp tokenize_loop([?#, ?( | rest], line, col, acc) do
token = %Token{type: :hash_lparen, value: "#(", line: line, col: col}
tokenize_loop(rest, line, col + 2, [token | acc])
end
defp tokenize_loop([?#, ?" | rest], line, col, acc) do
case read_string_literal(rest, line, col + 2, []) do
{:ok, value, rest2, end_line, end_col} ->
token = %Token{type: :hash_string, value: value, line: line, col: col}
tokenize_loop(rest2, end_line, end_col, [token | acc])
{:error, msg} ->
{:error, msg}
end
end
# ---------- splice-unquote ~@ (must come before unquote ~) ----------
defp tokenize_loop([?~, ?@ | rest], line, col, acc) do
token = %Token{type: :splice_unquote, value: "~@", line: line, col: col}
tokenize_loop(rest, line, col + 2, [token | acc])
end
# ---------- unquote ~ ----------
defp tokenize_loop([?~ | rest], line, col, acc) do
token = %Token{type: :unquote, value: "~", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
# ---------- delimiters ----------
defp tokenize_loop([?( | rest], line, col, acc) do
token = %Token{type: :lparen, value: "(", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
defp tokenize_loop([?) | rest], line, col, acc) do
token = %Token{type: :rparen, value: ")", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
defp tokenize_loop([?[ | rest], line, col, acc) do
token = %Token{type: :lbracket, value: "[", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
defp tokenize_loop([?] | rest], line, col, acc) do
token = %Token{type: :rbracket, value: "]", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
defp tokenize_loop([?{ | rest], line, col, acc) do
token = %Token{type: :lbrace, value: "{", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
defp tokenize_loop([?} | rest], line, col, acc) do
token = %Token{type: :rbrace, value: "}", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
# ---------- quote ' ----------
defp tokenize_loop([?' | rest], line, col, acc) do
token = %Token{type: :quote, value: "'", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
# ---------- quasiquote ` ----------
defp tokenize_loop([?` | rest], line, col, acc) do
token = %Token{type: :quasiquote, value: "`", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
# ---------- metadata ^ ----------
defp tokenize_loop([?^ | rest], line, col, acc) do
token = %Token{type: :meta, value: "^", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
# ---------- deref @ ----------
defp tokenize_loop([?@ | rest], line, col, acc) do
token = %Token{type: :deref, value: "@", line: line, col: col}
tokenize_loop(rest, line, col + 1, [token | acc])
end
# ---------- keywords ----------
defp tokenize_loop([?: | rest], line, col, acc) do
case read_keyword(rest, line, col) do
{:ok, kw_value, rest2, end_col} ->
token = %Token{type: :keyword, value: kw_value, line: line, col: col}
tokenize_loop(rest2, line, end_col, [token | acc])
{:error, msg} ->
{:error, msg}
end
end
# ---------- negative numbers: -<digit> ----------
# Since whitespace is always consumed before reaching tokenize_loop,
# a standalone `-` followed by a digit is always a negative number literal.
# The `-` inside symbol names (like `my-func`) is consumed by the symbol reader
# and never reaches this clause as a standalone character.
defp tokenize_loop([?- | rest], line, col, acc) do
if starts_with_digit?(rest) do
{:ok, token, rest2, end_col} = read_number(rest, line, col + 1, [?-])
token = %{token | line: line, col: col}
tokenize_loop(rest2, line, end_col, [token | acc])
else
# It's a symbol starting with -
case read_symbol([?- | rest], line, col) do
{:ok, token, rest2, end_col} ->
tokenize_loop(rest2, line, end_col, [token | acc])
end
end
end
# ---------- numbers ----------
defp tokenize_loop([c | _] = chars, line, col, acc) when c in ?0..?9 do
{:ok, token, rest2, end_col} = read_number(chars, line, col, [])
tokenize_loop(rest2, line, end_col, [token | acc])
end
# ---------- symbols (and true/false/nil) ----------
defp tokenize_loop([c | _] = chars, line, col, acc)
when c in ?a..?z or c in ?A..?Z or
c == ?_ or c == ?* or c == ?! or c == ?? or
c == ?< or c == ?> or c == ?= or c == ?+ or
c == ?. or c == ?& or c == ?% do
case read_symbol(chars, line, col) do
{:ok, token, rest, end_col} ->
tokenize_loop(rest, line, end_col, [token | acc])
end
end
# ---------- catch-all: unexpected character ----------
defp tokenize_loop([c | _], line, col, _acc) do
{:error, "Unexpected character '#{<<c::utf8>>}' at line #{line}, col #{col}"}
end
# ── Tokenizer helpers ───────────────────────────────────────────────
# Characters that can continue a symbol (after the start)
defp symbol_continue_char?(c) when c in ?a..?z, do: true
defp symbol_continue_char?(c) when c in ?A..?Z, do: true
defp symbol_continue_char?(c) when c in ?0..?9, do: true
defp symbol_continue_char?(c) when c in [?_, ?*, ?!, ??, ?<, ?>, ?=, ?+, ?-, ?/, ?., ?%, ?&, ?#], do: true
defp symbol_continue_char?(_), do: false
defp starts_with_digit?([c | _]) when c in ?0..?9, do: true
defp starts_with_digit?(_), do: false
defp skip_comment([?\n | _] = rest), do: rest
defp skip_comment([?\r | _] = rest), do: rest
defp skip_comment([]), do: []
defp skip_comment([_ | rest]), do: skip_comment(rest)
# ── String literal reader ──────────────────────────────────────────
defp read_string_literal([], line, _col, _acc),
do: {:error, "Unterminated string starting at line #{line}"}
defp read_string_literal([?" | rest], line, col, acc),
do: {:ok, IO.chardata_to_string(Enum.reverse(acc)), rest, line, col + 1}
defp read_string_literal([?\\, ?" | rest], line, col, acc),
do: read_string_literal(rest, line, col + 2, [?" | acc])
defp read_string_literal([?\\, ?\\ | rest], line, col, acc),
do: read_string_literal(rest, line, col + 2, [?\\ | acc])
defp read_string_literal([?\\, ?n | rest], line, col, acc),
do: read_string_literal(rest, line, col + 2, [?\n | acc])
defp read_string_literal([?\\, ?t | rest], line, col, acc),
do: read_string_literal(rest, line, col + 2, [?\t | acc])
defp read_string_literal([?\\, ?r | rest], line, col, acc),
do: read_string_literal(rest, line, col + 2, [?\r | acc])
defp read_string_literal([?\n | rest], line, _col, acc),
do: read_string_literal(rest, line + 1, 1, [?\n | acc])
defp read_string_literal([c | rest], line, col, acc),
do: read_string_literal(rest, line, col + 1, [c | acc])
# ── Keyword reader ─────────────────────────────────────────────────
# Quoted keyword: :"some-name"
defp read_keyword([?" | rest], line, col) do
case read_string_literal(rest, line, col + 2, []) do
{:ok, value, rest2, _end_line, end_col} ->
{:ok, String.to_atom(value), rest2, end_col}
{:error, msg} ->
{:error, msg}
end
end
# Regular keyword: :name, :my-key, :ok
defp read_keyword(chars, _line, col) do
{name_chars, rest} = take_keyword_chars(chars, [])
case name_chars do
[] ->
{:error, "Expected keyword name after ':'"}
_ ->
name = IO.chardata_to_string(Enum.reverse(name_chars))
atom_val = String.to_atom(name)
{:ok, atom_val, rest, col + 1 + length(name_chars)}
end
end
defp take_keyword_chars([c | rest], acc) when c in ?a..?z or c in ?A..?Z or c in ?0..?9 or c in [?_, ?-, ?!, ??, ?., ?/, ?*, ?+, ?>, ?<, ?=, ?&, ?#],
do: take_keyword_chars(rest, [c | acc])
defp take_keyword_chars(rest, acc), do: {acc, rest}
# ── Number reader ──────────────────────────────────────────────────
defp read_number(chars, line, col, prefix) do
{digit_chars, rest} = take_digits(chars, prefix)
case rest do
[?. | after_dot] ->
case after_dot do
[d | _] when d in ?0..?9 ->
{frac_chars, rest2} = take_digits(after_dot, [?. | digit_chars])
str = IO.chardata_to_string(Enum.reverse(frac_chars))
{float_val, ""} = Float.parse(str)
end_col = col + String.length(str) - length(prefix)
token = %Token{type: :float, value: float_val, line: line, col: col}
{:ok, token, rest2, end_col}
_ ->
# dot not followed by digit — just an integer, leave dot for next token
str = IO.chardata_to_string(Enum.reverse(digit_chars))
{int_val, ""} = Integer.parse(str)
end_col = col + String.length(str) - length(prefix)
token = %Token{type: :integer, value: int_val, line: line, col: col}
{:ok, token, rest, end_col}
end
_ ->
str = IO.chardata_to_string(Enum.reverse(digit_chars))
{int_val, ""} = Integer.parse(str)
end_col = col + String.length(str) - length(prefix)
token = %Token{type: :integer, value: int_val, line: line, col: col}
{:ok, token, rest, end_col}
end
end
defp take_digits([c | rest], acc) when c in ?0..?9,
do: take_digits(rest, [c | acc])
defp take_digits(rest, acc), do: {acc, rest}
# ── Symbol reader ──────────────────────────────────────────────────
defp read_symbol(chars, line, col) do
{sym_chars, rest} = take_symbol_chars(chars, [])
name = IO.chardata_to_string(Enum.reverse(sym_chars))
end_col = col + String.length(name)
token =
case name do
"true" -> %Token{type: :boolean, value: true, line: line, col: col}
"false" -> %Token{type: :boolean, value: false, line: line, col: col}
"nil" -> %Token{type: :nil, value: nil, line: line, col: col}
_ -> %Token{type: :symbol, value: name, line: line, col: col}
end
{:ok, token, rest, end_col}
end
defp take_symbol_chars([c | rest], acc) do
if (acc == [] && symbol_start_char?(c)) || (acc != [] && symbol_continue_char?(c)) do
take_symbol_chars(rest, [c | acc])
else
{acc, [c | rest]}
end
end
defp take_symbol_chars([], acc), do: {acc, []}
defp symbol_start_char?(c) when c in ?a..?z, do: true
defp symbol_start_char?(c) when c in ?A..?Z, do: true
defp symbol_start_char?(c) when c in [?_, ?*, ?!, ??, ?<, ?>, ?=, ?+, ?-, ?., ?&, ?%], do: true
defp symbol_start_char?(_), do: false
# ════════════════════════════════════════════════════════════════════
# PARSER — Recursive Descent
# ════════════════════════════════════════════════════════════════════
# Parse all top-level forms until tokens are exhausted
defp parse_all([], acc), do: {:ok, Enum.reverse(acc)}
defp parse_all(tokens, acc) do
case parse_form(tokens) do
{:ok, form, rest} ->
parse_all(rest, [form | acc])
{:error, _} = err ->
err
end
end
# ── Parse a single form ────────────────────────────────────────────
# Literals
defp parse_form([%Token{type: :integer, value: v} | rest]),
do: {:ok, v, rest}
defp parse_form([%Token{type: :float, value: v} | rest]),
do: {:ok, v, rest}
defp parse_form([%Token{type: :string, value: v} | rest]),
do: {:ok, v, rest}
defp parse_form([%Token{type: :keyword, value: v} | rest]),
do: {:ok, v, rest}
defp parse_form([%Token{type: :boolean, value: v} | rest]),
do: {:ok, v, rest}
defp parse_form([%Token{type: :nil} | rest]),
do: {:ok, nil, rest}
# Symbol
defp parse_form([%Token{type: :symbol, value: name, line: l, col: c} | rest]),
do: {:ok, {:symbol, %{line: l, col: c}, name}, rest}
# List ( ... )
defp parse_form([%Token{type: :lparen, line: l, col: c} | rest]) do
case parse_until(rest, :rparen) do
{:ok, elements, rest2} ->
{:ok, {:list, %{line: l, col: c}, elements}, rest2}
{:error, _} = err ->
err
end
end
# Vector [ ... ]
defp parse_form([%Token{type: :lbracket, line: l, col: c} | rest]) do
case parse_until(rest, :rbracket) do
{:ok, elements, rest2} ->
{:ok, {:vector, %{line: l, col: c}, elements}, rest2}
{:error, _} = err ->
err
end
end
# Map { ... }
defp parse_form([%Token{type: :lbrace, line: l, col: c} | rest]) do
case parse_until(rest, :rbrace) do
{:ok, elements, rest2} ->
{:ok, {:map, %{line: l, col: c}, elements}, rest2}
{:error, _} = err ->
err
end
end
# Set #{ ... }
defp parse_form([%Token{type: :hash_lbrace, line: l, col: c} | rest]) do
case parse_until(rest, :rbrace) do
{:ok, elements, rest2} ->
{:ok, {:set, %{line: l, col: c}, elements}, rest2}
{:error, _} = err ->
err
end
end
# BEAM tuple #el[ ... ]
defp parse_form([%Token{type: :hash_el_lbracket, line: l, col: c} | rest]) do
case parse_until(rest, :rbracket) do
{:ok, elements, rest2} ->
{:ok, {:tuple, %{line: l, col: c}, elements}, rest2}
{:error, _} = err ->
err
end
end
# Anonymous function #( ... )
defp parse_form([%Token{type: :hash_lparen, line: l, col: c} | rest]) do
case parse_until(rest, :rparen) do
{:ok, elements, rest2} ->
body = {:list, %{line: l, col: c}, elements}
{:ok, {:anon_fn, %{line: l, col: c}, body}, rest2}
{:error, _} = err ->
err
end
end
# Regex #"..."
defp parse_form([%Token{type: :hash_string, value: pattern, line: l, col: c} | rest]),
do: {:ok, {:regex, %{line: l, col: c}, pattern}, rest}
# Quote '
defp parse_form([%Token{type: :quote, line: l, col: c} | rest]) do
case parse_form(rest) do
{:ok, form, rest2} ->
{:ok, {:quote, %{line: l, col: c}, form}, rest2}
{:error, _} = err ->
err
end
end
# Quasiquote `
defp parse_form([%Token{type: :quasiquote, line: l, col: c} | rest]) do
case parse_form(rest) do
{:ok, form, rest2} ->
{:ok, {:quasiquote, %{line: l, col: c}, form}, rest2}
{:error, _} = err ->
err
end
end
# Unquote ~
defp parse_form([%Token{type: :unquote, line: l, col: c} | rest]) do
case parse_form(rest) do
{:ok, form, rest2} ->
{:ok, {:unquote, %{line: l, col: c}, form}, rest2}
{:error, _} = err ->
err
end
end
# Splice-unquote ~@
defp parse_form([%Token{type: :splice_unquote, line: l, col: c} | rest]) do
case parse_form(rest) do
{:ok, form, rest2} ->
{:ok, {:splice_unquote, %{line: l, col: c}, form}, rest2}
{:error, _} = err ->
err
end
end
# Deref @
defp parse_form([%Token{type: :deref, line: l, col: c} | rest]) do
case parse_form(rest) do
{:ok, form, rest2} ->
{:ok, {:deref, %{line: l, col: c}, form}, rest2}
{:error, _} = err ->
err
end
end
# Metadata ^
defp parse_form([%Token{type: :meta, line: l, col: c} | rest]) do
case parse_meta_value(rest, l, c) do
{:ok, meta_form, rest2} ->
case parse_form(rest2) do
{:ok, target, rest3} ->
{:ok, {:with_meta, %{line: l, col: c}, {meta_form, target}}, rest3}
{:error, _} = err ->
err
end
{:error, _} = err ->
err
end
end
# Unexpected token
defp parse_form([%Token{type: type, line: l, col: c} | _]),
do: {:error, "Unexpected token #{type} at line #{l}, col #{c}"}
defp parse_form([]),
do: {:error, "Unexpected end of input"}
# ── Parse helpers ──────────────────────────────────────────────────
# Parse elements until a closing delimiter token type is found
defp parse_until(tokens, closer) do
parse_until_loop(tokens, closer, [])
end
defp parse_until_loop([], closer, _acc) do
name = delimiter_name(closer)
{:error, "Unexpected end of input, expected '#{name}'"}
end
defp parse_until_loop([%Token{type: type} | rest], closer, acc) when type == closer do
{:ok, Enum.reverse(acc), rest}
end
defp parse_until_loop(tokens, closer, acc) do
case parse_form(tokens) do
{:ok, form, rest} ->
parse_until_loop(rest, closer, [form | acc])
{:error, _} = err ->
err
end
end
# Parse the value after ^ (metadata)
# ^{...} — map metadata
defp parse_meta_value([%Token{type: :lbrace, line: l, col: c} | rest], _ml, _mc) do
case parse_until(rest, :rbrace) do
{:ok, elements, rest2} ->
{:ok, {:map, %{line: l, col: c}, elements}, rest2}
{:error, _} = err ->
err
end
end
# ^:keyword — sugar for ^{:keyword true}
defp parse_meta_value([%Token{type: :keyword, value: kw, line: l, col: c} | rest], _ml, _mc) do
meta_map = {:map, %{line: l, col: c}, [kw, true]}
{:ok, meta_map, rest}
end
# ^symbol — sugar for ^{:tag symbol}
defp parse_meta_value([%Token{type: :symbol} | _] = tokens, _ml, _mc) do
case parse_form(tokens) do
{:ok, form, rest} -> {:ok, form, rest}
{:error, _} = err -> err
end
end
defp parse_meta_value(_tokens, ml, mc) do
{:error, "Expected metadata value (map, keyword, or symbol) at line #{ml}, col #{mc}"}
end
defp delimiter_name(:rparen), do: ")"
defp delimiter_name(:rbracket), do: "]"
defp delimiter_name(:rbrace), do: "}"
end