(ns agent.syntax "Regex-based syntax highlighting for code blocks and diffs. Single-pass tokenizer using java.util.regex.Matcher.lookingAt()." (:require [clojure.string :as str]) (:import [java.util.regex Pattern Matcher])) ;; ============================================================ ;; Color Palette (ANSI 256) ;; ============================================================ (def ^:private colors {:string "\033[38;5;108m" :comment "\033[38;5;245m" :keyword "\033[38;5;176m" :number "\033[38;5;216m" :builtin "\033[38;5;75m" :constant "\033[38;5;216m" :type "\033[38;5;180m" :clj-kw "\033[38;5;73m" :param "\033[38;5;208m"}) ;; ============================================================ ;; Tokenizer Engine ;; ============================================================ (defn- highlight-line* "Walk left-to-right through `line`. For each position, try rules in order; first match wins. `rules` is a vector of [compiled-Pattern color-or-fn]. When color-or-fn is a function, it receives the matched text and returns an ANSI color string (or nil for default). `default-fg` is the ANSI code for unhighlighted text." [^String line rules ^String default-fg] (let [len (.length line) sb (StringBuilder.) matcher-cache (object-array (count rules))] ;; Pre-create matchers for each rule (dotimes [i (count rules)] (let [[^Pattern pat _] (nth rules i)] (aset matcher-cache i (.matcher pat line)))) (loop [pos 0] (if (>= pos len) (.toString sb) (let [matched? (loop [ri 0] (if (>= ri (count rules)) false (let [^Matcher m (aget matcher-cache ri) _ (.region m pos len)] (if (.lookingAt m) (let [[_ color-or-fn] (nth rules ri) text (.group m) color (if (fn? color-or-fn) (color-or-fn text) color-or-fn)] (if color (do (.append sb color) (.append sb text) (.append sb default-fg)) (do (.append sb text))) (.end m)) (recur (inc ri))))))] (if matched? (recur (long matched?)) (do (.append sb (.charAt line pos)) (recur (inc pos))))))))) ;; ============================================================ ;; Language: Clojure ;; ============================================================ (def ^:private clj-special-forms #{"def" "defn" "defn-" "defmacro" "defmethod" "defmulti" "defonce" "defprotocol" "defrecord" "deftype" "defstruct" "definline" "definterface" "fn" "fn*" "if" "if-let" "if-not" "if-some" "when" "when-let" "when-not" "when-first" "when-some" "do" "let" "letfn" "binding" "loop" "recur" "cond" "condp" "cond->" "cond->>" "case" "try" "catch" "finally" "throw" "quote" "var" "import" "require" "use" "refer" "ns" "and" "or" "not" "doseq" "dotimes" "doto" "dorun" "doall" "for" "while" "new" "set!" "monitor-enter" "monitor-exit" "->" "->>" "as->" "some->" "some->>"}) (def ^:private clj-builtins #{"map" "filter" "reduce" "apply" "partial" "comp" "juxt" "complement" "mapv" "filterv" "mapcat" "keep" "remove" "first" "second" "last" "rest" "next" "cons" "conj" "into" "assoc" "dissoc" "update" "get" "get-in" "assoc-in" "update-in" "select-keys" "merge" "merge-with" "atom" "deref" "reset!" "swap!" "compare-and-set!" "str" "subs" "format" "name" "keyword" "symbol" "println" "print" "prn" "pr" "pr-str" "prn-str" "count" "empty?" "seq" "seq?" "sequential?" "vec" "vector" "vector?" "list" "list?" "set" "hash-set" "sorted-set" "hash-map" "sorted-map" "zipmap" "frequencies" "group-by" "keys" "vals" "contains?" "find" "range" "repeat" "repeatedly" "iterate" "cycle" "interleave" "interpose" "take" "drop" "take-while" "drop-while" "split-at" "split-with" "partition" "partition-by" "partition-all" "concat" "flatten" "distinct" "sort" "sort-by" "reverse" "shuffle" "every?" "some" "not-every?" "not-any?" "identity" "constantly" "inc" "dec" "+" "-" "*" "/" "mod" "rem" "quot" "=" "==" "not=" "<" ">" "<=" ">=" "zero?" "pos?" "neg?" "even?" "odd?" "number?" "integer?" "nil?" "true?" "false?" "string?" "keyword?" "symbol?" "map?" "coll?" "fn?" "type" "class" "instance?" "satisfies?" "extends?" "meta" "with-meta" "vary-meta" "read-string" "slurp" "spit" "re-find" "re-matches" "re-seq" "re-pattern" "future" "promise" "deliver" "realized?" "pmap" "resolve" "ns-resolve" "eval" "max" "min" "abs" "rand" "rand-int" "nth" "nfirst" "nnext" "fnext" "ffirst" "not-empty" "bounded-count" "transduce" "sequence" "volatile!" "vswap!" "vreset!" "reduced" "reduced?" "unreduced" "ensure-reduced" "ex-info" "ex-data" "ex-message"}) (def ^:private clj-constants #{"nil" "true" "false"}) (defn- clj-classify [text] (cond (contains? clj-constants text) (:constant colors) (contains? clj-special-forms text) (:keyword colors) (contains? clj-builtins text) (:builtin colors) :else nil)) (def ^:private clj-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [[";.*" (:comment colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["#\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["\\\\(?:newline|space|tab|backspace|formfeed|return|[a-zA-Z])" (:string colors)] [":[a-zA-Z_*+!?<>=/.\\-][a-zA-Z0-9_*+!?<>=/.\\-:#]*" (:clj-kw colors)] ["-?0[xX][0-9a-fA-F]+" (:number colors)] ["-?\\d+\\.\\d+" (:number colors)] ["-?\\d+/\\d+" (:number colors)] ["-?\\d+" (:number colors)] ["##(?:Inf|-Inf|NaN)" (:constant colors)] ["[a-zA-Z_*+!?<>=/.\\-][a-zA-Z0-9_*+!?<>=/.\\-:#]*" clj-classify]])) ;; ============================================================ ;; Language: JavaScript / TypeScript ;; ============================================================ (def ^:private js-keywords #{"async" "await" "break" "case" "catch" "class" "const" "continue" "debugger" "default" "delete" "do" "else" "export" "extends" "finally" "for" "from" "function" "if" "import" "in" "instanceof" "let" "new" "of" "return" "static" "super" "switch" "this" "throw" "try" "typeof" "var" "void" "while" "with" "yield" ;; TS extras "type" "interface" "enum" "namespace" "declare" "implements" "abstract" "as" "readonly" "keyof" "infer"}) (def ^:private js-builtins #{"console" "Math" "JSON" "Object" "Array" "String" "Number" "Boolean" "Promise" "Map" "Set" "WeakMap" "WeakSet" "Symbol" "Proxy" "Reflect" "parseInt" "parseFloat" "isNaN" "isFinite" "undefined" "NaN" "Infinity" "require" "module" "exports" "process" "Buffer" "global" "window" "document"}) (def ^:private js-constants #{"true" "false" "null" "undefined" "NaN" "Infinity"}) (defn- js-classify [text] (cond (contains? js-constants text) (:constant colors) (contains? js-keywords text) (:keyword colors) (contains? js-builtins text) (:builtin colors) (and (>= (count text) 2) (Character/isUpperCase (.charAt ^String text 0))) (:type colors) :else nil)) (def ^:private js-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["//.*" (:comment colors)] ["/\\*[\\s\\S]*?\\*/" (:comment colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ["`(?:[^`\\\\]|\\\\.)*`" (:string colors)] ["/(?![*/])(?:[^/\\\\]|\\\\.)+/[gimsuy]*" (:string colors)] ["@[a-zA-Z_][a-zA-Z0-9_]*" (:param colors)] ["0[xX][0-9a-fA-F]+" (:number colors)] ["\\d+\\.\\d+(?:[eE][+-]?\\d+)?" (:number colors)] ["\\d+" (:number colors)] ["[a-zA-Z_$][a-zA-Z0-9_$]*" js-classify]])) ;; ============================================================ ;; Language: Python ;; ============================================================ (def ^:private py-keywords #{"and" "as" "assert" "async" "await" "break" "class" "continue" "def" "del" "elif" "else" "except" "finally" "for" "from" "global" "if" "import" "in" "is" "lambda" "nonlocal" "not" "or" "pass" "raise" "return" "try" "while" "with" "yield" "match" "case"}) (def ^:private py-builtins #{"print" "len" "range" "int" "str" "float" "list" "dict" "set" "tuple" "bool" "type" "isinstance" "issubclass" "hasattr" "getattr" "setattr" "super" "property" "staticmethod" "classmethod" "enumerate" "zip" "map" "filter" "sorted" "reversed" "any" "all" "min" "max" "sum" "abs" "round" "input" "open" "repr" "id" "hash" "callable" "iter" "next" "ValueError" "TypeError" "KeyError" "IndexError" "RuntimeError" "Exception" "StopIteration" "AttributeError" "ImportError" "OSError" "self" "cls"}) (def ^:private py-constants #{"True" "False" "None"}) (defn- py-classify [text] (cond (contains? py-constants text) (:constant colors) (contains? py-keywords text) (:keyword colors) (contains? py-builtins text) (:builtin colors) (and (>= (count text) 2) (Character/isUpperCase (.charAt ^String text 0))) (:type colors) :else nil)) (def ^:private py-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["#.*" (:comment colors)] ["\"\"\"[\\s\\S]*?\"\"\"" (:string colors)] ["'''[\\s\\S]*?'''" (:string colors)] ["f\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["f'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ["@[a-zA-Z_][a-zA-Z0-9_.]*" (:param colors)] ["0[xX][0-9a-fA-F]+" (:number colors)] ["\\d+\\.\\d+(?:[eE][+-]?\\d+)?" (:number colors)] ["\\d+" (:number colors)] ["[a-zA-Z_][a-zA-Z0-9_]*" py-classify]])) ;; ============================================================ ;; Language: Java ;; ============================================================ (def ^:private java-keywords #{"abstract" "assert" "boolean" "break" "byte" "case" "catch" "char" "class" "const" "continue" "default" "do" "double" "else" "enum" "extends" "final" "finally" "float" "for" "goto" "if" "implements" "import" "instanceof" "int" "interface" "long" "native" "new" "package" "private" "protected" "public" "return" "short" "static" "strictfp" "super" "switch" "synchronized" "this" "throw" "throws" "transient" "try" "var" "void" "volatile" "while" "yield" "record" "sealed" "permits" "non-sealed"}) (def ^:private java-constants #{"true" "false" "null"}) (defn- java-classify [text] (cond (contains? java-constants text) (:constant colors) (contains? java-keywords text) (:keyword colors) (and (>= (count text) 2) (Character/isUpperCase (.charAt ^String text 0))) (:type colors) :else nil)) (def ^:private java-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["//.*" (:comment colors)] ["/\\*[\\s\\S]*?\\*/" (:comment colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ["@[a-zA-Z_][a-zA-Z0-9_]*" (:param colors)] ["0[xX][0-9a-fA-F]+[lL]?" (:number colors)] ["\\d+\\.\\d+[fFdD]?" (:number colors)] ["\\d+[lLfFdD]?" (:number colors)] ["[a-zA-Z_$][a-zA-Z0-9_$]*" java-classify]])) ;; ============================================================ ;; Language: Kotlin ;; ============================================================ (def ^:private kt-keywords #{"abstract" "annotation" "as" "break" "by" "catch" "class" "companion" "const" "constructor" "continue" "crossinline" "data" "do" "else" "enum" "expect" "external" "final" "finally" "for" "fun" "get" "if" "import" "in" "infix" "init" "inline" "inner" "interface" "internal" "is" "lateinit" "noinline" "object" "open" "operator" "out" "override" "package" "private" "protected" "public" "reified" "return" "sealed" "set" "super" "suspend" "tailrec" "this" "throw" "try" "typealias" "val" "var" "vararg" "when" "where" "while" "yield"}) (def ^:private kt-builtins #{"println" "print" "listOf" "mutableListOf" "mapOf" "mutableMapOf" "setOf" "mutableSetOf" "arrayOf" "intArrayOf" "emptyList" "emptyMap" "require" "check" "error" "TODO" "repeat" "run" "with" "apply" "also" "let" "takeIf" "takeUnless" "lazy" "coroutineScope" "launch" "async" "String" "Int" "Long" "Double" "Float" "Boolean" "Char" "Unit" "Any" "Nothing"}) (def ^:private kt-constants #{"true" "false" "null"}) (defn- kt-classify [text] (cond (contains? kt-constants text) (:constant colors) (contains? kt-keywords text) (:keyword colors) (contains? kt-builtins text) (:builtin colors) (and (>= (count text) 2) (Character/isUpperCase (.charAt ^String text 0))) (:type colors) :else nil)) (def ^:private kt-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["//.*" (:comment colors)] ["/\\*[\\s\\S]*?\\*/" (:comment colors)] ["\"\"\"[\\s\\S]*?\"\"\"" (:string colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ["@[a-zA-Z_][a-zA-Z0-9_]*" (:param colors)] ["0[xX][0-9a-fA-F]+[lL]?" (:number colors)] ["\\d+\\.\\d+[fFdD]?" (:number colors)] ["\\d+[lLfFdD]?" (:number colors)] ["[a-zA-Z_][a-zA-Z0-9_]*" kt-classify]])) ;; ============================================================ ;; Language: Rust ;; ============================================================ (def ^:private rust-keywords #{"as" "async" "await" "break" "const" "continue" "crate" "dyn" "else" "enum" "extern" "fn" "for" "if" "impl" "in" "let" "loop" "match" "mod" "move" "mut" "pub" "ref" "return" "self" "Self" "static" "struct" "super" "trait" "type" "unsafe" "use" "where" "while" "yield" "macro_rules"}) (def ^:private rust-builtins #{"println" "eprintln" "format" "vec" "panic" "assert" "assert_eq" "assert_ne" "debug_assert" "todo" "unimplemented" "unreachable" "cfg" "derive" "include" "include_str" "env" "concat" "stringify" "Some" "None" "Ok" "Err" "Box" "Rc" "Arc" "Vec" "String" "Option" "Result" "HashMap" "HashSet" "BTreeMap" "BTreeSet" "Iterator" "IntoIterator" "From" "Into" "TryFrom" "TryInto" "Display" "Debug" "Clone" "Copy" "Default" "PartialEq" "Eq" "PartialOrd" "Ord" "Hash" "Send" "Sync" "Sized" "Drop" "Fn" "FnMut" "FnOnce"}) (def ^:private rust-constants #{"true" "false"}) (defn- rust-classify [text] (cond (contains? rust-constants text) (:constant colors) (contains? rust-keywords text) (:keyword colors) (contains? rust-builtins text) (:builtin colors) (and (>= (count text) 2) (Character/isUpperCase (.charAt ^String text 0))) (:type colors) :else nil)) (def ^:private rust-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["//.*" (:comment colors)] ["/\\*[\\s\\S]*?\\*/" (:comment colors)] ["r#\"[^\"]*\"#" (:string colors)] ["r\"[^\"]*\"" (:string colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'[a-zA-Z_][a-zA-Z0-9_]*" (:param colors)] ;; lifetimes ["'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ;; char literals ["[a-zA-Z_][a-zA-Z0-9_]*!" (:builtin colors)] ;; macros ["0[xX][0-9a-fA-F_]+" (:number colors)] ["0[bB][01_]+" (:number colors)] ["0[oO][0-7_]+" (:number colors)] ["\\d[\\d_]*\\.\\d[\\d_]*(?:[eE][+-]?\\d+)?(?:f32|f64)?" (:number colors)] ["\\d[\\d_]*(?:u8|u16|u32|u64|u128|usize|i8|i16|i32|i64|i128|isize|f32|f64)?" (:number colors)] ["[a-zA-Z_][a-zA-Z0-9_]*" rust-classify]])) ;; ============================================================ ;; Language: Bash ;; ============================================================ (def ^:private bash-keywords #{"if" "then" "else" "elif" "fi" "for" "while" "until" "do" "done" "case" "esac" "in" "function" "select" "time" "coproc" "return" "exit" "break" "continue" "shift" "trap" "local" "export" "declare" "typeset" "readonly" "unset"}) (def ^:private bash-builtins #{"echo" "printf" "read" "cd" "pwd" "ls" "cp" "mv" "rm" "mkdir" "rmdir" "cat" "grep" "sed" "awk" "find" "sort" "uniq" "wc" "head" "tail" "chmod" "chown" "curl" "wget" "tar" "gzip" "gunzip" "zip" "unzip" "git" "docker" "make" "ssh" "scp" "rsync" "test" "true" "false" "source" "eval" "exec" "set" "env"}) (defn- bash-classify [text] (cond (contains? bash-keywords text) (:keyword colors) (contains? bash-builtins text) (:builtin colors) :else nil)) (def ^:private bash-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["#.*" (:comment colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'[^']*'" (:string colors)] ["\\$\\{[^}]+\\}" (:param colors)] ["\\$[a-zA-Z_][a-zA-Z0-9_]*" (:param colors)] ["\\$[0-9@#?!$*-]" (:param colors)] ["\\d+" (:number colors)] ["[a-zA-Z_][a-zA-Z0-9_]*" bash-classify]])) ;; ============================================================ ;; Language: JSON ;; ============================================================ (def ^:private json-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["\"(?:[^\"\\\\]|\\\\.)*\"\\s*:" (:clj-kw colors)] ;; keys ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["-?\\d+\\.\\d+(?:[eE][+-]?\\d+)?" (:number colors)] ["-?\\d+" (:number colors)] ["\\b(?:true|false)\\b" (:constant colors)] ["\\bnull\\b" (:constant colors)]])) ;; ============================================================ ;; Language: Generic (Go, C, C++, Ruby, CSS, etc.) ;; ============================================================ (def ^:private generic-keywords #{"if" "else" "for" "while" "do" "switch" "case" "default" "break" "continue" "return" "goto" "try" "catch" "throw" "finally" "class" "struct" "enum" "interface" "extends" "implements" "public" "private" "protected" "static" "const" "final" "abstract" "virtual" "override" "new" "delete" "this" "self" "super" "import" "export" "package" "module" "use" "require" "include" "void" "int" "long" "float" "double" "char" "bool" "string" "var" "let" "val" "def" "fn" "func" "fun" "function" "type" "typedef" "namespace" "template" "typename" "async" "await" "yield" "defer" "select" "chan" "go" "begin" "end" "then" "elsif" "unless" "rescue" "ensure" "raise"}) (def ^:private generic-constants #{"true" "false" "nil" "null" "none" "None" "True" "False" "NULL" "undefined" "NaN" "Infinity"}) (defn- generic-classify [text] (cond (contains? generic-constants text) (:constant colors) (contains? generic-keywords text) (:keyword colors) (and (>= (count text) 2) (Character/isUpperCase (.charAt ^String text 0))) (:type colors) :else nil)) (def ^:private generic-rules (mapv (fn [[re c]] [(Pattern/compile re) c]) [["//.*" (:comment colors)] ["#.*" (:comment colors)] ["/\\*[\\s\\S]*?\\*/" (:comment colors)] ["\"(?:[^\"\\\\]|\\\\.)*\"" (:string colors)] ["'(?:[^'\\\\]|\\\\.)*'" (:string colors)] ["`(?:[^`\\\\]|\\\\.)*`" (:string colors)] ["@[a-zA-Z_][a-zA-Z0-9_]*" (:param colors)] ["0[xX][0-9a-fA-F]+" (:number colors)] ["\\d+\\.\\d+(?:[eE][+-]?\\d+)?" (:number colors)] ["\\d+" (:number colors)] ["[a-zA-Z_][a-zA-Z0-9_]*" generic-classify]])) ;; ============================================================ ;; Language Registry ;; ============================================================ (def ^:private lang-rules {:clojure clj-rules :javascript js-rules :python py-rules :java java-rules :kotlin kt-rules :rust rust-rules :bash bash-rules :json json-rules :generic generic-rules}) (def ^:private fence-tag->lang {"clojure" :clojure "clj" :clojure "cljs" :clojure "edn" :clojure "javascript" :javascript "js" :javascript "typescript" :javascript "ts" :javascript "jsx" :javascript "tsx" :javascript "python" :python "py" :python "java" :java "kotlin" :kotlin "kt" :kotlin "rust" :rust "rs" :rust "bash" :bash "sh" :bash "shell" :bash "zsh" :bash "json" :json "jsonc" :json "go" :generic "c" :generic "cpp" :generic "c++" :generic "ruby" :generic "rb" :generic "css" :generic "scss" :generic "less" :generic "html" :generic "xml" :generic "svg" :generic "yaml" :generic "yml" :generic "toml" :generic "sql" :generic "graphql" :generic "gql" :generic "lua" :generic "perl" :generic "r" :generic "swift" :generic "scala" :generic "groovy" :generic "haskell" :generic "hs" :generic "elixir" :generic "ex" :generic "erlang" :generic "erl" :generic "zig" :generic "nim" :generic "ocaml" :generic "ml" :generic "dart" :generic "php" :generic "dockerfile" :generic "makefile" :generic "diff" :generic "patch" :generic}) (def ^:private ext->lang {".clj" :clojure ".cljs" :clojure ".cljc" :clojure ".edn" :clojure ".bb" :clojure ".js" :javascript ".jsx" :javascript ".ts" :javascript ".tsx" :javascript ".mjs" :javascript ".py" :python ".pyw" :python ".java" :java ".kt" :kotlin ".kts" :kotlin ".rs" :rust ".sh" :bash ".bash" :bash ".zsh" :bash ".json" :json ".jsonc" :json ".go" :generic ".c" :generic ".h" :generic ".cpp" :generic ".hpp" :generic ".cc" :generic ".rb" :generic ".css" :generic ".scss" :generic ".less" :generic ".html" :generic ".xml" :generic ".svg" :generic ".yaml" :generic ".yml" :generic ".toml" :generic ".sql" :generic ".lua" :generic ".pl" :generic ".r" :generic ".swift" :generic ".scala" :generic ".groovy" :generic ".hs" :generic ".ex" :generic ".exs" :generic ".erl" :generic ".zig" :generic ".nim" :generic ".ml" :generic ".dart" :generic ".php" :generic}) ;; ============================================================ ;; Public API ;; ============================================================ (defn lang-for-fence "Map a code fence tag (e.g. \"clojure\", \"js\") to a language keyword." [tag] (when tag (get fence-tag->lang (str/lower-case (str/trim tag))))) (defn lang-for-ext "Map a file extension (e.g. \".clj\", \".rs\") to a language keyword." [ext] (when ext (get ext->lang (str/lower-case ext)))) (defn highlight-line "Syntax-highlight a single line of code. Returns string with ANSI fg codes. `lang` — keyword like :clojure, :javascript, etc. (nil = no highlighting) `default-fg` — ANSI code for unhighlighted text (\"\" for terminal default, or e.g. \"\\033[38;5;210m\" for diff removed lines). Caller should append \\033[0m after the returned string." [line lang default-fg] (if-let [rules (get lang-rules lang)] (str default-fg (highlight-line* line rules (or default-fg ""))) line))