formatter: support comments better (including block comments) and constant pair formatting (#2745)

2024-11-23 06:09:57 +00:00 · 2023-06-18 16:19:35 -05:00 · 2023-06-18 16:19:35 -05:00 · a8a5f1e745
commit a8a5f1e745
parent c41a66829f
16 changed files with 3610 additions and 1374 deletions
--- a/Taskfile.yml
+++ b/Taskfile.yml
@ -129,6 +129,12 @@ tasks:
    desc: Just an example to show it running
    cmds:
      - "{{.TYPESEARCH_BIN_RELEASE_DIR}}/type_searcher --output-path ./search-results.json --game {{.GAME}} --fields '[{\"type\":\"int16\",\"offset\":2},{\"type\":\"int16\",\"offset\":4}]'"
+  update-treesitter:
+    desc: Updates locally built tree-sitter rules
+    cmds:
+      - cd ../tree-sitter-opengoal && yarn gen
+      - python ./scripts/tasks/cp.py --src "../tree-sitter-opengoal/src/*" --dest "./third-party/tree-sitter/tree-sitter-opengoal"
+      - python ./scripts/tasks/cp.py --src "../tree-sitter-opengoal/grammar.js" --dest "./third-party/tree-sitter/tree-sitter-opengoal"
  # TESTS
  offline-tests: # ran by jenkins
    cmds:
--- a/common/formatter/formatter.cpp
+++ b/common/formatter/formatter.cpp
@ -15,14 +15,6 @@ extern "C" {
 extern const TSLanguage* tree_sitter_opengoal();
 }

-// TODO - incoporate some rules from zprint
-// https://github.com/kkinnear/zprint/blob/main/doc/types/classic.md
-// as well as maybe adjust the default rules to incorporate line length
-// https://github.com/kkinnear/zprint/blob/main/doc/options/indent.md
-// TODO - block comments seem to have an issue being parsed properly, also it basically needs the
-// code for flexibly wrapping a block of code in configurable symbols (parens, block comment braces,
-// etc)
-
 std::string apply_formatting(const FormatterTreeNode& curr_node,
                             std::string output,
                             int tree_depth = 0) {
@ -38,7 +30,6 @@ std::string apply_formatting(const FormatterTreeNode& curr_node,
    curr_form += curr_node.token.value();
    return curr_form;
  }
-  // TODO - this might have some issues for non-list top level elements (ie. comments)
  if (!curr_node.metadata.is_top_level) {
    curr_form += "(";
  }
--- a/common/formatter/formatter_tree.cpp
+++ b/common/formatter/formatter_tree.cpp
@ -4,6 +4,8 @@

 #include "config/rule_config.h"

+#include "third-party/fmt/core.h"
+
 namespace formatter {
 const std::shared_ptr<IndentationRule> default_rule = std::make_shared<IndentationRule>();
 }
@ -11,13 +13,13 @@ const std::shared_ptr<IndentationRule> default_rule = std::make_shared<Indentati
 std::string get_source_code(const std::string& source, const TSNode& node) {
  uint32_t start = ts_node_start_byte(node);
  uint32_t end = ts_node_end_byte(node);
-  // TODO - comments end with a \n, this is likely a tree-sitter grammar problem
-  return str_util::rtrim(source.substr(start, end - start));
+  return source.substr(start, end - start);
 }

 int num_blank_lines_following_node(const std::string& source, const TSNode& node) {
-  int num_lines = 0;
+  int num_lines = -1;  // The first new-line encountered is not a blank line
  uint32_t cursor = ts_node_end_byte(node);
+  // TODO - this breaks on lines with whitespace as well, should probably seek past that!
  while (cursor < source.length() && source.at(cursor) == '\n') {
    num_lines++;
    cursor++;
@ -25,9 +27,10 @@ int num_blank_lines_following_node(const std::string& source, const TSNode& node
  return num_lines;
 }

-// Check if the original source only has whitespace up to a new-line after it's token
+// Check if the original source only has whitespace up to a new-line before it's token
 bool node_preceeded_by_only_whitespace(const std::string& source, const TSNode& node) {
-  uint32_t pos = ts_node_start_byte(node);
+  // NOTE - this returns incorrectly because we skip brackets in lists, we'll see if that matters
+  int32_t pos = ts_node_start_byte(node) - 1;
  while (pos > 0) {
    const auto& c = source.at(pos);
    if (c == '\n') {
@ -43,8 +46,40 @@ bool node_preceeded_by_only_whitespace(const std::string& source, const TSNode&

 FormatterTreeNode::FormatterTreeNode(const std::string& source, const TSNode& node)
    : token(get_source_code(source, node)) {
-  // Set any metadata based on the value of the token
+  metadata.node_type = ts_node_type(node);
  metadata.is_comment = str_util::starts_with(str_util::ltrim(token.value()), ";");
+  // Do some formatting on block-comments text
+  // TODO - this should go into a formatting rule
+  if (str_util::starts_with(str_util::ltrim(token.value()), "#|")) {
+    metadata.is_comment = true;
+    // Normalize block comments, remove any trailing or leading whitespace
+    // Only allow annotations on the first line, like #|@file
+    // Don't mess with internal indentation as the user might intend it to be a certain way.
+    std::string new_token = "";
+    std::string comment_contents = "";
+    bool seek_until_whitespace = str_util::starts_with(token.value(), "#|@");
+    int chars_seeked = 0;
+    for (const auto& c : token.value()) {
+      if (c == '\n' || (seek_until_whitespace && (c == ' ' || c == '\t')) ||
+          (!seek_until_whitespace && (c != '#' && c != '|'))) {
+        break;
+      }
+      chars_seeked++;
+      new_token += c;
+    }
+    // Remove the first line content and any leading whitespace
+    comment_contents = str_util::ltrim_newlines(token.value().substr(chars_seeked));
+    // Remove trailing whitespace
+    comment_contents = str_util::rtrim(comment_contents);
+    // remove |#
+    comment_contents.pop_back();
+    comment_contents.pop_back();
+    comment_contents = str_util::rtrim(comment_contents);
+    new_token += fmt::format("\n{}\n|#", comment_contents);
+    token = new_token;
+  }
+
+  // Set any metadata based on the value of the token
  metadata.num_blank_lines_following = num_blank_lines_following_node(source, node);
  metadata.is_inline = !node_preceeded_by_only_whitespace(source, node);
 };
@ -100,6 +135,12 @@ void FormatterTree::construct_formatter_tree_recursive(const std::string& source
  FormatterTreeNode list_node;
  if (curr_node_type == "list_lit") {
    list_node = FormatterTreeNode();
+  } else if (curr_node_type == "str_lit") {
+    // Strings are a special case, they are literals and essentially tokens but the grammar can
+    // detect formatter identifiers, this is useful for semantic highlighting but doesn't matter for
+    // formatting So for strings, we treat them as if they should be a single token
+    tree_node.refs.push_back(FormatterTreeNode(source, curr_node));
+    return;
  }
  for (size_t i = 0; i < ts_node_child_count(curr_node); i++) {
    const auto child_node = ts_node_child(curr_node, i);
--- a/common/formatter/formatter_tree.h
+++ b/common/formatter/formatter_tree.h
@ -34,6 +34,7 @@ extern const std::shared_ptr<IndentationRule> default_indentation_rule;
 class FormatterTreeNode {
 public:
  struct Metadata {
+    std::string node_type;
    bool is_top_level = false;
    bool is_comment = false;
    bool is_inline = false;
--- a/common/formatter/formatting_rules.cpp
+++ b/common/formatter/formatting_rules.cpp
@ -1,5 +1,7 @@
 #include "formatting_rules.h"

+#include <set>
+
 #include "common/util/string_util.h"

 void formatter_rules::blank_lines::separate_by_newline(std::string& curr_text,
@ -23,6 +25,29 @@ void formatter_rules::blank_lines::separate_by_newline(std::string& curr_text,
  curr_text += "\n";
 }

+// TODO - probably need to include quoted literals as well, though the grammar currently does not
+// differentiate between a quoted symbol and a quoted form
+const std::set<std::string> constant_pair_types = {"kwd_lit",  "num_lit",  "str_lit", "char_lit",
+                                                   "null_lit", "bool_lit", "sym_lit"};
+
+bool formatter_rules::constant_pairs::is_element_second_in_constant_pair(
+    const FormatterTreeNode& containing_node,
+    const FormatterTreeNode& node,
+    const int index) {
+  if (containing_node.refs.empty() || index == 0) {
+    return false;
+  }
+  // Ensure that a keyword came before hand
+  if (containing_node.refs.at(index - 1).metadata.node_type != "kwd_lit") {
+    return false;
+  }
+  // Check the type of the element
+  if (constant_pair_types.find(node.metadata.node_type) != constant_pair_types.end()) {
+    return true;
+  }
+  return false;
+}
+
 void IndentationRule::append_newline(std::string& curr_text,
                                     const FormatterTreeNode& node,
                                     const FormatterTreeNode& containing_node,
@ -34,6 +59,11 @@ void IndentationRule::append_newline(std::string& curr_text,
      (node.metadata.is_comment && node.metadata.is_inline)) {
    return;
  }
+  // Check if it's a constant pair
+  if (formatter_rules::constant_pairs::is_element_second_in_constant_pair(containing_node, node,
+                                                                          index)) {
+    return;
+  }
  curr_text = str_util::rtrim(curr_text) + "\n";
 }

@ -45,6 +75,12 @@ void IndentationRule::indent_token(std::string& curr_text,
  if (node.metadata.is_top_level) {
    return;
  }
+  // If the element is the second element in a constant pair, that means we did not append a
+  // new-line before hand so we require no indentation (it's inline with the previous element)
+  if (formatter_rules::constant_pairs::is_element_second_in_constant_pair(containing_node, node,
+                                                                          index)) {
+    return;
+  }
  if (containing_node.metadata.multiple_elements_first_line) {
    if (index > 1) {
      // Only apply indentation if we are about to print a normal text token
@ -87,6 +123,11 @@ void InnerIndentationRule::append_newline(std::string& curr_text,
  if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) {
    return;
  }
+  // Check if it's a constant pair
+  if (formatter_rules::constant_pairs::is_element_second_in_constant_pair(containing_node, node,
+                                                                          index)) {
+    return;
+  }
  if (!node.metadata.was_on_first_line_of_form) {
    curr_text = str_util::rtrim(curr_text) + "\n";
  }
@ -100,6 +141,12 @@ void InnerIndentationRule::indent_token(std::string& curr_text,
  if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) {
    return;
  }
+  // If the element is the second element in a constant pair, that means we did not append a
+  // new-line before hand so we require no indentation (it's inline with the previous element)
+  if (formatter_rules::constant_pairs::is_element_second_in_constant_pair(containing_node, node,
+                                                                          index)) {
+    return;
+  }
  // We only new-line elements if they were not originally on the first line
  if (!node.metadata.was_on_first_line_of_form) {
    curr_text += str_util::repeat(depth * 2, " ");
--- a/common/formatter/formatting_rules.h
+++ b/common/formatter/formatting_rules.h
@ -10,7 +10,7 @@ namespace formatter_rules {
 // The formatter will try to collapse as much space as possible in the top-level, this means
 // separating forms by a single empty blank line
 //
-// The exception is comments, top level comments will retain their following blank lines from the
+// The exception are comments, top level comments will retain their following blank lines from the
 // original source
 // - this could be none, in the case where a comment is directly next to a form (like this one!)
 //   - you don't want them to be separated!
@ -27,12 +27,44 @@ void separate_by_newline(std::string& curr_text,
 }

 // TODO - nothing here yet, in the future:
-// - if/when the formatter is concerned with line length, there are implications here
 // - align consecutive comment lines
+// - if/when the formatter is concerned with line length, there are implications here
 //
 // Reference - https://github.com/kkinnear/zprint/blob/main/doc/options/comments.md
 namespace comments {}

+// Paired elements in a list will be kept in-line rather than the default new-line indentation
+// For example:
+// (:msg "hello world" :delay 100 :fn (lambda () (+ 1 1)))
+// Would typically become:
+// (:msg
+//  "hello world"
+//  :delay
+//  100
+//  :fn
+//    (lambda ()
+//      (+ 1 1)))
+// But with constant pairs:
+// (:msg "hello world"
+//  :delay 100
+//  :fn
+//    (lambda ()
+//      (+ 1 1)))
+//
+// Reference - https://github.com/kkinnear/zprint/blob/main/doc/options/constantpairs.md
+namespace constant_pairs {
+// Determines if the given element is the second element in a constant pair, if it is then we would
+// usually want to elide the new-line in whatever code that applies it
+//
+// This is true if:
+// - the element is in a list
+// - the element is preceeded by a keyword
+// - the element is a:
+//   - keyword, symbol, string, number, or boolean
+bool is_element_second_in_constant_pair(const FormatterTreeNode& containing_node,
+                                        const FormatterTreeNode& node,
+                                        const int index);
+}  // namespace constant_pairs
 }  // namespace formatter_rules

 // Indentation rules are heavily inspired by the descriptions here
--- a/common/util/string_util.cpp
+++ b/common/util/string_util.cpp
@ -24,6 +24,23 @@ bool ends_with(const std::string& s, const std::string& suffix) {
         0 == s.compare(s.size() - suffix.size(), suffix.size(), suffix);
 }

+// Left-trims any leading whitespace up to and including the final leading newline
+// For example:
+// " \n\n  hello world" => "  hello world"
+std::string ltrim_newlines(const std::string& s) {
+  size_t start = s.find_first_not_of(WHITESPACE);
+  // Seek backwards until we hit the beginning of the string, or a newline -- this is the actual
+  // substr point we want to use
+  for (int i = start - 1; i >= 0; i--) {
+    const auto& c = s.at(i);
+    if (c == '\n') {
+      break;
+    }
+    start--;
+  }
+  return (start == std::string::npos) ? "" : s.substr(start);
+}
+
 std::string ltrim(const std::string& s) {
  size_t start = s.find_first_not_of(WHITESPACE);
  return (start == std::string::npos) ? "" : s.substr(start);
--- a/common/util/string_util.h
+++ b/common/util/string_util.h
@ -7,6 +7,7 @@ namespace str_util {
 bool contains(const std::string& s, const std::string& substr);
 bool starts_with(const std::string& s, const std::string& prefix);
 bool ends_with(const std::string& s, const std::string& suffix);
+std::string ltrim_newlines(const std::string& s);
 std::string ltrim(const std::string& s);
 std::string rtrim(const std::string& s);
 std::string trim(const std::string& s);
--- a/scripts/tasks/cp.py
+++ b/scripts/tasks/cp.py
@ -0,0 +1,31 @@
+# Windows doesn't have the GNU toolchain by default, making it hard to write Taskfiles that are cross-platform
+# This "solves" that
+
+import argparse
+import glob
+parser = argparse.ArgumentParser()
+parser.add_argument("--src")
+parser.add_argument("--dest")
+args = parser.parse_args()
+
+import shutil
+import os
+
+def copy_with_glob(source_glob, destination):
+    # Expand the glob pattern
+    paths = glob.glob(source_glob, recursive=True)
+
+    for path in paths:
+        # Get the destination path by joining the destination directory with the relative path
+        relative_path = os.path.relpath(path, os.path.dirname(source_glob))
+        dest_path = os.path.join(destination, relative_path)
+
+        # Check if the path is a file or a directory
+        if os.path.isfile(path):
+            # Copy the file
+            shutil.copy2(path, dest_path)
+        elif os.path.isdir(path):
+            # Copy the directory and its contents recursively
+            shutil.copytree(path, dest_path, dirs_exist_ok=True)
+
+copy_with_glob(args.src, args.dest)
--- a/test/common/formatter/corpus/comments.test.gc
+++ b/test/common/formatter/corpus/comments.test.gc
@ -1,5 +1,5 @@
 ===
-Top-Level Comment
+Comment - Top-Level
 ===

 ;; test
@ -11,7 +11,22 @@ Top-Level Comment
 (println "test")

 ===
-Inline Comment
+Comment - Within Form
+===
+
+(println
+  ;; test
+ "test")
+
+---
+
+(println
+ ;; test
+ "test")
+
+
+===
+Comment - Inline
 ===

 (println "test") ;; test
@ -21,9 +36,17 @@ Inline Comment
 (println "test") ;; test

 ===
-TODO - Block Comment
+Block Comment
 ===

+#|
+  block comment
+  test|#
+
+(println "test")
+
+---
+
 #|
  block comment
  test
@ -31,12 +54,71 @@ TODO - Block Comment

 (println "test")

+===
+Block Comment - Single Line
+===
+
+#|block comment|#
+
 ---

 #|
+block comment
+|#

- |#
+===
+Block Comment - Don't Allow Content on Opening Brace
+===
+
+#| block comment
+ test
+|#

 (println "test")

+---

+#|
+ block comment
+ test
+|#
+
+(println "test")
+
+===
+Block Comment - Allow Annotations
+===
+
+#|@file block comment
+ test
+|#
+
+(println "test")
+
+---
+
+#|@file
+ block comment
+ test
+|#
+
+(println "test")
+
+===
+Block Comment - In Form
+===
+
+(println
+  #| block comment
+ test
+|#
+"test")
+
+---
+
+(println
+ #|
+ block comment
+ test
+|#
+ "test")
--- a/test/common/formatter/corpus/constant-pairs.test.gc
+++ b/test/common/formatter/corpus/constant-pairs.test.gc
@ -0,0 +1,51 @@
+===
+One Pair
+===
+
+(:hello
+
+"world")
+
+---
+
+(:hello "world")
+
+===
+Not a Valid Constant
+===
+
+(:hello
+
+(println "hello world"))
+
+---
+
+(:hello
+ (println "hello world"))
+
+===
+Two Pairs
+===
+
+(:hello
+
+"world" :test 123)
+
+---
+
+(:hello "world"
+ :test 123)
+
+===
+Pair Mixture
+===
+
+(:hello
+
+"world" "not-a-pair" :test 123)
+
+---
+
+(:hello "world"
+ "not-a-pair"
+ :test 123)
--- a/test/common/formatter/test_formatter.cpp
+++ b/test/common/formatter/test_formatter.cpp
@ -36,9 +36,12 @@ struct TestDefinition {
 };

 std::vector<TestDefinition> get_test_definitions(const fs::path& file_path) {
+  std::vector<TestDefinition> tests;
  // Read in the file, and run the test
  const auto contents = str_util::split(file_util::read_text_file(file_path));
-  std::vector<TestDefinition> tests;
+  if (contents.empty() || (contents.size() == 1 && contents.at(0).empty())) {
+    return tests;
+  }
  TestDefinition curr_test;
  size_t i = 0;
  while (i < contents.size()) {
--- a/third-party/tree-sitter/tree-sitter-opengoal/grammar.js
+++ b/third-party/tree-sitter/tree-sitter-opengoal/grammar.js
@ -0,0 +1,295 @@
+// Heavily stripped down - https://github.com/sogaiu/tree-sitter-clojure/
+// With some features taken from https://github.com/theHamsta/tree-sitter-commonlisp/blob/master/grammar.js
+
+// java.lang.Character.isWhitespace
+//
+// Space Separator (Zs) but NOT including (U+00A0, U+2007, U+202F)
+//   U+0020, U+1680, U+2000, U+2001, U+2002, U+2003, U+2004, U+2005,
+//   U+2006, U+2008, U+2009, U+200A, U+205F, U+3000
+// Line Separator (Zl)
+//   U+2028
+// Paragraph Separator (Zp)
+//   U+2029
+// Horizontal Tabulation
+//   \t
+// Line Feed
+//   \n
+// Vertical Tabulation
+//   U+000B
+// Form Feed
+//   \f
+// Carriage Return
+//   \r
+// File Separator
+//   U+001C
+// Group Separator
+//   U+001D
+// Record Separator
+//   U+001E
+// Unit Separator
+//   U+001F
+const WHITESPACE_CHAR =
+  /[\f\n\r\t \u000B\u001C\u001D\u001E\u001F\u2028\u2029\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2008\u2009\u200a\u205f\u3000]/;
+
+const WHITESPACE =
+  token(repeat1(WHITESPACE_CHAR));
+
+const COMMENT =
+  token(/(;)[^\n]*/);
+
+const BLOCK_COMMENT =
+  token(seq('#|', repeat1(/[^#|]/), '|#'));
+
+const DIGIT =
+  /[0-9]/;
+
+const HEX_DIGIT =
+  /[0-9a-fA-F]/;
+
+const BINARY_DIGIT =
+  /[0-1]/;
+
+const HEX_NUMBER =
+  seq("#x",
+    repeat1(HEX_DIGIT));
+
+const BINARY_NUMBER =
+  seq("#b",
+    repeat1(BINARY_DIGIT));
+
+const FLOAT =
+  seq(repeat1(DIGIT),
+    optional(seq(".",
+      repeat(DIGIT))));
+
+const INTEGER =
+  seq(repeat1(DIGIT));
+
+// TODO - does OG support negative hex/binary?
+const NUMBER =
+  token(prec(10, seq(optional(/[+-]/),
+    choice(HEX_NUMBER,
+      BINARY_NUMBER,
+      FLOAT,
+      INTEGER))));
+
+const NULL =
+  token('none');
+
+// While technically anything other than #f is true, conventionally speaking #t is used to indicate true
+const BOOLEAN =
+  token(choice('#f',
+    '#t'));
+
+const KEYWORD_HEAD =
+  /[^\f\n\r\t ()\[\]{}"@~^;`\\,:/\u000B\u001C\u001D\u001E\u001F\u2028\u2029\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2008\u2009\u200a\u205f\u3000]/;
+
+const KEYWORD_BODY =
+  choice(/[:']/, KEYWORD_HEAD);
+
+const KEYWORD = token(seq(":", KEYWORD_HEAD, repeat(KEYWORD_BODY)))
+
+const ANY_CHAR =
+  /.|\n/;
+
+const CHARACTER =
+  token(seq("#\\",
+    choice(ANY_CHAR, "\\s", "\\n", "\\t")));
+
+// \u000B => <vertical tab>
+// \u001C => <file separator>
+// \u001D => <group separator>
+// \u001E => <record separator>
+// \u001F => <unit separator>
+// \u2028 => <line separator>
+// \u2029 => <paragraph separator>
+// \u1680 => <ogham space mark>
+// \u2000 => <en quad>
+// \u2001 => <em quad>
+// \u2002 => <en space>
+// \u2003 => <em space>
+// \u2004 => <three-per-em space>
+// \u2005 => <four-per-em space>
+// \u2006 => <six-per-em space>
+// \u2008 => <punctuation space>
+// \u2009 => <thin space>
+// \u200a => <hair space>
+// \u205f => <medium mathematical space>
+// \u3000 => <ideographic space>
+const SYMBOL_HEAD =
+  /[^\f\n\r\t \/()\[\]{}"@~^;`\\,:#'0-9\u000B\u001C\u001D\u001E\u001F\u2028\u2029\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2008\u2009\u200a\u205f\u3000]/;
+
+const SYMBOL_BODY =
+  choice(SYMBOL_HEAD,
+    /[:#'0-9]/);
+
+const SYMBOL =
+  token(seq(SYMBOL_HEAD,
+    repeat(SYMBOL_BODY)));
+
+module.exports = grammar({
+  name: 'opengoal',
+
+  extras: $ =>
+    [],
+
+  conflicts: $ =>
+    [],
+
+  inline: $ =>
+    [$._kwd_unqualified,
+    $._sym_unqualified],
+
+  rules: {
+    // THIS MUST BE FIRST -- even though this doesn't look like it matters
+    source: $ =>
+      repeat(choice($._form,
+        $._gap)),
+
+    _gap: $ =>
+      choice($._ws,
+        $.comment,
+        $.block_comment),
+
+    _ws: $ =>
+      WHITESPACE,
+
+    comment: $ =>
+      COMMENT,
+
+    block_comment: $ => BLOCK_COMMENT,
+
+    _form: $ =>
+      choice($.num_lit, // atom-ish
+        $.kwd_lit,
+        $.str_lit,
+        $.char_lit,
+        $.null_lit,
+        $.bool_lit,
+        $.sym_lit,
+        // basic collection-ish
+        $.list_lit,
+        // some other reader macros
+        $.quoting_lit,
+        $.quasi_quoting_lit,
+        $.unquote_splicing_lit,
+        $.unquoting_lit),
+
+    num_lit: $ =>
+      NUMBER,
+
+    kwd_lit: $ => KEYWORD,
+
+    // https://opengoal.dev/docs/reference/lib/#format
+    // TODO - a lot of this might be irrelevant or not comprehensive in terms of OpenGOAL's
+    // but to be honest, most of these rare features are never used
+    _format_token: $ => choice(alias(NUMBER, $.num_lit), seq("'", alias(/./, $.char_lit))),
+    format_prefix_parameters: _ => choice('v', 'V', '#'),
+    format_modifiers: $ => seq(repeat(choice($._format_token, ',')), choice('@', '@:', ':', ':@')),
+    format_directive_type: $ => choice(
+      seq(optional(field('repetitions', $._format_token)), choice('~', '%', '&', '|')),
+      /[cC]/,
+      /\^/,
+      '\n',
+      '\r',
+      /[pP]/,
+      /[iI]/,
+      /[wW]/,
+      /[aA]/,
+      '_',
+      /[()]/,
+      /[{}]/,
+      /[\[\]]/,
+      /[<>]/,
+      ';',
+      seq(field('numberOfArgs', $._format_token), '*'),
+      '?',
+      "Newline",
+      seq(repeat(choice($._format_token, ',')), /[$rRbBdDgGxXeEoOsStTfF]/),
+    ),
+    format_specifier: $ =>
+      prec.left(seq(
+        '~',
+        optional($.format_prefix_parameters),
+        optional($.format_modifiers),
+        prec(5, $.format_directive_type),
+      )),
+
+    str_lit: $ =>
+      seq(
+        '"',
+        repeat(choice(
+          token.immediate(prec(1, /[^\\~"]+/)),
+          token.immediate(seq(/\\./)),
+          $.format_specifier,
+        )),
+        optional('~'),
+        '"',
+      ),
+
+    char_lit: $ =>
+      CHARACTER,
+
+    null_lit: $ =>
+      NULL,
+
+    bool_lit: $ =>
+      BOOLEAN,
+
+    sym_lit: $ =>
+      seq(choice($._sym_unqualified)),
+
+    _sym_unqualified: $ =>
+      field('name', alias(choice("/", SYMBOL),
+        $.sym_name)),
+
+    list_lit: $ =>
+      seq($._bare_list_lit),
+
+    _bare_list_lit: $ =>
+      seq(field('open', "("),
+        repeat(choice(field('value', $._form),
+          $._gap)),
+        field('close', ")")),
+
+    quoting_lit: $ =>
+      seq(field('marker', "'"),
+        repeat($._gap),
+        field('value', $._form)),
+
+    quasi_quoting_lit: $ =>
+      seq(field('marker', "`"),
+        repeat($._gap),
+        field('value', $._form)),
+
+    unquote_splicing_lit: $ =>
+      seq(field('marker', ",@"),
+        repeat($._gap),
+        field('value', $._form)),
+
+    unquoting_lit: $ =>
+      seq(field('marker', ","),
+        repeat($._gap),
+        field('value', $._form)),
+
+    // TODO - consider having ones for defun, defmethod, defstate, etc
+    // defun_keyword: _ => prec(10, clSymbol(choice('defun', 'defmacro', 'defgeneric', 'defmethod'))),
+
+    // defun_header: $ =>
+    //   prec(PREC.SPECIAL, choice(
+    //     seq(field('keyword', $.defun_keyword),
+    //       repeat($._gap),
+    //       choice($.unquoting_lit, $.unquote_splicing_lit)
+    //     ),
+    //     seq(field('keyword', $.defun_keyword),
+    //       repeat($._gap),
+    //       field('function_name', $._form),
+    //       optional(field('specifier', seq(repeat($._gap), choice($.kwd_lit, $.sym_lit)))),
+    //       repeat($._gap),
+    //       field('lambda_list', choice($.list_lit, $.unquoting_lit))),
+    //     seq(field('keyword', alias('lambda', $.defun_keyword)),
+    //       repeat($._gap),
+    //       field('lambda_list', choice($.list_lit, $.unquoting_lit)))
+    //   )),
+  }
+});
--- a/third-party/tree-sitter/tree-sitter-opengoal/grammar.json
+++ b/third-party/tree-sitter/tree-sitter-opengoal/grammar.json
@ -30,7 +30,7 @@
        },
        {
          "type": "SYMBOL",
-          "name": "comment_multiline"
+          "name": "block_comment"
        }
      ]
    },
@ -48,51 +48,31 @@
      "type": "TOKEN",
      "content": {
        "type": "PATTERN",
-        "value": "(;).*\\n?"
+        "value": "(;)[^\\n]*"
      }
    },
-    "comment_multiline": {
-      "type": "SEQ",
-      "members": [
-        {
-          "type": "TOKEN",
-          "content": {
+    "block_comment": {
+      "type": "TOKEN",
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {
            "type": "STRING",
            "value": "#|"
-          }
-        },
-        {
-          "type": "REPEAT",
-          "content": {
-            "type": "CHOICE",
-            "members": [
-              {
-                "type": "PATTERN",
-                "value": "[^|#]+"
-              },
-              {
-                "type": "PATTERN",
-                "value": "#[^|]"
-              },
-              {
-                "type": "PATTERN",
-                "value": "[^#]\\|"
-              },
-              {
-                "type": "PATTERN",
-                "value": "[\\n\\r]+"
-              }
-            ]
-          }
-        },
-        {
-          "type": "TOKEN",
-          "content": {
+          },
+          {
+            "type": "REPEAT1",
+            "content": {
+              "type": "PATTERN",
+              "value": "[^#|]"
+            }
+          },
+          {
            "type": "STRING",
            "value": "|#"
          }
-        }
-      ]
+        ]
+      }
    },
    "_form": {
      "type": "CHOICE",
@ -257,126 +237,491 @@
      }
    },
    "kwd_lit": {
-      "type": "CHOICE",
-      "members": [
-        {
-          "type": "SYMBOL",
-          "name": "_kwd_unqualified"
-        }
-      ]
-    },
-    "_kwd_unqualified": {
-      "type": "PREC",
-      "value": 1,
-      "content": {
-        "type": "SEQ",
-        "members": [
-          {
-            "type": "FIELD",
-            "name": "marker",
-            "content": {
-              "type": "SYMBOL",
-              "name": "_kwd_marker"
-            }
-          },
-          {
-            "type": "FIELD",
-            "name": "name",
-            "content": {
-              "type": "ALIAS",
-              "content": {
-                "type": "TOKEN",
-                "content": {
-                  "type": "SEQ",
-                  "members": [
-                    {
-                      "type": "PATTERN",
-                      "value": "[^\\f\\n\\r\\t ()\\[\\]{}\"@~^;`\\\\,:/\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
-                    },
-                    {
-                      "type": "REPEAT",
-                      "content": {
-                        "type": "CHOICE",
-                        "members": [
-                          {
-                            "type": "PATTERN",
-                            "value": "[:']"
-                          },
-                          {
-                            "type": "PATTERN",
-                            "value": "[^\\f\\n\\r\\t ()\\[\\]{}\"@~^;`\\\\,:/\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
-                          }
-                        ]
-                      }
-                    }
-                  ]
-                }
-              },
-              "named": true,
-              "value": "kwd_name"
-            }
-          }
-        ]
-      }
-    },
-    "_kwd_marker": {
-      "type": "CHOICE",
-      "members": [
-        {
-          "type": "TOKEN",
-          "content": {
-            "type": "STRING",
-            "value": ":"
-          }
-        }
-      ]
-    },
-    "str_lit": {
      "type": "TOKEN",
      "content": {
        "type": "SEQ",
        "members": [
          {
            "type": "STRING",
-            "value": "\""
+            "value": ":"
+          },
+          {
+            "type": "PATTERN",
+            "value": "[^\\f\\n\\r\\t ()\\[\\]{}\"@~^;`\\\\,:/\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
          },
          {
            "type": "REPEAT",
            "content": {
-              "type": "PATTERN",
-              "value": "[^\"\\\\]"
-            }
-          },
-          {
-            "type": "REPEAT",
-            "content": {
-              "type": "SEQ",
+              "type": "CHOICE",
              "members": [
                {
-                  "type": "STRING",
-                  "value": "\\"
+                  "type": "PATTERN",
+                  "value": "[:']"
                },
                {
                  "type": "PATTERN",
-                  "value": "."
-                },
-                {
-                  "type": "REPEAT",
-                  "content": {
-                    "type": "PATTERN",
-                    "value": "[^\"\\\\]"
-                  }
+                  "value": "[^\\f\\n\\r\\t ()\\[\\]{}\"@~^;`\\\\,:/\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
                }
              ]
            }
-          },
-          {
-            "type": "STRING",
-            "value": "\""
          }
        ]
      }
    },
+    "_format_token": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "ALIAS",
+          "content": {
+            "type": "TOKEN",
+            "content": {
+              "type": "PREC",
+              "value": 10,
+              "content": {
+                "type": "SEQ",
+                "members": [
+                  {
+                    "type": "CHOICE",
+                    "members": [
+                      {
+                        "type": "PATTERN",
+                        "value": "[+-]"
+                      },
+                      {
+                        "type": "BLANK"
+                      }
+                    ]
+                  },
+                  {
+                    "type": "CHOICE",
+                    "members": [
+                      {
+                        "type": "SEQ",
+                        "members": [
+                          {
+                            "type": "STRING",
+                            "value": "#x"
+                          },
+                          {
+                            "type": "REPEAT1",
+                            "content": {
+                              "type": "PATTERN",
+                              "value": "[0-9a-fA-F]"
+                            }
+                          }
+                        ]
+                      },
+                      {
+                        "type": "SEQ",
+                        "members": [
+                          {
+                            "type": "STRING",
+                            "value": "#b"
+                          },
+                          {
+                            "type": "REPEAT1",
+                            "content": {
+                              "type": "PATTERN",
+                              "value": "[0-1]"
+                            }
+                          }
+                        ]
+                      },
+                      {
+                        "type": "SEQ",
+                        "members": [
+                          {
+                            "type": "REPEAT1",
+                            "content": {
+                              "type": "PATTERN",
+                              "value": "[0-9]"
+                            }
+                          },
+                          {
+                            "type": "CHOICE",
+                            "members": [
+                              {
+                                "type": "SEQ",
+                                "members": [
+                                  {
+                                    "type": "STRING",
+                                    "value": "."
+                                  },
+                                  {
+                                    "type": "REPEAT",
+                                    "content": {
+                                      "type": "PATTERN",
+                                      "value": "[0-9]"
+                                    }
+                                  }
+                                ]
+                              },
+                              {
+                                "type": "BLANK"
+                              }
+                            ]
+                          }
+                        ]
+                      },
+                      {
+                        "type": "SEQ",
+                        "members": [
+                          {
+                            "type": "REPEAT1",
+                            "content": {
+                              "type": "PATTERN",
+                              "value": "[0-9]"
+                            }
+                          }
+                        ]
+                      }
+                    ]
+                  }
+                ]
+              }
+            }
+          },
+          "named": true,
+          "value": "num_lit"
+        },
+        {
+          "type": "SEQ",
+          "members": [
+            {
+              "type": "STRING",
+              "value": "'"
+            },
+            {
+              "type": "ALIAS",
+              "content": {
+                "type": "PATTERN",
+                "value": "."
+              },
+              "named": true,
+              "value": "char_lit"
+            }
+          ]
+        }
+      ]
+    },
+    "format_prefix_parameters": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "STRING",
+          "value": "v"
+        },
+        {
+          "type": "STRING",
+          "value": "V"
+        },
+        {
+          "type": "STRING",
+          "value": "#"
+        }
+      ]
+    },
+    "format_modifiers": {
+      "type": "SEQ",
+      "members": [
+        {
+          "type": "REPEAT",
+          "content": {
+            "type": "CHOICE",
+            "members": [
+              {
+                "type": "SYMBOL",
+                "name": "_format_token"
+              },
+              {
+                "type": "STRING",
+                "value": ","
+              }
+            ]
+          }
+        },
+        {
+          "type": "CHOICE",
+          "members": [
+            {
+              "type": "STRING",
+              "value": "@"
+            },
+            {
+              "type": "STRING",
+              "value": "@:"
+            },
+            {
+              "type": "STRING",
+              "value": ":"
+            },
+            {
+              "type": "STRING",
+              "value": ":@"
+            }
+          ]
+        }
+      ]
+    },
+    "format_directive_type": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "SEQ",
+          "members": [
+            {
+              "type": "CHOICE",
+              "members": [
+                {
+                  "type": "FIELD",
+                  "name": "repetitions",
+                  "content": {
+                    "type": "SYMBOL",
+                    "name": "_format_token"
+                  }
+                },
+                {
+                  "type": "BLANK"
+                }
+              ]
+            },
+            {
+              "type": "CHOICE",
+              "members": [
+                {
+                  "type": "STRING",
+                  "value": "~"
+                },
+                {
+                  "type": "STRING",
+                  "value": "%"
+                },
+                {
+                  "type": "STRING",
+                  "value": "&"
+                },
+                {
+                  "type": "STRING",
+                  "value": "|"
+                }
+              ]
+            }
+          ]
+        },
+        {
+          "type": "PATTERN",
+          "value": "[cC]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "\\^"
+        },
+        {
+          "type": "STRING",
+          "value": "\n"
+        },
+        {
+          "type": "STRING",
+          "value": "\r"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[pP]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[iI]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[wW]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[aA]"
+        },
+        {
+          "type": "STRING",
+          "value": "_"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[()]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[{}]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[\\[\\]]"
+        },
+        {
+          "type": "PATTERN",
+          "value": "[<>]"
+        },
+        {
+          "type": "STRING",
+          "value": ";"
+        },
+        {
+          "type": "SEQ",
+          "members": [
+            {
+              "type": "FIELD",
+              "name": "numberOfArgs",
+              "content": {
+                "type": "SYMBOL",
+                "name": "_format_token"
+              }
+            },
+            {
+              "type": "STRING",
+              "value": "*"
+            }
+          ]
+        },
+        {
+          "type": "STRING",
+          "value": "?"
+        },
+        {
+          "type": "STRING",
+          "value": "Newline"
+        },
+        {
+          "type": "SEQ",
+          "members": [
+            {
+              "type": "REPEAT",
+              "content": {
+                "type": "CHOICE",
+                "members": [
+                  {
+                    "type": "SYMBOL",
+                    "name": "_format_token"
+                  },
+                  {
+                    "type": "STRING",
+                    "value": ","
+                  }
+                ]
+              }
+            },
+            {
+              "type": "PATTERN",
+              "value": "[$rRbBdDgGxXeEoOsStTfF]"
+            }
+          ]
+        }
+      ]
+    },
+    "format_specifier": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {
+            "type": "STRING",
+            "value": "~"
+          },
+          {
+            "type": "CHOICE",
+            "members": [
+              {
+                "type": "SYMBOL",
+                "name": "format_prefix_parameters"
+              },
+              {
+                "type": "BLANK"
+              }
+            ]
+          },
+          {
+            "type": "CHOICE",
+            "members": [
+              {
+                "type": "SYMBOL",
+                "name": "format_modifiers"
+              },
+              {
+                "type": "BLANK"
+              }
+            ]
+          },
+          {
+            "type": "PREC",
+            "value": 5,
+            "content": {
+              "type": "SYMBOL",
+              "name": "format_directive_type"
+            }
+          }
+        ]
+      }
+    },
+    "str_lit": {
+      "type": "SEQ",
+      "members": [
+        {
+          "type": "STRING",
+          "value": "\""
+        },
+        {
+          "type": "REPEAT",
+          "content": {
+            "type": "CHOICE",
+            "members": [
+              {
+                "type": "IMMEDIATE_TOKEN",
+                "content": {
+                  "type": "PREC",
+                  "value": 1,
+                  "content": {
+                    "type": "PATTERN",
+                    "value": "[^\\\\~\"]+"
+                  }
+                }
+              },
+              {
+                "type": "IMMEDIATE_TOKEN",
+                "content": {
+                  "type": "SEQ",
+                  "members": [
+                    {
+                      "type": "PATTERN",
+                      "value": "\\\\."
+                    }
+                  ]
+                }
+              },
+              {
+                "type": "SYMBOL",
+                "name": "format_specifier"
+              }
+            ]
+          }
+        },
+        {
+          "type": "CHOICE",
+          "members": [
+            {
+              "type": "STRING",
+              "value": "~"
+            },
+            {
+              "type": "BLANK"
+            }
+          ]
+        },
+        {
+          "type": "STRING",
+          "value": "\""
+        }
+      ]
+    },
    "char_lit": {
      "type": "TOKEN",
      "content": {
@ -661,7 +1006,7 @@
  "precedences": [],
  "externals": [],
  "inline": [
-    "_kwd_unqualified",
+    "ReferenceError",
    "_sym_unqualified"
  ],
  "supertypes": []
--- a/third-party/tree-sitter/tree-sitter-opengoal/node-types.json
+++ b/third-party/tree-sitter/tree-sitter-opengoal/node-types.json
@ -1,33 +1,105 @@
 [
  {
-    "type": "comment_multiline",
-    "named": true,
-    "fields": {}
-  },
-  {
-    "type": "kwd_lit",
+    "type": "format_directive_type",
    "named": true,
    "fields": {
-      "marker": {
-        "multiple": false,
-        "required": true,
+      "numberOfArgs": {
+        "multiple": true,
+        "required": false,
        "types": [
          {
-            "type": ":",
+            "type": "'",
            "named": false
+          },
+          {
+            "type": "char_lit",
+            "named": true
+          },
+          {
+            "type": "num_lit",
+            "named": true
          }
        ]
      },
-      "name": {
-        "multiple": false,
-        "required": true,
+      "repetitions": {
+        "multiple": true,
+        "required": false,
        "types": [
          {
-            "type": "kwd_name",
+            "type": "'",
+            "named": false
+          },
+          {
+            "type": "char_lit",
+            "named": true
+          },
+          {
+            "type": "num_lit",
            "named": true
          }
        ]
      }
+    },
+    "children": {
+      "multiple": true,
+      "required": false,
+      "types": [
+        {
+          "type": "char_lit",
+          "named": true
+        },
+        {
+          "type": "num_lit",
+          "named": true
+        }
+      ]
+    }
+  },
+  {
+    "type": "format_modifiers",
+    "named": true,
+    "fields": {},
+    "children": {
+      "multiple": true,
+      "required": false,
+      "types": [
+        {
+          "type": "char_lit",
+          "named": true
+        },
+        {
+          "type": "num_lit",
+          "named": true
+        }
+      ]
+    }
+  },
+  {
+    "type": "format_prefix_parameters",
+    "named": true,
+    "fields": {}
+  },
+  {
+    "type": "format_specifier",
+    "named": true,
+    "fields": {},
+    "children": {
+      "multiple": true,
+      "required": true,
+      "types": [
+        {
+          "type": "format_directive_type",
+          "named": true
+        },
+        {
+          "type": "format_modifiers",
+          "named": true
+        },
+        {
+          "type": "format_prefix_parameters",
+          "named": true
+        }
+      ]
    }
  },
  {
@ -114,16 +186,21 @@
      "required": false,
      "types": [
        {
-          "type": "comment",
+          "type": "block_comment",
          "named": true
        },
        {
-          "type": "comment_multiline",
+          "type": "comment",
          "named": true
        }
      ]
    }
  },
+  {
+    "type": "num_lit",
+    "named": true,
+    "fields": {}
+  },
  {
    "type": "quasi_quoting_lit",
    "named": true,
@ -198,11 +275,11 @@
      "required": false,
      "types": [
        {
-          "type": "comment",
+          "type": "block_comment",
          "named": true
        },
        {
-          "type": "comment_multiline",
+          "type": "comment",
          "named": true
        }
      ]
@ -282,11 +359,11 @@
      "required": false,
      "types": [
        {
-          "type": "comment",
+          "type": "block_comment",
          "named": true
        },
        {
-          "type": "comment_multiline",
+          "type": "comment",
          "named": true
        }
      ]
@ -300,6 +377,10 @@
      "multiple": true,
      "required": false,
      "types": [
+        {
+          "type": "block_comment",
+          "named": true
+        },
        {
          "type": "bool_lit",
          "named": true
@ -312,10 +393,6 @@
          "type": "comment",
          "named": true
        },
-        {
-          "type": "comment_multiline",
-          "named": true
-        },
        {
          "type": "kwd_lit",
          "named": true
@ -359,6 +436,21 @@
      ]
    }
  },
+  {
+    "type": "str_lit",
+    "named": true,
+    "fields": {},
+    "children": {
+      "multiple": true,
+      "required": false,
+      "types": [
+        {
+          "type": "format_specifier",
+          "named": true
+        }
+      ]
+    }
+  },
  {
    "type": "sym_lit",
    "named": true,
@ -449,11 +541,11 @@
      "required": false,
      "types": [
        {
-          "type": "comment",
+          "type": "block_comment",
          "named": true
        },
        {
-          "type": "comment_multiline",
+          "type": "comment",
          "named": true
        }
      ]
@ -533,18 +625,38 @@
      "required": false,
      "types": [
        {
-          "type": "comment",
+          "type": "block_comment",
          "named": true
        },
        {
-          "type": "comment_multiline",
+          "type": "comment",
          "named": true
        }
      ]
    }
  },
  {
-    "type": "#|",
+    "type": "\n",
+    "named": false
+  },
+  {
+    "type": "\r",
+    "named": false
+  },
+  {
+    "type": "\"",
+    "named": false
+  },
+  {
+    "type": "#",
+    "named": false
+  },
+  {
+    "type": "%",
+    "named": false
+  },
+  {
+    "type": "&",
    "named": false
  },
  {
@ -559,6 +671,10 @@
    "type": ")",
    "named": false
  },
+  {
+    "type": "*",
+    "named": false
+  },
  {
    "type": ",",
    "named": false
@ -571,10 +687,46 @@
    "type": ":",
    "named": false
  },
+  {
+    "type": ":@",
+    "named": false
+  },
+  {
+    "type": ";",
+    "named": false
+  },
+  {
+    "type": "?",
+    "named": false
+  },
+  {
+    "type": "@",
+    "named": false
+  },
+  {
+    "type": "@:",
+    "named": false
+  },
+  {
+    "type": "Newline",
+    "named": false
+  },
+  {
+    "type": "V",
+    "named": false
+  },
+  {
+    "type": "_",
+    "named": false
+  },
  {
    "type": "`",
    "named": false
  },
+  {
+    "type": "block_comment",
+    "named": true
+  },
  {
    "type": "bool_lit",
    "named": true
@ -588,27 +740,27 @@
    "named": true
  },
  {
-    "type": "kwd_name",
+    "type": "kwd_lit",
    "named": true
  },
  {
    "type": "null_lit",
    "named": true
  },
-  {
-    "type": "num_lit",
-    "named": true
-  },
-  {
-    "type": "str_lit",
-    "named": true
-  },
  {
    "type": "sym_name",
    "named": true
  },
  {
-    "type": "|#",
+    "type": "v",
+    "named": false
+  },
+  {
+    "type": "|",
+    "named": false
+  },
+  {
+    "type": "~",
    "named": false
  }
 ]
--- a/third-party/tree-sitter/tree-sitter-opengoal/parser.c
+++ b/third-party/tree-sitter/tree-sitter-opengoal/parser.c