feat: add external scanner, template literals, and Rune-specific constructs

- Remove raw string handling from scanner, make scanner stateless
- Add template_content external token for backtick template scanning
- Add template_literal and interpolation grammar rules
- Add select_expression with select_arm for async select blocks
- Add object_literal (#{ key: value }) syntax
- Add is_expression and is_not_expression type check operators
- Verify yield_expression works as expected
This commit is contained in:
2026-03-27 10:19:00 +01:00
parent fbbe1be791
commit c4d53c9ea7
5 changed files with 28936 additions and 24428 deletions

View File

@@ -35,14 +35,12 @@ module.exports = grammar({
externals: $ => [
$.string_content,
$._raw_string_literal_start,
$.raw_string_literal_content,
$._raw_string_literal_end,
$.float_literal,
$._outer_block_doc_comment_marker,
$._inner_block_doc_comment_marker,
$._block_comment_content,
$._line_doc_content,
$.template_content,
$._error_sentinel,
],
@@ -299,6 +297,9 @@ module.exports = grammar({
$.self,
$.scoped_identifier,
prec(1, $.macro_invocation),
$.object_literal,
$.is_expression,
$.is_not_expression,
),
_expression: $ => choice(
@@ -314,6 +315,7 @@ module.exports = grammar({
$.while_expression,
$.loop_expression,
$.for_expression,
$.select_expression,
),
macro_invocation: $ => seq(
@@ -604,6 +606,67 @@ module.exports = grammar({
'}',
),
// Section - Rune-specific constructs
template_literal: $ => seq(
'`',
repeat(choice(
$.template_content,
$.escape_sequence,
$.interpolation,
)),
'`',
),
interpolation: $ => seq(
'${',
field('expression', $._expression),
'}',
),
select_expression: $ => seq(
'select',
'{',
sepBy(',', $.select_arm),
optional(','),
'}',
),
select_arm: $ => prec.right(seq(
field('pattern', $._pattern),
'=',
field('value', $._expression_except_range),
'=>',
field('body', $._expression),
)),
object_literal: $ => seq(
'#',
'{',
sepBy(',', $.object_entry),
optional(','),
'}',
),
object_entry: $ => seq(
field('key', $.identifier),
':',
field('value', $._expression),
),
is_expression: $ => prec.left(PREC.comparative, seq(
field('left', $._expression),
'is',
field('right', choice($.identifier, $.scoped_identifier)),
)),
is_not_expression: $ => prec.left(PREC.comparative, seq(
field('left', $._expression),
'is',
'not',
field('right', choice($.identifier, $.scoped_identifier)),
)),
// Section - Patterns
_pattern: $ => choice(
@@ -713,6 +776,7 @@ module.exports = grammar({
$.boolean_literal,
$.integer_literal,
$.float_literal,
$.template_literal,
),
_literal_pattern: $ => choice(

View File

@@ -1289,6 +1289,18 @@
"type": "SYMBOL",
"name": "macro_invocation"
}
},
{
"type": "SYMBOL",
"name": "object_literal"
},
{
"type": "SYMBOL",
"name": "is_expression"
},
{
"type": "SYMBOL",
"name": "is_not_expression"
}
]
},
@@ -1335,6 +1347,10 @@
{
"type": "SYMBOL",
"name": "for_expression"
},
{
"type": "SYMBOL",
"name": "select_expression"
}
]
},
@@ -3310,6 +3326,330 @@
}
]
},
"template_literal": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "`"
},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "template_content"
},
{
"type": "SYMBOL",
"name": "escape_sequence"
},
{
"type": "SYMBOL",
"name": "interpolation"
}
]
}
},
{
"type": "STRING",
"value": "`"
}
]
},
"interpolation": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "${"
},
{
"type": "FIELD",
"name": "expression",
"content": {
"type": "SYMBOL",
"name": "_expression"
}
},
{
"type": "STRING",
"value": "}"
}
]
},
"select_expression": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "select"
},
{
"type": "STRING",
"value": "{"
},
{
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "select_arm"
},
{
"type": "REPEAT",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": ","
},
{
"type": "SYMBOL",
"name": "select_arm"
}
]
}
}
]
},
{
"type": "BLANK"
}
]
},
{
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": ","
},
{
"type": "BLANK"
}
]
},
{
"type": "STRING",
"value": "}"
}
]
},
"select_arm": {
"type": "PREC_RIGHT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "pattern",
"content": {
"type": "SYMBOL",
"name": "_pattern"
}
},
{
"type": "STRING",
"value": "="
},
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_expression_except_range"
}
},
{
"type": "STRING",
"value": "=>"
},
{
"type": "FIELD",
"name": "body",
"content": {
"type": "SYMBOL",
"name": "_expression"
}
}
]
}
},
"object_literal": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "#"
},
{
"type": "STRING",
"value": "{"
},
{
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "object_entry"
},
{
"type": "REPEAT",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": ","
},
{
"type": "SYMBOL",
"name": "object_entry"
}
]
}
}
]
},
{
"type": "BLANK"
}
]
},
{
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": ","
},
{
"type": "BLANK"
}
]
},
{
"type": "STRING",
"value": "}"
}
]
},
"object_entry": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "key",
"content": {
"type": "SYMBOL",
"name": "identifier"
}
},
{
"type": "STRING",
"value": ":"
},
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_expression"
}
}
]
},
"is_expression": {
"type": "PREC_LEFT",
"value": 4,
"content": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "left",
"content": {
"type": "SYMBOL",
"name": "_expression"
}
},
{
"type": "STRING",
"value": "is"
},
{
"type": "FIELD",
"name": "right",
"content": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "identifier"
},
{
"type": "SYMBOL",
"name": "scoped_identifier"
}
]
}
}
]
}
},
"is_not_expression": {
"type": "PREC_LEFT",
"value": 4,
"content": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "left",
"content": {
"type": "SYMBOL",
"name": "_expression"
}
},
{
"type": "STRING",
"value": "is"
},
{
"type": "STRING",
"value": "not"
},
{
"type": "FIELD",
"name": "right",
"content": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "identifier"
},
{
"type": "SYMBOL",
"name": "scoped_identifier"
}
]
}
}
]
}
},
"_pattern": {
"type": "CHOICE",
"members": [
@@ -3900,6 +4240,10 @@
{
"type": "SYMBOL",
"name": "float_literal"
},
{
"type": "SYMBOL",
"name": "template_literal"
}
]
},
@@ -4464,18 +4808,6 @@
"type": "SYMBOL",
"name": "string_content"
},
{
"type": "SYMBOL",
"name": "_raw_string_literal_start"
},
{
"type": "SYMBOL",
"name": "raw_string_literal_content"
},
{
"type": "SYMBOL",
"name": "_raw_string_literal_end"
},
{
"type": "SYMBOL",
"name": "float_literal"
@@ -4496,6 +4828,10 @@
"type": "SYMBOL",
"name": "_line_doc_content"
},
{
"type": "SYMBOL",
"name": "template_content"
},
{
"type": "SYMBOL",
"name": "_error_sentinel"

View File

@@ -117,6 +117,14 @@
"type": "index_expression",
"named": true
},
{
"type": "is_expression",
"named": true
},
{
"type": "is_not_expression",
"named": true
},
{
"type": "loop_expression",
"named": true
@@ -129,6 +137,10 @@
"type": "match_expression",
"named": true
},
{
"type": "object_literal",
"named": true
},
{
"type": "parenthesized_expression",
"named": true
@@ -194,6 +206,10 @@
{
"type": "string_literal",
"named": true
},
{
"type": "template_literal",
"named": true
}
]
},
@@ -728,6 +744,14 @@
"type": "index_expression",
"named": true
},
{
"type": "is_expression",
"named": true
},
{
"type": "is_not_expression",
"named": true
},
{
"type": "loop_expression",
"named": true
@@ -740,6 +764,10 @@
"type": "match_expression",
"named": true
},
{
"type": "object_literal",
"named": true
},
{
"type": "parenthesized_expression",
"named": true
@@ -1058,6 +1086,10 @@
{
"type": "_expression",
"named": true
},
{
"type": "select_expression",
"named": true
}
]
}
@@ -1365,6 +1397,82 @@
"named": true,
"fields": {}
},
{
"type": "interpolation",
"named": true,
"fields": {
"expression": {
"multiple": false,
"required": true,
"types": [
{
"type": "_expression",
"named": true
}
]
}
}
},
{
"type": "is_expression",
"named": true,
"fields": {
"left": {
"multiple": false,
"required": true,
"types": [
{
"type": "_expression",
"named": true
}
]
},
"right": {
"multiple": false,
"required": true,
"types": [
{
"type": "identifier",
"named": true
},
{
"type": "scoped_identifier",
"named": true
}
]
}
}
},
{
"type": "is_not_expression",
"named": true,
"fields": {
"left": {
"multiple": false,
"required": true,
"types": [
{
"type": "_expression",
"named": true
}
]
},
"right": {
"multiple": false,
"required": true,
"types": [
{
"type": "identifier",
"named": true
},
{
"type": "scoped_identifier",
"named": true
}
]
}
}
},
{
"type": "let_chain",
"named": true,
@@ -1540,6 +1648,10 @@
{
"type": "_expression",
"named": true
},
{
"type": "select_expression",
"named": true
}
]
}
@@ -1689,6 +1801,47 @@
]
}
},
{
"type": "object_entry",
"named": true,
"fields": {
"key": {
"multiple": false,
"required": true,
"types": [
{
"type": "identifier",
"named": true
}
]
},
"value": {
"multiple": false,
"required": true,
"types": [
{
"type": "_expression",
"named": true
}
]
}
}
},
{
"type": "object_literal",
"named": true,
"fields": {},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "object_entry",
"named": true
}
]
}
},
{
"type": "or_pattern",
"named": true,
@@ -1991,6 +2144,181 @@
}
}
},
{
"type": "select_arm",
"named": true,
"fields": {
"body": {
"multiple": false,
"required": true,
"types": [
{
"type": "_expression",
"named": true
}
]
},
"pattern": {
"multiple": false,
"required": true,
"types": [
{
"type": "_pattern",
"named": true
}
]
},
"value": {
"multiple": false,
"required": true,
"types": [
{
"type": "_literal",
"named": true
},
{
"type": "array_expression",
"named": true
},
{
"type": "assignment_expression",
"named": true
},
{
"type": "async_block",
"named": true
},
{
"type": "await_expression",
"named": true
},
{
"type": "binary_expression",
"named": true
},
{
"type": "block",
"named": true
},
{
"type": "break_expression",
"named": true
},
{
"type": "call_expression",
"named": true
},
{
"type": "closure_expression",
"named": true
},
{
"type": "compound_assignment_expr",
"named": true
},
{
"type": "continue_expression",
"named": true
},
{
"type": "field_expression",
"named": true
},
{
"type": "for_expression",
"named": true
},
{
"type": "identifier",
"named": true
},
{
"type": "if_expression",
"named": true
},
{
"type": "index_expression",
"named": true
},
{
"type": "is_expression",
"named": true
},
{
"type": "is_not_expression",
"named": true
},
{
"type": "loop_expression",
"named": true
},
{
"type": "macro_invocation",
"named": true
},
{
"type": "match_expression",
"named": true
},
{
"type": "object_literal",
"named": true
},
{
"type": "parenthesized_expression",
"named": true
},
{
"type": "return_expression",
"named": true
},
{
"type": "scoped_identifier",
"named": true
},
{
"type": "self",
"named": true
},
{
"type": "struct_expression",
"named": true
},
{
"type": "tuple_expression",
"named": true
},
{
"type": "unary_expression",
"named": true
},
{
"type": "while_expression",
"named": true
},
{
"type": "yield_expression",
"named": true
}
]
}
}
},
{
"type": "select_expression",
"named": true,
"fields": {},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "select_arm",
"named": true
}
]
}
},
{
"type": "shorthand_field_initializer",
"named": true,
@@ -2168,6 +2496,29 @@
]
}
},
{
"type": "template_literal",
"named": true,
"fields": {},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "escape_sequence",
"named": true
},
{
"type": "interpolation",
"named": true
},
{
"type": "template_content",
"named": true
}
]
}
},
{
"type": "tuple_expression",
"named": true,
@@ -2525,6 +2876,10 @@
"type": "#",
"named": false
},
{
"type": "${",
"named": false
},
{
"type": "%",
"named": false
@@ -2697,6 +3052,10 @@
"type": "_",
"named": false
},
{
"type": "`",
"named": false
},
{
"type": "as",
"named": false
@@ -2777,6 +3136,10 @@
"type": "integer_literal",
"named": true
},
{
"type": "is",
"named": false
},
{
"type": "let",
"named": false
@@ -2793,6 +3156,10 @@
"type": "mod",
"named": false
},
{
"type": "not",
"named": false
},
{
"type": "pub",
"named": false
@@ -2801,6 +3168,10 @@
"type": "return",
"named": false
},
{
"type": "select",
"named": false
},
{
"type": "self",
"named": true
@@ -2821,6 +3192,10 @@
"type": "super",
"named": true
},
{
"type": "template_content",
"named": true
},
{
"type": "true",
"named": false

52382
src/parser.c

File diff suppressed because it is too large Load Diff

View File

@@ -1,43 +1,25 @@
#include "tree_sitter/alloc.h"
#include "tree_sitter/parser.h"
#include <wctype.h>
enum TokenType {
STRING_CONTENT,
RAW_STRING_LITERAL_START,
RAW_STRING_LITERAL_CONTENT,
RAW_STRING_LITERAL_END,
FLOAT_LITERAL,
BLOCK_OUTER_DOC_MARKER,
BLOCK_INNER_DOC_MARKER,
BLOCK_OUTER_DOC_COMMENT_MARKER,
BLOCK_INNER_DOC_COMMENT_MARKER,
BLOCK_COMMENT_CONTENT,
LINE_DOC_CONTENT,
ERROR_SENTINEL
TEMPLATE_CONTENT,
ERROR_SENTINEL,
};
typedef struct {
uint8_t opening_hash_count;
} Scanner;
void *tree_sitter_rune_external_scanner_create() { return NULL; }
void *tree_sitter_rune_external_scanner_create() { return ts_calloc(1, sizeof(Scanner)); }
void tree_sitter_rune_external_scanner_destroy(void *payload) {}
void tree_sitter_rune_external_scanner_destroy(void *payload) { ts_free((Scanner *)payload); }
unsigned tree_sitter_rune_external_scanner_serialize(void *payload, char *buffer) { return 0; }
unsigned tree_sitter_rune_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = (Scanner *)payload;
buffer[0] = (char)scanner->opening_hash_count;
return 1;
}
void tree_sitter_rune_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = (Scanner *)payload;
scanner->opening_hash_count = 0;
if (length == 1) {
Scanner *scanner = (Scanner *)payload;
scanner->opening_hash_count = buffer[0];
}
}
void tree_sitter_rune_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {}
static inline bool is_num_char(int32_t c) { return c == '_' || iswdigit(c); }
@@ -62,63 +44,32 @@ static inline bool process_string(TSLexer *lexer) {
return has_content;
}
static inline bool scan_raw_string_start(Scanner *scanner, TSLexer *lexer) {
if (lexer->lookahead == 'b' || lexer->lookahead == 'c') {
advance(lexer);
}
if (lexer->lookahead != 'r') {
return false;
}
advance(lexer);
uint8_t opening_hash_count = 0;
while (lexer->lookahead == '#') {
advance(lexer);
opening_hash_count++;
}
if (lexer->lookahead != '"') {
return false;
}
advance(lexer);
scanner->opening_hash_count = opening_hash_count;
lexer->result_symbol = RAW_STRING_LITERAL_START;
return true;
}
static inline bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
for (;;) {
if (lexer->eof(lexer)) {
return false;
}
if (lexer->lookahead == '"') {
lexer->mark_end(lexer);
advance(lexer);
unsigned hash_count = 0;
while (lexer->lookahead == '#' && hash_count < scanner->opening_hash_count) {
static bool scan_template_content(TSLexer *lexer) {
lexer->result_symbol = TEMPLATE_CONTENT;
bool has_content = false;
while (true) {
lexer->mark_end(lexer);
switch (lexer->lookahead) {
case '`':
return has_content;
case '\0':
return false;
case '$':
advance(lexer);
hash_count++;
}
if (hash_count == scanner->opening_hash_count) {
lexer->result_symbol = RAW_STRING_LITERAL_CONTENT;
return true;
}
} else {
advance(lexer);
if (lexer->lookahead == '{') {
return has_content;
}
has_content = true;
break;
case '\\':
return has_content;
default:
advance(lexer);
has_content = true;
}
}
}
static inline bool scan_raw_string_end(Scanner *scanner, TSLexer *lexer) {
advance(lexer);
for (unsigned i = 0; i < scanner->opening_hash_count; i++) {
advance(lexer);
}
lexer->result_symbol = RAW_STRING_LITERAL_END;
return true;
}
static inline bool process_float_literal(TSLexer *lexer) {
lexer->result_symbol = FLOAT_LITERAL;
@@ -245,28 +196,19 @@ static inline void process_continuing(BlockCommentProcessing *processing, char c
static inline bool process_block_comment(TSLexer *lexer, const bool *valid_symbols) {
char first = (char)lexer->lookahead;
// The first character is stored so we can safely advance inside
// these if blocks. However, because we only store one, we can only
// safely advance 1 time. Since there's a chance that an advance could
// happen in one state, we must advance in all states to ensure that
// the program ends up in a sane state prior to processing the block
// comment if need be.
if (valid_symbols[BLOCK_INNER_DOC_MARKER] && first == '!') {
lexer->result_symbol = BLOCK_INNER_DOC_MARKER;
if (valid_symbols[BLOCK_INNER_DOC_COMMENT_MARKER] && first == '!') {
lexer->result_symbol = BLOCK_INNER_DOC_COMMENT_MARKER;
advance(lexer);
return true;
}
if (valid_symbols[BLOCK_OUTER_DOC_MARKER] && first == '*') {
if (valid_symbols[BLOCK_OUTER_DOC_COMMENT_MARKER] && first == '*') {
advance(lexer);
lexer->mark_end(lexer);
// If the next token is a / that means that it's an empty block comment.
if (lexer->lookahead == '/') {
return false;
}
// If the next token is a * that means that this isn't a BLOCK_OUTER_DOC_MARKER
// as BLOCK_OUTER_DOC_MARKER's only have 2 * not 3 or more.
if (lexer->lookahead != '*') {
lexer->result_symbol = BLOCK_OUTER_DOC_MARKER;
lexer->result_symbol = BLOCK_OUTER_DOC_COMMENT_MARKER;
return true;
}
} else {
@@ -275,13 +217,10 @@ static inline bool process_block_comment(TSLexer *lexer, const bool *valid_symbo
if (valid_symbols[BLOCK_COMMENT_CONTENT]) {
BlockCommentProcessing processing = {Continuing, 1};
// Manually set the current state based on the first character
switch (first) {
case '*':
processing.state = LeftAsterisk;
if (lexer->lookahead == '/') {
// This case can happen in an empty doc block comment
// like /*!*/. The comment has no contents, so bail.
return false;
}
break;
@@ -293,17 +232,7 @@ static inline bool process_block_comment(TSLexer *lexer, const bool *valid_symbo
break;
}
// For the purposes of actually parsing rust code, this
// is incorrect as it considers an unterminated block comment
// to be an error. However, for the purposes of syntax highlighting
// this should be considered successful as otherwise you are not able
// to syntax highlight a block of code prior to closing the
// block comment
while (!lexer->eof(lexer) && processing.nestingDepth != 0) {
// Set first to the current lookahead as that is the second character
// as we force an advance in the above code when we are checking if we
// need to handle a block comment inner or outer doc comment signifier
// node
first = (char)lexer->lookahead;
switch (processing.state) {
case LeftForwardSlash:
@@ -332,34 +261,19 @@ static inline bool process_block_comment(TSLexer *lexer, const bool *valid_symbo
}
bool tree_sitter_rune_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
// The documentation states that if the lexical analysis fails for some reason
// they will mark every state as valid and pass it to the external scanner
// However, we can't do anything to help them recover in that case so we
// should just fail.
/*
link: https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
If a syntax error is encountered during regular parsing, Tree-sitters
first action during error recovery will be to call the external scanners
scan function with all tokens marked valid. The scanner should detect this
case and handle it appropriately. One simple method of detection is to add
an unused token to the end of the externals array, for example
externals: $ => [$.token1, $.token2, $.error_sentinel],
then check whether that token is marked valid to determine whether
Tree-sitter is in error correction mode.
*/
if (valid_symbols[ERROR_SENTINEL]) {
return false;
}
Scanner *scanner = (Scanner *)payload;
if (valid_symbols[BLOCK_COMMENT_CONTENT] || valid_symbols[BLOCK_INNER_DOC_MARKER] ||
valid_symbols[BLOCK_OUTER_DOC_MARKER]) {
if (valid_symbols[BLOCK_COMMENT_CONTENT] || valid_symbols[BLOCK_INNER_DOC_COMMENT_MARKER] ||
valid_symbols[BLOCK_OUTER_DOC_COMMENT_MARKER]) {
return process_block_comment(lexer, valid_symbols);
}
if (valid_symbols[TEMPLATE_CONTENT]) {
return scan_template_content(lexer);
}
if (valid_symbols[STRING_CONTENT] && !valid_symbols[FLOAT_LITERAL]) {
return process_string(lexer);
}
@@ -372,19 +286,6 @@ bool tree_sitter_rune_external_scanner_scan(void *payload, TSLexer *lexer, const
skip(lexer);
}
if (valid_symbols[RAW_STRING_LITERAL_START] &&
(lexer->lookahead == 'r' || lexer->lookahead == 'b' || lexer->lookahead == 'c')) {
return scan_raw_string_start(scanner, lexer);
}
if (valid_symbols[RAW_STRING_LITERAL_CONTENT]) {
return scan_raw_string_content(scanner, lexer);
}
if (valid_symbols[RAW_STRING_LITERAL_END] && lexer->lookahead == '"') {
return scan_raw_string_end(scanner, lexer);
}
if (valid_symbols[FLOAT_LITERAL] && iswdigit(lexer->lookahead)) {
return process_float_literal(lexer);
}