From ac4081f0f9c75cf2d91751b3c387710e6039e2cf Mon Sep 17 00:00:00 2001 From: David Roe Date: Tue, 1 Dec 2020 15:13:19 +0000 Subject: [PATCH] update tree sitter to 0.17.3 --- alloc.h | 48 +- api.h | 14 +- array.h | 143 ++- atomic.h | 18 +- bits.h | 15 +- get_changed_ranges.c | 28 +- language.c | 12 +- language.h | 199 +++- lexer.c | 4 +- node.c | 8 +- parser.c | 259 +++-- parser.h | 53 +- query.c | 2522 ++++++++++++++++++++++++++++++++++-------- reusable_node.h | 29 +- stack.c | 18 +- subtree.c | 252 +++-- subtree.h | 56 +- tree_cursor.c | 212 ++-- tree_cursor.h | 12 +- vendor.sh | 2 +- 20 files changed, 3041 insertions(+), 863 deletions(-) diff --git a/alloc.h b/alloc.h index c8fe6c6e..6e22a0ab 100644 --- a/alloc.h +++ b/alloc.h @@ -17,58 +17,64 @@ void *ts_record_realloc(void *, size_t); void ts_record_free(void *); bool ts_toggle_allocation_recording(bool); -static inline void *ts_malloc(size_t size) { - return ts_record_malloc(size); -} - -static inline void *ts_calloc(size_t count, size_t size) { - return ts_record_calloc(count, size); -} +#define ts_malloc ts_record_malloc +#define ts_calloc ts_record_calloc +#define ts_realloc ts_record_realloc +#define ts_free ts_record_free -static inline void *ts_realloc(void *buffer, size_t size) { - return ts_record_realloc(buffer, size); -} +#else -static inline void ts_free(void *buffer) { - ts_record_free(buffer); -} +// Allow clients to override allocation functions -#else +#ifndef ts_malloc +#define ts_malloc ts_malloc_default +#endif +#ifndef ts_calloc +#define ts_calloc ts_calloc_default +#endif +#ifndef ts_realloc +#define ts_realloc ts_realloc_default +#endif +#ifndef ts_free +#define ts_free ts_free_default +#endif #include static inline bool ts_toggle_allocation_recording(bool value) { + (void)value; return false; } -static inline void *ts_malloc(size_t size) { + +static inline void *ts_malloc_default(size_t size) { void *result = malloc(size); if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %lu bytes", size); + fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size); exit(1); } return result; } -static inline void *ts_calloc(size_t count, size_t size) { +static inline void *ts_calloc_default(size_t count, size_t size) { void *result = calloc(count, size); if (count > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %lu bytes", count * size); + fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size); exit(1); } return result; } -static inline void *ts_realloc(void *buffer, size_t size) { +static inline void *ts_realloc_default(void *buffer, size_t size) { void *result = realloc(buffer, size); if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to reallocate %lu bytes", size); + fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size); exit(1); } return result; } -static inline void ts_free(void *buffer) { +static inline void ts_free_default(void *buffer) { free(buffer); } diff --git a/api.h b/api.h index 9d832e6e..caa05f52 100644 --- a/api.h +++ b/api.h @@ -21,7 +21,7 @@ extern "C" { * The Tree-sitter library is generally backwards-compatible with languages * generated using older CLI versions, but is not forwards-compatible. */ -#define TREE_SITTER_LANGUAGE_VERSION 11 +#define TREE_SITTER_LANGUAGE_VERSION 12 /** * The earliest ABI version that is supported by the current version of the @@ -130,6 +130,7 @@ typedef enum { TSQueryErrorNodeType, TSQueryErrorField, TSQueryErrorCapture, + TSQueryErrorStructure, } TSQueryError; /********************/ @@ -219,8 +220,8 @@ const TSRange *ts_parser_included_ranges( * following three fields: * 1. `read`: A function to retrieve a chunk of text at a given byte offset * and (row, column) position. The function should return a pointer to the - * text and write its length to the the `bytes_read` pointer. The parser - * does not take ownership of this buffer; it just borrows it until it has + * text and write its length to the `bytes_read` pointer. The parser does + * not take ownership of this buffer; it just borrows it until it has * finished reading it. The function should write a zero value to the * `bytes_read` pointer to indicate the end of the document. * 2. `payload`: An arbitrary pointer that will be passed to each invocation @@ -718,6 +719,11 @@ const TSQueryPredicateStep *ts_query_predicates_for_pattern( uint32_t *length ); +bool ts_query_step_is_definite( + const TSQuery *self, + uint32_t byte_offset +); + /** * Get the name and length of one of the query's captures, or one of the * query's string literals. Each capture and string is associated with a @@ -759,7 +765,7 @@ void ts_query_disable_pattern(TSQuery *, uint32_t); * to start running a given query on a given syntax node. Then, there are * two options for consuming the results of the query: * 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the - * the *matches* in the order that they were found. Each match contains the + * *matches* in the order that they were found. Each match contains the * index of the pattern that matched, and an array of captures. Because * multiple patterns can match the same set of nodes, one match may contain * captures that appear *before* some of the captures from a previous match. diff --git a/array.h b/array.h index bc77e687..5ff5580a 100644 --- a/array.h +++ b/array.h @@ -12,9 +12,9 @@ extern "C" { #include #include "./alloc.h" -#define Array(T) \ - struct { \ - T *contents; \ +#define Array(T) \ + struct { \ + T *contents; \ uint32_t size; \ uint32_t capacity; \ } @@ -37,35 +37,94 @@ extern "C" { #define array_reserve(self, new_capacity) \ array__reserve((VoidArray *)(self), array__elem_size(self), new_capacity) -#define array_erase(self, index) \ - array__erase((VoidArray *)(self), array__elem_size(self), index) - +// Free any memory allocated for this array. #define array_delete(self) array__delete((VoidArray *)self) #define array_push(self, element) \ (array__grow((VoidArray *)(self), 1, array__elem_size(self)), \ (self)->contents[(self)->size++] = (element)) +// Increase the array's size by a given number of elements, reallocating +// if necessary. New elements are zero-initialized. #define array_grow_by(self, count) \ (array__grow((VoidArray *)(self), count, array__elem_size(self)), \ memset((self)->contents + (self)->size, 0, (count) * array__elem_size(self)), \ (self)->size += (count)) #define array_push_all(self, other) \ - array_splice((self), (self)->size, 0, (other)->size, (other)->contents) - -#define array_splice(self, index, old_count, new_count, new_contents) \ - array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \ - new_count, new_contents) - + array_extend((self), (other)->size, (other)->contents) + +// Append `count` elements to the end of the array, reading their values from the +// `contents` pointer. +#define array_extend(self, count, contents) \ + array__splice( \ + (VoidArray *)(self), array__elem_size(self), (self)->size, \ + 0, count, contents \ + ) + +// Remove `old_count` elements from the array starting at the given `index`. At +// the same index, insert `new_count` new elements, reading their values from the +// `new_contents` pointer. +#define array_splice(self, index, old_count, new_count, new_contents) \ + array__splice( \ + (VoidArray *)(self), array__elem_size(self), index, \ + old_count, new_count, new_contents \ + ) + +// Insert one `element` into the array at the given `index`. #define array_insert(self, index, element) \ array__splice((VoidArray *)(self), array__elem_size(self), index, 0, 1, &element) +// Remove one `element` from the array at the given `index`. +#define array_erase(self, index) \ + array__erase((VoidArray *)(self), array__elem_size(self), index) + #define array_pop(self) ((self)->contents[--(self)->size]) #define array_assign(self, other) \ array__assign((VoidArray *)(self), (const VoidArray *)(other), array__elem_size(self)) +#define array_swap(self, other) \ + array__swap((VoidArray *)(self), (VoidArray *)(other)) + +// Search a sorted array for a given `needle` value, using the given `compare` +// callback to determine the order. +// +// If an existing element is found to be equal to `needle`, then the `index` +// out-parameter is set to the existing value's index, and the `exists` +// out-parameter is set to true. Otherwise, `index` is set to an index where +// `needle` should be inserted in order to preserve the sorting, and `exists` +// is set to false. +#define array_search_sorted_with(self, compare, needle, index, exists) \ + array__search_sorted(self, 0, compare, , needle, index, exists) + +// Search a sorted array for a given `needle` value, using integer comparisons +// of a given struct field (specified with a leading dot) to determine the order. +// +// See also `array_search_sorted_with`. +#define array_search_sorted_by(self, field, needle, index, exists) \ + array__search_sorted(self, 0, _compare_int, field, needle, index, exists) + +// Insert a given `value` into a sorted array, using the given `compare` +// callback to determine the order. +#define array_insert_sorted_with(self, compare, value) \ + do { \ + unsigned index, exists; \ + array_search_sorted_with(self, compare, &(value), &index, &exists); \ + if (!exists) array_insert(self, index, value); \ + } while (0) + +// Insert a given `value` into a sorted array, using integer comparisons of +// a given struct field (specified with a leading dot) to determine the order. +// +// See also `array_search_sorted_by`. +#define array_insert_sorted_by(self, field, value) \ + do { \ + unsigned index, exists; \ + array_search_sorted_by(self, field, (value) field, &index, &exists); \ + if (!exists) array_insert(self, index, value); \ + } while (0) + // Private typedef Array(void) VoidArray; @@ -93,7 +152,7 @@ static inline void array__reserve(VoidArray *self, size_t element_size, uint32_t if (self->contents) { self->contents = ts_realloc(self->contents, new_capacity * element_size); } else { - self->contents = ts_calloc(new_capacity, element_size); + self->contents = ts_malloc(new_capacity * element_size); } self->capacity = new_capacity; } @@ -105,6 +164,12 @@ static inline void array__assign(VoidArray *self, const VoidArray *other, size_t memcpy(self->contents, other->contents, self->size * element_size); } +static inline void array__swap(VoidArray *self, VoidArray *other) { + VoidArray swap = *other; + *other = *self; + *self = swap; +} + static inline void array__grow(VoidArray *self, size_t count, size_t element_size) { size_t new_size = self->size + count; if (new_size > self->capacity) { @@ -126,15 +191,55 @@ static inline void array__splice(VoidArray *self, size_t element_size, array__reserve(self, element_size, new_size); char *contents = (char *)self->contents; - if (self->size > old_end) - memmove(contents + new_end * element_size, contents + old_end * element_size, - (self->size - old_end) * element_size); - if (new_count > 0) - memcpy((contents + index * element_size), elements, - new_count * element_size); + if (self->size > old_end) { + memmove( + contents + new_end * element_size, + contents + old_end * element_size, + (self->size - old_end) * element_size + ); + } + if (new_count > 0) { + if (elements) { + memcpy( + (contents + index * element_size), + elements, + new_count * element_size + ); + } else { + memset( + (contents + index * element_size), + 0, + new_count * element_size + ); + } + } self->size += new_count - old_count; } +// A binary search routine, based on Rust's `std::slice::binary_search_by`. +#define array__search_sorted(self, start, compare, suffix, needle, index, exists) \ + do { \ + *(index) = start; \ + *(exists) = false; \ + uint32_t size = (self)->size - *(index); \ + if (size == 0) break; \ + int comparison; \ + while (size > 1) { \ + uint32_t half_size = size / 2; \ + uint32_t mid_index = *(index) + half_size; \ + comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ + if (comparison <= 0) *(index) = mid_index; \ + size -= half_size; \ + } \ + comparison = compare(&((self)->contents[*(index)] suffix), (needle)); \ + if (comparison == 0) *(exists) = true; \ + else if (comparison < 0) *(index) += 1; \ + } while (0) + +// Helper macro for the `_sorted_by` routines below. This takes the left (existing) +// parameter by reference in order to work with the generic sorting function above. +#define _compare_int(a, b) ((int)*(a) - (int)(b)) + #ifdef __cplusplus } #endif diff --git a/atomic.h b/atomic.h index 7bd0e850..16573242 100644 --- a/atomic.h +++ b/atomic.h @@ -3,7 +3,23 @@ #include -#ifdef _WIN32 +#ifdef __TINYC__ + +static inline size_t atomic_load(const volatile size_t *p) { + return *p; +} + +static inline uint32_t atomic_inc(volatile uint32_t *p) { + *p += 1; + return *p; +} + +static inline uint32_t atomic_dec(volatile uint32_t *p) { + *p-= 1; + return *p; +} + +#elif defined(_WIN32) #include diff --git a/bits.h b/bits.h index ce7a7155..ca8caf30 100644 --- a/bits.h +++ b/bits.h @@ -7,7 +7,20 @@ static inline uint32_t bitmask_for_index(uint16_t id) { return (1u << (31 - id)); } -#if defined _WIN32 && !defined __GNUC__ +#ifdef __TINYC__ + +// Algorithm taken from the Hacker's Delight book +// See also https://graphics.stanford.edu/~seander/bithacks.html +static inline uint32_t count_leading_zeros(uint32_t x) { + int count = 0; + if (x == 0) return 32; + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + count = (((x + (x >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24; + return count; +} + +#elif defined _WIN32 && !defined __GNUC__ #include diff --git a/get_changed_ranges.c b/get_changed_ranges.c index 5bd1d814..b8915544 100644 --- a/get_changed_ranges.c +++ b/get_changed_ranges.c @@ -146,17 +146,21 @@ static bool iterator_tree_is_visible(const Iterator *self) { if (ts_subtree_visible(*entry.subtree)) return true; if (self->cursor.stack.size > 1) { Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree; - const TSSymbol *alias_sequence = ts_language_alias_sequence( + return ts_language_alias_at( self->language, - parent.ptr->production_id - ); - return alias_sequence && alias_sequence[entry.structural_child_index] != 0; + parent.ptr->production_id, + entry.structural_child_index + ) != 0; } return false; } -static void iterator_get_visible_state(const Iterator *self, Subtree *tree, - TSSymbol *alias_symbol, uint32_t *start_byte) { +static void iterator_get_visible_state( + const Iterator *self, + Subtree *tree, + TSSymbol *alias_symbol, + uint32_t *start_byte +) { uint32_t i = self->cursor.stack.size - 1; if (self->in_padding) { @@ -169,13 +173,11 @@ static void iterator_get_visible_state(const Iterator *self, Subtree *tree, if (i > 0) { const Subtree *parent = self->cursor.stack.contents[i - 1].subtree; - const TSSymbol *alias_sequence = ts_language_alias_sequence( + *alias_symbol = ts_language_alias_at( self->language, - parent->ptr->production_id + parent->ptr->production_id, + entry.structural_child_index ); - if (alias_sequence) { - *alias_symbol = alias_sequence[entry.structural_child_index]; - } } if (ts_subtree_visible(*entry.subtree) || *alias_symbol) { @@ -203,7 +205,7 @@ static bool iterator_descend(Iterator *self, uint32_t goal_position) { Length position = entry.position; uint32_t structural_child_index = 0; for (uint32_t i = 0, n = ts_subtree_child_count(*entry.subtree); i < n; i++) { - const Subtree *child = &entry.subtree->ptr->children[i]; + const Subtree *child = &ts_subtree_children(*entry.subtree)[i]; Length child_left = length_add(position, ts_subtree_padding(*child)); Length child_right = length_add(child_left, ts_subtree_size(*child)); @@ -258,7 +260,7 @@ static void iterator_advance(Iterator *self) { Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); uint32_t structural_child_index = entry.structural_child_index; if (!ts_subtree_extra(*entry.subtree)) structural_child_index++; - const Subtree *next_child = &parent->ptr->children[child_index]; + const Subtree *next_child = &ts_subtree_children(*parent)[child_index]; array_push(&self->cursor.stack, ((TreeCursorEntry){ .subtree = next_child, diff --git a/language.c b/language.c index e240ef2a..9ccf2bc3 100644 --- a/language.c +++ b/language.c @@ -33,8 +33,8 @@ void ts_language_table_entry( assert(symbol < self->token_count); uint32_t action_index = ts_language_lookup(self, state, symbol); const TSParseActionEntry *entry = &self->parse_actions[action_index]; - result->action_count = entry->count; - result->is_reusable = entry->reusable; + result->action_count = entry->entry.count; + result->is_reusable = entry->entry.reusable; result->actions = (const TSParseAction *)(entry + 1); } } @@ -72,8 +72,10 @@ const char *ts_language_symbol_name( return "ERROR"; } else if (symbol == ts_builtin_sym_error_repeat) { return "_ERROR"; - } else { + } else if (symbol < ts_language_symbol_count(self)) { return self->symbol_names[symbol]; + } else { + return NULL; } } @@ -87,7 +89,7 @@ TSSymbol ts_language_symbol_for_name( uint32_t count = ts_language_symbol_count(self); for (TSSymbol i = 0; i < count; i++) { TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i); - if (!metadata.visible || metadata.named != is_named) continue; + if ((!metadata.visible && !metadata.supertype) || metadata.named != is_named) continue; const char *symbol_name = self->symbol_names[i]; if (!strncmp(symbol_name, string, length) && !symbol_name[length]) { if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { @@ -119,7 +121,7 @@ const char *ts_language_field_name_for_id( TSFieldId id ) { uint32_t count = ts_language_field_count(self); - if (count) { + if (count && id <= count) { return self->field_names[id]; } else { return NULL; diff --git a/language.h b/language.h index c72f5da8..984bd7ed 100644 --- a/language.h +++ b/language.h @@ -12,6 +12,8 @@ extern "C" { #define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10 #define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11 #define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11 +#define TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT 12 +#define TREE_SITTER_LANGUAGE_VERSION_WITH_ALIAS_MAP 12 typedef struct { const TSParseAction *actions; @@ -19,6 +21,22 @@ typedef struct { bool is_reusable; } TableEntry; +typedef struct { + const TSLanguage *language; + const uint16_t *data; + const uint16_t *group_end; + TSStateId state; + uint16_t table_value; + uint16_t section_index; + uint16_t group_count; + bool is_small_state; + + const TSParseAction *actions; + TSSymbol symbol; + TSStateId next_state; + uint16_t action_count; +} LookaheadIterator; + void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *); TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); @@ -29,32 +47,45 @@ static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymb return 0 < symbol && symbol < self->external_token_count + 1; } -static inline const TSParseAction *ts_language_actions(const TSLanguage *self, - TSStateId state, - TSSymbol symbol, - uint32_t *count) { +static inline const TSParseAction *ts_language_actions( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol, + uint32_t *count +) { TableEntry entry; ts_language_table_entry(self, state, symbol, &entry); *count = entry.action_count; return entry.actions; } -static inline bool ts_language_has_actions(const TSLanguage *self, - TSStateId state, - TSSymbol symbol) { +static inline bool ts_language_has_actions( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { TableEntry entry; ts_language_table_entry(self, state, symbol, &entry); return entry.action_count > 0; } -static inline bool ts_language_has_reduce_action(const TSLanguage *self, - TSStateId state, - TSSymbol symbol) { +static inline bool ts_language_has_reduce_action( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { TableEntry entry; ts_language_table_entry(self, state, symbol, &entry); return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce; } +// Lookup the table value for a given symbol and state. +// +// For non-terminal symbols, the table value represents a successor state. +// For terminal symbols, it represents an index in the actions table. +// For 'large' parse states, this is a direct lookup. For 'small' parse +// states, this requires searching through the symbol groups to find +// the given symbol. static inline uint16_t ts_language_lookup( const TSLanguage *self, TSStateId state, @@ -66,8 +97,8 @@ static inline uint16_t ts_language_lookup( ) { uint32_t index = self->small_parse_table_map[state - self->large_state_count]; const uint16_t *data = &self->small_parse_table[index]; - uint16_t section_count = *(data++); - for (unsigned i = 0; i < section_count; i++) { + uint16_t group_count = *(data++); + for (unsigned i = 0; i < group_count; i++) { uint16_t section_value = *(data++); uint16_t symbol_count = *(data++); for (unsigned i = 0; i < symbol_count; i++) { @@ -80,9 +111,90 @@ static inline uint16_t ts_language_lookup( } } -static inline TSStateId ts_language_next_state(const TSLanguage *self, - TSStateId state, - TSSymbol symbol) { +// Iterate over all of the symbols that are valid in the given state. +// +// For 'large' parse states, this just requires iterating through +// all possible symbols and checking the parse table for each one. +// For 'small' parse states, this exploits the structure of the +// table to only visit the valid symbols. +static inline LookaheadIterator ts_language_lookaheads( + const TSLanguage *self, + TSStateId state +) { + bool is_small_state = + self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES && + state >= self->large_state_count; + const uint16_t *data; + const uint16_t *group_end = NULL; + uint16_t group_count = 0; + if (is_small_state) { + uint32_t index = self->small_parse_table_map[state - self->large_state_count]; + data = &self->small_parse_table[index]; + group_end = data + 1; + group_count = *data; + } else { + data = &self->parse_table[state * self->symbol_count] - 1; + } + return (LookaheadIterator) { + .language = self, + .data = data, + .group_end = group_end, + .group_count = group_count, + .is_small_state = is_small_state, + .symbol = UINT16_MAX, + .next_state = 0, + }; +} + +static inline bool ts_lookahead_iterator_next(LookaheadIterator *self) { + // For small parse states, valid symbols are listed explicitly, + // grouped by their value. There's no need to look up the actions + // again until moving to the next group. + if (self->is_small_state) { + self->data++; + if (self->data == self->group_end) { + if (self->group_count == 0) return false; + self->group_count--; + self->table_value = *(self->data++); + unsigned symbol_count = *(self->data++); + self->group_end = self->data + symbol_count; + self->symbol = *self->data; + } else { + self->symbol = *self->data; + return true; + } + } + + // For large parse states, iterate through every symbol until one + // is found that has valid actions. + else { + do { + self->data++; + self->symbol++; + if (self->symbol >= self->language->symbol_count) return false; + self->table_value = *self->data; + } while (!self->table_value); + } + + // Depending on if the symbols is terminal or non-terminal, the table value either + // represents a list of actions or a successor state. + if (self->symbol < self->language->token_count) { + const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value]; + self->action_count = entry->entry.count; + self->actions = (const TSParseAction *)(entry + 1); + self->next_state = 0; + } else { + self->action_count = 0; + self->next_state = self->table_value; + } + return true; +} + +static inline TSStateId ts_language_next_state( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { return 0; } else if (symbol < self->token_count) { @@ -90,8 +202,8 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, const TSParseAction *actions = ts_language_actions(self, state, symbol, &count); if (count > 0) { TSParseAction action = actions[count - 1]; - if (action.type == TSParseActionTypeShift || action.type == TSParseActionTypeRecover) { - return action.params.state; + if (action.type == TSParseActionTypeShift) { + return action.params.shift.extra ? state : action.params.shift.state; } } return 0; @@ -100,9 +212,10 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, } } -static inline const bool * -ts_language_enabled_external_tokens(const TSLanguage *self, - unsigned external_scanner_state) { +static inline const bool *ts_language_enabled_external_tokens( + const TSLanguage *self, + unsigned external_scanner_state +) { if (external_scanner_state == 0) { return NULL; } else { @@ -110,13 +223,25 @@ ts_language_enabled_external_tokens(const TSLanguage *self, } } -static inline const TSSymbol * -ts_language_alias_sequence(const TSLanguage *self, uint32_t production_id) { - return production_id > 0 ? - self->alias_sequences + production_id * self->max_alias_sequence_length : +static inline const TSSymbol *ts_language_alias_sequence( + const TSLanguage *self, + uint32_t production_id +) { + return production_id ? + &self->alias_sequences[production_id * self->max_alias_sequence_length] : NULL; } +static inline TSSymbol ts_language_alias_at( + const TSLanguage *self, + uint32_t production_id, + uint32_t child_index +) { + return production_id ? + self->alias_sequences[production_id * self->max_alias_sequence_length + child_index] : + 0; +} + static inline void ts_language_field_map( const TSLanguage *self, uint32_t production_id, @@ -134,6 +259,32 @@ static inline void ts_language_field_map( *end = &self->field_map_entries[slice.index] + slice.length; } +static inline void ts_language_aliases_for_symbol( + const TSLanguage *self, + TSSymbol original_symbol, + const TSSymbol **start, + const TSSymbol **end +) { + *start = &self->public_symbol_map[original_symbol]; + *end = *start + 1; + + if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_ALIAS_MAP) return; + + unsigned i = 0; + for (;;) { + TSSymbol symbol = self->alias_map[i++]; + if (symbol == 0 || symbol > original_symbol) break; + uint16_t count = self->alias_map[i++]; + if (symbol == original_symbol) { + *start = &self->alias_map[i]; + *end = &self->alias_map[i + count]; + break; + } + i += count; + } +} + + #ifdef __cplusplus } #endif diff --git a/lexer.c b/lexer.c index 3f8a4c0a..08e90a8c 100644 --- a/lexer.c +++ b/lexer.c @@ -73,7 +73,6 @@ static void ts_lexer__get_chunk(Lexer *self) { // code that spans the current position. static void ts_lexer__get_lookahead(Lexer *self) { uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start; - const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; uint32_t size = self->chunk_size - position_in_chunk; if (size == 0) { @@ -82,6 +81,7 @@ static void ts_lexer__get_lookahead(Lexer *self) { return; } + const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 : ts_decode_utf16; @@ -203,7 +203,7 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) { // Is the lexer at a boundary between two disjoint included ranges of // source code? This is exposed as an API because some languages' external -// scanners need to perform custom actions at these bounaries. +// scanners need to perform custom actions at these boundaries. static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) { const Lexer *self = (const Lexer *)_self; if (self->current_included_range_index < self->included_range_count) { diff --git a/node.c b/node.c index b03e2fc9..9ce0f0b3 100644 --- a/node.c +++ b/node.c @@ -79,7 +79,7 @@ static inline bool ts_node_child_iterator_next( TSNode *result ) { if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false; - const Subtree *child = &self->parent.ptr->children[self->child_index]; + const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; TSSymbol alias_symbol = 0; if (!ts_subtree_extra(*child)) { if (self->alias_sequence) { @@ -150,7 +150,9 @@ static inline TSNode ts_node__child( while (ts_node_child_iterator_next(&iterator, &child)) { if (ts_node__is_relevant(child, include_anonymous)) { if (index == child_index) { - ts_tree_set_cached_parent(self.tree, &child, &self); + if (ts_node__is_relevant(self, true)) { + ts_tree_set_cached_parent(self.tree, &child, &self); + } return child; } index++; @@ -176,7 +178,7 @@ static bool ts_subtree_has_trailing_empty_descendant( Subtree other ) { for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) { - Subtree child = self.ptr->children[i]; + Subtree child = ts_subtree_children(self)[i]; if (ts_subtree_total_bytes(child) > 0) break; if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) { return true; diff --git a/parser.c b/parser.c index e10e83d6..e8955682 100644 --- a/parser.c +++ b/parser.c @@ -60,8 +60,9 @@ struct TSParser { const TSLanguage *language; ReduceActionSet reduce_actions; Subtree finished_tree; - SubtreeHeapData scratch_tree_data; - MutableSubtree scratch_tree; + SubtreeArray trailing_extras; + SubtreeArray trailing_extras2; + SubtreeArray scratch_trees; TokenCache token_cache; ReusableNode reusable_node; void *external_scanner_payload; @@ -101,9 +102,10 @@ typedef struct { static const char *ts_string_input_read( void *_self, uint32_t byte, - TSPoint _, + TSPoint pt, uint32_t *length ) { + (void)pt; TSStringInput *self = (TSStringInput *)_self; if (byte >= self->length) { *length = 0; @@ -154,7 +156,7 @@ static bool ts_parser__breakdown_top_of_stack( Subtree parent = *array_front(&slice.subtrees); for (uint32_t j = 0, n = ts_subtree_child_count(parent); j < n; j++) { - Subtree child = parent.ptr->children[j]; + Subtree child = ts_subtree_children(parent)[j]; pending = ts_subtree_child_count(child) > 0; if (ts_subtree_is_error(child)) { @@ -210,6 +212,7 @@ static ErrorComparison ts_parser__compare_versions( ErrorStatus a, ErrorStatus b ) { + (void)self; if (!a.is_in_error && b.is_in_error) { if (a.cost < b.cost) { return ErrorComparisonTakeLeft; @@ -290,6 +293,7 @@ static bool ts_parser__better_version_exists( return true; case ErrorComparisonPreferRight: if (ts_stack_can_merge(self->stack, i, version)) return true; + break; default: break; } @@ -324,6 +328,12 @@ static bool ts_parser__can_reuse_first_leaf( TSStateId leaf_state = ts_subtree_leaf_parse_state(tree); TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state]; + // At the end of a non-terminal extra node, the lexer normally returns + // NULL, which indicates that the parser should look for a reduce action + // at symbol `0`. Avoid reusing tokens in this situation to ensure that + // the same thing happens when incrementally reparsing. + if (current_lex_mode.lex_state == (uint16_t)(-1)) return false; + // If the token was created in a state with the same set of lookaheads, it is reusable. if ( table_entry->action_count > 0 && @@ -347,10 +357,14 @@ static Subtree ts_parser__lex( StackVersion version, TSStateId parse_state ) { + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + if (lex_mode.lex_state == (uint16_t)-1) { + LOG("no_lookahead_after_non_terminal_extra"); + return NULL_SUBTREE; + } + Length start_position = ts_stack_position(self->stack, version); Subtree external_token = ts_stack_last_external_token(self->stack, version); - TSLexMode lex_mode = self->language->lex_modes[parse_state]; - if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE; const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state @@ -593,6 +607,10 @@ static Subtree ts_parser__reuse_node( uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node); uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result); + // Do not reuse an EOF node if the included ranges array has changes + // later on in the file. + if (ts_subtree_is_eof(result)) end_byte_offset = UINT32_MAX; + if (byte_offset > position) { LOG("before_reusable_node symbol:%s", TREE_NAME(result)); break; @@ -655,6 +673,10 @@ static Subtree ts_parser__reuse_node( return NULL_SUBTREE; } +// Determine if a given tree should be replaced by an alternative tree. +// +// The decision is based on the trees' error costs (if any), their dynamic precedence, +// and finally, as a default, by a recursive comparison of the trees' symbols. static bool ts_parser__select_tree(TSParser *self, Subtree left, Subtree right) { if (!left.ptr) return true; if (!right.ptr) return false; @@ -700,6 +722,33 @@ static bool ts_parser__select_tree(TSParser *self, Subtree left, Subtree right) } } +// Determine if a given tree's children should be replaced by an alternative +// array of children. +static bool ts_parser__select_children( + TSParser *self, + Subtree left, + const SubtreeArray *children +) { + array_assign(&self->scratch_trees, children); + + // Create a temporary subtree using the scratch trees array. This node does + // not perform any allocation except for possibly growing the array to make + // room for its own heap data. The scratch tree is never explicitly released, + // so the same 'scratch trees' array can be reused again later. + MutableSubtree scratch_tree = ts_subtree_new_node( + ts_subtree_symbol(left), + &self->scratch_trees, + 0, + self->language + ); + + return ts_parser__select_tree( + self, + left, + ts_subtree_from_mut(scratch_tree) + ); +} + static void ts_parser__shift( TSParser *self, StackVersion version, @@ -725,22 +774,6 @@ static void ts_parser__shift( } } -static bool ts_parser__replace_children( - TSParser *self, - MutableSubtree *tree, - SubtreeArray *children -) { - *self->scratch_tree.ptr = *tree->ptr; - self->scratch_tree.ptr->child_count = 0; - ts_subtree_set_children(self->scratch_tree, children->contents, children->size, self->language); - if (ts_parser__select_tree(self, ts_subtree_from_mut(*tree), ts_subtree_from_mut(self->scratch_tree))) { - *tree->ptr = *self->scratch_tree.ptr; - return true; - } else { - return false; - } -} - static StackVersion ts_parser__reduce( TSParser *self, StackVersion version, @@ -749,20 +782,26 @@ static StackVersion ts_parser__reduce( int dynamic_precedence, uint16_t production_id, bool is_fragile, - bool is_extra + bool end_of_non_terminal_extra ) { uint32_t initial_version_count = ts_stack_version_count(self->stack); - uint32_t removed_version_count = 0; - StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + // Pop the given number of nodes from the given version of the parse stack. + // If stack versions have previously merged, then there may be more than one + // path back through the stack. For each path, create a new parent node to + // contain the popped children, and push it onto the stack in place of the + // children. + StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + uint32_t removed_version_count = 0; for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; StackVersion slice_version = slice.version - removed_version_count; - // Error recovery can sometimes cause lots of stack versions to merge, - // such that a single pop operation can produce a lots of slices. - // Avoid creating too many stack versions in that situation. - if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { + // This is where new versions are added to the parse stack. The versions + // will all be sorted and truncated at the end of the outer parsing loop. + // Allow the maximum version count to be temporarily exceeded, but only + // by a limited threshold. + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; @@ -779,11 +818,9 @@ static StackVersion ts_parser__reduce( // node. They will be re-pushed onto the stack after the parent node is // created and pushed. SubtreeArray children = slice.subtrees; - while (children.size > 0 && ts_subtree_extra(children.contents[children.size - 1])) { - children.size--; - } + ts_subtree_array_remove_trailing_extras(&children, &self->trailing_extras); - MutableSubtree parent = ts_subtree_new_node(&self->tree_pool, + MutableSubtree parent = ts_subtree_new_node( symbol, &children, production_id, self->language ); @@ -797,24 +834,30 @@ static StackVersion ts_parser__reduce( i++; SubtreeArray children = next_slice.subtrees; - while (children.size > 0 && ts_subtree_extra(children.contents[children.size - 1])) { - children.size--; - } + ts_subtree_array_remove_trailing_extras(&children, &self->trailing_extras2); - if (ts_parser__replace_children(self, &parent, &children)) { - ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); - slice = next_slice; + if (ts_parser__select_children( + self, + ts_subtree_from_mut(parent), + &children + )) { + ts_subtree_array_clear(&self->tree_pool, &self->trailing_extras); + ts_subtree_release(&self->tree_pool, ts_subtree_from_mut(parent)); + array_swap(&self->trailing_extras, &self->trailing_extras2); + parent = ts_subtree_new_node( + symbol, &children, production_id, self->language + ); } else { + array_clear(&self->trailing_extras2); ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); } } - parent.ptr->dynamic_precedence += dynamic_precedence; - parent.ptr->production_id = production_id; - TSStateId state = ts_stack_state(self->stack, slice_version); TSStateId next_state = ts_language_next_state(self->language, state, symbol); - if (is_extra) parent.ptr->extra = true; + if (end_of_non_terminal_extra && next_state == state) { + parent.ptr->extra = true; + } if (is_fragile || pop.size > 1 || initial_version_count > 1) { parent.ptr->fragile_left = true; parent.ptr->fragile_right = true; @@ -822,12 +865,13 @@ static StackVersion ts_parser__reduce( } else { parent.ptr->parse_state = state; } + parent.ptr->dynamic_precedence += dynamic_precedence; // Push the parent node onto the stack, along with any extra tokens that // were previously on top of the stack. ts_stack_push(self->stack, slice_version, ts_subtree_from_mut(parent), false, next_state); - for (uint32_t j = parent.ptr->child_count; j < slice.subtrees.size; j++) { - ts_stack_push(self->stack, slice_version, slice.subtrees.contents[j], false, next_state); + for (uint32_t j = 0; j < self->trailing_extras.size; j++) { + ts_stack_push(self->stack, slice_version, self->trailing_extras.contents[j], false, next_state); } for (StackVersion j = 0; j < slice_version; j++) { @@ -859,22 +903,22 @@ static void ts_parser__accept( Subtree root = NULL_SUBTREE; for (uint32_t j = trees.size - 1; j + 1 > 0; j--) { - Subtree child = trees.contents[j]; - if (!ts_subtree_extra(child)) { - assert(!child.data.is_inline); - uint32_t child_count = ts_subtree_child_count(child); + Subtree tree = trees.contents[j]; + if (!ts_subtree_extra(tree)) { + assert(!tree.data.is_inline); + uint32_t child_count = ts_subtree_child_count(tree); + const Subtree *children = ts_subtree_children(tree); for (uint32_t k = 0; k < child_count; k++) { - ts_subtree_retain(child.ptr->children[k]); + ts_subtree_retain(children[k]); } - array_splice(&trees, j, 1, child_count, child.ptr->children); + array_splice(&trees, j, 1, child_count, children); root = ts_subtree_from_mut(ts_subtree_new_node( - &self->tree_pool, - ts_subtree_symbol(child), + ts_subtree_symbol(tree), &trees, - child.ptr->production_id, + tree.ptr->production_id, self->language )); - ts_subtree_release(&self->tree_pool, child); + ts_subtree_release(&self->tree_pool, tree); break; } } @@ -941,16 +985,17 @@ static bool ts_parser__do_all_potential_reductions( switch (action.type) { case TSParseActionTypeShift: case TSParseActionTypeRecover: - if (!action.params.extra && !action.params.repetition) has_shift_action = true; + if (!action.params.shift.extra && !action.params.shift.repetition) has_shift_action = true; break; case TSParseActionTypeReduce: - if (action.params.child_count > 0) + if (action.params.reduce.child_count > 0) ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ - .symbol = action.params.symbol, - .count = action.params.child_count, - .dynamic_precedence = action.params.dynamic_precedence, - .production_id = action.params.production_id, + .symbol = action.params.reduce.symbol, + .count = action.params.reduce.child_count, + .dynamic_precedence = action.params.reduce.dynamic_precedence, + .production_id = action.params.reduce.production_id, }); + break; default: break; } @@ -1013,7 +1058,9 @@ static void ts_parser__handle_error( TSStateId state_after_missing_symbol = ts_language_next_state( self->language, state, missing_symbol ); - if (state_after_missing_symbol == 0) continue; + if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { + continue; + } if (ts_language_has_reduce_action( self->language, @@ -1097,7 +1144,7 @@ static bool ts_parser__recover_to_state( Subtree error_tree = error_trees.contents[0]; uint32_t error_child_count = ts_subtree_child_count(error_tree); if (error_child_count > 0) { - array_splice(&slice.subtrees, 0, 0, error_child_count, error_tree.ptr->children); + array_splice(&slice.subtrees, 0, 0, error_child_count, ts_subtree_children(error_tree)); for (unsigned j = 0; j < error_child_count; j++) { ts_subtree_retain(slice.subtrees.contents[j]); } @@ -1105,22 +1152,21 @@ static bool ts_parser__recover_to_state( ts_subtree_array_delete(&self->tree_pool, &error_trees); } - SubtreeArray trailing_extras = ts_subtree_array_remove_trailing_extras(&slice.subtrees); + ts_subtree_array_remove_trailing_extras(&slice.subtrees, &self->trailing_extras); if (slice.subtrees.size > 0) { - Subtree error = ts_subtree_new_error_node(&self->tree_pool, &slice.subtrees, true, self->language); + Subtree error = ts_subtree_new_error_node(&slice.subtrees, true, self->language); ts_stack_push(self->stack, slice.version, error, false, goal_state); } else { array_delete(&slice.subtrees); } - for (unsigned j = 0; j < trailing_extras.size; j++) { - Subtree tree = trailing_extras.contents[j]; + for (unsigned j = 0; j < self->trailing_extras.size; j++) { + Subtree tree = self->trailing_extras.contents[j]; ts_stack_push(self->stack, slice.version, tree, false, goal_state); } previous_version = slice.version; - array_delete(&trailing_extras); } return previous_version != STACK_VERSION_NONE; @@ -1193,7 +1239,7 @@ static void ts_parser__recover( } } - // In the process of attemping to recover, some stack versions may have been created + // In the process of attempting to recover, some stack versions may have been created // and subsequently halted. Remove those versions. for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { if (!ts_stack_is_active(self->stack, i)) { @@ -1217,7 +1263,7 @@ static void ts_parser__recover( if (ts_subtree_is_eof(lookahead)) { LOG("recover_eof"); SubtreeArray children = array_new(); - Subtree parent = ts_subtree_new_error_node(&self->tree_pool, &children, false, self->language); + Subtree parent = ts_subtree_new_error_node(&children, false, self->language); ts_stack_push(self->stack, version, parent, false, 1); ts_parser__accept(self, version, lookahead); return; @@ -1238,7 +1284,7 @@ static void ts_parser__recover( // be counted in error cost calculations. unsigned n; const TSParseAction *actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n); - if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.extra) { + if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.shift.extra) { MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); ts_subtree_set_extra(&mutable_lookahead); lookahead = ts_subtree_from_mut(mutable_lookahead); @@ -1250,7 +1296,6 @@ static void ts_parser__recover( array_reserve(&children, 1); array_push(&children, lookahead); MutableSubtree error_repeat = ts_subtree_new_node( - &self->tree_pool, ts_builtin_sym_error_repeat, &children, 0, @@ -1279,7 +1324,6 @@ static void ts_parser__recover( ts_stack_renumber_version(self->stack, pop.contents[0].version, version); array_push(&pop.contents[0].subtrees, ts_subtree_from_mut(error_repeat)); error_repeat = ts_subtree_new_node( - &self->tree_pool, ts_builtin_sym_error_repeat, &pop.contents[0].subtrees, 0, @@ -1325,23 +1369,26 @@ static bool ts_parser__advance( ); } - // Otherwise, re-run the lexer. - if (!lookahead.ptr) { - lookahead = ts_parser__lex(self, version, state); - if (lookahead.ptr) { - ts_parser__set_cached_token(self, position, last_external_token, lookahead); - ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); - } + bool needs_lex = !lookahead.ptr; + for (;;) { + // Otherwise, re-run the lexer. + if (needs_lex) { + needs_lex = false; + lookahead = ts_parser__lex(self, version, state); + + if (lookahead.ptr) { + ts_parser__set_cached_token(self, position, last_external_token, lookahead); + ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); + } - // When parsing a non-terminal extra, a null lookahead indicates the - // end of the rule. The reduction is stored in the EOF table entry. - // After the reduction, the lexer needs to be run again. - else { - ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); + // When parsing a non-terminal extra, a null lookahead indicates the + // end of the rule. The reduction is stored in the EOF table entry. + // After the reduction, the lexer needs to be run again. + else { + ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); + } } - } - for (;;) { // If a cancellation flag or a timeout was provided, then check every // time a fixed number of parse actions has been processed. if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { @@ -1367,9 +1414,9 @@ static bool ts_parser__advance( switch (action.type) { case TSParseActionTypeShift: { - if (action.params.repetition) break; + if (action.params.shift.repetition) break; TSStateId next_state; - if (action.params.extra) { + if (action.params.shift.extra) { // TODO: remove when TREE_SITTER_LANGUAGE_VERSION 9 is out. if (state == ERROR_STATE) continue; @@ -1377,7 +1424,7 @@ static bool ts_parser__advance( next_state = state; LOG("shift_extra"); } else { - next_state = action.params.state; + next_state = action.params.shift.state; LOG("shift state:%u", next_state); } @@ -1386,19 +1433,19 @@ static bool ts_parser__advance( next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead)); } - ts_parser__shift(self, version, next_state, lookahead, action.params.extra); + ts_parser__shift(self, version, next_state, lookahead, action.params.shift.extra); if (did_reuse) reusable_node_advance(&self->reusable_node); return true; } case TSParseActionTypeReduce: { bool is_fragile = table_entry.action_count > 1; - bool is_extra = lookahead.ptr == NULL; - LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count); + bool end_of_non_terminal_extra = lookahead.ptr == NULL; + LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count); StackVersion reduction_version = ts_parser__reduce( - self, version, action.params.symbol, action.params.child_count, - action.params.dynamic_precedence, action.params.production_id, - is_fragile, is_extra + self, version, action.params.reduce.symbol, action.params.reduce.child_count, + action.params.reduce.dynamic_precedence, action.params.reduce.production_id, + is_fragile, end_of_non_terminal_extra ); if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; @@ -1438,8 +1485,10 @@ static bool ts_parser__advance( // (and completing the non-terminal extra rule) run the lexer again based // on the current parse state. if (!lookahead.ptr) { - lookahead = ts_parser__lex(self, version, state); + needs_lex = true; + continue; } + ts_language_table_entry( self->language, state, @@ -1449,6 +1498,11 @@ static bool ts_parser__advance( continue; } + if (!lookahead.ptr) { + ts_stack_pause(self->stack, version, ts_builtin_sym_end); + return true; + } + // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that @@ -1486,6 +1540,9 @@ static bool ts_parser__advance( // push each of its children. Then try again to process the current // lookahead. if (ts_parser__breakdown_top_of_stack(self, version)) { + state = ts_stack_state(self->stack, version); + ts_subtree_release(&self->tree_pool, lookahead); + needs_lex = true; continue; } @@ -1603,8 +1660,8 @@ static unsigned ts_parser__condense_stack(TSParser *self) { static bool ts_parser_has_outstanding_parse(TSParser *self) { return ( - self->lexer.current_position.bytes > 0 || - ts_stack_state(self->stack, 0) != 1 + ts_stack_state(self->stack, 0) != 1 || + ts_stack_node_count_since_error(self->stack, 0) != 0 ); } @@ -1625,7 +1682,6 @@ TSParser *ts_parser_new(void) { self->end_clock = clock_null(); self->operation_count = 0; self->old_tree = NULL_SUBTREE; - self->scratch_tree.ptr = &self->scratch_tree_data; self->included_range_differences = (TSRangeArray) array_new(); self->included_range_difference_index = 0; ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); @@ -1651,6 +1707,9 @@ void ts_parser_delete(TSParser *self) { ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); ts_subtree_pool_delete(&self->tree_pool); reusable_node_delete(&self->reusable_node); + array_delete(&self->trailing_extras); + array_delete(&self->trailing_extras2); + array_delete(&self->scratch_trees); ts_free(self); } diff --git a/parser.h b/parser.h index 9df91f8c..c5a788ff 100644 --- a/parser.h +++ b/parser.h @@ -35,6 +35,7 @@ typedef uint16_t TSStateId; typedef struct { bool visible : 1; bool named : 1; + bool supertype: 1; } TSSymbolMetadata; typedef struct TSLexer TSLexer; @@ -62,13 +63,13 @@ typedef struct { TSStateId state; bool extra : 1; bool repetition : 1; - }; + } shift; struct { TSSymbol symbol; int16_t dynamic_precedence; uint8_t child_count; uint8_t production_id; - }; + } reduce; } params; TSParseActionType type : 4; } TSParseAction; @@ -83,7 +84,7 @@ typedef union { struct { uint8_t count; bool reusable : 1; - }; + } entry; } TSParseActionEntry; struct TSLanguage { @@ -119,6 +120,8 @@ struct TSLanguage { const uint16_t *small_parse_table; const uint32_t *small_parse_table_map; const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + uint32_t state_count; }; /* @@ -167,22 +170,28 @@ struct TSLanguage { #define ACTIONS(id) id -#define SHIFT(state_value) \ - { \ - { \ - .type = TSParseActionTypeShift, \ - .params = {.state = state_value}, \ - } \ +#define SHIFT(state_value) \ + { \ + { \ + .params = { \ + .shift = { \ + .state = state_value \ + } \ + }, \ + .type = TSParseActionTypeShift \ + } \ } #define SHIFT_REPEAT(state_value) \ { \ { \ - .type = TSParseActionTypeShift, \ .params = { \ - .state = state_value, \ - .repetition = true \ + .shift = { \ + .state = state_value, \ + .repetition = true \ + } \ }, \ + .type = TSParseActionTypeShift \ } \ } @@ -194,20 +203,26 @@ struct TSLanguage { #define SHIFT_EXTRA() \ { \ { \ - .type = TSParseActionTypeShift, \ - .params = {.extra = true} \ + .params = { \ + .shift = { \ + .extra = true \ + } \ + }, \ + .type = TSParseActionTypeShift \ } \ } #define REDUCE(symbol_val, child_count_val, ...) \ { \ { \ - .type = TSParseActionTypeReduce, \ .params = { \ - .symbol = symbol_val, \ - .child_count = child_count_val, \ - __VA_ARGS__ \ - } \ + .reduce = { \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + __VA_ARGS__ \ + }, \ + }, \ + .type = TSParseActionTypeReduce \ } \ } diff --git a/query.c b/query.c index 29ddc028..68894262 100644 --- a/query.c +++ b/query.c @@ -8,12 +8,22 @@ #include "./unicode.h" #include +// #define DEBUG_ANALYZE_QUERY +// #define LOG(...) fprintf(stderr, __VA_ARGS__) +#define LOG(...) + +#define MAX_CAPTURE_LIST_COUNT 32 +#define MAX_STEP_CAPTURE_COUNT 3 +#define MAX_STATE_PREDECESSOR_COUNT 100 +#define MAX_ANALYSIS_STATE_DEPTH 12 + /* * Stream - A sequence of unicode characters derived from a UTF8 string. * This struct is used in parsing queries from S-expressions. */ typedef struct { const char *input; + const char *start; const char *end; int32_t next; uint8_t next_size; @@ -25,21 +35,30 @@ typedef struct { * represented as a sequence of these steps. Fields: * * - `symbol` - The grammar symbol to match. A zero value represents the - * wildcard symbol, '*'. + * wildcard symbol, '_'. * - `field` - The field name to match. A zero value means that a field name * was not specified. - * - `capture_id` - An integer representing the name of the capture associated - * with this node in the pattern. A `NONE` value means this node is not - * captured in this pattern. + * - `capture_ids` - An array of integers representing the names of captures + * associated with this node in the pattern, terminated by a `NONE` value. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. + * - `alternative_index` - The index of a different query step that serves as + * an alternative to this step. */ typedef struct { TSSymbol symbol; + TSSymbol supertype_symbol; TSFieldId field; - uint16_t capture_id; - uint16_t depth: 15; + uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; + uint16_t alternative_index; + uint16_t depth; bool contains_captures: 1; + bool is_immediate: 1; + bool is_last_child: 1; + bool is_pass_through: 1; + bool is_dead_end: 1; + bool alternative_is_immediate: 1; + bool is_definite: 1; } QueryStep; /* @@ -74,33 +93,114 @@ typedef struct { uint16_t pattern_index; } PatternEntry; +typedef struct { + Slice steps; + Slice predicate_steps; + uint32_t start_byte; +} QueryPattern; + +typedef struct { + uint32_t byte_offset; + uint16_t step_index; +} StepOffset; + /* * QueryState - The state of an in-progress match of a particular pattern * in a query. While executing, a `TSQueryCursor` must keep track of a number * of possible in-progress matches. Each of those possible matches is - * represented as one of these states. + * represented as one of these states. Fields: + * - `id` - A numeric id that is exposed to the public API. This allows the + * caller to remove a given match, preventing any more of its captures + * from being returned. + * - `start_depth` - The depth in the tree where the first step of the state's + * pattern was matched. + * - `pattern_index` - The pattern that the state is matching. + * - `consumed_capture_count` - The number of captures from this match that + * have already been returned. + * - `capture_list_id` - A numeric id that can be used to retrieve the state's + * list of captures from the `CaptureListPool`. + * - `seeking_immediate_match` - A flag that indicates that the state's next + * step must be matched by the very next sibling. This is used when + * processing repetitions. + * - `has_in_progress_alternatives` - A flag that indicates that there is are + * other states that have the same captures as this state, but are at + * different steps in their pattern. This means that in order to obey the + * 'longest-match' rule, this state should not be returned as a match until + * it is clear that there can be no longer match. */ typedef struct { + uint32_t id; uint16_t start_depth; - uint16_t pattern_index; uint16_t step_index; - uint16_t capture_count; + uint16_t pattern_index; uint16_t capture_list_id; - uint16_t consumed_capture_count; - uint32_t id; + uint16_t consumed_capture_count: 12; + bool seeking_immediate_match: 1; + bool has_in_progress_alternatives: 1; + bool dead: 1; + bool needs_parent: 1; } QueryState; +typedef Array(TSQueryCapture) CaptureList; + /* * CaptureListPool - A collection of *lists* of captures. Each QueryState - * needs to maintain its own list of captures. They are all represented as - * slices of one shared array. The CaptureListPool keeps track of which - * parts of the shared array are currently in use by a QueryState. + * needs to maintain its own list of captures. To avoid repeated allocations, + * the reuses a fixed set of capture lists, and keeps track of which ones + * are currently in use. */ typedef struct { - Array(TSQueryCapture) list; + CaptureList list[MAX_CAPTURE_LIST_COUNT]; + CaptureList empty_list; uint32_t usage_map; } CaptureListPool; +/* + * AnalysisState - The state needed for walking the parse table when analyzing + * a query pattern, to determine at which steps the pattern might fail to match. + */ +typedef struct { + TSStateId parse_state; + TSSymbol parent_symbol; + uint16_t child_index; + TSFieldId field_id: 15; + bool done: 1; +} AnalysisStateEntry; + +typedef struct { + AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH]; + uint16_t depth; + uint16_t step_index; +} AnalysisState; + +typedef Array(AnalysisState) AnalysisStateSet; + +/* + * AnalysisSubgraph - A subset of the states in the parse table that are used + * in constructing nodes with a certain symbol. Each state is accompanied by + * some information about the possible node that could be produced in + * downstream states. + */ +typedef struct { + TSStateId state; + uint8_t production_id; + uint8_t child_index: 7; + bool done: 1; +} AnalysisSubgraphNode; + +typedef struct { + TSSymbol symbol; + Array(TSStateId) start_states; + Array(AnalysisSubgraphNode) nodes; +} AnalysisSubgraph; + +/* + * StatePredecessorMap - A map that stores the predecessors of each parse state. + */ +typedef struct { + TSStateId *contents; +} StatePredecessorMap; + /* * TSQuery - A tree query, compiled from a string of S-expressions. The query * itself is immutable. The mutable state used in the process of executing the @@ -112,10 +212,10 @@ struct TSQuery { Array(QueryStep) steps; Array(PatternEntry) pattern_map; Array(TSQueryPredicateStep) predicate_steps; - Array(Slice) predicates_by_pattern; - Array(uint32_t) start_bytes_by_pattern; + Array(QueryPattern) patterns; + Array(StepOffset) step_offsets; + Array(char) string_buffer; const TSLanguage *language; - uint16_t max_capture_count; uint16_t wildcard_root_pattern_count; TSSymbol *symbol_map; }; @@ -136,16 +236,14 @@ struct TSQueryCursor { TSPoint start_point; TSPoint end_point; bool ascending; + bool halted; }; static const TSQueryError PARENT_DONE = -1; -static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX; +static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX; static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; -static const uint16_t MAX_STATE_COUNT = 32; - -// #define LOG(...) fprintf(stderr, __VA_ARGS__) -#define LOG(...) +static const TSSymbol NAMED_WILDCARD_SYMBOL = UINT16_MAX - 1; /********** * Stream @@ -183,21 +281,22 @@ static Stream stream_new(const char *string, uint32_t length) { Stream self = { .next = 0, .input = string, + .start = string, .end = string + length, }; stream_advance(&self); return self; } -static void stream_skip_whitespace(Stream *stream) { +static void stream_skip_whitespace(Stream *self) { for (;;) { - if (iswspace(stream->next)) { - stream_advance(stream); - } else if (stream->next == ';') { + if (iswspace(self->next)) { + stream_advance(self); + } else if (self->next == ';') { // skip over comments - stream_advance(stream); - while (stream->next && stream->next != '\n') { - if (!stream_advance(stream)) break; + stream_advance(self); + while (self->next && self->next != '\n') { + if (!stream_advance(self)) break; } } else { break; @@ -205,8 +304,8 @@ static void stream_skip_whitespace(Stream *stream) { } } -static bool stream_is_ident_start(Stream *stream) { - return iswalnum(stream->next) || stream->next == '_' || stream->next == '-'; +static bool stream_is_ident_start(Stream *self) { + return iswalnum(self->next) || self->next == '_' || self->next == '-'; } static void stream_scan_identifier(Stream *stream) { @@ -222,30 +321,42 @@ static void stream_scan_identifier(Stream *stream) { ); } +static uint32_t stream_offset(Stream *self) { + return self->input - self->start; +} + /****************** * CaptureListPool ******************/ -static CaptureListPool capture_list_pool_new() { +static CaptureListPool capture_list_pool_new(void) { return (CaptureListPool) { - .list = array_new(), + .empty_list = array_new(), .usage_map = UINT32_MAX, }; } -static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) { +static void capture_list_pool_reset(CaptureListPool *self) { self->usage_map = UINT32_MAX; - uint32_t total_size = MAX_STATE_COUNT * list_size; - array_reserve(&self->list, total_size); - self->list.size = total_size; + for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { + array_clear(&self->list[i]); + } } static void capture_list_pool_delete(CaptureListPool *self) { - array_delete(&self->list); + for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { + array_delete(&self->list[i]); + } +} + +static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) { + if (id >= MAX_CAPTURE_LIST_COUNT) return &self->empty_list; + return &self->list[id]; } -static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) { - return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)]; +static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) { + assert(id < MAX_CAPTURE_LIST_COUNT); + return &self->list[id]; } static bool capture_list_pool_is_empty(const CaptureListPool *self) { @@ -258,12 +369,15 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) { // the leading zeros in the usage map. An id of zero corresponds to the // highest-order bit in the bitmask. uint16_t id = count_leading_zeros(self->usage_map); - if (id == 32) return NONE; + if (id >= MAX_CAPTURE_LIST_COUNT) return NONE; self->usage_map &= ~bitmask_for_index(id); + array_clear(&self->list[id]); return id; } static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { + if (id >= MAX_CAPTURE_LIST_COUNT) return; + array_clear(&self->list[id]); self->usage_map |= bitmask_for_index(id); } @@ -271,7 +385,7 @@ static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { * SymbolTable **************/ -static SymbolTable symbol_table_new() { +static SymbolTable symbol_table_new(void) { return (SymbolTable) { .characters = array_new(), .slices = array_new(), @@ -326,6 +440,169 @@ static uint16_t symbol_table_insert_name( return self->slices.size - 1; } +/************ + * QueryStep + ************/ + +static QueryStep query_step__new( + TSSymbol symbol, + uint16_t depth, + bool is_immediate +) { + return (QueryStep) { + .symbol = symbol, + .depth = depth, + .field = 0, + .capture_ids = {NONE, NONE, NONE}, + .alternative_index = NONE, + .contains_captures = false, + .is_last_child = false, + .is_pass_through = false, + .is_dead_end = false, + .is_definite = false, + .is_immediate = is_immediate, + .alternative_is_immediate = false, + }; +} + +static void query_step__add_capture(QueryStep *self, uint16_t capture_id) { + for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { + if (self->capture_ids[i] == NONE) { + self->capture_ids[i] = capture_id; + break; + } + } +} + +static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) { + for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { + if (self->capture_ids[i] == capture_id) { + self->capture_ids[i] = NONE; + while (i + 1 < MAX_STEP_CAPTURE_COUNT) { + if (self->capture_ids[i + 1] == NONE) break; + self->capture_ids[i] = self->capture_ids[i + 1]; + self->capture_ids[i + 1] = NONE; + i++; + } + break; + } + } +} + +/********************** + * StatePredecessorMap + **********************/ + +static inline StatePredecessorMap state_predecessor_map_new(const TSLanguage *language) { + return (StatePredecessorMap) { + .contents = ts_calloc(language->state_count * (MAX_STATE_PREDECESSOR_COUNT + 1), sizeof(TSStateId)), + }; +} + +static inline void state_predecessor_map_delete(StatePredecessorMap *self) { + ts_free(self->contents); +} + +static inline void state_predecessor_map_add( + StatePredecessorMap *self, + TSStateId state, + TSStateId predecessor +) { + unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1); + TSStateId *count = &self->contents[index]; + if (*count == 0 || (*count < MAX_STATE_PREDECESSOR_COUNT && self->contents[index + *count] != predecessor)) { + (*count)++; + self->contents[index + *count] = predecessor; + } +} + +static inline const TSStateId *state_predecessor_map_get( + const StatePredecessorMap *self, + TSStateId state, + unsigned *count +) { + unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1); + *count = self->contents[index]; + return &self->contents[index + 1]; +} + +/**************** + * AnalysisState + ****************/ + +static unsigned analysis_state__recursion_depth(const AnalysisState *self) { + unsigned result = 0; + for (unsigned i = 0; i < self->depth; i++) { + TSSymbol symbol = self->stack[i].parent_symbol; + for (unsigned j = 0; j < i; j++) { + if (self->stack[j].parent_symbol == symbol) { + result++; + break; + } + } + } + return result; +} + +static inline int analysis_state__compare_position( + const AnalysisState *self, + const AnalysisState *other +) { + for (unsigned i = 0; i < self->depth; i++) { + if (i >= other->depth) return -1; + if (self->stack[i].child_index < other->stack[i].child_index) return -1; + if (self->stack[i].child_index > other->stack[i].child_index) return 1; + } + if (self->depth < other->depth) return 1; + return 0; +} + +static inline int analysis_state__compare( + const AnalysisState *self, + const AnalysisState *other +) { + int result = analysis_state__compare_position(self, other); + if (result != 0) return result; + for (unsigned i = 0; i < self->depth; i++) { + if (self->stack[i].parent_symbol < other->stack[i].parent_symbol) return -1; + if (self->stack[i].parent_symbol > other->stack[i].parent_symbol) return 1; + if (self->stack[i].parse_state < other->stack[i].parse_state) return -1; + if (self->stack[i].parse_state > other->stack[i].parse_state) return 1; + if (self->stack[i].field_id < other->stack[i].field_id) return -1; + if (self->stack[i].field_id > other->stack[i].field_id) return 1; + } + if (self->step_index < other->step_index) return -1; + if (self->step_index > other->step_index) return 1; + return 0; +} + +static inline AnalysisStateEntry *analysis_state__top(AnalysisState *self) { + return &self->stack[self->depth - 1]; +} + +static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol symbol) { + for (unsigned i = 0; i < self->depth; i++) { + if (self->stack[i].parent_symbol == symbol) return true; + } + return false; +} + +/*********************** + * AnalysisSubgraphNode + ***********************/ + +static inline int analysis_subgraph_node__compare(const AnalysisSubgraphNode *self, const AnalysisSubgraphNode *other) { + if (self->state < other->state) return -1; + if (self->state > other->state) return 1; + if (self->child_index < other->child_index) return -1; + if (self->child_index > other->child_index) return 1; + if (self->done < other->done) return -1; + if (self->done > other->done) return 1; + if (self->production_id < other->production_id) return -1; + if (self->production_id > other->production_id) return 1; + return 0; +} + /********* * Query *********/ @@ -387,33 +664,728 @@ static inline bool ts_query__pattern_map_search( static inline void ts_query__pattern_map_insert( TSQuery *self, TSSymbol symbol, - uint32_t start_step_index + uint32_t start_step_index, + uint32_t pattern_index ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); + + // Ensure that the entries are sorted not only by symbol, but also + // by pattern_index. This way, states for earlier patterns will be + // initiated first, which allows the ordering of the states array + // to be maintained more efficiently. + while (index < self->pattern_map.size) { + PatternEntry *entry = &self->pattern_map.contents[index]; + if ( + self->steps.contents[entry->step_index].symbol == symbol && + entry->pattern_index < pattern_index + ) { + index++; + } else { + break; + } + } + array_insert(&self->pattern_map, index, ((PatternEntry) { .step_index = start_step_index, - .pattern_index = self->pattern_map.size, + .pattern_index = pattern_index, })); } +static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { + // Identify all of the patterns in the query that have child patterns, both at the + // top level and nested within other larger patterns. Record the step index where + // each pattern starts. + Array(uint32_t) parent_step_indices = array_new(); + for (unsigned i = 0; i < self->steps.size; i++) { + QueryStep *step = &self->steps.contents[i]; + if (i + 1 < self->steps.size) { + QueryStep *next_step = &self->steps.contents[i + 1]; + if ( + step->symbol != WILDCARD_SYMBOL && + step->symbol != NAMED_WILDCARD_SYMBOL && + next_step->depth > step->depth && + next_step->depth != PATTERN_DONE_MARKER + ) { + array_push(&parent_step_indices, i); + } + } + if (step->depth > 0) { + step->is_definite = true; + } + } + + // For every parent symbol in the query, initialize an 'analysis subgraph'. + // This subgraph lists all of the states in the parse table that are directly + // involved in building subtrees for this symbol. + // + // In addition to the parent symbols in the query, construct subgraphs for all + // of the hidden symbols in the grammar, because these might occur within + // one of the parent nodes, such that their children appear to belong to the + // parent. + Array(AnalysisSubgraph) subgraphs = array_new(); + for (unsigned i = 0; i < parent_step_indices.size; i++) { + uint32_t parent_step_index = parent_step_indices.contents[i]; + TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; + AnalysisSubgraph subgraph = { .symbol = parent_symbol }; + array_insert_sorted_by(&subgraphs, .symbol, subgraph); + } + for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) { + if (!ts_language_symbol_metadata(self->language, sym).visible) { + AnalysisSubgraph subgraph = { .symbol = sym }; + array_insert_sorted_by(&subgraphs, .symbol, subgraph); + } + } + + // Scan the parse table to find the data needed to populate these subgraphs. + // Collect three things during this scan: + // 1) All of the parse states where one of these symbols can start. + // 2) All of the parse states where one of these symbols can end, along + // with information about the node that would be created. + // 3) A list of predecessor states for each state. + StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language); + for (TSStateId state = 1; state < self->language->state_count; state++) { + unsigned subgraph_index, exists; + LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state); + while (ts_lookahead_iterator_next(&lookahead_iterator)) { + if (lookahead_iterator.action_count) { + for (unsigned i = 0; i < lookahead_iterator.action_count; i++) { + const TSParseAction *action = &lookahead_iterator.actions[i]; + if (action->type == TSParseActionTypeReduce) { + const TSSymbol *aliases, *aliases_end; + ts_language_aliases_for_symbol( + self->language, + action->params.reduce.symbol, + &aliases, + &aliases_end + ); + for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) { + array_search_sorted_by( + &subgraphs, + .symbol, + *symbol, + &subgraph_index, + &exists + ); + if (exists) { + AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) { + array_push(&subgraph->nodes, ((AnalysisSubgraphNode) { + .state = state, + .production_id = action->params.reduce.production_id, + .child_index = action->params.reduce.child_count, + .done = true, + })); + } + } + } + } else if (action->type == TSParseActionTypeShift && !action->params.shift.extra) { + TSStateId next_state = action->params.shift.state; + state_predecessor_map_add(&predecessor_map, next_state, state); + } + } + } else if (lookahead_iterator.next_state != 0 && lookahead_iterator.next_state != state) { + state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state); + const TSSymbol *aliases, *aliases_end; + ts_language_aliases_for_symbol( + self->language, + lookahead_iterator.symbol, + &aliases, + &aliases_end + ); + for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) { + array_search_sorted_by( + &subgraphs, + .symbol, + *symbol, + &subgraph_index, + &exists + ); + if (exists) { + AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + if ( + subgraph->start_states.size == 0 || + *array_back(&subgraph->start_states) != state + ) + array_push(&subgraph->start_states, state); + } + } + } + } + } + + // For each subgraph, compute the preceding states by walking backward + // from the end states using the predecessor map. + Array(AnalysisSubgraphNode) next_nodes = array_new(); + for (unsigned i = 0; i < subgraphs.size; i++) { + AnalysisSubgraph *subgraph = &subgraphs.contents[i]; + if (subgraph->nodes.size == 0) { + array_delete(&subgraph->start_states); + array_erase(&subgraphs, i); + i--; + continue; + } + array_assign(&next_nodes, &subgraph->nodes); + while (next_nodes.size > 0) { + AnalysisSubgraphNode node = array_pop(&next_nodes); + if (node.child_index > 1) { + unsigned predecessor_count; + const TSStateId *predecessors = state_predecessor_map_get( + &predecessor_map, + node.state, + &predecessor_count + ); + for (unsigned j = 0; j < predecessor_count; j++) { + AnalysisSubgraphNode predecessor_node = { + .state = predecessors[j], + .child_index = node.child_index - 1, + .production_id = node.production_id, + .done = false, + }; + unsigned index, exists; + array_search_sorted_with( + &subgraph->nodes, analysis_subgraph_node__compare, &predecessor_node, + &index, &exists + ); + if (!exists) { + array_insert(&subgraph->nodes, index, predecessor_node); + array_push(&next_nodes, predecessor_node); + } + } + } + } + } + + #ifdef DEBUG_ANALYZE_QUERY + printf("\nSubgraphs:\n"); + for (unsigned i = 0; i < subgraphs.size; i++) { + AnalysisSubgraph *subgraph = &subgraphs.contents[i]; + printf(" %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol)); + for (unsigned j = 0; j < subgraph->start_states.size; j++) { + printf( + " {state: %u}\n", + subgraph->start_states.contents[j] + ); + } + for (unsigned j = 0; j < subgraph->nodes.size; j++) { + AnalysisSubgraphNode *node = &subgraph->nodes.contents[j]; + printf( + " {state: %u, child_index: %u, production_id: %u, done: %d}\n", + node->state, node->child_index, node->production_id, node->done + ); + } + printf("\n"); + } + #endif + + // For each non-terminal pattern, determine if the pattern can successfully match, + // and identify all of the possible children within the pattern where matching could fail. + bool result = true; + AnalysisStateSet states = array_new(); + AnalysisStateSet next_states = array_new(); + AnalysisStateSet deeper_states = array_new(); + Array(uint16_t) final_step_indices = array_new(); + for (unsigned i = 0; i < parent_step_indices.size; i++) { + uint16_t parent_step_index = parent_step_indices.contents[i]; + uint16_t parent_depth = self->steps.contents[parent_step_index].depth; + TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; + if (parent_symbol == ts_builtin_sym_error) continue; + + // Find the subgraph that corresponds to this pattern's root symbol. If the pattern's + // root symbols is not a non-terminal, then return an error. + unsigned subgraph_index, exists; + array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); + if (!exists) { + unsigned first_child_step_index = parent_step_index + 1; + uint32_t i, exists; + array_search_sorted_by(&self->step_offsets, .step_index, first_child_step_index, &i, &exists); + assert(exists); + *error_offset = self->step_offsets.contents[i].byte_offset; + result = false; + break; + } + + // Initialize an analysis state at every parse state in the table where + // this parent symbol can occur. + AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + array_clear(&states); + array_clear(&deeper_states); + for (unsigned j = 0; j < subgraph->start_states.size; j++) { + TSStateId parse_state = subgraph->start_states.contents[j]; + array_push(&states, ((AnalysisState) { + .step_index = parent_step_index + 1, + .stack = { + [0] = { + .parse_state = parse_state, + .parent_symbol = parent_symbol, + .child_index = 0, + .field_id = 0, + .done = false, + }, + }, + .depth = 1, + })); + } + + // Walk the subgraph for this non-terminal, tracking all of the possible + // sequences of progress within the pattern. + bool can_finish_pattern = false; + bool did_exceed_max_depth = false; + unsigned recursion_depth_limit = 0; + unsigned prev_final_step_count = 0; + array_clear(&final_step_indices); + for (;;) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Final step indices:"); + for (unsigned j = 0; j < final_step_indices.size; j++) { + printf(" %4u", final_step_indices.contents[j]); + } + printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol)); + for (unsigned j = 0; j < states.size; j++) { + AnalysisState *state = &states.contents[j]; + printf(" %3u: step: %u, stack: [", j, state->step_index); + for (unsigned k = 0; k < state->depth; k++) { + printf( + " {%s, child: %u, state: %4u", + self->language->symbol_names[state->stack[k].parent_symbol], + state->stack[k].child_index, + state->stack[k].parse_state + ); + if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]); + if (state->stack[k].done) printf(", DONE"); + printf("}"); + } + printf(" ]\n"); + } + #endif + + if (states.size == 0) { + if (deeper_states.size > 0 && final_step_indices.size > prev_final_step_count) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); + #endif + + prev_final_step_count = final_step_indices.size; + recursion_depth_limit++; + AnalysisStateSet _states = states; + states = deeper_states; + deeper_states = _states; + continue; + } + + break; + } + + array_clear(&next_states); + for (unsigned j = 0; j < states.size; j++) { + AnalysisState * const state = &states.contents[j]; + + // For efficiency, it's important to avoid processing the same analysis state more + // than once. To achieve this, keep the states in order of ascending position within + // their hypothetical syntax trees. In each iteration of this loop, start by advancing + // the states that have made the least progress. Avoid advancing states that have already + // made more progress. + if (next_states.size > 0) { + int comparison = analysis_state__compare_position(state, array_back(&next_states)); + if (comparison == 0) { + array_insert_sorted_with(&next_states, analysis_state__compare, *state); + continue; + } else if (comparison > 0) { + while (j < states.size) { + array_push(&next_states, states.contents[j]); + j++; + } + break; + } + } + + const TSStateId parse_state = analysis_state__top(state)->parse_state; + const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; + const TSFieldId parent_field_id = analysis_state__top(state)->field_id; + const unsigned child_index = analysis_state__top(state)->child_index; + const QueryStep * const step = &self->steps.contents[state->step_index]; + + unsigned subgraph_index, exists; + array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); + if (!exists) continue; + const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + + // Follow every possible path in the parse table, but only visit states that + // are part of the subgraph for the current symbol. + LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state); + while (ts_lookahead_iterator_next(&lookahead_iterator)) { + TSSymbol sym = lookahead_iterator.symbol; + + TSStateId next_parse_state; + if (lookahead_iterator.action_count) { + const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1]; + if (action->type == TSParseActionTypeShift && !action->params.shift.extra) { + next_parse_state = action->params.shift.state; + } else { + continue; + } + } else if (lookahead_iterator.next_state != 0 && lookahead_iterator.next_state != parse_state) { + next_parse_state = lookahead_iterator.next_state; + } else { + continue; + } + + AnalysisSubgraphNode successor = { + .state = next_parse_state, + .child_index = child_index + 1, + }; + unsigned node_index; + array_search_sorted_with( + &subgraph->nodes, + analysis_subgraph_node__compare, &successor, + &node_index, &exists + ); + while (node_index < subgraph->nodes.size) { + AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; + if (node->state != successor.state || node->child_index != successor.child_index) break; + + // Use the subgraph to determine what alias and field will eventually be applied + // to this child node. + TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); + TSSymbol visible_symbol = alias + ? alias + : self->language->symbol_metadata[sym].visible + ? self->language->public_symbol_map[sym] + : 0; + TSFieldId field_id = parent_field_id; + if (!field_id) { + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end); + for (; field_map != field_map_end; field_map++) { + if (!field_map->inherited && field_map->child_index == child_index) { + field_id = field_map->field_id; + break; + } + } + } + + AnalysisState next_state = *state; + analysis_state__top(&next_state)->child_index++; + analysis_state__top(&next_state)->parse_state = successor.state; + if (node->done) analysis_state__top(&next_state)->done = true; + + // Determine if this hypothetical child node would match the current step + // of the query pattern. + bool does_match = false; + if (visible_symbol) { + does_match = true; + if (step->symbol == NAMED_WILDCARD_SYMBOL) { + if (!self->language->symbol_metadata[visible_symbol].named) does_match = false; + } else if (step->symbol != WILDCARD_SYMBOL) { + if (step->symbol != visible_symbol) does_match = false; + } + if (step->field && step->field != field_id) { + does_match = false; + } + if ( + step->supertype_symbol && + !analysis_state__has_supertype(state, step->supertype_symbol) + ) does_match = false; + } + + // If this is a hidden child, then push a new entry to the stack, in order to + // walk through the children of this child. + else if (sym >= self->language->token_count) { + if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) { + did_exceed_max_depth = true; + continue; + } + + next_state.depth++; + analysis_state__top(&next_state)->parse_state = parse_state; + analysis_state__top(&next_state)->child_index = 0; + analysis_state__top(&next_state)->parent_symbol = sym; + analysis_state__top(&next_state)->field_id = field_id; + analysis_state__top(&next_state)->done = false; + + if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { + array_insert_sorted_with(&deeper_states, analysis_state__compare, next_state); + continue; + } + } + + // Pop from the stack when this state reached the end of its current syntax node. + while (next_state.depth > 0 && analysis_state__top(&next_state)->done) { + next_state.depth--; + } + + // If this hypothetical child did match the current step of the query pattern, + // then advance to the next step at the current depth. This involves skipping + // over any descendant steps of the current child. + const QueryStep *next_step = step; + if (does_match) { + for (;;) { + next_state.step_index++; + next_step = &self->steps.contents[next_state.step_index]; + if ( + next_step->depth == PATTERN_DONE_MARKER || + next_step->depth <= parent_depth + 1 + ) break; + } + } + + for (;;) { + // If this state can make further progress, then add it to the states for the next iteration. + // Otherwise, record the fact that matching can fail at this step of the pattern. + if (!next_step->is_dead_end) { + bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1; + if (did_finish_pattern) can_finish_pattern = true; + if (did_finish_pattern || next_state.depth == 0) { + array_insert_sorted_by(&final_step_indices, , next_state.step_index); + } else { + array_insert_sorted_with(&next_states, analysis_state__compare, next_state); + } + } + + // If the state has advanced to a step with an alternative step, then add another state at + // that alternative step to the next iteration. + if ( + does_match && + next_step->alternative_index != NONE && + next_step->alternative_index > next_state.step_index + ) { + next_state.step_index = next_step->alternative_index; + next_step = &self->steps.contents[next_state.step_index]; + } else { + break; + } + } + } + } + } + + AnalysisStateSet _states = states; + states = next_states; + next_states = _states; + } + + // Mark as indefinite any step where a match terminated. + // Later, this property will be propagated to all of the step's predecessors. + for (unsigned j = 0; j < final_step_indices.size; j++) { + uint32_t final_step_index = final_step_indices.contents[j]; + QueryStep *step = &self->steps.contents[final_step_index]; + if ( + step->depth != PATTERN_DONE_MARKER && + step->depth > parent_depth && + !step->is_dead_end + ) { + step->is_definite = false; + } + } + + if (did_exceed_max_depth) { + for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) { + QueryStep *step = &self->steps.contents[j]; + if ( + step->depth <= parent_depth || + step->depth == PATTERN_DONE_MARKER + ) break; + if (!step->is_dead_end) { + step->is_definite = false; + } + } + } + + // If this pattern cannot match, store the pattern index so that it can be + // returned to the caller. + if (result && !can_finish_pattern && !did_exceed_max_depth) { + assert(final_step_indices.size > 0); + uint16_t impossible_step_index = *array_back(&final_step_indices); + uint32_t i, exists; + array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists); + assert(exists); + *error_offset = self->step_offsets.contents[i].byte_offset; + result = false; + break; + } + } + + // Mark as indefinite any step with captures that are used in predicates. + Array(uint16_t) predicate_capture_ids = array_new(); + for (unsigned i = 0; i < self->patterns.size; i++) { + QueryPattern *pattern = &self->patterns.contents[i]; + + // Gather all of the captures that are used in predicates for this pattern. + array_clear(&predicate_capture_ids); + for ( + unsigned start = pattern->predicate_steps.offset, + end = start + pattern->predicate_steps.length, + j = start; j < end; j++ + ) { + TSQueryPredicateStep *step = &self->predicate_steps.contents[j]; + if (step->type == TSQueryPredicateStepTypeCapture) { + array_insert_sorted_by(&predicate_capture_ids, , step->value_id); + } + } + + // Find all of the steps that have these captures. + for ( + unsigned start = pattern->steps.offset, + end = start + pattern->steps.length, + j = start; j < end; j++ + ) { + QueryStep *step = &self->steps.contents[j]; + for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) { + uint16_t capture_id = step->capture_ids[k]; + if (capture_id == NONE) break; + unsigned index, exists; + array_search_sorted_by(&predicate_capture_ids, , capture_id, &index, &exists); + if (exists) { + step->is_definite = false; + break; + } + } + } + } + + // Propagate indefiniteness backwards. + bool done = self->steps.size == 0; + while (!done) { + done = true; + for (unsigned i = self->steps.size - 1; i > 0; i--) { + QueryStep *step = &self->steps.contents[i]; + + // Determine if this step is definite or has definite alternatives. + bool is_definite = false; + for (;;) { + if (step->is_definite) { + is_definite = true; + break; + } + if (step->alternative_index == NONE || step->alternative_index < i) { + break; + } + step = &self->steps.contents[step->alternative_index]; + } + + // If not, mark its predecessor as indefinite. + if (!is_definite) { + QueryStep *prev_step = &self->steps.contents[i - 1]; + if ( + !prev_step->is_dead_end && + prev_step->depth != PATTERN_DONE_MARKER && + prev_step->is_definite + ) { + prev_step->is_definite = false; + done = false; + } + } + } + } + + #ifdef DEBUG_ANALYZE_QUERY + printf("Steps:\n"); + for (unsigned i = 0; i < self->steps.size; i++) { + QueryStep *step = &self->steps.contents[i]; + if (step->depth == PATTERN_DONE_MARKER) { + printf(" %u: DONE\n", i); + } else { + printf( + " %u: {symbol: %s, field: %s, is_definite: %d}\n", + i, + (step->symbol == WILDCARD_SYMBOL || step->symbol == NAMED_WILDCARD_SYMBOL) + ? "ANY" + : ts_language_symbol_name(self->language, step->symbol), + (step->field ? ts_language_field_name_for_id(self->language, step->field) : "-"), + step->is_definite + ); + } + } + #endif + + // Cleanup + for (unsigned i = 0; i < subgraphs.size; i++) { + array_delete(&subgraphs.contents[i].start_states); + array_delete(&subgraphs.contents[i].nodes); + } + array_delete(&subgraphs); + array_delete(&next_nodes); + array_delete(&states); + array_delete(&next_states); + array_delete(&deeper_states); + array_delete(&final_step_indices); + array_delete(&parent_step_indices); + array_delete(&predicate_capture_ids); + state_predecessor_map_delete(&predecessor_map); + + return result; +} + static void ts_query__finalize_steps(TSQuery *self) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; uint32_t depth = step->depth; - if (step->capture_id != NONE) { + if (step->capture_ids[0] != NONE) { step->contains_captures = true; } else { step->contains_captures = false; for (unsigned j = i + 1; j < self->steps.size; j++) { QueryStep *s = &self->steps.contents[j]; if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; - if (s->capture_id != NONE) step->contains_captures = true; + if (s->capture_ids[0] != NONE) step->contains_captures = true; } } } } +static TSQueryError ts_query__parse_string_literal( + TSQuery *self, + Stream *stream +) { + const char *string_start = stream->input; + if (stream->next != '"') return TSQueryErrorSyntax; + stream_advance(stream); + const char *prev_position = stream->input; + + bool is_escaped = false; + array_clear(&self->string_buffer); + for (;;) { + if (is_escaped) { + is_escaped = false; + switch (stream->next) { + case 'n': + array_push(&self->string_buffer, '\n'); + break; + case 'r': + array_push(&self->string_buffer, '\r'); + break; + case 't': + array_push(&self->string_buffer, '\t'); + break; + case '0': + array_push(&self->string_buffer, '\0'); + break; + default: + array_extend(&self->string_buffer, stream->next_size, stream->input); + break; + } + prev_position = stream->input + stream->next_size; + } else { + if (stream->next == '\\') { + array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); + prev_position = stream->input + 1; + is_escaped = true; + } else if (stream->next == '"') { + array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); + stream_advance(stream); + return TSQueryErrorNone; + } else if (stream->next == '\n') { + stream_reset(stream, string_start); + return TSQueryErrorSyntax; + } + } + if (!stream_advance(stream)) { + stream_reset(stream, string_start); + return TSQueryErrorSyntax; + } + } +} + // Parse a single predicate associated with a pattern, adding it to the // query's internal `predicate_steps` array. Predicates are arbitrary // S-expressions associated with a pattern which are meant to be handled at @@ -424,17 +1396,25 @@ static TSQueryError ts_query__parse_predicate( TSQuery *self, Stream *stream ) { - if (stream->next == ')') return PARENT_DONE; - if (stream->next != '(') return TSQueryErrorSyntax; - stream_advance(stream); + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *predicate_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - predicate_name; + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + predicate_name, + length + ); + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); stream_skip_whitespace(stream); - unsigned step_count = 0; for (;;) { if (stream->next == ')') { stream_advance(stream); stream_skip_whitespace(stream); - array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeDone, .value_id = 0, @@ -463,7 +1443,6 @@ static TSQueryError ts_query__parse_predicate( return TSQueryErrorCapture; } - array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeCapture, .value_id = capture_id, @@ -472,32 +1451,17 @@ static TSQueryError ts_query__parse_predicate( // Parse a string literal else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - const char *string_content = stream->input; - while (stream->next != '"') { - if (stream->next == '\n' || !stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; - - // Add a step for the node + TSQueryError e = ts_query__parse_string_literal(self, stream); + if (e) return e; uint16_t id = symbol_table_insert_name( &self->predicate_values, - string_content, - length + self->string_buffer.contents, + self->string_buffer.size ); - array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); } // Parse a bare symbol @@ -510,7 +1474,6 @@ static TSQueryError ts_query__parse_predicate( symbol_start, length ); - array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, @@ -521,7 +1484,6 @@ static TSQueryError ts_query__parse_predicate( return TSQueryErrorSyntax; } - step_count++; stream_skip_whitespace(stream); } @@ -535,126 +1497,252 @@ static TSQueryError ts_query__parse_pattern( TSQuery *self, Stream *stream, uint32_t depth, - uint32_t *capture_count + bool is_immediate ) { - uint16_t starting_step_index = self->steps.size; - if (stream->next == 0) return TSQueryErrorSyntax; + if (stream->next == ')' || stream->next == ']') return PARENT_DONE; + + const uint32_t starting_step_index = self->steps.size; + + // Store the byte offset of each step in the query. + if ( + self->step_offsets.size == 0 || + array_back(&self->step_offsets)->step_index != starting_step_index + ) { + array_push(&self->step_offsets, ((StepOffset) { + .step_index = starting_step_index, + .byte_offset = stream_offset(stream), + })); + } + + // An open bracket is the start of an alternation. + if (stream->next == '[') { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Parse each branch, and add a placeholder step in between the branches. + Array(uint32_t) branch_step_indices = array_new(); + for (;;) { + uint32_t start_index = self->steps.size; + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + is_immediate + ); - // Finish the parent S-expression - if (stream->next == ')') { - return PARENT_DONE; + if (e == PARENT_DONE && stream->next == ']' && branch_step_indices.size > 0) { + stream_advance(stream); + break; + } else if (e) { + array_delete(&branch_step_indices); + return e; + } + + array_push(&branch_step_indices, start_index); + array_push(&self->steps, query_step__new(0, depth, false)); + } + (void)array_pop(&self->steps); + + // For all of the branches except for the last one, add the subsequent branch as an + // alternative, and link the end of the branch to the current end of the steps. + for (unsigned i = 0; i < branch_step_indices.size - 1; i++) { + uint32_t step_index = branch_step_indices.contents[i]; + uint32_t next_step_index = branch_step_indices.contents[i + 1]; + QueryStep *start_step = &self->steps.contents[step_index]; + QueryStep *end_step = &self->steps.contents[next_step_index - 1]; + start_step->alternative_index = next_step_index; + end_step->alternative_index = self->steps.size; + end_step->is_dead_end = true; + } + + array_delete(&branch_step_indices); } - // Parse a parenthesized node expression + // An open parenthesis can be the start of three possible constructs: + // * A grouped sequence + // * A predicate + // * A named node else if (stream->next == '(') { stream_advance(stream); stream_skip_whitespace(stream); - // Parse a nested list, which represents a pattern followed by - // zero-or-more predicates. - if (stream->next == '(' && depth == 0) { - TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count); - if (e) return e; - - // Parse the predicates. - stream_skip_whitespace(stream); + // If this parenthesis is followed by a node, then it represents a grouped sequence. + if (stream->next == '(' || stream->next == '"' || stream->next == '[') { + bool child_is_immediate = false; for (;;) { - TSQueryError e = ts_query__parse_predicate(self, stream); - if (e == PARENT_DONE) { + if (stream->next == '.') { + child_is_immediate = true; stream_advance(stream); stream_skip_whitespace(stream); - return 0; + } + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + child_is_immediate + ); + if (e == PARENT_DONE && stream->next == ')') { + stream_advance(stream); + break; } else if (e) { return e; } + + child_is_immediate = false; } } - TSSymbol symbol; - - // Parse the wildcard symbol - if (stream->next == '*') { - symbol = WILDCARD_SYMBOL; + // A dot/pound character indicates the start of a predicate. + else if (stream->next == '.' || stream->next == '#') { stream_advance(stream); + return ts_query__parse_predicate(self, stream); } - // Parse a normal node name - else if (stream_is_ident_start(stream)) { - const char *node_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - node_name; - symbol = ts_language_symbol_for_name( - self->language, - node_name, - length, - true - ); - if (!symbol) { - stream_reset(stream, node_name); - return TSQueryErrorNodeType; + // Otherwise, this parenthesis is the start of a named node. + else { + TSSymbol symbol; + + // TODO - remove. + // For temporary backward compatibility, handle '*' as a wildcard. + if (stream->next == '*') { + symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; + stream_advance(stream); } - } else { - return TSQueryErrorSyntax; - } - // Add a step for the node. - array_push(&self->steps, ((QueryStep) { - .depth = depth, - .symbol = symbol, - .field = 0, - .capture_id = NONE, - .contains_captures = false, - })); + // Parse a normal node name + else if (stream_is_ident_start(stream)) { + const char *node_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - node_name; + + // TODO - remove. + // For temporary backward compatibility, handle predicates without the leading '#' sign. + if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) { + stream_reset(stream, node_name); + return ts_query__parse_predicate(self, stream); + } - // Parse the child patterns - stream_skip_whitespace(stream); - for (;;) { - TSQueryError e = ts_query__parse_pattern(self, stream, depth + 1, capture_count); - if (e == PARENT_DONE) { + // Parse the wildcard symbol + else if (length == 1 && node_name[0] == '_') { + symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; + } + + else { + symbol = ts_language_symbol_for_name( + self->language, + node_name, + length, + true + ); + if (!symbol) { + stream_reset(stream, node_name); + return TSQueryErrorNodeType; + } + } + } else { + return TSQueryErrorSyntax; + } + + // Add a step for the node. + array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); + if (ts_language_symbol_metadata(self->language, symbol).supertype) { + QueryStep *step = array_back(&self->steps); + step->supertype_symbol = step->symbol; + step->symbol = NAMED_WILDCARD_SYMBOL; + } + + stream_skip_whitespace(stream); + + if (stream->next == '/') { stream_advance(stream); - break; - } else if (e) { - return e; + if (!stream_is_ident_start(stream)) { + return TSQueryErrorSyntax; + } + + const char *node_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - node_name; + + QueryStep *step = array_back(&self->steps); + step->symbol = ts_language_symbol_for_name( + self->language, + node_name, + length, + true + ); + if (!step->symbol) { + stream_reset(stream, node_name); + return TSQueryErrorNodeType; + } + + stream_skip_whitespace(stream); + } + + // Parse the child patterns + bool child_is_immediate = false; + uint16_t child_start_step_index = self->steps.size; + for (;;) { + if (stream->next == '.') { + child_is_immediate = true; + stream_advance(stream); + stream_skip_whitespace(stream); + } + + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth + 1, + child_is_immediate + ); + if (e == PARENT_DONE && stream->next == ')') { + if (child_is_immediate) { + self->steps.contents[child_start_step_index].is_last_child = true; + } + stream_advance(stream); + break; + } else if (e) { + return e; + } + + child_is_immediate = false; } } } - // Parse a double-quoted anonymous leaf node expression - else if (stream->next == '"') { + // Parse a wildcard pattern + else if ( + stream->next == '_' || + + // TODO remove. + // For temporary backward compatibility, handle '*' as a wildcard. + stream->next == '*' + ) { stream_advance(stream); + stream_skip_whitespace(stream); - // Parse the string content - const char *string_content = stream->input; - while (stream->next != '"') { - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; + // Add a step that matches any kind of node + array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); + } + + // Parse a double-quoted anonymous leaf node expression + else if (stream->next == '"') { + const char *string_start = stream->input; + TSQueryError e = ts_query__parse_string_literal(self, stream); + if (e) return e; // Add a step for the node TSSymbol symbol = ts_language_symbol_for_name( self->language, - string_content, - length, + self->string_buffer.contents, + self->string_buffer.size, false ); if (!symbol) { - stream_reset(stream, string_content); + stream_reset(stream, string_start + 1); return TSQueryErrorNodeType; } - array_push(&self->steps, ((QueryStep) { - .depth = depth, - .symbol = symbol, - .field = 0, - .capture_id = NONE, - .contains_captures = false, - })); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); + array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); } // Parse a field-prefixed pattern @@ -673,8 +1761,12 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); // Parse the pattern - uint32_t step_index = self->steps.size; - TSQueryError e = ts_query__parse_pattern(self, stream, depth, capture_count); + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + is_immediate + ); if (e == PARENT_DONE) return TSQueryErrorSyntax; if (e) return e; @@ -688,21 +1780,22 @@ static TSQueryError ts_query__parse_pattern( stream->input = field_name; return TSQueryErrorField; } - self->steps.contents[step_index].field = field_id; - } - // Parse a wildcard pattern - else if (stream->next == '*') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Add a step that matches any kind of node - array_push(&self->steps, ((QueryStep) { - .depth = depth, - .symbol = WILDCARD_SYMBOL, - .field = 0, - .contains_captures = false, - })); + uint32_t step_index = starting_step_index; + QueryStep *step = &self->steps.contents[step_index]; + for (;;) { + step->field = field_id; + if ( + step->alternative_index != NONE && + step->alternative_index > step_index && + step->alternative_index < self->steps.size + ) { + step_index = step->alternative_index; + step = &self->steps.contents[step_index]; + } else { + break; + } + } } else { @@ -711,26 +1804,86 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); - // Parse an '@'-prefixed capture pattern - if (stream->next == '@') { - stream_advance(stream); + // Parse suffixes modifiers for this pattern + for (;;) { + QueryStep *step = &self->steps.contents[starting_step_index]; - // Parse the capture name - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; + // Parse the one-or-more operator. + if (stream->next == '+') { + stream_advance(stream); + stream_skip_whitespace(stream); - // Add the capture id to the first step of the pattern - uint16_t capture_id = symbol_table_insert_name( - &self->captures, - capture_name, - length - ); - self->steps.contents[starting_step_index].capture_id = capture_id; - (*capture_count)++; + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); + repeat_step.alternative_index = starting_step_index; + repeat_step.is_pass_through = true; + repeat_step.alternative_is_immediate = true; + array_push(&self->steps, repeat_step); + } - stream_skip_whitespace(stream); + // Parse the zero-or-more repetition operator. + else if (stream->next == '*') { + stream_advance(stream); + stream_skip_whitespace(stream); + + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); + repeat_step.alternative_index = starting_step_index; + repeat_step.is_pass_through = true; + repeat_step.alternative_is_immediate = true; + array_push(&self->steps, repeat_step); + + while (step->alternative_index != NONE) { + step = &self->steps.contents[step->alternative_index]; + } + step->alternative_index = self->steps.size; + } + + // Parse the optional operator. + else if (stream->next == '?') { + stream_advance(stream); + stream_skip_whitespace(stream); + + while (step->alternative_index != NONE) { + step = &self->steps.contents[step->alternative_index]; + } + step->alternative_index = self->steps.size; + } + + // Parse an '@'-prefixed capture pattern + else if (stream->next == '@') { + stream_advance(stream); + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + stream_skip_whitespace(stream); + + // Add the capture id to the first step of the pattern + uint16_t capture_id = symbol_table_insert_name( + &self->captures, + capture_name, + length + ); + + uint32_t step_index = starting_step_index; + for (;;) { + query_step__add_capture(step, capture_id); + if ( + step->alternative_index != NONE && + step->alternative_index > step_index && + step->alternative_index < self->steps.size + ) { + step_index = step->alternative_index; + step = &self->steps.contents[step_index]; + } else { + break; + } + } + } + + // No more suffix modifiers + else { + break; + } } return 0; @@ -777,55 +1930,89 @@ TSQuery *ts_query_new( .captures = symbol_table_new(), .predicate_values = symbol_table_new(), .predicate_steps = array_new(), - .predicates_by_pattern = array_new(), + .patterns = array_new(), + .step_offsets = array_new(), + .string_buffer = array_new(), .symbol_map = symbol_map, .wildcard_root_pattern_count = 0, - .max_capture_count = 0, .language = language, }; // Parse all of the S-expressions in the given string. Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); - uint32_t start_step_index; while (stream.input < stream.end) { - start_step_index = self->steps.size; - uint32_t capture_count = 0; - array_push(&self->start_bytes_by_pattern, stream.input - source); - array_push(&self->predicates_by_pattern, ((Slice) { - .offset = self->predicate_steps.size, - .length = 0, + uint32_t pattern_index = self->patterns.size; + uint32_t start_step_index = self->steps.size; + uint32_t start_predicate_step_index = self->predicate_steps.size; + array_push(&self->patterns, ((QueryPattern) { + .steps = (Slice) {.offset = start_step_index}, + .predicate_steps = (Slice) {.offset = start_predicate_step_index}, + .start_byte = stream_offset(&stream), })); - *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count); - array_push(&self->steps, ((QueryStep) { .depth = PATTERN_DONE_MARKER })); + *error_type = ts_query__parse_pattern(self, &stream, 0, false); + array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false)); + + QueryPattern *pattern = array_back(&self->patterns); + pattern->steps.length = self->steps.size - start_step_index; + pattern->predicate_steps.length = self->predicate_steps.size - start_predicate_step_index; // If any pattern could not be parsed, then report the error information // and terminate. if (*error_type) { - *error_offset = stream.input - source; + if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax; + *error_offset = stream_offset(&stream); ts_query_delete(self); return NULL; } - // Maintain a map that can look up patterns for a given root symbol. - ts_query__pattern_map_insert( - self, - self->steps.contents[start_step_index].symbol, - start_step_index - ); - if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { - self->wildcard_root_pattern_count++; + // Maintain a map that can look up patterns for a given root symbol. + uint16_t wildcard_root_alternative_index = NONE; + for (;;) { + QueryStep *step = &self->steps.contents[start_step_index]; + + // If a pattern has a wildcard at its root, but it has a non-wildcard child, + // then optimize the matching process by skipping matching the wildcard. + // Later, during the matching process, the query cursor will check that + // there is a parent node, and capture it if necessary. + if (step->symbol == WILDCARD_SYMBOL && step->depth == 0) { + QueryStep *second_step = &self->steps.contents[start_step_index + 1]; + if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1) { + wildcard_root_alternative_index = step->alternative_index; + start_step_index += 1; + step = second_step; + } + } + + ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); + if (step->symbol == WILDCARD_SYMBOL) { + self->wildcard_root_pattern_count++; + } + + // If there are alternatives or options at the root of the pattern, + // then add multiple entries to the pattern map. + if (step->alternative_index != NONE) { + start_step_index = step->alternative_index; + step->alternative_index = NONE; + } else if (wildcard_root_alternative_index != NONE) { + start_step_index = wildcard_root_alternative_index; + wildcard_root_alternative_index = NONE; + } else { + break; + } } + } - // Keep track of the maximum number of captures in pattern, because - // that numer determines how much space is needed to store each capture - // list. - if (capture_count > self->max_capture_count) { - self->max_capture_count = capture_count; + if (self->language->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT) { + if (!ts_query__analyze_patterns(self, error_offset)) { + *error_type = TSQueryErrorStructure; + ts_query_delete(self); + return NULL; } } ts_query__finalize_steps(self); + array_delete(&self->string_buffer); return self; } @@ -834,8 +2021,9 @@ void ts_query_delete(TSQuery *self) { array_delete(&self->steps); array_delete(&self->pattern_map); array_delete(&self->predicate_steps); - array_delete(&self->predicates_by_pattern); - array_delete(&self->start_bytes_by_pattern); + array_delete(&self->patterns); + array_delete(&self->step_offsets); + array_delete(&self->string_buffer); symbol_table_delete(&self->captures); symbol_table_delete(&self->predicate_values); ts_free(self->symbol_map); @@ -844,7 +2032,7 @@ void ts_query_delete(TSQuery *self) { } uint32_t ts_query_pattern_count(const TSQuery *self) { - return self->predicates_by_pattern.size; + return self->patterns.size; } uint32_t ts_query_capture_count(const TSQuery *self) { @@ -876,8 +2064,11 @@ const TSQueryPredicateStep *ts_query_predicates_for_pattern( uint32_t pattern_index, uint32_t *step_count ) { - Slice slice = self->predicates_by_pattern.contents[pattern_index]; + Slice slice = self->patterns.contents[pattern_index].predicate_steps; *step_count = slice.length; + if (self->predicate_steps.contents == NULL) { + return NULL; + } return &self->predicate_steps.contents[slice.offset]; } @@ -885,7 +2076,24 @@ uint32_t ts_query_start_byte_for_pattern( const TSQuery *self, uint32_t pattern_index ) { - return self->start_bytes_by_pattern.contents[pattern_index]; + return self->patterns.contents[pattern_index].start_byte; +} + +bool ts_query_step_is_definite( + const TSQuery *self, + uint32_t byte_offset +) { + uint32_t step_index = UINT32_MAX; + for (unsigned i = 0; i < self->step_offsets.size; i++) { + StepOffset *step_offset = &self->step_offsets.contents[i]; + if (step_offset->byte_offset > byte_offset) break; + step_index = step_offset->step_index; + } + if (step_index < self->steps.size) { + return self->steps.contents[step_index].is_definite; + } else { + return false; + } } void ts_query_disable_capture( @@ -899,9 +2107,7 @@ void ts_query_disable_capture( if (id != -1) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; - if (step->capture_id == id) { - step->capture_id = NONE; - } + query_step__remove_capture(step, id); } ts_query__finalize_steps(self); } @@ -926,10 +2132,11 @@ void ts_query_disable_pattern( * QueryCursor ***************/ -TSQueryCursor *ts_query_cursor_new() { +TSQueryCursor *ts_query_cursor_new(void) { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { .ascending = false, + .halted = false, .states = array_new(), .finished_states = array_new(), .capture_list_pool = capture_list_pool_new(), @@ -938,8 +2145,8 @@ TSQueryCursor *ts_query_cursor_new() { .start_point = {0, 0}, .end_point = POINT_MAX, }; - array_reserve(&self->states, MAX_STATE_COUNT); - array_reserve(&self->finished_states, MAX_STATE_COUNT); + array_reserve(&self->states, 8); + array_reserve(&self->finished_states, 8); return self; } @@ -959,10 +2166,11 @@ void ts_query_cursor_exec( array_clear(&self->states); array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); - capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count); + capture_list_pool_reset(&self->capture_list_pool); self->next_state_id = 0; self->depth = 0; self->ascending = false; + self->halted = false; self->query = query; } @@ -998,25 +2206,34 @@ static bool ts_query_cursor__first_in_progress_capture( TSQueryCursor *self, uint32_t *state_index, uint32_t *byte_offset, - uint32_t *pattern_index + uint32_t *pattern_index, + bool *is_definite ) { bool result = false; + *state_index = UINT32_MAX; + *byte_offset = UINT32_MAX; + *pattern_index = UINT32_MAX; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; - if (state->capture_count > 0) { - const TSQueryCapture *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - uint32_t capture_byte = ts_node_start_byte(captures[0].node); + if (state->dead) continue; + const CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + if (captures->size > state->consumed_capture_count) { + uint32_t capture_byte = ts_node_start_byte(captures->contents[state->consumed_capture_count].node); if ( !result || capture_byte < *byte_offset || - ( - capture_byte == *byte_offset && - state->pattern_index < *pattern_index - ) + (capture_byte == *byte_offset && state->pattern_index < *pattern_index) ) { + QueryStep *step = &self->query->steps.contents[state->step_index]; + if (is_definite) { + *is_definite = step->is_definite; + } else if (step->is_definite) { + continue; + } + result = true; *state_index = i; *byte_offset = capture_byte; @@ -1027,126 +2244,324 @@ static bool ts_query_cursor__first_in_progress_capture( return result; } -static bool ts_query__cursor_add_state( +// Determine which node is first in a depth-first traversal +int ts_query_cursor__compare_nodes(TSNode left, TSNode right) { + if (left.id != right.id) { + uint32_t left_start = ts_node_start_byte(left); + uint32_t right_start = ts_node_start_byte(right); + if (left_start < right_start) return -1; + if (left_start > right_start) return 1; + uint32_t left_node_count = ts_node_end_byte(left); + uint32_t right_node_count = ts_node_end_byte(right); + if (left_node_count > right_node_count) return -1; + if (left_node_count < right_node_count) return 1; + } + return 0; +} + +// Determine if either state contains a superset of the other state's captures. +void ts_query_cursor__compare_captures( TSQueryCursor *self, - const PatternEntry *pattern + QueryState *left_state, + QueryState *right_state, + bool *left_contains_right, + bool *right_contains_left ) { - uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); - - // If there are no capture lists left in the pool, then terminate whichever - // state has captured the earliest node in the document, and steal its - // capture list. - if (list_id == NONE) { - uint32_t state_index, byte_offset, pattern_index; - if (ts_query_cursor__first_in_progress_capture( - self, - &state_index, - &byte_offset, - &pattern_index - )) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - state_index, pattern_index, byte_offset - ); - list_id = self->states.contents[state_index].capture_list_id; - array_erase(&self->states, state_index); + const CaptureList *left_captures = capture_list_pool_get( + &self->capture_list_pool, + left_state->capture_list_id + ); + const CaptureList *right_captures = capture_list_pool_get( + &self->capture_list_pool, + right_state->capture_list_id + ); + *left_contains_right = true; + *right_contains_left = true; + unsigned i = 0, j = 0; + for (;;) { + if (i < left_captures->size) { + if (j < right_captures->size) { + TSQueryCapture *left = &left_captures->contents[i]; + TSQueryCapture *right = &right_captures->contents[j]; + if (left->node.id == right->node.id && left->index == right->index) { + i++; + j++; + } else { + switch (ts_query_cursor__compare_nodes(left->node, right->node)) { + case -1: + *right_contains_left = false; + i++; + break; + case 1: + *left_contains_right = false; + j++; + break; + default: + *right_contains_left = false; + *left_contains_right = false; + i++; + j++; + break; + } + } + } else { + *right_contains_left = false; + break; + } } else { - LOG(" too many finished states.\n"); - return false; + if (j < right_captures->size) { + *left_contains_right = false; + } + break; + } + } +} + +static void ts_query_cursor__add_state( + TSQueryCursor *self, + const PatternEntry *pattern +) { + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + uint32_t start_depth = self->depth - step->depth; + + // Keep the states array in ascending order of start_depth and pattern_index, + // so that it can be processed more efficiently elsewhere. Usually, there is + // no work to do here because of two facts: + // * States with lower start_depth are naturally added first due to the + // order in which nodes are visited. + // * Earlier patterns are naturally added first because of the ordering of the + // pattern_map data structure that's used to initiate matches. + // + // This loop is only needed in cases where two conditions hold: + // * A pattern consists of more than one sibling node, so that its states + // remain in progress after exiting the node that started the match. + // * The first node in the pattern matches against multiple nodes at the + // same depth. + // + // An example of this is the pattern '((comment)* (function))'. If multiple + // `comment` nodes appear in a row, then we may initiate a new state for this + // pattern while another state for the same pattern is already in progress. + // If there are multiple patterns like this in a query, then this loop will + // need to execute in order to keep the states ordered by pattern_index. + uint32_t index = self->states.size; + while (index > 0) { + QueryState *prev_state = &self->states.contents[index - 1]; + if (prev_state->start_depth < start_depth) break; + if (prev_state->start_depth == start_depth) { + if (prev_state->pattern_index < pattern->pattern_index) break; + if (prev_state->pattern_index == pattern->pattern_index) { + // Avoid inserting an unnecessary duplicate state, which would be + // immediately pruned by the longest-match criteria. + if (prev_state->step_index == pattern->step_index) return; + } } + index--; } - LOG(" start state. pattern:%u\n", pattern->pattern_index); - array_push(&self->states, ((QueryState) { - .capture_list_id = list_id, + LOG( + " start state. pattern:%u, step:%u\n", + pattern->pattern_index, + pattern->step_index + ); + array_insert(&self->states, index, ((QueryState) { + .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, - .start_depth = self->depth, - .capture_count = 0, + .start_depth = start_depth, .consumed_capture_count = 0, + .seeking_immediate_match = true, + .has_in_progress_alternatives = false, + .needs_parent = step->depth == 1, + .dead = false, })); - return true; } -static QueryState *ts_query__cursor_copy_state( +// Acquire a capture list for this state. If there are no capture lists left in the +// pool, this will steal the capture list from another existing state, and mark that +// other state as 'dead'. +static CaptureList *ts_query_cursor__prepare_to_capture( TSQueryCursor *self, - const QueryState *state + QueryState *state, + unsigned state_index_to_preserve ) { - uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); - if (new_list_id == NONE) return NULL; - array_push(&self->states, *state); - QueryState *new_state = array_back(&self->states); - new_state->capture_list_id = new_list_id; - TSQueryCapture *old_captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - TSQueryCapture *new_captures = capture_list_pool_get( - &self->capture_list_pool, - new_list_id - ); - memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture)); - return new_state; + if (state->capture_list_id == NONE) { + state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); + + // If there are no capture lists left in the pool, then terminate whichever + // state has captured the earliest node in the document, and steal its + // capture list. + if (state->capture_list_id == NONE) { + uint32_t state_index, byte_offset, pattern_index; + if ( + ts_query_cursor__first_in_progress_capture( + self, + &state_index, + &byte_offset, + &pattern_index, + NULL + ) && + state_index != state_index_to_preserve + ) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + state_index, pattern_index, byte_offset + ); + QueryState *other_state = &self->states.contents[state_index]; + state->capture_list_id = other_state->capture_list_id; + other_state->capture_list_id = NONE; + other_state->dead = true; + CaptureList *list = capture_list_pool_get_mut( + &self->capture_list_pool, + state->capture_list_id + ); + array_clear(list); + return list; + } else { + LOG(" ran out of capture lists"); + return NULL; + } + } + } + return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id); +} + +static void ts_query_cursor__capture( + TSQueryCursor *self, + QueryState *state, + QueryStep *step, + TSNode node +) { + if (state->dead) return; + CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX); + if (!capture_list) { + state->dead = true; + return; + } + + for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { + uint16_t capture_id = step->capture_ids[j]; + if (step->capture_ids[j] == NONE) break; + array_push(capture_list, ((TSQueryCapture) { node, capture_id })); + LOG( + " capture node. type:%s, pattern:%u, capture_id:%u, capture_count:%u\n", + ts_node_type(node), + state->pattern_index, + capture_id, + capture_list->size + ); + } +} + +// Duplicate the given state and insert the newly-created state immediately after +// the given state in the `states` array. Ensures that the given state reference is +// still valid, even if the states array is reallocated. +static QueryState *ts_query_cursor__copy_state( + TSQueryCursor *self, + QueryState **state_ref +) { + const QueryState *state = *state_ref; + uint32_t state_index = state - self->states.contents; + QueryState copy = *state; + copy.capture_list_id = NONE; + + // If the state has captures, copy its capture list. + if (state->capture_list_id != NONE) { + CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index); + if (!new_captures) return NULL; + const CaptureList *old_captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + array_push_all(new_captures, old_captures); + } + + array_insert(&self->states, state_index + 1, copy); + *state_ref = &self->states.contents[state_index]; + return &self->states.contents[state_index + 1]; } // Walk the tree, processing patterns until at least one pattern finishes, // If one or more patterns finish, return `true` and store their states in the // `finished_states` array. Multiple patterns can finish on the same node. If // there are no more matches, return `false`. -static inline bool ts_query_cursor__advance(TSQueryCursor *self) { - do { +static inline bool ts_query_cursor__advance( + TSQueryCursor *self, + bool stop_on_definite_step +) { + bool did_match = false; + for (;;) { + if (self->halted) { + while (self->states.size > 0) { + QueryState state = array_pop(&self->states); + capture_list_pool_release( + &self->capture_list_pool, + state.capture_list_id + ); + } + } + + if (did_match || self->halted) return did_match; + + // Exit the current node. if (self->ascending) { LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); - // When leaving a node, remove any unfinished states whose next step - // needed to match something within that node. + // Leave this node by stepping to its next sibling or to its parent. + if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { + self->ascending = false; + } else if (ts_tree_cursor_goto_parent(&self->cursor)) { + self->depth--; + } else { + LOG("halt at root"); + self->halted = true; + } + + // After leaving a node, remove any states that cannot make further progress. uint32_t deleted_count = 0; for (unsigned i = 0, n = self->states.size; i < n; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; - if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { + // If a state completed its pattern inside of this node, but was deferred from finishing + // in order to search for longer matches, mark it as finished. + if (step->depth == PATTERN_DONE_MARKER) { + if (state->start_depth > self->depth || self->halted) { + LOG(" finish pattern %u\n", state->pattern_index); + state->id = self->next_state_id++; + array_push(&self->finished_states, *state); + did_match = true; + deleted_count++; + continue; + } + } + + // If a state needed to match something within this node, then remove that state + // as it has failed to match. + else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { LOG( " failed to match. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); - capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); deleted_count++; - } else if (deleted_count > 0) { + continue; + } + + if (deleted_count > 0) { self->states.contents[i - deleted_count] = *state; } } - self->states.size -= deleted_count; + } - if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = false; - } else if (ts_tree_cursor_goto_parent(&self->cursor)) { - self->depth--; - } else { - return self->finished_states.size > 0; - } - } else { - bool can_have_later_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = ts_tree_cursor_current_status( - &self->cursor, - &can_have_later_siblings, - &can_have_later_siblings_with_this_field - ); + // Enter a new node. + else { + // If this node is before the selected range, then avoid descending into it. TSNode node = ts_tree_cursor_current_node(&self->cursor); - TSSymbol symbol = ts_node_symbol(node); - if (symbol != ts_builtin_sym_error && self->query->symbol_map) { - symbol = self->query->symbol_map[symbol]; - } - - // If this node is before the selected range, then avoid descending - // into it. if ( ts_node_end_byte(node) <= self->start_byte || point_lte(ts_node_end_point(node), self->start_point) @@ -1161,17 +2576,40 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if ( self->end_byte <= ts_node_start_byte(node) || point_lte(self->end_point, ts_node_start_point(node)) - ) return false; + ) { + LOG("halt at end of range"); + self->halted = true; + continue; + } + // Get the properties of the current node. + TSSymbol symbol = ts_node_symbol(node); + bool is_named = ts_node_is_named(node); + if (symbol != ts_builtin_sym_error && self->query->symbol_map) { + symbol = self->query->symbol_map[symbol]; + } + bool has_later_siblings; + bool has_later_named_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = 0; + TSSymbol supertypes[8] = {0}; + unsigned supertype_count = 8; + ts_tree_cursor_current_status( + &self->cursor, + &field_id, + &has_later_siblings, + &has_later_named_siblings, + &can_have_later_siblings_with_this_field, + supertypes, + &supertype_count + ); LOG( - "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u, can_have_later_siblings:%d, can_have_later_siblings_with_this_field:%d\n", + "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", ts_node_type(node), ts_language_field_name_for_id(self->query->language, field_id), ts_node_start_point(node).row, self->states.size, - self->finished_states.size, - can_have_later_siblings, - can_have_later_siblings_with_this_field + self->finished_states.size ); // Add new states for any patterns whose root node is a wildcard. @@ -1182,7 +2620,8 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, pattern)) break; + if (step->supertype_symbol && !supertype_count) continue; + ts_query_cursor__add_state(self, pattern); } // Add new states for any patterns whose root node matches this node. @@ -1194,7 +2633,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, pattern)) break; + ts_query_cursor__add_state(self, pattern); // Advance to the next pattern whose root node matches this node. i++; @@ -1205,19 +2644,40 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // Update all of the in-progress states with current node. - for (unsigned i = 0, n = self->states.size; i < n; i++) { + for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; + state->has_in_progress_alternatives = false; + copy_count = 0; // Check that the node matches all of the criteria for the next - // step of the pattern.if ( + // step of the pattern. if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; // Determine if this node matches this step of the pattern, and also // if this node can have later siblings that match this step of the // pattern. - bool node_does_match = !step->symbol || step->symbol == symbol; - bool later_sibling_can_match = can_have_later_siblings; + bool node_does_match = + step->symbol == symbol || + step->symbol == WILDCARD_SYMBOL || + (step->symbol == NAMED_WILDCARD_SYMBOL && is_named); + bool later_sibling_can_match = has_later_siblings; + if ((step->is_immediate && is_named) || state->seeking_immediate_match) { + later_sibling_can_match = false; + } + if (step->is_last_child && has_later_named_siblings) { + node_does_match = false; + } + if (step->supertype_symbol) { + bool has_supertype = false; + for (unsigned j = 0; j < supertype_count; j++) { + if (supertypes[j] == step->supertype_symbol) { + has_supertype = true; + break; + } + } + if (!has_supertype) node_does_match = false; + } if (step->field) { if (step->field == field_id) { if (!can_have_later_siblings_with_this_field) { @@ -1228,6 +2688,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } + // Remove states immediately if it is ever clear that they cannot match. if (!node_does_match) { if (!later_sibling_can_match) { LOG( @@ -1241,76 +2702,210 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ); array_erase(&self->states, i); i--; - n--; } continue; } - // Some patterns can match their root node in multiple ways, - // capturing different children. If this pattern step could match - // later children within the same parent, then this query state - // cannot simply be updated in place. It must be split into two - // states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later - // siblings. - QueryState *next_state = state; - if ( - step->depth > 0 && - step->contains_captures && - later_sibling_can_match - ) { - QueryState *copy = ts_query__cursor_copy_state(self, state); - if (copy) { + // Some patterns can match their root node in multiple ways, capturing different + // children. If this pattern step could match later children within the same + // parent, then this query state cannot simply be updated in place. It must be + // split into two states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later siblings. + if (later_sibling_can_match && step->contains_captures) { + if (ts_query_cursor__copy_state(self, &state)) { LOG( - " split state. pattern:%u, step:%u\n", - copy->pattern_index, - copy->step_index + " split state for capture. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index ); - next_state = copy; + copy_count++; + } + } + + // If this pattern started with a wildcard, such that the pattern map + // actually points to the *second* step of the pattern, then check + // that the node has a parent, and capture the parent node if necessary. + if (state->needs_parent) { + TSNode parent = ts_tree_cursor_parent_node(&self->cursor); + if (ts_node_is_null(parent)) { + LOG(" missing parent node\n"); + state->dead = true; } else { - LOG(" canot split state.\n"); + state->needs_parent = false; + QueryStep *skipped_wildcard_step = step; + do { + skipped_wildcard_step--; + } while ( + skipped_wildcard_step->is_dead_end || + skipped_wildcard_step->is_pass_through || + skipped_wildcard_step->depth > 0 + ); + if (skipped_wildcard_step->capture_ids[0] != NONE) { + LOG(" capture wildcard parent\n"); + ts_query_cursor__capture( + self, + state, + skipped_wildcard_step, + parent + ); + } } } + // If the current node is captured in this pattern, add it to the capture list. + if (step->capture_ids[0] != NONE) { + ts_query_cursor__capture(self, state, step, node); + } + + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } + + // Advance this state to the next step of its pattern. + state->step_index++; + state->seeking_immediate_match = false; LOG( " advance state. pattern:%u, step:%u\n", - next_state->pattern_index, - next_state->step_index + state->pattern_index, + state->step_index ); - // If the current node is captured in this pattern, add it to the - // capture list. - if (step->capture_id != NONE) { - LOG( - " capture node. pattern:%u, capture_id:%u\n", - next_state->pattern_index, - step->capture_id - ); - TSQueryCapture *capture_list = capture_list_pool_get( - &self->capture_list_pool, - next_state->capture_list_id - ); - capture_list[next_state->capture_count++] = (TSQueryCapture) { - node, - step->capture_id - }; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (stop_on_definite_step && next_step->is_definite) did_match = true; + + // If this state's next step has an alternative step, then copy the state in order + // to pursue both alternatives. The alternative step itself may have an alternative, + // so this is an interative process. + unsigned end_index = i + 1; + for (unsigned j = i; j < end_index; j++) { + QueryState *state = &self->states.contents[j]; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->alternative_index != NONE) { + // A "dead-end" step exists only to add a non-sequential jump into the step sequence, + // via its alternative index. When a state reaches a dead-end step, it jumps straight + // to the step's alternative. + if (next_step->is_dead_end) { + state->step_index = next_step->alternative_index; + j--; + continue; + } + + // A "pass-through" step exists only to add a branch into the step sequence, + // via its alternative_index. When a state reaches a pass-through step, it splits + // in order to process the alternative step, and then it advances to the next step. + if (next_step->is_pass_through) { + state->step_index++; + j--; + } + + QueryState *copy = ts_query_cursor__copy_state(self, &state); + if (copy) { + LOG( + " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", + copy->pattern_index, + copy->step_index, + next_step->alternative_index, + next_step->alternative_is_immediate, + capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size + ); + end_index++; + copy_count++; + copy->step_index = next_step->alternative_index; + if (next_step->alternative_is_immediate) { + copy->seeking_immediate_match = true; + } + } + } } + } - // If the pattern is now done, then remove it from the list of - // in-progress states, and add it to the list of finished states. - next_state->step_index++; - QueryStep *next_step = step + 1; - if (next_step->depth == PATTERN_DONE_MARKER) { - LOG(" finish pattern %u\n", next_state->pattern_index); + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } - next_state->id = self->next_state_id++; - array_push(&self->finished_states, *next_state); - if (next_state == state) { - array_erase(&self->states, i); - i--; - n--; - } else { - self->states.size--; + // Enfore the longest-match criteria. When a query pattern contains optional or + // repeated nodes, this is necessary to avoid multiple redundant states, where + // one state has a strict subset of another state's captures. + bool did_remove = false; + for (unsigned j = i + 1; j < self->states.size; j++) { + QueryState *other_state = &self->states.contents[j]; + + // Query states are kept in ascending order of start_depth and pattern_index. + // Since the longest-match criteria is only used for deduping matches of the same + // pattern and root node, we only need to perform pairwise comparisons within a + // small slice of the states array. + if ( + other_state->start_depth != state->start_depth || + other_state->pattern_index != state->pattern_index + ) break; + + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; + } + other_state->has_in_progress_alternatives = true; + } + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + i--; + did_remove = true; + break; + } + state->has_in_progress_alternatives = true; + } + } + + // If there the state is at the end of its pattern, remove it from the list + // of in-progress states and add it to the list of finished states. + if (!did_remove) { + LOG( + " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", + state->pattern_index, + state->start_depth, + state->step_index, + capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size + ); + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->depth == PATTERN_DONE_MARKER) { + if (state->has_in_progress_alternatives) { + LOG(" defer finishing pattern %u\n", state->pattern_index); + } else { + LOG(" finish pattern %u\n", state->pattern_index); + state->id = self->next_state_id++; + array_push(&self->finished_states, *state); + array_erase(&self->states, state - self->states.contents); + did_match = true; + i--; + } } } } @@ -1322,9 +2917,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { self->ascending = true; } } - } while (self->finished_states.size == 0); - - return true; + } } bool ts_query_cursor_next_match( @@ -1332,7 +2925,7 @@ bool ts_query_cursor_next_match( TSQueryMatch *match ) { if (self->finished_states.size == 0) { - if (!ts_query_cursor__advance(self)) { + if (!ts_query_cursor__advance(self, false)) { return false; } } @@ -1340,11 +2933,12 @@ bool ts_query_cursor_next_match( QueryState *state = &self->finished_states.contents[0]; match->id = state->id; match->pattern_index = state->pattern_index; - match->capture_count = state->capture_count; - match->captures = capture_list_pool_get( + const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); + match->captures = captures->contents; + match->capture_count = captures->size; capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); array_erase(&self->finished_states, 0); return true; @@ -1372,97 +2966,105 @@ bool ts_query_cursor_next_capture( TSQueryMatch *match, uint32_t *capture_index ) { + // The goal here is to return captures in order, even though they may not + // be discovered in order, because patterns can overlap. Search for matches + // until there is a finished capture that is before any unfinished capture. for (;;) { - // The goal here is to return captures in order, even though they may not - // be discovered in order, because patterns can overlap. If there are any - // finished patterns, then try to find one that contains a capture that - // is *definitely* before any capture in an *unfinished* pattern. - if (self->finished_states.size > 0) { - // First, identify the position of the earliest capture in an unfinished - // match. For a finished capture to be returned, it must be *before* - // this position. - uint32_t first_unfinished_capture_byte = UINT32_MAX; - uint32_t first_unfinished_pattern_index = UINT32_MAX; - uint32_t first_unfinished_state_index; - ts_query_cursor__first_in_progress_capture( - self, - &first_unfinished_state_index, - &first_unfinished_capture_byte, - &first_unfinished_pattern_index - ); + // First, find the earliest capture in an unfinished match. + uint32_t first_unfinished_capture_byte; + uint32_t first_unfinished_pattern_index; + uint32_t first_unfinished_state_index; + bool first_unfinished_state_is_definite = false; + ts_query_cursor__first_in_progress_capture( + self, + &first_unfinished_state_index, + &first_unfinished_capture_byte, + &first_unfinished_pattern_index, + &first_unfinished_state_is_definite + ); - // Find the earliest capture in a finished match. - int first_finished_state_index = -1; - uint32_t first_finished_capture_byte = first_unfinished_capture_byte; - uint32_t first_finished_pattern_index = first_unfinished_pattern_index; - for (unsigned i = 0; i < self->finished_states.size; i++) { - const QueryState *state = &self->finished_states.contents[i]; - if (state->capture_count > state->consumed_capture_count) { - const TSQueryCapture *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - uint32_t capture_byte = ts_node_start_byte( - captures[state->consumed_capture_count].node - ); - if ( - capture_byte < first_finished_capture_byte || - ( - capture_byte == first_finished_capture_byte && - state->pattern_index < first_finished_pattern_index - ) - ) { - first_finished_state_index = i; - first_finished_capture_byte = capture_byte; - first_finished_pattern_index = state->pattern_index; - } - } else { - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->finished_states, i); - i--; + // Then find the earliest capture in a finished match. It must occur + // before the first capture in an *unfinished* match. + QueryState *first_finished_state = NULL; + uint32_t first_finished_capture_byte = first_unfinished_capture_byte; + uint32_t first_finished_pattern_index = first_unfinished_pattern_index; + for (unsigned i = 0; i < self->finished_states.size; i++) { + QueryState *state = &self->finished_states.contents[i]; + const CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + if (captures->size > state->consumed_capture_count) { + uint32_t capture_byte = ts_node_start_byte( + captures->contents[state->consumed_capture_count].node + ); + if ( + capture_byte < first_finished_capture_byte || + ( + capture_byte == first_finished_capture_byte && + state->pattern_index < first_finished_pattern_index + ) + ) { + first_finished_state = state; + first_finished_capture_byte = capture_byte; + first_finished_pattern_index = state->pattern_index; } - } - - // If there is finished capture that is clearly before any unfinished - // capture, then return its match, and its capture index. Internally - // record the fact that the capture has been 'consumed'. - if (first_finished_state_index != -1) { - QueryState *state = &self->finished_states.contents[ - first_finished_state_index - ]; - match->id = state->id; - match->pattern_index = state->pattern_index; - match->capture_count = state->capture_count; - match->captures = capture_list_pool_get( + } else { + capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); - *capture_index = state->consumed_capture_count; - state->consumed_capture_count++; - return true; + array_erase(&self->finished_states, i); + i--; } + } - if (capture_list_pool_is_empty(&self->capture_list_pool)) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - first_unfinished_state_index, - first_unfinished_pattern_index, - first_unfinished_capture_byte - ); - capture_list_pool_release( - &self->capture_list_pool, - self->states.contents[first_unfinished_state_index].capture_list_id - ); - array_erase(&self->states, first_unfinished_state_index); - } + // If there is finished capture that is clearly before any unfinished + // capture, then return its match, and its capture index. Internally + // record the fact that the capture has been 'consumed'. + QueryState *state; + if (first_finished_state) { + state = first_finished_state; + } else if (first_unfinished_state_is_definite) { + state = &self->states.contents[first_unfinished_state_index]; + } else { + state = NULL; + } + + if (state) { + match->id = state->id; + match->pattern_index = state->pattern_index; + const CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + match->captures = captures->contents; + match->capture_count = captures->size; + *capture_index = state->consumed_capture_count; + state->consumed_capture_count++; + return true; + } + + if (capture_list_pool_is_empty(&self->capture_list_pool)) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + first_unfinished_state_index, + first_unfinished_pattern_index, + first_unfinished_capture_byte + ); + capture_list_pool_release( + &self->capture_list_pool, + self->states.contents[first_unfinished_state_index].capture_list_id + ); + array_erase(&self->states, first_unfinished_state_index); } // If there are no finished matches that are ready to be returned, then // continue finding more matches. - if (!ts_query_cursor__advance(self)) return false; + if ( + !ts_query_cursor__advance(self, true) && + self->finished_states.size == 0 + ) return false; } } diff --git a/reusable_node.h b/reusable_node.h index 9cba9519..63fe3c1a 100644 --- a/reusable_node.h +++ b/reusable_node.h @@ -20,15 +20,6 @@ static inline void reusable_node_clear(ReusableNode *self) { self->last_external_token = NULL_SUBTREE; } -static inline void reusable_node_reset(ReusableNode *self, Subtree tree) { - reusable_node_clear(self); - array_push(&self->stack, ((StackEntry) { - .tree = tree, - .child_index = 0, - .byte_offset = 0, - })); -} - static inline Subtree reusable_node_tree(ReusableNode *self) { return self->stack.size > 0 ? self->stack.contents[self->stack.size - 1].tree @@ -62,7 +53,7 @@ static inline void reusable_node_advance(ReusableNode *self) { } while (ts_subtree_child_count(tree) <= next_index); array_push(&self->stack, ((StackEntry) { - .tree = tree.ptr->children[next_index], + .tree = ts_subtree_children(tree)[next_index], .child_index = next_index, .byte_offset = byte_offset, })); @@ -72,7 +63,7 @@ static inline bool reusable_node_descend(ReusableNode *self) { StackEntry last_entry = *array_back(&self->stack); if (ts_subtree_child_count(last_entry.tree) > 0) { array_push(&self->stack, ((StackEntry) { - .tree = last_entry.tree.ptr->children[0], + .tree = ts_subtree_children(last_entry.tree)[0], .child_index = 0, .byte_offset = last_entry.byte_offset, })); @@ -86,3 +77,19 @@ static inline void reusable_node_advance_past_leaf(ReusableNode *self) { while (reusable_node_descend(self)) {} reusable_node_advance(self); } + +static inline void reusable_node_reset(ReusableNode *self, Subtree tree) { + reusable_node_clear(self); + array_push(&self->stack, ((StackEntry) { + .tree = tree, + .child_index = 0, + .byte_offset = 0, + })); + + // Never reuse the root node, because it has a non-standard internal structure + // due to transformations that are applied when it is accepted: adding the EOF + // child and any extra children. + if (!reusable_node_descend(self)) { + reusable_node_clear(self); + } +} diff --git a/stack.c b/stack.c index ade15775..cc728b05 100644 --- a/stack.c +++ b/stack.c @@ -288,7 +288,7 @@ inline StackSliceArray stack__iter(Stack *self, StackVersion version, bool include_subtrees = false; if (goal_subtree_count >= 0) { include_subtrees = true; - array_reserve(&iterator.subtrees, goal_subtree_count); + array_reserve(&iterator.subtrees, ts_subtree_alloc_size(goal_subtree_count) / sizeof(Subtree)); } array_push(&self->iterators, iterator); @@ -304,8 +304,9 @@ inline StackSliceArray stack__iter(Stack *self, StackVersion version, if (should_pop) { SubtreeArray subtrees = iterator->subtrees; - if (!should_stop) + if (!should_stop) { ts_subtree_array_copy(subtrees, &subtrees); + } ts_subtree_array_reverse(&subtrees); ts_stack__add_slice( self, @@ -480,6 +481,7 @@ StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t c } inline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { + (void)payload; if (iterator->subtree_count >= 1) { if (iterator->is_pending) { return StackActionPop | StackActionStop; @@ -532,6 +534,7 @@ SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) { } inline StackAction pop_all_callback(void *payload, const StackIterator *iterator) { + (void)payload; return iterator->node->link_count == 0 ? StackActionPop : StackActionNone; } @@ -569,7 +572,12 @@ void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_dep }; array_init(session.summary); stack__iter(self, version, summarize_stack_callback, &session, -1); - self->heads.contents[version].summary = session.summary; + StackHead *head = &self->heads.contents[version]; + if (head->summary) { + array_delete(head->summary); + ts_free(head->summary); + } + head->summary = session.summary; } StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { @@ -741,6 +749,10 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) ts_stack_error_cost(self, i) ); + if (head->summary) { + fprintf(f, "\nsummary_size: %u", head->summary->size); + } + if (head->last_external_token.ptr) { const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state; const char *data = ts_external_scanner_state_data(state); diff --git a/subtree.c b/subtree.c index b98f1723..e90dc9d7 100644 --- a/subtree.c +++ b/subtree.c @@ -21,7 +21,7 @@ typedef struct { #define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX #define TS_MAX_TREE_POOL_SIZE 32 -static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}}; +static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0}; // ExternalScannerState @@ -80,26 +80,33 @@ void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) { } } -void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) { +void ts_subtree_array_clear(SubtreePool *pool, SubtreeArray *self) { for (uint32_t i = 0; i < self->size; i++) { ts_subtree_release(pool, self->contents[i]); } - array_delete(self); + array_clear(self); } -SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *self) { - SubtreeArray result = array_new(); +void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) { + ts_subtree_array_clear(pool, self); + array_delete(self); +} - uint32_t i = self->size - 1; - for (; i + 1 > 0; i--) { - Subtree child = self->contents[i]; - if (!ts_subtree_extra(child)) break; - array_push(&result, child); +void ts_subtree_array_remove_trailing_extras( + SubtreeArray *self, + SubtreeArray *destination +) { + array_clear(destination); + while (self->size > 0) { + Subtree last = self->contents[self->size - 1]; + if (ts_subtree_extra(last)) { + self->size--; + array_push(destination, last); + } else { + break; + } } - - self->size = i + 1; - ts_subtree_array_reverse(&result); - return result; + ts_subtree_array_reverse(destination); } void ts_subtree_array_reverse(SubtreeArray *self) { @@ -208,7 +215,7 @@ Subtree ts_subtree_new_leaf( .has_external_tokens = has_external_tokens, .is_missing = false, .is_keyword = is_keyword, - .first_leaf = {.symbol = 0, .parse_state = 0}, + {{.first_leaf = {.symbol = 0, .parse_state = 0}}} }; return (Subtree) {.ptr = data}; } @@ -247,28 +254,45 @@ Subtree ts_subtree_new_error( return result; } -MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) { - if (self.data.is_inline) return (MutableSubtree) {self.data}; - if (self.ptr->ref_count == 1) return ts_subtree_to_mut_unsafe(self); - - SubtreeHeapData *result = ts_subtree_pool_allocate(pool); - memcpy(result, self.ptr, sizeof(SubtreeHeapData)); - if (result->child_count > 0) { - result->children = ts_calloc(self.ptr->child_count, sizeof(Subtree)); - memcpy(result->children, self.ptr->children, result->child_count * sizeof(Subtree)); - for (uint32_t i = 0; i < result->child_count; i++) { - ts_subtree_retain(result->children[i]); +// Clone a subtree. +MutableSubtree ts_subtree_clone(Subtree self) { + size_t alloc_size = ts_subtree_alloc_size(self.ptr->child_count); + Subtree *new_children = ts_malloc(alloc_size); + Subtree *old_children = ts_subtree_children(self); + memcpy(new_children, old_children, alloc_size); + SubtreeHeapData *result = (SubtreeHeapData *)&new_children[self.ptr->child_count]; + if (self.ptr->child_count > 0) { + for (uint32_t i = 0; i < self.ptr->child_count; i++) { + ts_subtree_retain(new_children[i]); } - } else if (result->has_external_tokens) { - result->external_scanner_state = ts_external_scanner_state_copy(&self.ptr->external_scanner_state); + } else if (self.ptr->has_external_tokens) { + result->external_scanner_state = ts_external_scanner_state_copy( + &self.ptr->external_scanner_state + ); } result->ref_count = 1; - ts_subtree_release(pool, self); return (MutableSubtree) {.ptr = result}; } -static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLanguage *language, - MutableSubtreeArray *stack) { +// Get mutable version of a subtree. +// +// This takes ownership of the subtree. If the subtree has only one owner, +// this will directly convert it into a mutable version. Otherwise, it will +// perform a copy. +MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) { + if (self.data.is_inline) return (MutableSubtree) {self.data}; + if (self.ptr->ref_count == 1) return ts_subtree_to_mut_unsafe(self); + MutableSubtree result = ts_subtree_clone(self); + ts_subtree_release(pool, self); + return result; +} + +static void ts_subtree__compress( + MutableSubtree self, + unsigned count, + const TSLanguage *language, + MutableSubtreeArray *stack +) { unsigned initial_stack_size = stack->size; MutableSubtree tree = self; @@ -276,7 +300,7 @@ static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLa for (unsigned i = 0; i < count; i++) { if (tree.ptr->ref_count > 1 || tree.ptr->child_count < 2) break; - MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); + MutableSubtree child = ts_subtree_to_mut_unsafe(ts_subtree_children(tree)[0]); if ( child.data.is_inline || child.ptr->child_count < 2 || @@ -284,7 +308,7 @@ static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLa child.ptr->symbol != symbol ) break; - MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[0]); + MutableSubtree grandchild = ts_subtree_to_mut_unsafe(ts_subtree_children(child)[0]); if ( grandchild.data.is_inline || grandchild.ptr->child_count < 2 || @@ -292,20 +316,20 @@ static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLa grandchild.ptr->symbol != symbol ) break; - tree.ptr->children[0] = ts_subtree_from_mut(grandchild); - child.ptr->children[0] = grandchild.ptr->children[grandchild.ptr->child_count - 1]; - grandchild.ptr->children[grandchild.ptr->child_count - 1] = ts_subtree_from_mut(child); + ts_subtree_children(tree)[0] = ts_subtree_from_mut(grandchild); + ts_subtree_children(child)[0] = ts_subtree_children(grandchild)[grandchild.ptr->child_count - 1]; + ts_subtree_children(grandchild)[grandchild.ptr->child_count - 1] = ts_subtree_from_mut(child); array_push(stack, tree); tree = grandchild; } while (stack->size > initial_stack_size) { tree = array_pop(stack); - MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); - MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[child.ptr->child_count - 1]); - ts_subtree_set_children(grandchild, grandchild.ptr->children, grandchild.ptr->child_count, language); - ts_subtree_set_children(child, child.ptr->children, child.ptr->child_count, language); - ts_subtree_set_children(tree, tree.ptr->children, tree.ptr->child_count, language); + MutableSubtree child = ts_subtree_to_mut_unsafe(ts_subtree_children(tree)[0]); + MutableSubtree grandchild = ts_subtree_to_mut_unsafe(ts_subtree_children(child)[child.ptr->child_count - 1]); + ts_subtree_summarize_children(grandchild, language); + ts_subtree_summarize_children(child, language); + ts_subtree_summarize_children(tree, language); } } @@ -320,8 +344,8 @@ void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *langu MutableSubtree tree = array_pop(&pool->tree_stack); if (tree.ptr->repeat_depth > 0) { - Subtree child1 = tree.ptr->children[0]; - Subtree child2 = tree.ptr->children[tree.ptr->child_count - 1]; + Subtree child1 = ts_subtree_children(tree)[0]; + Subtree child2 = ts_subtree_children(tree)[tree.ptr->child_count - 1]; long repeat_delta = (long)ts_subtree_repeat_depth(child1) - (long)ts_subtree_repeat_depth(child2); if (repeat_delta > 0) { unsigned n = repeat_delta; @@ -333,7 +357,7 @@ void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *langu } for (uint32_t i = 0; i < tree.ptr->child_count; i++) { - Subtree child = tree.ptr->children[i]; + Subtree child = ts_subtree_children(tree)[i]; if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); } @@ -341,17 +365,13 @@ void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *langu } } -void ts_subtree_set_children( - MutableSubtree self, Subtree *children, uint32_t child_count, const TSLanguage *language +// Assign all of the node's properties that depend on its children. +void ts_subtree_summarize_children( + MutableSubtree self, + const TSLanguage *language ) { assert(!self.data.is_inline); - if (self.ptr->child_count > 0 && children != self.ptr->children) { - ts_free(self.ptr->children); - } - - self.ptr->child_count = child_count; - self.ptr->children = children; self.ptr->named_child_count = 0; self.ptr->visible_child_count = 0; self.ptr->error_cost = 0; @@ -360,12 +380,13 @@ void ts_subtree_set_children( self.ptr->has_external_tokens = false; self.ptr->dynamic_precedence = 0; - uint32_t non_extra_index = 0; + uint32_t structural_index = 0; const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); uint32_t lookahead_end_byte = 0; + const Subtree *children = ts_subtree_children(self); for (uint32_t i = 0; i < self.ptr->child_count; i++) { - Subtree child = self.ptr->children[i]; + Subtree child = children[i]; if (i == 0) { self.ptr->padding = ts_subtree_padding(child); @@ -384,18 +405,29 @@ void ts_subtree_set_children( self.ptr->error_cost += ts_subtree_error_cost(child); } + uint32_t grandchild_count = ts_subtree_child_count(child); + if (self.ptr->symbol == ts_builtin_sym_error || self.ptr->symbol == ts_builtin_sym_error_repeat) { + if (!ts_subtree_extra(child) && !(ts_subtree_is_error(child) && grandchild_count == 0)) { + if (ts_subtree_visible(child)) { + self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE; + } else if (grandchild_count > 0) { + self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * child.ptr->visible_child_count; + } + } + } + self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child); self.ptr->node_count += ts_subtree_node_count(child); - if (alias_sequence && alias_sequence[non_extra_index] != 0 && !ts_subtree_extra(child)) { + if (alias_sequence && alias_sequence[structural_index] != 0 && !ts_subtree_extra(child)) { self.ptr->visible_child_count++; - if (ts_language_symbol_metadata(language, alias_sequence[non_extra_index]).named) { + if (ts_language_symbol_metadata(language, alias_sequence[structural_index]).named) { self.ptr->named_child_count++; } } else if (ts_subtree_visible(child)) { self.ptr->visible_child_count++; if (ts_subtree_named(child)) self.ptr->named_child_count++; - } else if (ts_subtree_child_count(child) > 0) { + } else if (grandchild_count > 0) { self.ptr->visible_child_count += child.ptr->visible_child_count; self.ptr->named_child_count += child.ptr->named_child_count; } @@ -407,7 +439,7 @@ void ts_subtree_set_children( self.ptr->parse_state = TS_TREE_STATE_NONE; } - if (!ts_subtree_extra(child)) non_extra_index++; + if (!ts_subtree_extra(child)) structural_index++; } self.ptr->lookahead_bytes = lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes; @@ -417,22 +449,11 @@ void ts_subtree_set_children( ERROR_COST_PER_RECOVERY + ERROR_COST_PER_SKIPPED_CHAR * self.ptr->size.bytes + ERROR_COST_PER_SKIPPED_LINE * self.ptr->size.extent.row; - for (uint32_t i = 0; i < self.ptr->child_count; i++) { - Subtree child = self.ptr->children[i]; - uint32_t grandchild_count = ts_subtree_child_count(child); - if (ts_subtree_extra(child)) continue; - if (ts_subtree_is_error(child) && grandchild_count == 0) continue; - if (ts_subtree_visible(child)) { - self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE; - } else if (grandchild_count > 0) { - self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * child.ptr->visible_child_count; - } - } } if (self.ptr->child_count > 0) { - Subtree first_child = self.ptr->children[0]; - Subtree last_child = self.ptr->children[self.ptr->child_count - 1]; + Subtree first_child = children[0]; + Subtree last_child = children[self.ptr->child_count - 1]; self.ptr->first_leaf.symbol = ts_subtree_leaf_symbol(first_child); self.ptr->first_leaf.parse_state = ts_subtree_leaf_parse_state(first_child); @@ -455,52 +476,82 @@ void ts_subtree_set_children( } } -MutableSubtree ts_subtree_new_node(SubtreePool *pool, TSSymbol symbol, - SubtreeArray *children, unsigned production_id, - const TSLanguage *language) { +// Create a new parent node with the given children. +// +// This takes ownership of the children array. +MutableSubtree ts_subtree_new_node( + TSSymbol symbol, + SubtreeArray *children, + unsigned production_id, + const TSLanguage *language +) { TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); bool fragile = symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat; - SubtreeHeapData *data = ts_subtree_pool_allocate(pool); + + // Allocate the node's data at the end of the array of children. + size_t new_byte_size = ts_subtree_alloc_size(children->size); + if (children->capacity * sizeof(Subtree) < new_byte_size) { + children->contents = ts_realloc(children->contents, new_byte_size); + children->capacity = new_byte_size / sizeof(Subtree); + } + SubtreeHeapData *data = (SubtreeHeapData *)&children->contents[children->size]; + *data = (SubtreeHeapData) { .ref_count = 1, .symbol = symbol, - .production_id = production_id, + .child_count = children->size, .visible = metadata.visible, .named = metadata.named, .has_changes = false, .fragile_left = fragile, .fragile_right = fragile, .is_keyword = false, - .node_count = 0, - .first_leaf = {.symbol = 0, .parse_state = 0}, + {{ + .node_count = 0, + .production_id = production_id, + .first_leaf = {.symbol = 0, .parse_state = 0}, + }} }; MutableSubtree result = {.ptr = data}; - ts_subtree_set_children(result, children->contents, children->size, language); + ts_subtree_summarize_children(result, language); return result; } -Subtree ts_subtree_new_error_node(SubtreePool *pool, SubtreeArray *children, - bool extra, const TSLanguage *language) { +// Create a new error node contaning the given children. +// +// This node is treated as 'extra'. Its children are prevented from having +// having any effect on the parse state. +Subtree ts_subtree_new_error_node( + SubtreeArray *children, + bool extra, + const TSLanguage *language +) { MutableSubtree result = ts_subtree_new_node( - pool, ts_builtin_sym_error, children, 0, language + ts_builtin_sym_error, children, 0, language ); result.ptr->extra = extra; return ts_subtree_from_mut(result); } -Subtree ts_subtree_new_missing_leaf(SubtreePool *pool, TSSymbol symbol, Length padding, - const TSLanguage *language) { +// Create a new 'missing leaf' node. +// +// This node is treated as 'extra'. Its children are prevented from having +// having any effect on the parse state. +Subtree ts_subtree_new_missing_leaf( + SubtreePool *pool, + TSSymbol symbol, + Length padding, + const TSLanguage *language +) { Subtree result = ts_subtree_new_leaf( pool, symbol, padding, length_zero(), 0, 0, false, false, language ); - if (result.data.is_inline) { result.data.is_missing = true; } else { ((SubtreeHeapData *)result.ptr)->is_missing = true; } - return result; } @@ -523,19 +574,22 @@ void ts_subtree_release(SubtreePool *pool, Subtree self) { while (pool->tree_stack.size > 0) { MutableSubtree tree = array_pop(&pool->tree_stack); if (tree.ptr->child_count > 0) { + Subtree *children = ts_subtree_children(tree); for (uint32_t i = 0; i < tree.ptr->child_count; i++) { - Subtree child = tree.ptr->children[i]; + Subtree child = children[i]; if (child.data.is_inline) continue; assert(child.ptr->ref_count > 0); if (atomic_dec((volatile uint32_t *)&child.ptr->ref_count) == 0) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); } } - ts_free(tree.ptr->children); - } else if (tree.ptr->has_external_tokens) { - ts_external_scanner_state_delete(&tree.ptr->external_scanner_state); + ts_free(children); + } else { + if (tree.ptr->has_external_tokens) { + ts_external_scanner_state_delete(&tree.ptr->external_scanner_state); + } + ts_subtree_pool_free(pool, tree.ptr); } - ts_subtree_pool_free(pool, tree.ptr); } } @@ -562,7 +616,7 @@ bool ts_subtree_eq(Subtree self, Subtree other) { if (self.ptr->named_child_count != other.ptr->named_child_count) return false; for (uint32_t i = 0; i < self.ptr->child_count; i++) { - if (!ts_subtree_eq(self.ptr->children[i], other.ptr->children[i])) { + if (!ts_subtree_eq(ts_subtree_children(self)[i], ts_subtree_children(other)[i])) { return false; } } @@ -576,8 +630,8 @@ int ts_subtree_compare(Subtree left, Subtree right) { if (ts_subtree_child_count(left) < ts_subtree_child_count(right)) return -1; if (ts_subtree_child_count(right) < ts_subtree_child_count(left)) return 1; for (uint32_t i = 0, n = ts_subtree_child_count(left); i < n; i++) { - Subtree left_child = left.ptr->children[i]; - Subtree right_child = right.ptr->children[i]; + Subtree left_child = ts_subtree_children(left)[i]; + Subtree right_child = ts_subtree_children(right)[i]; switch (ts_subtree_compare(left_child, right_child)) { case -1: return -1; case 1: return 1; @@ -693,7 +747,7 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool Length child_left, child_right = length_zero(); for (uint32_t i = 0, n = ts_subtree_child_count(*entry.tree); i < n; i++) { - Subtree *child = &result.ptr->children[i]; + Subtree *child = &ts_subtree_children(*entry.tree)[i]; Length child_size = ts_subtree_total_size(*child); child_left = child_right; child_right = length_add(child_left, child_size); @@ -748,7 +802,7 @@ Subtree ts_subtree_last_external_token(Subtree tree) { if (!ts_subtree_has_external_tokens(tree)) return NULL_SUBTREE; while (tree.ptr->child_count > 0) { for (uint32_t i = tree.ptr->child_count - 1; i + 1 > 0; i--) { - Subtree child = tree.ptr->children[i]; + Subtree child = ts_subtree_children(tree)[i]; if (ts_subtree_has_external_tokens(child)) { tree = child; break; @@ -851,7 +905,7 @@ static size_t ts_subtree__write_to_string( uint32_t structural_child_index = 0; for (uint32_t i = 0; i < self.ptr->child_count; i++) { - Subtree child = self.ptr->children[i]; + Subtree child = ts_subtree_children(self)[i]; if (ts_subtree_extra(child)) { cursor += ts_subtree__write_to_string( child, *writer, limit, @@ -900,7 +954,7 @@ char *ts_subtree_string( language, include_all, 0, false, ROOT_FIELD ) + 1; - char *result = malloc(size * sizeof(char)); + char *result = ts_malloc(size * sizeof(char)); ts_subtree__write_to_string( self, result, size, language, include_all, @@ -948,7 +1002,7 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, language->max_alias_sequence_length * ts_subtree_production_id(*self); for (uint32_t i = 0, n = ts_subtree_child_count(*self); i < n; i++) { - const Subtree *child = &self->ptr->children[i]; + const Subtree *child = &ts_subtree_children(*self)[i]; TSSymbol alias_symbol = 0; if (!ts_subtree_extra(*child) && child_info_offset) { alias_symbol = language->alias_sequences[child_info_offset]; diff --git a/subtree.h b/subtree.h index 178eb4e9..899d592f 100644 --- a/subtree.h +++ b/subtree.h @@ -14,12 +14,19 @@ extern "C" { #include "api.h" #include "parser.h" -static const TSStateId TS_TREE_STATE_NONE = USHRT_MAX; +#define TS_TREE_STATE_NONE USHRT_MAX #define NULL_SUBTREE ((Subtree) {.ptr = NULL}) -typedef union Subtree Subtree; -typedef union MutableSubtree MutableSubtree; - +// The serialized state of an external scanner. +// +// Every time an external token subtree is created after a call to an +// external scanner, the scanner's `serialize` function is called to +// retrieve a serialized copy of its state. The bytes are then copied +// onto the subtree itself so that the scanner's state can later be +// restored using its `deserialize` function. +// +// Small byte arrays are stored inline, and long ones are allocated +// separately on the heap. typedef struct { union { char *long_data; @@ -28,6 +35,10 @@ typedef struct { uint32_t length; } ExternalScannerState; +// A compact representation of a subtree. +// +// This representation is used for small leaf nodes that are not +// errors, and were not created by an external scanner. typedef struct { bool is_inline : 1; bool visible : 1; @@ -45,6 +56,11 @@ typedef struct { uint16_t parse_state; } SubtreeInlineData; +// A heap-allocated representation of a subtree. +// +// This representation is used for parent nodes, external tokens, +// errors, and other leaf nodes whose data is too large to fit into +// the inlinen representation. typedef struct { volatile uint32_t ref_count; Length padding; @@ -68,7 +84,6 @@ typedef struct { union { // Non-terminal subtrees (`child_count > 0`) struct { - Subtree *children; uint32_t visible_child_count; uint32_t named_child_count; uint32_t node_count; @@ -89,15 +104,17 @@ typedef struct { }; } SubtreeHeapData; -union Subtree { +// The fundamental building block of a syntax tree. +typedef union { SubtreeInlineData data; const SubtreeHeapData *ptr; -}; +} Subtree; -union MutableSubtree { +// Like Subtree, but mutable. +typedef union { SubtreeInlineData data; SubtreeHeapData *ptr; -}; +} MutableSubtree; typedef Array(Subtree) SubtreeArray; typedef Array(MutableSubtree) MutableSubtreeArray; @@ -111,8 +128,9 @@ void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsign const char *ts_external_scanner_state_data(const ExternalScannerState *); void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); +void ts_subtree_array_clear(SubtreePool *, SubtreeArray *); void ts_subtree_array_delete(SubtreePool *, SubtreeArray *); -SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *); +void ts_subtree_array_remove_trailing_extras(SubtreeArray *, SubtreeArray *); void ts_subtree_array_reverse(SubtreeArray *); SubtreePool ts_subtree_pool_new(uint32_t capacity); @@ -125,8 +143,8 @@ Subtree ts_subtree_new_leaf( Subtree ts_subtree_new_error( SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * ); -MutableSubtree ts_subtree_new_node(SubtreePool *, TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); -Subtree ts_subtree_new_error_node(SubtreePool *, SubtreeArray *, bool, const TSLanguage *); +MutableSubtree ts_subtree_new_node(TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); +Subtree ts_subtree_new_error_node(SubtreeArray *, bool, const TSLanguage *); Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, const TSLanguage *); MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree); void ts_subtree_retain(Subtree); @@ -134,7 +152,8 @@ void ts_subtree_release(SubtreePool *, Subtree); bool ts_subtree_eq(Subtree, Subtree); int ts_subtree_compare(Subtree, Subtree); void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); -void ts_subtree_set_children(MutableSubtree, Subtree *, uint32_t, const TSLanguage *); +void ts_subtree_summarize(MutableSubtree, const Subtree *, uint32_t, const TSLanguage *); +void ts_subtree_summarize_children(MutableSubtree, const TSLanguage *); void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all); @@ -156,6 +175,17 @@ static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE #undef SUBTREE_GET +// Get the size needed to store a heap-allocated subtree with the given +// number of children. +static inline size_t ts_subtree_alloc_size(uint32_t child_count) { + return child_count * sizeof(Subtree) + sizeof(SubtreeHeapData); +} + +// Get a subtree's children, which are allocated immediately before the +// tree's own heap data. +#define ts_subtree_children(self) \ + ((self).data.is_inline ? NULL : (Subtree *)((self).ptr) - (self).ptr->child_count) + static inline void ts_subtree_set_extra(MutableSubtree *self) { if (self->data.is_inline) { self->data.extra = true; diff --git a/tree_cursor.c b/tree_cursor.c index 4ca83264..b9856240 100644 --- a/tree_cursor.c +++ b/tree_cursor.c @@ -38,7 +38,7 @@ static inline bool ts_tree_cursor_child_iterator_next(CursorChildIterator *self, TreeCursorEntry *result, bool *visible) { if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false; - const Subtree *child = &self->parent.ptr->children[self->child_index]; + const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; *result = (TreeCursorEntry) { .subtree = child, .position = self->position, @@ -56,7 +56,7 @@ static inline bool ts_tree_cursor_child_iterator_next(CursorChildIterator *self, self->child_index++; if (self->child_index < self->parent.ptr->child_count) { - Subtree next_child = self->parent.ptr->children[self->child_index]; + Subtree next_child = ts_subtree_children(self->parent)[self->child_index]; self->position = length_add(self->position, ts_subtree_padding(next_child)); } @@ -205,19 +205,21 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) { TreeCursorEntry *entry = &self->stack.contents[i]; - bool is_aliased = false; - if (i > 0) { - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - is_aliased = alias_sequence && alias_sequence[entry->structural_child_index]; - } - if (ts_subtree_visible(*entry->subtree) || is_aliased) { + if (ts_subtree_visible(*entry->subtree)) { self->stack.size = i + 1; return true; } + if (i > 0 && !ts_subtree_extra(*entry->subtree)) { + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + if (ts_language_alias_at( + self->tree->language, + parent_entry->subtree->ptr->production_id, + entry->structural_child_index + )) { + self->stack.size = i + 1; + return true; + } + } } return false; } @@ -226,15 +228,13 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; TreeCursorEntry *last_entry = array_back(&self->stack); TSSymbol alias_symbol = 0; - if (self->stack.size > 1) { + if (self->stack.size > 1 && !ts_subtree_extra(*last_entry->subtree)) { TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2]; - const TSSymbol *alias_sequence = ts_language_alias_sequence( + alias_symbol = ts_language_alias_at( self->tree->language, - parent_entry->subtree->ptr->production_id + parent_entry->subtree->ptr->production_id, + last_entry->structural_child_index ); - if (alias_sequence && !ts_subtree_extra(*last_entry->subtree)) { - alias_symbol = alias_sequence[last_entry->structural_child_index]; - } } return ts_node_new( self->tree, @@ -244,14 +244,23 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { ); } -TSFieldId ts_tree_cursor_current_status( +// Private - Get various facts about the current node that are needed +// when executing tree queries. +void ts_tree_cursor_current_status( const TSTreeCursor *_self, - bool *can_have_later_siblings, - bool *can_have_later_siblings_with_this_field + TSFieldId *field_id, + bool *has_later_siblings, + bool *has_later_named_siblings, + bool *can_have_later_siblings_with_this_field, + TSSymbol *supertypes, + unsigned *supertype_count ) { const TreeCursor *self = (const TreeCursor *)_self; - TSFieldId result = 0; - *can_have_later_siblings = false; + unsigned max_supertypes = *supertype_count; + *field_id = 0; + *supertype_count = 0; + *has_later_siblings = false; + *has_later_named_siblings = false; *can_have_later_siblings_with_this_field = false; // Walk up the tree, visiting the current node and its invisible ancestors, @@ -260,54 +269,130 @@ TSFieldId ts_tree_cursor_current_status( TreeCursorEntry *entry = &self->stack.contents[i]; TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + const TSSymbol *alias_sequence = ts_language_alias_sequence( + self->tree->language, + parent_entry->subtree->ptr->production_id + ); + + #define subtree_symbol(subtree, structural_child_index) \ + (( \ + !ts_subtree_extra(subtree) && \ + alias_sequence && \ + alias_sequence[structural_child_index] \ + ) ? \ + alias_sequence[structural_child_index] : \ + ts_subtree_symbol(subtree)) + // Stop walking up when a visible ancestor is found. - if (i != self->stack.size - 1) { - if (ts_subtree_visible(*entry->subtree)) break; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - if (alias_sequence && alias_sequence[entry->structural_child_index]) { - break; - } + TSSymbol entry_symbol = subtree_symbol( + *entry->subtree, + entry->structural_child_index + ); + TSSymbolMetadata entry_metadata = ts_language_symbol_metadata( + self->tree->language, + entry_symbol + ); + if (i != self->stack.size - 1 && entry_metadata.visible) break; + + // Record any supertypes + if (entry_metadata.supertype && *supertype_count < max_supertypes) { + supertypes[*supertype_count] = entry_symbol; + (*supertype_count)++; } - if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) { - *can_have_later_siblings = true; + // Determine if the current node has later siblings. + if (!*has_later_siblings) { + unsigned sibling_count = parent_entry->subtree->ptr->child_count; + unsigned structural_child_index = entry->structural_child_index; + if (!ts_subtree_extra(*entry->subtree)) structural_child_index++; + for (unsigned j = entry->child_index + 1; j < sibling_count; j++) { + Subtree sibling = ts_subtree_children(*parent_entry->subtree)[j]; + TSSymbolMetadata sibling_metadata = ts_language_symbol_metadata( + self->tree->language, + subtree_symbol(sibling, structural_child_index) + ); + if (sibling_metadata.visible) { + *has_later_siblings = true; + if (*has_later_named_siblings) break; + if (sibling_metadata.named) { + *has_later_named_siblings = true; + break; + } + } else if (ts_subtree_visible_child_count(sibling) > 0) { + *has_later_siblings = true; + if (*has_later_named_siblings) break; + if (sibling.ptr->named_child_count > 0) { + *has_later_named_siblings = true; + break; + } + } + if (!ts_subtree_extra(sibling)) structural_child_index++; + } } - if (ts_subtree_extra(*entry->subtree)) break; + #undef subtree_symbol - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map( - self->tree->language, - parent_entry->subtree->ptr->production_id, - &field_map, &field_map_end - ); + if (!ts_subtree_extra(*entry->subtree)) { + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self->tree->language, + parent_entry->subtree->ptr->production_id, + &field_map, &field_map_end + ); - // Look for a field name associated with the current node. - if (!result) { - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (!i->inherited && i->child_index == entry->structural_child_index) { - result = i->field_id; - *can_have_later_siblings_with_this_field = false; - break; + // Look for a field name associated with the current node. + if (!*field_id) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (!i->inherited && i->child_index == entry->structural_child_index) { + *field_id = i->field_id; + break; + } } } - } - // Determine if there other later siblings with the same field name. - if (result) { - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (i->field_id == result && i->child_index > entry->structural_child_index) { - *can_have_later_siblings_with_this_field = true; - break; + // Determine if the current node can have later siblings with the same field name. + if (*field_id) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (i->field_id == *field_id) { + if ( + i->child_index > entry->structural_child_index || + (i->child_index == entry->structural_child_index && *has_later_named_siblings) + ) { + *can_have_later_siblings_with_this_field = true; + break; + } + } } } } } +} - return result; +TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + for (int i = (int)self->stack.size - 2; i >= 0; i--) { + TreeCursorEntry *entry = &self->stack.contents[i]; + bool is_visible = true; + TSSymbol alias_symbol = 0; + if (i > 0) { + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + alias_symbol = ts_language_alias_at( + self->tree->language, + parent_entry->subtree->ptr->production_id, + entry->structural_child_index + ); + is_visible = (alias_symbol != 0) || ts_subtree_visible(*entry->subtree); + } + if (is_visible) { + return ts_node_new( + self->tree, + entry->subtree, + entry->position, + alias_symbol + ); + } + } + return ts_node_new(NULL, NULL, length_zero(), 0); } TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { @@ -321,13 +406,14 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { // Stop walking up when another visible node is found. if (i != self->stack.size - 1) { if (ts_subtree_visible(*entry->subtree)) break; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - if (alias_sequence && alias_sequence[entry->structural_child_index]) { - break; - } + if ( + !ts_subtree_extra(*entry->subtree) && + ts_language_alias_at( + self->tree->language, + parent_entry->subtree->ptr->production_id, + entry->structural_child_index + ) + ) break; } if (ts_subtree_extra(*entry->subtree)) break; diff --git a/tree_cursor.h b/tree_cursor.h index 5a39dd27..69647d1d 100644 --- a/tree_cursor.h +++ b/tree_cursor.h @@ -16,6 +16,16 @@ typedef struct { } TreeCursor; void ts_tree_cursor_init(TreeCursor *, TSNode); -TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *); +void ts_tree_cursor_current_status( + const TSTreeCursor *, + TSFieldId *, + bool *, + bool *, + bool *, + TSSymbol *, + unsigned * +); + +TSNode ts_tree_cursor_parent_node(const TSTreeCursor *); #endif // TREE_SITTER_TREE_CURSOR_H_ diff --git a/vendor.sh b/vendor.sh index 51dfe1c2..7dd68182 100755 --- a/vendor.sh +++ b/vendor.sh @@ -4,7 +4,7 @@ set -e -sitter_version=0.16.5 +sitter_version=0.17.3 grammars=( "bash;v0.16.1;parser.c;scanner.cc" "c-sharp;v0.16.1;parser.c;scanner.c"