From d4f5a4bb50023b8f3356c2ea7b07447f4cb21d78 Mon Sep 17 00:00:00 2001 From: Gabriel Dos Reis Date: Fri, 26 Jul 2024 03:32:49 +0900 Subject: [PATCH] Simplify native reading (#54) * Simplify native reading Simplify the operations of reading and tokenizing an input source file that is either Boot or Spad. * Fix thinko * Use `std::numeric_limits` to express intent --- src/include/open-axiom/InputFragment | 12 ++- src/include/open-axiom/token | 130 ++++++++++++++++----------- src/io/InputFragment.cxx | 11 ++- src/syntax/Parser.cxx | 14 +-- src/syntax/token.cxx | 10 ++- 5 files changed, 117 insertions(+), 60 deletions(-) diff --git a/src/include/open-axiom/InputFragment b/src/include/open-axiom/InputFragment index 67863e44e..2837729f4 100644 --- a/src/include/open-axiom/InputFragment +++ b/src/include/open-axiom/InputFragment @@ -1,5 +1,5 @@ // -*- C++ -*- -// Copyright (C) 2014-2015, Gabriel Dos Reis. +// Copyright (C) 2014-2024, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -127,6 +127,16 @@ namespace OpenAxiom { }; std::ostream& operator<<(std::ostream&, const Fragment&); + + // A prose is the contents of an input source file organized as a + // sequence of fragments. + // Note: a prose is defined as movable, but not a copyable type. + struct Prose : std::vector { + Prose() = default; + Prose(Prose&&) = default; + }; + + Prose read_source(std::istream&); } #endif // OPENAXIOM_INPUTFRAGMENT_included diff --git a/src/include/open-axiom/token b/src/include/open-axiom/token index 319c670fb..bd4b5ea6b 100644 --- a/src/include/open-axiom/token +++ b/src/include/open-axiom/token @@ -1,5 +1,5 @@ // -*- C++ -*- -// Copyright (C) 2013-2022, Gabriel Dos Reis. +// Copyright (C) 2013-2024, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -35,6 +35,7 @@ #define OPENAXIOM_TOKEN_included #include +#include #include #include #include @@ -62,6 +63,7 @@ namespace OpenAxiom { std::ostream& operator<<(std::ostream&, TokenCategory); // The abstract value associated with a token. + // Note: All token values are within the bound of an 8-bit integer. enum class TokenValue : std::uint8_t { #undef OPENAXIOM_DEFINE_TOKEN #define OPENAXIOM_DEFINE_TOKEN(T, ...) T, @@ -69,10 +71,10 @@ namespace OpenAxiom { #undef OPENAXIOM_DEFINE_TOKEN EndOfStream // end of token stream }; + static_assert(TokenValue::EndOfStream <= TokenValue{std::numeric_limits::max()}); std::ostream& operator<<(std::ostream&, TokenValue); - enum class TokenIndex : std::uint32_t { }; constexpr TokenValue value(TokenIndex t) { @@ -120,20 +122,21 @@ namespace OpenAxiom { // token stream. The tokens are of type indicated by Tok. template struct Tokenizer { - Tokenizer(Fragment& f) + Tokenizer(const Fragment& f) : frag(f), pos{ 0, frag.front().indent } { indents.push(pos); } - bool eos() const { + bool eos() const + { return pos.line >= frag.size(); } Tok get(Language = Language::Spad); private: - Fragment& frag; + const Fragment& frag; FragmentCursor pos; std::stack indents; @@ -149,32 +152,37 @@ namespace OpenAxiom { bool separator_or_punctuator(std::uint8_t); template - inline void comment_token(T& t, TokenValue v) { + inline void comment_token(T& t, TokenValue v) + { t.category = TokenCategory::Comment; t.value = v; } template - inline T& formatting_token(T& t, TokenValue v) { + inline T& formatting_token(T& t, TokenValue v) + { t.category = TokenCategory::Formatting; t.value = v; return t; } template - inline void operator_token(T& t, TokenValue v) { + inline void operator_token(T& t, TokenValue v) + { t.category = TokenCategory::Operator; t.value = v; } template - inline void punctuator_token(T& t, TokenValue v) { + inline void punctuator_token(T& t, TokenValue v) + { t.category = TokenCategory::Punctuator; t.value = v; } template - inline T& eos_token(T& t) { + inline T& eos_token(T& t) + { t.category = TokenCategory::EOS; t.value = TokenValue::EndOfStream; t.end = t.start; @@ -191,7 +199,8 @@ namespace OpenAxiom { } template - void junk(L& line, ColumnIndex& idx, T& t) { + void junk(L& line, ColumnIndex& idx, T& t) + { while (idx < line.size() and not separator_or_punctuator(line[idx])) ++idx; t.category = TokenCategory::Junk; @@ -206,7 +215,8 @@ namespace OpenAxiom { } template - void string_literal(Fragment& frag, FragmentCursor& pos, Tok& t) { + void string_literal(const Fragment& frag, FragmentCursor& pos, Tok& t) + { bool done = false; bool escape = false; while (frag.covering(pos) && not done) { @@ -231,12 +241,14 @@ namespace OpenAxiom { } template - void skip_to_end_of_integer(L& line, ColumnIndex& idx) { + void skip_to_end_of_integer(L& line, ColumnIndex& idx) + { while (idx < line.size() and isdigit(line[idx])) ++idx; } - static bool next_line(Fragment& frag, FragmentCursor& pos) { + static bool next_line(const Fragment& frag, FragmentCursor& pos) + { if (++pos.line < frag.size()) { pos.column = frag(pos).indent; return true; @@ -246,13 +258,15 @@ namespace OpenAxiom { } template - void integer(L& line, ColumnIndex& idx, T& t) { + void integer(L& line, ColumnIndex& idx, T& t) + { skip_to_end_of_integer(line, idx); t.category = TokenCategory::Integer; } template - T& number(L& line, ColumnIndex& idx, T& t) { + T& number(L& line, ColumnIndex& idx, T& t) + { integer(line, idx, t); if (idx >= line.size() or line[idx] != '.') return t; @@ -274,33 +288,39 @@ namespace OpenAxiom { } inline bool - identifier_head(std::uint8_t c) { + identifier_head(std::uint8_t c) + { return isalpha(c) or c == '%' or c == '_'; } inline bool - identifier_part(uint8_t c) { + identifier_part(uint8_t c) + { return identifier_head(c) or isdigit(c); } inline bool - identifier_suffix(std::uint8_t c) { + identifier_suffix(std::uint8_t c) + { return c == '!' or c == '?'; } - inline bool internal_prefix(std::uint8_t c) { + inline bool internal_prefix(std::uint8_t c) + { return c == '%' or c == '$'; } template inline void - skip_prefix(L& line, ColumnIndex& idx, std::uint8_t c) { + skip_prefix(L& line, ColumnIndex& idx, std::uint8_t c) + { while (idx < line.size() and line[idx] == c) ++idx; } template - T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) { + T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) + { t.category = TokenCategory::Identifier; ColumnIndex start = --idx; // idx was ahead by 1. @@ -328,7 +348,8 @@ namespace OpenAxiom { } template - void left_paren_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void left_paren_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { punctuator_token(t, TokenValue::OpenParen); if (frag.covering(pos) and frag[pos] == '|') { ++pos; @@ -337,7 +358,8 @@ namespace OpenAxiom { } template - void left_brace_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void left_brace_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { punctuator_token(t, TokenValue::OpenBrace); if (frag.covering(pos) and frag[pos] == '|') { ++pos; @@ -346,7 +368,8 @@ namespace OpenAxiom { } template - void left_bracket_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void left_bracket_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { punctuator_token(t, TokenValue::OpenBracket); if (frag.covering(pos) and frag[pos] == '|') { ++pos; @@ -355,7 +378,8 @@ namespace OpenAxiom { } template - void colon_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void colon_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Colon); if (frag.covering(pos)) switch (frag[pos]) { @@ -367,7 +391,8 @@ namespace OpenAxiom { } template - void star_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void star_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Star); if (frag.covering(pos) and frag[pos] == '*') { t.value = TokenValue::StarStar; @@ -376,7 +401,8 @@ namespace OpenAxiom { } template - void slash_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void slash_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Slash); if (frag.covering(pos)) switch (frag[pos]) { @@ -387,7 +413,8 @@ namespace OpenAxiom { } template - void backslash_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void backslash_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Backslash); if (frag.covering(pos)) switch (frag[pos]) { @@ -398,7 +425,8 @@ namespace OpenAxiom { } template - void less_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void less_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Less); if (frag.covering(pos)) switch (frag[pos]) { @@ -416,7 +444,8 @@ namespace OpenAxiom { } template - void equal_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void equal_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Eq); if (frag.covering(pos)) switch (frag[pos]) { @@ -433,7 +462,8 @@ namespace OpenAxiom { } template - void tilde_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void tilde_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Tilde); if (frag.covering(pos) and frag[pos] == '=') { t.value = TokenValue::TildeEq; @@ -442,7 +472,8 @@ namespace OpenAxiom { } template - void greater_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void greater_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Greater); if (frag.covering(pos)) switch (frag[pos]) { @@ -452,7 +483,8 @@ namespace OpenAxiom { } template - void bar_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void bar_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { punctuator_token(t, TokenValue::Bar); if (frag.covering(pos)) switch (frag[pos]) { @@ -464,7 +496,8 @@ namespace OpenAxiom { } template - void minus_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void minus_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Minus); if (frag.covering(pos)) switch (frag[pos]) { @@ -478,7 +511,8 @@ namespace OpenAxiom { template - void plus_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void plus_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Plus); if (frag.covering(pos)) switch (frag[pos]) { @@ -498,7 +532,8 @@ namespace OpenAxiom { } template - void dot_et_al(Fragment& frag, FragmentCursor& pos, Tok& t) { + void dot_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t) + { operator_token(t, TokenValue::Dot); if (frag.covering(pos) and frag[pos] == '.') { t.value = TokenValue::DotDot; @@ -507,8 +542,8 @@ namespace OpenAxiom { } template - void - dollar_et_al(Fragment& frag, FragmentCursor& pos, Tok& t, Language dialect) { + void dollar_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t, Language dialect) + { if (dialect != Language::Boot or not frag.covering(pos) or separator_or_punctuator(frag[pos])) operator_token(t, TokenValue::Dollar); @@ -517,8 +552,8 @@ namespace OpenAxiom { } template - void - sharp_et_al(Fragment& frag, FragmentCursor& pos, Tok& t, Language dialect) { + void sharp_et_al(const Fragment& frag, FragmentCursor& pos, Tok& t, Language dialect) + { if (dialect != Language::Lisp) operator_token(t, TokenValue::Sharp); else if (frag.covering(pos)) @@ -534,7 +569,8 @@ namespace OpenAxiom { } template - Tok Tokenizer::finish(Tok& t, Language dialect) { + Tok Tokenizer::finish(Tok& t, Language dialect) + { switch (auto c = frag.advance(pos)) { case '#': sharp_et_al(frag, pos, t, dialect); break; case '@': operator_token(t, TokenValue::At); break; @@ -625,14 +661,8 @@ namespace OpenAxiom { } // -- Token streams. - template - struct TokenStream : std::vector { - explicit TokenStream(Fragment& f, Language dialect = Language::Spad) { - Tokenizer lex { f }; - while (auto t = lex.get(dialect)) - this->push_back(t); - } - }; + std::vector words(const Fragment&, Language); + } #endif // OPENAXIOM_TOKEN_included diff --git a/src/io/InputFragment.cxx b/src/io/InputFragment.cxx index 595b88b1d..6e7bc421a 100644 --- a/src/io/InputFragment.cxx +++ b/src/io/InputFragment.cxx @@ -1,5 +1,5 @@ // -*- C++ -*- -// Copyright (C) 2014-2017, Gabriel Dos Reis. +// Copyright (C) 2014-2024, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -148,4 +148,13 @@ namespace OpenAxiom { } return fragment; } + + Prose read_source(std::istream& is) + { + Prose text { }; + SourceInput src { is }; + while (auto f = src.get()) + text.push_back(f); + return text; + } } diff --git a/src/syntax/Parser.cxx b/src/syntax/Parser.cxx index ada3963b4..f87f907a5 100644 --- a/src/syntax/Parser.cxx +++ b/src/syntax/Parser.cxx @@ -1,5 +1,5 @@ // -*- C++ -*- -// Copyright (C) 2014-2017, Gabriel Dos Reis. +// Copyright (C) 2014-2024, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -48,10 +48,10 @@ namespace { using namespace OpenAxiom; - using TokenSequence = TokenStream; + using TokenSequence = std::vector; struct ParsingContext { - explicit ParsingContext(TokenSequence& ts) + explicit ParsingContext(const TokenSequence& ts) : tokens{ ts }, position{ } { } @@ -68,7 +68,7 @@ namespace { void advance() { ++position; } private: - TokenSequence& tokens; + const TokenSequence& tokens; TokenSequence::size_type position; }; @@ -150,12 +150,12 @@ namespace { << "## Output: " << out.path << '\n'; SourceInput src { in.stream }; - while (auto f = src.get()) { + for (auto& f : read_source(in.stream)) + { out.stream << "================================================\n"; out.stream << f; try { - TokenSequence ts { f, Language::Boot }; - for (auto& t : ts) { + for (auto& t : words(f, Language::Boot)) { out.stream << '\t'; format_token(f, t, out.stream); switch (t.category) { diff --git a/src/syntax/token.cxx b/src/syntax/token.cxx index 01985bd8a..854716bef 100644 --- a/src/syntax/token.cxx +++ b/src/syntax/token.cxx @@ -1,4 +1,4 @@ -// Copyright (C) 2013-2014, Gabriel Dos Reis. +// Copyright (C) 2013-2024, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -103,4 +103,12 @@ namespace OpenAxiom { return os; } + std::vector words(const Fragment& f, Language lang) + { + std::vector v { }; + Tokenizer lex { f }; + while (auto t = lex.get(lang)) + v.push_back(t); + return v; + } }