Skip to content

Commit

Permalink
WIP: add TOKENIZE_STREAM
Browse files Browse the repository at this point in the history
  • Loading branch information
fingolfin committed Jan 28, 2021
1 parent 2d3da44 commit 8bca6c6
Show file tree
Hide file tree
Showing 6 changed files with 408 additions and 12 deletions.
190 changes: 190 additions & 0 deletions lib/tokenizer.g
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
ID_TO_SYMBOL_LIST:=[];
SYMBOL_COLORS:=rec();
# We split the scanner ids into the lower 3 bits, plus PValuation(id>>3, 2)
# compressing them to 3+5 = 8 bits
COMPRESS_SCANNER_ID := function(id)
local lo, hi;
lo := (id mod 8);
hi := QuoInt(id, 8);
if hi > 0 then
hi := PValuation(hi, 2) + 1;
fi;
return lo + hi * 8 + 1;
end;

for name in RecNames(SCANNER_SYMBOLS) do
SYMBOL_COLORS.(name) := TextAttr.reset;
id := SCANNER_SYMBOLS.(name);
#Print(name, ": ",id, " -> ", [ id mod 8, QuoInt(id,8), COMPRESS_SCANNER_ID(id)], "\n");
ID_TO_SYMBOL_LIST[COMPRESS_SCANNER_ID(id)] := name;
od;
ID_TO_SYMBOL := id -> ID_TO_SYMBOL_LIST[COMPRESS_SCANNER_ID(id)];

# identifiers and keywords
SYMBOL_COLORS.S_IDENT := TextAttr.1;
SYMBOL_COLORS.S_UNBIND := TextAttr.1;
SYMBOL_COLORS.S_ISBOUND := TextAttr.1;
SYMBOL_COLORS.S_TRYNEXT := TextAttr.1;
SYMBOL_COLORS.S_INFO := TextAttr.1;
SYMBOL_COLORS.S_ASSERT := TextAttr.1;
SYMBOL_COLORS.S_READWRITE := TextAttr.1;
SYMBOL_COLORS.S_READONLY := TextAttr.1;
SYMBOL_COLORS.S_ASSERT := TextAttr.1;
SYMBOL_COLORS.S_REC := TextAttr.1;
SYMBOL_COLORS.S_FUNCTION := TextAttr.1;
SYMBOL_COLORS.S_LOCAL := TextAttr.1;
SYMBOL_COLORS.S_END := TextAttr.1;
SYMBOL_COLORS.S_IF := TextAttr.1;
SYMBOL_COLORS.S_FOR := TextAttr.1;
SYMBOL_COLORS.S_WHILE := TextAttr.1;
SYMBOL_COLORS.S_REPEAT := TextAttr.1;
SYMBOL_COLORS.S_ATOMIC := TextAttr.1;
SYMBOL_COLORS.S_THEN := TextAttr.1;
SYMBOL_COLORS.S_ELIF := TextAttr.1;
SYMBOL_COLORS.S_ELSE := TextAttr.1;
SYMBOL_COLORS.S_FI := TextAttr.1;
SYMBOL_COLORS.S_DO := TextAttr.1;
SYMBOL_COLORS.S_OD := TextAttr.1;
SYMBOL_COLORS.S_UNTIL := TextAttr.1;
SYMBOL_COLORS.S_BREAK := TextAttr.1;
SYMBOL_COLORS.S_RETURN := TextAttr.1;
SYMBOL_COLORS.S_QUIT := TextAttr.1;
SYMBOL_COLORS.S_QQUIT := TextAttr.1;
SYMBOL_COLORS.S_CONTINUE := TextAttr.1;

SYMBOL_COLORS.S_MOD := TextAttr.1;
SYMBOL_COLORS.S_IN := TextAttr.1;
SYMBOL_COLORS.S_NOT := TextAttr.1;
SYMBOL_COLORS.S_AND := TextAttr.1;
SYMBOL_COLORS.S_OR := TextAttr.1;

# brackets, parens, ...
SYMBOL_COLORS.S_LBRACK := TextAttr.5;
SYMBOL_COLORS.S_LBRACE := TextAttr.5;
SYMBOL_COLORS.S_BLBRACK := TextAttr.5;
SYMBOL_COLORS.S_RBRACK := TextAttr.5;
SYMBOL_COLORS.S_RBRACE := TextAttr.5;
SYMBOL_COLORS.S_DOT := TextAttr.5;
SYMBOL_COLORS.S_BDOT := TextAttr.5;
SYMBOL_COLORS.S_LPAREN := TextAttr.5;
SYMBOL_COLORS.S_RPAREN := TextAttr.5;
SYMBOL_COLORS.S_COMMA := TextAttr.5;
SYMBOL_COLORS.S_DOTDOT := TextAttr.5;
SYMBOL_COLORS.S_COLON := TextAttr.5;
SYMBOL_COLORS.S_DOTDOTDOT := TextAttr.5;
SYMBOL_COLORS.S_SEMICOLON := TextAttr.5;
SYMBOL_COLORS.S_DUALSEMICOLON := TextAttr.5;


# constants
SYMBOL_COLORS.S_INT := TextAttr.4;
SYMBOL_COLORS.S_FLOAT := TextAttr.4;
SYMBOL_COLORS.S_TRUE := TextAttr.4;
SYMBOL_COLORS.S_FALSE := TextAttr.4;
SYMBOL_COLORS.S_CHAR := TextAttr.4;

# strings
SYMBOL_COLORS.S_STRING := TextAttr.3;

# operators
SYMBOL_COLORS.S_MULT := TextAttr.2;
SYMBOL_COLORS.S_MULT := TextAttr.2;
SYMBOL_COLORS.S_DIV := TextAttr.2;
SYMBOL_COLORS.S_POW := TextAttr.2;
SYMBOL_COLORS.S_PLUS := TextAttr.2;
SYMBOL_COLORS.S_MINUS := TextAttr.2;
SYMBOL_COLORS.S_EQ := TextAttr.2;
SYMBOL_COLORS.S_LT := TextAttr.2;
SYMBOL_COLORS.S_GT := TextAttr.2;
SYMBOL_COLORS.S_NE := TextAttr.2;
SYMBOL_COLORS.S_LE := TextAttr.2;
SYMBOL_COLORS.S_GE := TextAttr.2;
SYMBOL_COLORS.S_ASSIGN := TextAttr.2;


ExtractRangeFromLines := function(lines, startline, startpos, endline, endpos)
local data, tmp, i;
if startline = endline then
return lines[startline]{[startpos+1 .. endpos]};
fi;
tmp := lines[startline];
data := tmp{[startpos+1 .. Length(tmp)]};
Add(data, '\n');
for i in [startline+1 .. endline-1] do
Append(data, lines[i]);
Add(data, '\n');
od;
tmp := lines[endline];
Append(data, tmp{[1 .. endpos]});
return data;
end;


TOKENIZE_STRING:=function(str)
local res, stat, token, symbol, lines, text, sep1, sep2;
Add(str, '\n');
sep1 := "";
sep2 := "";
#sep1 := "<";
#sep2 := ">";
# Print("Input:\n", str, "\n");
# Print("Output:\n");
lines := SplitString(str, "\n");
res := TOKENIZE_STREAM(InputTextString(str));
for stat in res do
if not IsList(stat) then continue; fi;
for token in stat do
if not IsList(token) then continue; fi;
if token[1] = "ERROR" then
Print("\nEncountered an error: ", token[2], "\n");
continue;
fi;
symbol := Remove(token);
Add(token, ID_TO_SYMBOL(symbol));

# extract symbol
if Length(token) <> 8 then continue; fi;
if symbol = SCANNER_SYMBOLS.S_EOF then
Print("\n\n-- EOF --\n");
continue;
fi;
text := ExtractRangeFromLines(lines, token[2], token[3], token[4], token[5]);
if Length(text) > 0 then
Print(TextAttr.6, sep1, text, sep2, TextAttr.reset);
#Print(TextAttr.b6, sep1, text, sep2, TextAttr.reset);
fi;
text := ExtractRangeFromLines(lines, token[4], token[5], token[6], token[7]);
if Length(text) > 0 then
Print(SYMBOL_COLORS.(ID_TO_SYMBOL(symbol)), sep1, text, sep2, TextAttr.reset);
fi;
Add(token, text);
od;
od;
Print("\n");
return res;
end;


SetPrintFormattingStatus("*stdout*", false);

l:=TOKENIZE_STRING("1;");

l:=TOKENIZE_STRING("1");

l:=TOKENIZE_STRING("1-;");


l:=TOKENIZE_STRING("1+1;");

l:=TOKENIZE_STRING("123 + 456;");

l:=TOKENIZE_STRING("1+1; x:=y-3;");

l:=TOKENIZE_STRING("x:=0123 + 1234; xxxx+777777;");

l:=TOKENIZE_STRING("""
1+1; x:=y-3;
# This is a little test program
f := x -> x+1; # increment function
f(2);
""");
45 changes: 44 additions & 1 deletion src/read.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
#endif


static ExecStatus ReadCommand(Obj context,
TypInputFile * input,
BOOL justTokenize,
Obj * evalResult,
UInt * dualSemicolon);

/****************************************************************************
**
*S TRY_IF_NO_ERROR
Expand Down Expand Up @@ -2565,6 +2571,32 @@ ExecStatus ReadEvalCommand(Obj context,
TypInputFile * input,
Obj * evalResult,
UInt * dualSemicolon)
{
return ReadCommand(context, input, FALSE, evalResult, dualSemicolon);
}


/****************************************************************************
**
*F ReadTokenizeCommand()
**
*/
ExecStatus ReadTokenizeCommand(Obj context, TypInputFile * input, Obj * tokens)
{
return ReadCommand(context, input, TRUE, tokens, 0);
}


/****************************************************************************
**
*F ReadCommand()
**
*/
static ExecStatus ReadCommand(Obj context,
TypInputFile * input,
BOOL justTokenize,
Obj * evalResult,
UInt * dualSemicolon)
{
volatile ExecStatus type;
volatile Obj tilde;
Expand All @@ -2583,6 +2615,16 @@ ExecStatus ReadEvalCommand(Obj context,

ClearError();

if (justTokenize) {
// HACK / TODO: explain this
rs->intr.returning = STATUS_RETURN_VAL;
rs->intr.ignoring = 1;
if (*evalResult)
rs->s.tokens = *evalResult;
else
rs->s.tokens = *evalResult = NEW_PLIST(T_PLIST, 16);
}

/* get the first symbol from the input */
Match_(rs, rs->s.Symbol, "", 0);

Expand Down Expand Up @@ -2661,7 +2703,7 @@ ExecStatus ReadEvalCommand(Obj context,
*dualSemicolon = (rs->s.Symbol == S_DUALSEMICOLON);

// end the interpreter
type = IntrEnd(&rs->intr, rs->s.NrError > 0, evalResult);
type = IntrEnd(&rs->intr, rs->s.NrError > 0, justTokenize ? 0 : evalResult);

// restore the execution environment
SWITCH_TO_OLD_LVARS(oldLVars);
Expand All @@ -2687,6 +2729,7 @@ ExecStatus ReadEvalCommand(Obj context,
return type;
}


/****************************************************************************
**
*F ReadEvalFile() . . . . . . . . . . . . . . . . . . . . . . . read a file
Expand Down
8 changes: 8 additions & 0 deletions src/read.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ UInt ReadEvalCommand(Obj context,
UInt * dualSemicolon);


/****************************************************************************
**
*F ReadTokenizeCommand()
**
*/
UInt ReadTokenizeCommand(Obj context, TypInputFile * input, Obj * tokens);


/****************************************************************************
**
*F ReadEvalFile() . . . . . . . . . . . . . . . . . . . . . . . read a file
Expand Down
60 changes: 49 additions & 11 deletions src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "error.h"
#include "gapstate.h"
#include "gaputils.h"
#include "integer.h"
#include "io.h"
#include "lists.h"
#include "plist.h"
Expand All @@ -43,8 +44,18 @@ static void SyntaxErrorOrWarning(ScannerState * s,
Int tokenoffset)
{
GAP_ASSERT(tokenoffset >= 0 && tokenoffset <= 2);


if (s->tokens) {
// if tokenization mode, don't print the error, instead
// add it to tokens
PushPlist(s->tokens, NewPlistFromArgs(
MakeImmString("ERROR"), MakeImmString(msg),
INTOBJ_INT(error), INTOBJ_INT(tokenoffset)));
}

// do not print a message if we found one already on the current line
if (s->input->lastErrorLine != s->input->number) {
else if (s->input->lastErrorLine != s->input->number) {

// open error output
TypOutputFile output = { 0 };
Expand Down Expand Up @@ -889,6 +900,8 @@ static UInt NextSymbol(ScannerState * s)
// Record end of previous symbol's position
StoreSymbolPosition(s);

Obj tokenEntry = 0;

Char c = PEEK_CURR_CHAR(s->input);

// skip over <spaces>, <tabs>, <newlines> and comments
Expand All @@ -898,6 +911,7 @@ static UInt NextSymbol(ScannerState * s)
if (c == '%') {
// we have encountered a pragma
GetPragma(s, c);
// TODO: record token positions before/after
return S_PRAGMA;
}

Expand All @@ -909,13 +923,21 @@ static UInt NextSymbol(ScannerState * s)
// Record start of this symbol's position
StoreSymbolPosition(s);

// switch according to the character
if (IsAlpha(c)) {
return GetIdent(s, 0, c);
}
UInt symbol = S_ILLEGAL;

UInt symbol;
if (s->tokens) {
// in tokenizer mode, record the current position
tokenEntry = NEW_PLIST(T_PLIST, 3);
PushPlist(tokenEntry, MakeImmString("token"));
PushPlist(tokenEntry, INTOBJ_INT(s->SymbolStartLine[1]));
PushPlist(tokenEntry, INTOBJ_INT(s->SymbolStartPos[1]));
PushPlist(tokenEntry, INTOBJ_INT(s->SymbolStartLine[0]));
PushPlist(tokenEntry, INTOBJ_INT(s->SymbolStartPos[0]));
// push the token now, so that SyntaxError can modify it if necessary
PushPlist(s->tokens, tokenEntry);
}

// switch according to the character
switch (c) {
case '.': symbol = S_DOT; c = GET_NEXT_CHAR();
if (c == '.') { symbol = S_DOTDOT; c = GET_NEXT_CHAR();
Expand Down Expand Up @@ -964,17 +986,33 @@ static UInt NextSymbol(ScannerState * s)
case '?': symbol = S_HELP; GetHelp(s); break;
case '"': symbol = S_STRING; GetString(s); break;
case '\'': symbol = S_CHAR; GetChar(s); break;
case '\\': return GetIdent(s, 0, c);
case '_': return GetIdent(s, 0, c);
case '@': return GetIdent(s, 0, c);
case '\\': symbol = GetIdent(s, 0, c); break;
case '_': symbol = GetIdent(s, 0, c); break;
case '@': symbol = GetIdent(s, 0, c); break;

case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return GetNumber(s, 0, c);
symbol = GetNumber(s, 0, c); break;

case '\377': symbol = S_EOF; FlushRestOfInputLine(s->input); break;

default: symbol = S_ILLEGAL; GET_NEXT_CHAR(); break;
default:
if (IsAlpha(c)) {
symbol = GetIdent(s, 0, c);
}
else {
symbol = S_ILLEGAL;
GET_NEXT_CHAR();
}
break;
}

if (s->tokens) {
// in tokenizer mode, record the current position
PushPlist(tokenEntry, INTOBJ_INT(GetInputLineNumber(s->input)));
PushPlist(tokenEntry, INTOBJ_INT(GetInputLinePosition(s->input)));
PushPlist(tokenEntry, ObjInt_UInt((UInt4)symbol));
}

return symbol;
}
Loading

0 comments on commit 8bca6c6

Please sign in to comment.