Skip to content

Commit d7dee61

Browse files
committed
Allow unicode symbols in variable names enso-org#96
1 parent 05cd9fc commit d7dee61

File tree

2 files changed

+35
-20
lines changed

2 files changed

+35
-20
lines changed

syntax/text/lexer/src/Luna/Syntax/Text/Lexer/Grammar.hs

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -79,16 +79,31 @@ notNewlineStart c = c /= '\n' && c /= '\r' ; {-# INLINE notNewlineStart #-}
7979

8080
-- === Char by char checking === --
8181

82-
isDecDigitChar, isOctDigitChar, isBinDigitChar, isHexDigitChar, isIndentBodyChar :: Char -> Bool
83-
isDecDigitChar c = (c >= '0' && c <= '9') ; {-# INLINE isDecDigitChar #-}
84-
isOctDigitChar c = (c >= '0' && c <= '7') ; {-# INLINE isOctDigitChar #-}
85-
isBinDigitChar c = (c == '0' || c == '1') ; {-# INLINE isBinDigitChar #-}
86-
isHexDigitChar c = isDecDigitChar c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') ; {-# INLINE isHexDigitChar #-}
87-
isIndentBodyChar c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isDecDigitChar c || c == '_' ; {-# INLINE isIndentBodyChar #-}
82+
isDecDigitChar, isOctDigitChar, isBinDigitChar, isHexDigitChar, isIdentBodyChar, isVarHead, isConsHead :: Char -> Bool
83+
isDecDigitChar c = (c >= '0' && c <= '9') ; {-# INLINE isDecDigitChar #-}
84+
isOctDigitChar c = (c >= '0' && c <= '7') ; {-# INLINE isOctDigitChar #-}
85+
isBinDigitChar c = (c == '0' || c == '1') ; {-# INLINE isBinDigitChar #-}
86+
isHexDigitChar c = isDecDigitChar c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') ; {-# INLINE isHexDigitChar #-}
87+
isIdentBodyChar c = Char.isAlphaNum c || c == '_' ; {-# INLINE isIdentBodyChar #-}
88+
isVarHead c = Char.isLower c || c == '_' ; {-# INLINE isVarHead #-}
89+
isConsHead = Char.isUpper ; {-# INLINE isConsHead #-}
8890

8991
opChars :: [Char]
9092
opChars = "!$%&*+-/<>?^~\\" ; {-# INLINE opChars #-}
9193

94+
-- === Names === --
95+
96+
lexVariable :: Lexer
97+
lexVariable = checkSpecialVar
98+
<$> (takeWhile isIdentBodyChar
99+
<**> (option id $ flip Text32.snoc <$> (token '?' <|> token '!'))
100+
<**> (option id $ flip (<>) <$> takeMany1 '\''))
101+
{-# INLINE lexVariable #-}
102+
103+
lexConstructor :: Lexer
104+
lexConstructor = Cons <$> takeWhile isIdentBodyChar
105+
{-# INLINE lexConstructor #-}
106+
92107

93108
-- === Numbers === --
94109

@@ -299,8 +314,8 @@ symmap = Vector.generate symmapSize $ \i -> let c = Char.chr i in if
299314
| c == markerBeginChar -> lexMarker
300315

301316
-- Identifiers & Keywords
302-
| varHead c -> checkSpecialVar <$> varBody
303-
| consHead c -> Cons <$> consBody
317+
| isVarHead c -> lexVariable
318+
| isConsHead c -> lexConstructor
304319

305320
-- Operators
306321
| c == '@' -> TypeApp <$ dropToken
@@ -316,23 +331,15 @@ symmap = Vector.generate symmapSize $ \i -> let c = Char.chr i in if
316331
| c == rawStrQuote -> rawStr
317332
| c == fmtStrQuote -> fmtStr
318333
| c == natStrQuote -> natStr
319-
| decHead c -> lexNumber
334+
| isDecDigitChar c -> lexNumber
320335

321336
-- Meta
322337
| c == '#' -> handleHash =<< takeMany '#'
323338

324339
-- Utils
325340
| otherwise -> unknownCharSym c
326341

327-
where between a l r = a >= l && a <= r
328-
decHead c = between c '0' '9'
329-
varHead c = between c 'a' 'z' || c == '_'
330-
consHead c = between c 'A' 'Z'
331-
consBody = indentBaseBody
332-
varBody = indentBaseBody <**> (option id $ flip Text32.snoc <$> (token '?' <|> token '!'))
333-
<**> (option id $ flip (<>) <$> takeMany1 '\'')
334-
indentBaseBody = takeWhile isIndentBodyChar
335-
handleColons = handleReps [BlockStart, Typed]
342+
where handleColons = handleReps [BlockStart, Typed]
336343
handleDots = handleReps [Accessor , Range, Anything]
337344
handleEqs = handleReps [Assignment, Operator "=="]
338345
handleHash = handleRepsM [pure Disable, lexComment, lexConfig]
@@ -379,8 +386,14 @@ topEntryPoint :: Lexer
379386
topEntryPoint = peekToken >>= lexSymChar ; {-# INLINE topEntryPoint #-}
380387

381388
lexSymChar :: Char -> Lexer
382-
lexSymChar c = if chord < symmapSize then Vector.unsafeIndex symmap chord else unknownCharSym c
383-
where chord = Char.ord c
389+
lexSymChar c
390+
-- fetch lexers for ASCII from precomputed cache
391+
| chord < symmapSize = Vector.unsafeIndex symmap chord
392+
-- create lexers for unicode names on the fly
393+
| isVarHead c = lexVariable
394+
| isConsHead c = lexConstructor
395+
| otherwise = unknownCharSym c
396+
where chord = Char.ord c
384397
{-# INLINE lexSymChar #-}
385398

386399
lexeme :: Symbol -> Parser (Symbol, Int)

syntax/text/parser/test/spec/Luna/Test/Source/Text/ParserSpec.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,11 @@ spec = do
154154
it "one letter variable" $ shouldParseItself' expr "a" [(0,1)]
155155
it "variable with trailing apostrophe" $ shouldParseItself' expr "foo'" [(0,4)]
156156
it "variable with trailing apostrophes" $ shouldParseItself' expr "foo''" [(0,5)]
157+
it "variable with unicode symbols" $ shouldParseItself' expr "фываΧξωβ김동욱" [(0,11)]
157158
it "wildcard" $ shouldParseItself' expr "_" [(0,1)]
158159
it "simple constructors" $ shouldParseItself' expr "Vector" [(0,6)]
159160
it "constructors with arguments" $ shouldParseItself' expr "Vector x 1 z" [(0,6),(1,1),(0,8),(1,1),(0,10),(1,1),(0,12)]
161+
it "constructor with unicode symbols" $ shouldParseItself' expr "Κοηστρυκτορ" [(0,11)]
160162

161163
describe "expressions" $ do
162164
describe "applications" $ do

0 commit comments

Comments
 (0)