Skip to content

Commit

Permalink
Accept multibyte character as wide character literal
Browse files Browse the repository at this point in the history
On most Unix-like systems, wide character literal is 32-bit long
and encodes a Unicode code point. On Windows, that is 16-bit
long and encodes a UTF-16 code unit. Clearly, there's a portability
issue here. Personally I've never used wide characters in my code
as I didn't find it useful.

Being said that, some header files contain wide character literal,
so we need to support that so that chibicc can include such files.

We assume that source files are always encoded in UTF-8.
  • Loading branch information
rui314 committed Sep 30, 2020
1 parent 058c3a7 commit c9937c8
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 2 deletions.
1 change: 1 addition & 0 deletions chibicc.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ int align_to(int n, int align);
//

int encode_utf8(char *buf, uint32_t c);
uint32_t decode_utf8(char **new_pos, char *p);

//
// main.c
Expand Down
5 changes: 5 additions & 0 deletions test/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ int main() {
ASSERT(0, strcmp("日本語", "\U000065E5\U0000672C\U00008A9E"));
ASSERT(0, strcmp("🌮", "\U0001F32E"));

ASSERT(-1, L'\xffffffff'>>31);
ASSERT(946, L'β');
ASSERT(12354, L'あ');
ASSERT(127843, L'🍣');

printf("OK\n");
return 0;
}
5 changes: 3 additions & 2 deletions tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ static Token *read_char_literal(char *start, char *quote) {
if (*p == '\0')
error_at(start, "unclosed char literal");

char c;
int c;
if (*p == '\\')
c = read_escaped_char(&p, p + 1);
else
c = *p++;
c = decode_utf8(&p, p);

char *end = strchr(p, '\'');
if (!end)
Expand Down Expand Up @@ -449,6 +449,7 @@ Token *tokenize(File *file) {
// Character literal
if (*p == '\'') {
cur = cur->next = read_char_literal(p, p);
cur->val = (char)cur->val;
p += cur->len;
continue;
}
Expand Down
40 changes: 40 additions & 0 deletions unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,43 @@ int encode_utf8(char *buf, uint32_t c) {
buf[3] = 0b10000000 | (c & 0b00111111);
return 4;
}

// Read a UTF-8-encoded Unicode code point from a source file.
// We assume that source files are always in UTF-8.
//
// UTF-8 is a variable-width encoding in which one code point is
// encoded in one to four bytes. One byte UTF-8 code points are
// identical to ASCII. Non-ASCII characters are encoded using more
// than one byte.
uint32_t decode_utf8(char **new_pos, char *p) {
if ((unsigned char)*p < 128) {
*new_pos = p + 1;
return *p;
}

char *start = p;
int len;
uint32_t c;

if ((unsigned char)*p >= 0b11110000) {
len = 4;
c = *p & 0b111;
} else if ((unsigned char)*p >= 0b11100000) {
len = 3;
c = *p & 0b1111;
} else if ((unsigned char)*p >= 0b11000000) {
len = 2;
c = *p & 0b11111;
} else {
error_at(start, "invalid UTF-8 sequence");
}

for (int i = 1; i < len; i++) {
if ((unsigned char)p[i] >> 6 != 0b10)
error_at(start, "invalid UTF-8 sequence");
c = (c << 6) | (p[i] & 0b111111);
}

*new_pos = p + len;
return c;
}

0 comments on commit c9937c8

Please sign in to comment.