Skip to content

Commit

Permalink
add EUC-JP encoder and bug fix.
Browse files Browse the repository at this point in the history
  • Loading branch information
mitchan0321 committed Feb 24, 2017
1 parent 7524b37 commit 5b44ce1
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 44 deletions.
11 changes: 6 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,18 @@ PREFIX = /usr/local
CC = cc

# for product build. (use BoehmGC)
CFLAGS = -Wall -O -c -g
CFLAGS = -Wall -O3 -c -g
INCLUDE = -I/usr/local/include -I.
LIB = -L/usr/lib -L/lib -L/usr/local/lib -lm -lgc -lpthread -lonigmo -lpcl -lgmp
LIB = -L/usr/lib -L/lib -L/usr/local/lib \
-lm -lgc -lpthread -lonigmo -lpcl -lgmp

# for memory debuging build.
#CFLAGS = -Wall -O -c -g -DPROF
#CFLAGS = -Wall -c -g -DPROF
#INCLUDE = -I/usr/local/include -I.
#LIB = -L/usr/local/lib -lm -lonigmo -lpcl -lgmp

# for profiling build.
#CFLAGS = -Wall -O -c -g -pg -DPROF
#CFLAGS = -Wall -c -g -pg -DPROF
#INCLUDE = -I/usr/local/include -I.
#LIB = -pg -L/usr/local/lib -lonigmo -lpcl -lgmp

Expand Down Expand Up @@ -95,7 +96,7 @@ util.o: util.c $(HDRS)
encoding.o: encoding.c $(HDRS)
$(CC) $(CFLAGS) $(INCLUDE) encoding.c -o encoding.o

encoding-table.o: encoding-table.c
encoding-table.o: encoding-table.c $(HDRS)
rm -f encoding-set-utoj.h encoding-set-jtou.h
awk -f jisconv.awk < doc/jis0208.txt
$(CC) $(CFLAGS) -O0 $(INCLUDE) encoding-table.c -o encoding-table.o
Expand Down
7 changes: 5 additions & 2 deletions bulk.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ bulk_load_file(Bulk *bulk, const char *file, int encoder) {
struct stat statbuff;
int size;
char *readbuff;
encoder_error_info error_info;
encoder_error_info *error_info;
Cell *encbuff;

if (NULL == bulk) return 0;
Expand All @@ -40,6 +40,9 @@ bulk_load_file(Bulk *bulk, const char *file, int encoder) {
if (-1 == fstat(fd, &statbuff)) goto error;
size = statbuff.st_size;

error_info = GC_MALLOC(sizeof(encoder_error_info));
ALLOC_SAFE(error_info);

bulk->length = size;
bulk->allocsize = (size+1)*sizeof(wchar_t);
bulk->pos = 0;
Expand All @@ -50,7 +53,7 @@ bulk_load_file(Bulk *bulk, const char *file, int encoder) {

if (-1 == read_size(fd, readbuff, size)) goto error;
readbuff[size] = 0;
encbuff = decode_raw_to_unicode(new_cell(to_wchar(readbuff)), encoder, &error_info);
encbuff = decode_raw_to_unicode(new_cell(to_wchar(readbuff)), encoder, error_info);
if (NULL == encbuff) goto error;
bulk->data = cell_get_addr(encbuff);
bulk->length = cell_get_length(encbuff);
Expand Down
11 changes: 8 additions & 3 deletions encoding-table.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <wchar.h>
#include <string.h>
#include <toy.h>
#include "t_gc.h"

Expand All @@ -8,8 +9,12 @@ int jisencoder_setup_done = 0;

void
JisEncoder_Setup() {
# include "encoding-set-utoj.h"
# include "encoding-set-jtou.h"

if (jisencoder_setup_done == 0) {
memset(Unicode_to_JIS0208, 0, sizeof(Unicode_to_JIS0208));
memset(JIS0208_to_Unicode, 0, sizeof(JIS0208_to_Unicode));
# include "encoding-set-utoj.h"
# include "encoding-set-jtou.h"
}

jisencoder_setup_done = 1;
}
65 changes: 52 additions & 13 deletions encoding.c
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
#include "toy.h"
#include "encoding.h"

static wchar_t *ENCODING_NAME_DEFS[] = {
L"RAW", // index: 0 ... RAW encoding (no encoding, byte data stream)
L"UTF-8", // index: 1 ... UTF-8 encoding
L"EUC-JP", // index: 2 ... EUC-JP encoding
L"Shift-JIS", // index: 3 ... Shift-JIS encoding (not yet)
};

Cell*raw_decoder(Cell *raw, encoder_error_info *error_info);
Cell*raw_encoder(Cell *unicode, encoder_error_info *error_info);
Cell*utf8_decoder(Cell *raw, encoder_error_info *error_info);
Expand All @@ -27,6 +20,14 @@ static encoder_methods Encoder_methods[] = {
{0, 0}
};

static wchar_t *ENCODING_NAME_DEFS[] = {
L"RAW", // index: 0 ... RAW encoding (no encoding, byte data stream)
L"UTF-8", // index: 1 ... UTF-8 encoding
L"EUC-JP", // index: 2 ... EUC-JP encoding
L"Shift-JIS", // index: 3 ... Shift-JIS encoding (not yet)
0
};

wchar_t*
get_encoding_name(int enc_idx) {
/* if return 0, enc_idex range over */
Expand Down Expand Up @@ -286,7 +287,7 @@ utf8_encoder(Cell *unicode, encoder_error_info *error_info) {
}

/*
* Shift-JIS decoder/encoder.
* EUC-JP decoder/encoder.
*/
Cell*
eucjp_decoder(Cell *raw, encoder_error_info *error_info) {
Expand All @@ -302,9 +303,11 @@ eucjp_decoder(Cell *raw, encoder_error_info *error_info) {

for (i=0; i<len; ) {
c = p[i];
if (c <= 0x7f) {
if ((c >= 0) && (c <= 0x7f)) {
/* ascii */
cell_add_char(result, c);
i++;

} else if ((c >= 0xa1) && (c <= 0xfe)) {
i++;
if (i >= len) {
Expand All @@ -317,21 +320,22 @@ eucjp_decoder(Cell *raw, encoder_error_info *error_info) {
if ((c2 >= 0xa1) && (c2 <= 0xfe)) {
cr = JIS0208_to_Unicode[(((c & 0x7f) << 8) | (c2 & 0x7f)) & 0xffff];
if (cr == 0) {
/* not JISX0208 character */
/* not JIS0208 character */
cell_add_char(result, c);
cell_add_char(result, c2);
} else {
/* valid JISX0208 character */
/* valid JIS0208 character */
cell_add_char(result, cr);
}
} else {
/* less data euc_jp 2nd byte */
cell_add_char(result, c);
cell_add_char(result, c2);
}
i++;

} else {
/* not JISX0208 character */
/* not JIS0208 character */
cell_add_char(result, c);
i++;
}
Expand All @@ -342,6 +346,41 @@ eucjp_decoder(Cell *raw, encoder_error_info *error_info) {

Cell*
eucjp_encoder(Cell *unicode, encoder_error_info *error_info) {
Cell *result;
wchar_t *p, c, cr;
int len, i;

JISENCODER_INIT();
return unicode;

p = cell_get_addr(unicode);
len = cell_get_length(unicode);
result = new_cell(L"");

for (i=0; i<len; i++) {
c = p[i];

if ((c >= 0x00) && (c <= 0x7f)) {
/* ascii */
cell_add_char(result, c);

} else if ((c > 0xffff) || (c < 0)) {
/* out range JIS0208 character, can't convert */
cell_add_char(result, L'?');

} else {
cr = Unicode_to_JIS0208[c & 0xffff];
if (0 == cr) {
/* not JIS0208 character, output throw */
cell_add_char(result, (c >> 8) & 0xff);
cell_add_char(result, (c ) & 0xff);

} else {
/* JIS0208 character, convert to euc_jp */
cell_add_char(result, ((cr >> 8) & 0xff) | 0x80);
cell_add_char(result, ((cr ) & 0xff) | 0x80);
}
}
}

return result;
}
20 changes: 11 additions & 9 deletions encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
#include <wchar.h>
#include "cell.h"

#define ENCODING_NAME_MAX (2) // now ready encodings are;
// RAW, UTF-8 and EUC-JP.
// (index range is 0 to ENCODING_NAME_MAX)
/*
* Indicate the maximum encoding name index; range is 0 to ENCODING_NAME_MAX.
* Now ready encodings are RAW, UTF-8 and EUC-JP.
*/
#define ENCODING_NAME_MAX (2)

/* encoding index */
#define NENCODE_RAW (0)
Expand All @@ -28,14 +30,14 @@ typedef struct _encoder_error_info {
wchar_t *message;
} encoder_error_info;

wchar_t* get_encoding_name(int enc_idx);
int get_encoding_index(wchar_t *enc_name);
Cell* decode_raw_to_unicode(Cell *raw, int enc, encoder_error_info *error_info);
Cell* encode_unicode_to_raw(Cell *unicode, int enc, encoder_error_info *error_info);
wchar_t* get_encoding_name(int enc_idx);
int get_encoding_index(wchar_t *enc_name);
Cell* decode_raw_to_unicode(Cell *raw, int enc, encoder_error_info *error_info);
Cell* encode_unicode_to_raw(Cell *unicode, int enc, encoder_error_info *error_info);

/* for JIS converter */
extern wchar_t *Unicode_to_JIS0208;
extern wchar_t *JIS0208_to_Unicode;
extern wchar_t Unicode_to_JIS0208[65536];
extern wchar_t JIS0208_to_Unicode[65536];
extern int jisencoder_setup_done;
void JisEncoder_Setup();
#define JISENCODER_INIT() if (jisencoder_setup_done == 0) {JisEncoder_Setup();}
Expand Down
2 changes: 0 additions & 2 deletions global.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,4 @@ void def_global() {
const_ast = new_symbol(L"*");
const_regex_cache = L"@REGEX_CACHE";
PageSize = sysconf(_SC_PAGESIZE);
#ifdef PROF
#endif /* PROF */
}
24 changes: 15 additions & 9 deletions methods.c
Original file line number Diff line number Diff line change
Expand Up @@ -3182,7 +3182,7 @@ static Toy_File*
new_file() {
Toy_File *o;

o = GC_MALLOC_ATOMIC(sizeof(Toy_File));
o = GC_MALLOC(sizeof(Toy_File));
ALLOC_SAFE(o);
memset(o, 0, sizeof(Toy_File));

Expand Down Expand Up @@ -3356,10 +3356,13 @@ mth_file_gets(Toy_Interp *interp, Toy_Type *posargs, Hash *nameargs, int arglen)
Cell *cbuff;
int c;
int flag_nonewline=0, flag_nocontrol=0;
encoder_error_info enc_error_info;
encoder_error_info *enc_error_info;

if (arglen > 0) goto error;

enc_error_info = GC_MALLOC(sizeof(encoder_error_info));
ALLOC_SAFE(enc_error_info);

if (hash_get_and_unset_t(nameargs, const_nonewline)) {
flag_nonewline = 1;
}
Expand Down Expand Up @@ -3406,9 +3409,9 @@ mth_file_gets(Toy_Interp *interp, Toy_Type *posargs, Hash *nameargs, int arglen)
if (cell_get_length(cbuff) == 0) {
return const_Nil;
} else {
Cell *c = decode_raw_to_unicode(cbuff, f->input_encoding, &enc_error_info);
Cell *c = decode_raw_to_unicode(cbuff, f->input_encoding, enc_error_info);
if (NULL == c) {
return new_exception(TE_BADENCODEBYTE, enc_error_info.message, interp);
return new_exception(TE_BADENCODEBYTE, enc_error_info->message, interp);
}
return new_string_cell(c);
}
Expand All @@ -3431,9 +3434,9 @@ mth_file_gets(Toy_Interp *interp, Toy_Type *posargs, Hash *nameargs, int arglen)
}

if ('\n' == c) {
Cell *c = decode_raw_to_unicode(cbuff, f->input_encoding, &enc_error_info);
Cell *c = decode_raw_to_unicode(cbuff, f->input_encoding, enc_error_info);
if (NULL == c) {
return new_exception(TE_BADENCODEBYTE, enc_error_info.message, interp);
return new_exception(TE_BADENCODEBYTE, enc_error_info->message, interp);
}
return new_string_cell(c);
}
Expand All @@ -3454,10 +3457,13 @@ mth_file_puts(Toy_Interp *interp, Toy_Type *posargs, Hash *nameargs, int arglen)
int flag_nonewline = 0;
wchar_t *p;
Cell *unicode, *raw;
encoder_error_info enc_error_info;
encoder_error_info *enc_error_info;

if (arglen == 0) goto error;

enc_error_info = GC_MALLOC(sizeof(encoder_error_info));
ALLOC_SAFE(enc_error_info);

if (hash_get_and_unset_t(nameargs, const_nonewline)) {
flag_nonewline = 1;
}
Expand All @@ -3483,9 +3489,9 @@ mth_file_puts(Toy_Interp *interp, Toy_Type *posargs, Hash *nameargs, int arglen)

while (posargs) {
unicode = new_cell(to_string_call(interp, list_get_item(posargs)));
raw = encode_unicode_to_raw(unicode, f->output_encoding, &enc_error_info);
raw = encode_unicode_to_raw(unicode, f->output_encoding, enc_error_info);
if (NULL == raw) {
return new_exception(TE_BADENCODEBYTE, enc_error_info.message, interp);
return new_exception(TE_BADENCODEBYTE, enc_error_info->message, interp);
}
p = cell_get_addr(raw);
c = fputs(to_char(p), f->fd);
Expand Down
26 changes: 25 additions & 1 deletion tests/sjis.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
����������
�����‚Ă�
�Ȃɂʂ˂�
�͂Ђӂւ�
�͂Ђӂւ�
�܂݂ނ߂�
�₢�䂦��
������
Expand All @@ -22,3 +22,27 @@
��
��
��
a��i��u��e��o��
��ka��ki��ku��ke��ko
sa��si��su��se��so��
��ta��ti��tu��te��to
na��ni��nu��ne��no��
��ha��hi��fu��he��ho
ma��mi��mu��me��mo��
��ya��i��yu��e��yo
ra��ri��ru��re��ro��
wa��
��wo
xn��
a��i��u�Ke��o��
��ka�Cki��ku��ke��ko
sa��shi�ssu��se�wso�c
��ta�nchi��tu��te��to
na��ni��nu�Gne�Qno��
�tha��hi�sfu�ohe��ho
ma��mi�gmu��me��mo�r
��ya��i��yu��e��yo
ra��ri��ru��re��ro�F
wa��
��wo
xn��

0 comments on commit 5b44ce1

Please sign in to comment.