Skip to content

Change internal encoding of strings to CESU-8 #616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 20, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions jerry-core/ecma/base/ecma-helpers-conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,42 +354,40 @@ ecma_utf8_string_to_number (const lit_utf8_byte_t *str_p, /**< utf-8 string */
return ECMA_NUMBER_ZERO;
}

lit_utf8_iterator_t iter = lit_utf8_iterator_create (str_p, str_size);
lit_utf8_byte_t *str_curr_p = (lit_utf8_byte_t *) str_p;
const lit_utf8_byte_t *str_end_p = str_p + str_size;
ecma_char_t code_unit;

while (!lit_utf8_iterator_is_eos (&iter))
while (str_curr_p < str_end_p)
{
code_unit = lit_utf8_iterator_peek_next (&iter);
code_unit = lit_utf8_peek_next (str_curr_p);
if (lit_char_is_white_space (code_unit) || lit_char_is_line_terminator (code_unit))
{
lit_utf8_iterator_incr (&iter);
lit_utf8_incr (&str_curr_p);
}
else
{
break;
}
}

JERRY_ASSERT (!iter.buf_pos.is_non_bmp_middle);
const lit_utf8_byte_t *begin_p = iter.buf_p + iter.buf_pos.offset;
const lit_utf8_byte_t *begin_p = str_curr_p;
str_curr_p = (lit_utf8_byte_t *) str_end_p;

iter = lit_utf8_iterator_create (iter.buf_p + iter.buf_pos.offset, str_size - iter.buf_pos.offset);
lit_utf8_iterator_seek_eos (&iter);
while (!lit_utf8_iterator_is_bos (&iter))
while (str_curr_p > str_p)
{
code_unit = lit_utf8_iterator_peek_prev (&iter);
code_unit = lit_utf8_peek_prev (str_curr_p);
if (lit_char_is_white_space (code_unit) || lit_char_is_line_terminator (code_unit))
{
lit_utf8_iterator_decr (&iter);
lit_utf8_decr (&str_curr_p);
}
else
{
break;
}
}

JERRY_ASSERT (!iter.buf_pos.is_non_bmp_middle);
const lit_utf8_byte_t *end_p = iter.buf_p + iter.buf_pos.offset - 1;
const lit_utf8_byte_t *end_p = str_curr_p - 1;

if (begin_p > end_p)
{
Expand Down
107 changes: 42 additions & 65 deletions jerry-core/ecma/base/ecma-helpers-string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
lit_utf8_size_t string_size) /**< string size */
{
JERRY_ASSERT (string_p != NULL || string_size == 0);
JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size));
JERRY_ASSERT (lit_is_cesu8_string_valid (string_p, string_size));

lit_magic_string_id_t magic_string_id;
if (lit_is_utf8_string_magic (string_p, string_size, &magic_string_id))
Expand Down Expand Up @@ -444,7 +444,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
} /* ecma_new_ecma_string_from_utf8 */

/**
* Allocate new ecma-string and fill it with utf-8 character which represents specified code unit
* Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
*
* @return pointer to ecma-string descriptor
*/
Expand Down Expand Up @@ -627,14 +627,7 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
jerry_fatal (ERR_OUT_OF_MEMORY);
}

ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1);
ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0);

bool is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit)
&& lit_is_code_unit_low_surrogate (str2_first_code_unit));

lit_utf8_size_t buffer_size = str1_size + str2_size - (lit_utf8_size_t) (is_surrogate_pair_sliced ?
LIT_UTF8_CESU8_SURROGATE_SIZE_DIF : 0);
lit_utf8_size_t buffer_size = str1_size + str2_size;

lit_utf8_byte_t *str_p = (lit_utf8_byte_t *) mem_heap_alloc_block (buffer_size, MEM_HEAP_ALLOC_SHORT_TERM);

Expand All @@ -643,23 +636,9 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
bytes_copied1 = ecma_string_to_utf8_string (string1_p, str_p, (ssize_t) str1_size);
JERRY_ASSERT (bytes_copied1 > 0);

if (!is_surrogate_pair_sliced)
{
bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
JERRY_ASSERT (bytes_copied2 > 0);
}
else
{
bytes_copied2 = ecma_string_to_utf8_string (string2_p,
str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + 1,
(ssize_t) buffer_size - bytes_copied1
+ LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
JERRY_ASSERT (bytes_copied2 > 0);
bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
JERRY_ASSERT (bytes_copied2 > 0);

lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit,
str2_first_code_unit);
lit_code_point_to_utf8 (surrogate_code_point, str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
}
ecma_string_t *str_concat_p = ecma_new_ecma_string_from_utf8 (str_p, buffer_size);

mem_heap_free_block ((void*) str_p);
Expand Down Expand Up @@ -955,7 +934,7 @@ ecma_string_get_array_index (const ecma_string_t *str_p, /**< ecma-string */
} /* ecma_string_is_array_index */

/**
* Convert ecma-string's contents to a utf-8 string and put it to the buffer.
* Convert ecma-string's contents to a cesu-8 string and put it to the buffer.
*
* @return number of bytes, actually copied to the buffer - if string's content was copied successfully;
* otherwise (in case size of buffer is insufficient) - negative number, which is calculated
Expand Down Expand Up @@ -1018,7 +997,6 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string

break;
}

case ECMA_STRING_CONTAINER_MAGIC_STRING:
{
const lit_magic_string_id_t id = string_desc_p->u.magic_string_id;
Expand Down Expand Up @@ -1491,7 +1469,7 @@ ecma_string_get_char_at_pos (const ecma_string_t *string_p, /**< ecma-string */
ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
JERRY_ASSERT (sz > 0);

ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);;
ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);

MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);

Expand Down Expand Up @@ -1682,10 +1660,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
JERRY_ASSERT (end_pos <= string_length);
#endif

const ecma_length_t span = (start_pos > end_pos) ? 0 : end_pos - start_pos;
const lit_utf8_size_t utf8_str_size = LIT_UTF8_MAX_BYTES_IN_CODE_UNIT * span;

if (utf8_str_size)
if (start_pos < end_pos)
{
/**
* I. Dump original string to plain buffer
Expand All @@ -1701,20 +1676,22 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
/**
* II. Extract substring
*/
MEM_DEFINE_LOCAL_ARRAY (utf8_substr_buffer, utf8_str_size, lit_utf8_byte_t);
lit_utf8_byte_t *start_p = utf8_str_p;
end_pos -= start_pos;

lit_utf8_size_t utf8_substr_buffer_offset = 0;
for (ecma_length_t idx = 0; idx < span; idx++)
while (start_pos--)
{
ecma_char_t code_unit = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, start_pos + idx);
start_p += lit_get_unicode_char_size_by_utf8_first_byte (*start_p);
}

JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
utf8_substr_buffer_offset += lit_code_unit_to_utf8 (code_unit, utf8_substr_buffer + utf8_substr_buffer_offset);
lit_utf8_byte_t *end_p = start_p;
while (end_pos--)
{
end_p += lit_get_unicode_char_size_by_utf8_first_byte (*end_p);
}

ecma_string_p = ecma_new_ecma_string_from_utf8 (utf8_substr_buffer, utf8_substr_buffer_offset);
ecma_string_p = ecma_new_ecma_string_from_utf8 (start_p, (lit_utf8_size_t) (end_p - start_p));

MEM_FINALIZE_LOCAL_ARRAY (utf8_substr_buffer);
MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);

return ecma_string_p;
Expand Down Expand Up @@ -1746,47 +1723,47 @@ ecma_string_trim (const ecma_string_t *string_p) /**< pointer to an ecma string
ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
JERRY_ASSERT (sz >= 0);

lit_utf8_iterator_t front = lit_utf8_iterator_create (utf8_str_p, buffer_size);

lit_utf8_iterator_t back = lit_utf8_iterator_create (utf8_str_p, buffer_size);
lit_utf8_iterator_seek_eos (&back);

lit_utf8_iterator_pos_t start = lit_utf8_iterator_get_pos (&back);
lit_utf8_iterator_pos_t end = lit_utf8_iterator_get_pos (&front);

ecma_char_t current;
ecma_char_t ch;
lit_utf8_size_t read_size;
lit_utf8_byte_t *nonws_start_p = utf8_str_p + buffer_size;
lit_utf8_byte_t *current_p = utf8_str_p;

/* Trim front. */
while (!lit_utf8_iterator_is_eos (&front))
while (current_p < nonws_start_p)
{
current = lit_utf8_iterator_read_next (&front);
if (!lit_char_is_white_space (current)
&& !lit_char_is_line_terminator (current))
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);

if (!lit_char_is_white_space (ch)
&& !lit_char_is_line_terminator (ch))
{
lit_utf8_iterator_decr (&front);
start = lit_utf8_iterator_get_pos (&front);
nonws_start_p = current_p;
break;
}

current_p += read_size;
}

current_p = utf8_str_p + buffer_size;

/* Trim back. */
while (!lit_utf8_iterator_is_bos (&back))
while (current_p > utf8_str_p)
{
current = lit_utf8_iterator_read_prev (&back);
if (!lit_char_is_white_space (current)
&& !lit_char_is_line_terminator (current))
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);

if (!lit_char_is_white_space (ch)
&& !lit_char_is_line_terminator (ch))
{
lit_utf8_iterator_incr (&back);
end = lit_utf8_iterator_get_pos (&back);
break;
}

current_p -= read_size;
}

/* Construct new string. */
if (end.offset > start.offset)
if (current_p > nonws_start_p)
{
ret_string_p = ecma_new_ecma_string_from_utf8 (utf8_str_p + start.offset,
(lit_utf8_size_t) (end.offset - start.offset));
ret_string_p = ecma_new_ecma_string_from_utf8 (nonws_start_p,
(lit_utf8_size_t) (current_p - nonws_start_p));
}
else
{
Expand Down
Loading