Skip to content

Commit 6a3dec5

Browse files
committed
Add API functions to create string from a valid UTF-8 string.
JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
1 parent dc5ae46 commit 6a3dec5

File tree

4 files changed

+151
-0
lines changed

4 files changed

+151
-0
lines changed

jerry-core/ecma/base/ecma-helpers-string.c

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,120 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
215215
return string_desc_p;
216216
} /* ecma_new_ecma_string_from_utf8 */
217217

218+
/**
219+
* Allocate a new ecma-string and initialize it from the utf8 string argument.
220+
* All 4-bytes long unicode sequences are converted into two 3-bytes long sequences.
221+
*
222+
* @return pointer to ecma-string descriptor
223+
*/
224+
ecma_string_t *
225+
ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *string_p, /**< utf-8 string */
226+
lit_utf8_size_t string_size) /**< utf-8 string size */
227+
{
228+
JERRY_ASSERT (string_p != NULL || string_size == 0);
229+
230+
ecma_string_t *string_desc_p = NULL;
231+
232+
ecma_length_t str_length = 0;
233+
lit_utf8_size_t conv_size = 0;
234+
lit_utf8_size_t size = 0;
235+
236+
/* Calculate the required length and size information of the converted cesu-8 encoded string */
237+
while (size < string_size)
238+
{
239+
if ((string_p[size] & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
240+
{
241+
size++;
242+
}
243+
else if ((string_p[size] & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
244+
{
245+
size += 2;
246+
}
247+
else if ((string_p[size] & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
248+
{
249+
size += 3;
250+
}
251+
else
252+
{
253+
JERRY_ASSERT ((string_p[size] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
254+
size += 4;
255+
conv_size += 2;
256+
}
257+
258+
str_length++;
259+
}
260+
261+
JERRY_ASSERT (size == string_size);
262+
263+
if (conv_size == 0)
264+
{
265+
return ecma_new_ecma_string_from_utf8 (string_p, string_size);
266+
}
267+
else
268+
{
269+
conv_size += size;
270+
271+
JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size));
272+
273+
lit_utf8_byte_t *data_p;
274+
275+
if (likely (string_size <= UINT16_MAX))
276+
{
277+
string_desc_p = jmem_heap_alloc_block (sizeof (ecma_string_t) + conv_size);
278+
279+
string_desc_p->refs_and_container = ECMA_STRING_CONTAINER_HEAP_UTF8_STRING | ECMA_STRING_REF_ONE;
280+
string_desc_p->u.common_field = 0;
281+
string_desc_p->u.utf8_string.size = (uint16_t) conv_size;
282+
string_desc_p->u.utf8_string.length = (uint16_t) str_length;
283+
284+
data_p = (lit_utf8_byte_t *) (string_desc_p + 1);
285+
}
286+
else
287+
{
288+
string_desc_p = jmem_heap_alloc_block (sizeof (ecma_long_string_t) + conv_size);
289+
290+
string_desc_p->refs_and_container = ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING | ECMA_STRING_REF_ONE;
291+
string_desc_p->u.common_field = 0;
292+
string_desc_p->u.long_utf8_string_size = conv_size;
293+
294+
ecma_long_string_t *long_string_desc_p = (ecma_long_string_t *) string_desc_p;
295+
long_string_desc_p->long_utf8_string_length = str_length;
296+
297+
data_p = (lit_utf8_byte_t *) (long_string_desc_p + 1);
298+
}
299+
300+
size = 0;
301+
302+
while (size < string_size)
303+
{
304+
if ((string_p[size] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
305+
{
306+
/* Processing 4 byte unicode sequence. Always converted to two 3 byte long sequence. */
307+
uint32_t character = ((((uint32_t) string_p[size++]) & 0x7) << 18);
308+
character |= ((((uint32_t) string_p[size++]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
309+
character |= ((((uint32_t) string_p[size++]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
310+
character |= (((uint32_t) string_p[size++]) & LIT_UTF8_LAST_6_BITS_MASK);
311+
312+
JERRY_ASSERT (character >= 0x10000);
313+
character -= 0x10000;
314+
315+
data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xd800 | (character >> 10)));
316+
data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK)));
317+
}
318+
else
319+
{
320+
*data_p++ = string_p[size++];
321+
}
322+
}
323+
324+
JERRY_ASSERT (size == string_size);
325+
326+
string_desc_p->hash = lit_utf8_string_calc_hash (data_p, conv_size);
327+
}
328+
329+
return string_desc_p;
330+
} /* ecma_new_ecma_string_from_utf8_converted_to_cesu8 */
331+
218332
/**
219333
* Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
220334
*

jerry-core/ecma/base/ecma-helpers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ extern void ecma_free_value_if_not_object (ecma_value_t);
164164

165165
/* ecma-helpers-string.c */
166166
extern ecma_string_t *ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *, lit_utf8_size_t);
167+
extern ecma_string_t *ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *, lit_utf8_size_t);
167168
extern ecma_string_t *ecma_new_ecma_string_from_code_unit (ecma_char_t);
168169
extern ecma_string_t *ecma_new_ecma_string_from_uint32 (uint32_t);
169170
extern ecma_string_t *ecma_new_ecma_string_from_number (ecma_number_t);

jerry-core/jerry-api.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ jerry_value_t jerry_create_number_infinity (bool);
248248
jerry_value_t jerry_create_number_nan (void);
249249
jerry_value_t jerry_create_null (void);
250250
jerry_value_t jerry_create_object (void);
251+
jerry_value_t jerry_create_string_from_utf8 (const jerry_char_t *);
252+
jerry_value_t jerry_create_string_sz_from_utf8 (const jerry_char_t *, jerry_size_t);
251253
jerry_value_t jerry_create_string (const jerry_char_t *);
252254
jerry_value_t jerry_create_string_sz (const jerry_char_t *, jerry_size_t);
253255
jerry_value_t jerry_create_undefined (void);

jerry-core/jerry.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,40 @@ jerry_create_object (void)
913913
return ecma_make_object_value (ecma_op_create_object_object_noarg ());
914914
} /* jerry_create_object */
915915

916+
/**
917+
* Create string from a valid UTF8 string
918+
*
919+
* Note:
920+
* returned value must be freed with jerry_release_value when it is no longer needed.
921+
*
922+
* @return value of the created string
923+
*/
924+
jerry_value_t
925+
jerry_create_string_from_utf8 (const jerry_char_t *str_p) /**< pointer to string */
926+
{
927+
return jerry_create_string_sz_from_utf8 (str_p, lit_zt_utf8_string_size ((lit_utf8_byte_t *) str_p));
928+
} /* jerry_create_string_from_utf8 */
929+
930+
/**
931+
* Create string from a valid UTF8 string
932+
*
933+
* Note:
934+
* returned value must be freed with jerry_release_value when it is no longer needed.
935+
*
936+
* @return value of the created string
937+
*/
938+
jerry_value_t
939+
jerry_create_string_sz_from_utf8 (const jerry_char_t *str_p, /**< pointer to string */
940+
jerry_size_t str_size) /**< string size */
941+
{
942+
jerry_assert_api_available ();
943+
944+
ecma_string_t *ecma_str_p = ecma_new_ecma_string_from_utf8_converted_to_cesu8 ((lit_utf8_byte_t *) str_p,
945+
(lit_utf8_size_t) str_size);
946+
947+
return ecma_make_string_value (ecma_str_p);
948+
} /* jerry_create_string_sz_from_utf8 */
949+
916950
/**
917951
* Create string from a valid CESU8 string
918952
*

0 commit comments

Comments
 (0)