@@ -215,6 +215,120 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
215
215
return string_desc_p ;
216
216
} /* ecma_new_ecma_string_from_utf8 */
217
217
218
+ /**
219
+ * Allocate a new ecma-string and initialize it from the utf8 string argument.
220
+ * All 4-bytes long unicode sequences are converted into two 3-bytes long sequences.
221
+ *
222
+ * @return pointer to ecma-string descriptor
223
+ */
224
+ ecma_string_t *
225
+ ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t * string_p , /**< utf-8 string */
226
+ lit_utf8_size_t string_size ) /**< utf-8 string size */
227
+ {
228
+ JERRY_ASSERT (string_p != NULL || string_size == 0 );
229
+
230
+ ecma_string_t * string_desc_p = NULL ;
231
+
232
+ ecma_length_t str_length = 0 ;
233
+ lit_utf8_size_t conv_size = 0 ;
234
+ lit_utf8_size_t size = 0 ;
235
+
236
+ /* Calculate the required length and size information of the converted cesu-8 encoded string */
237
+ while (size < string_size )
238
+ {
239
+ if ((string_p [size ] & LIT_UTF8_1_BYTE_MASK ) == LIT_UTF8_1_BYTE_MARKER )
240
+ {
241
+ size ++ ;
242
+ }
243
+ else if ((string_p [size ] & LIT_UTF8_2_BYTE_MASK ) == LIT_UTF8_2_BYTE_MARKER )
244
+ {
245
+ size += 2 ;
246
+ }
247
+ else if ((string_p [size ] & LIT_UTF8_3_BYTE_MASK ) == LIT_UTF8_3_BYTE_MARKER )
248
+ {
249
+ size += 3 ;
250
+ }
251
+ else
252
+ {
253
+ JERRY_ASSERT ((string_p [size ] & LIT_UTF8_4_BYTE_MASK ) == LIT_UTF8_4_BYTE_MARKER );
254
+ size += 4 ;
255
+ conv_size += 2 ;
256
+ }
257
+
258
+ str_length ++ ;
259
+ }
260
+
261
+ JERRY_ASSERT (size == string_size );
262
+
263
+ if (conv_size == 0 )
264
+ {
265
+ return ecma_new_ecma_string_from_utf8 (string_p , string_size );
266
+ }
267
+ else
268
+ {
269
+ conv_size += size ;
270
+
271
+ JERRY_ASSERT (lit_is_utf8_string_valid (string_p , string_size ));
272
+
273
+ lit_utf8_byte_t * data_p ;
274
+
275
+ if (likely (string_size <= UINT16_MAX ))
276
+ {
277
+ string_desc_p = jmem_heap_alloc_block (sizeof (ecma_string_t ) + conv_size );
278
+
279
+ string_desc_p -> refs_and_container = ECMA_STRING_CONTAINER_HEAP_UTF8_STRING | ECMA_STRING_REF_ONE ;
280
+ string_desc_p -> u .common_field = 0 ;
281
+ string_desc_p -> u .utf8_string .size = (uint16_t ) conv_size ;
282
+ string_desc_p -> u .utf8_string .length = (uint16_t ) str_length ;
283
+
284
+ data_p = (lit_utf8_byte_t * ) (string_desc_p + 1 );
285
+ }
286
+ else
287
+ {
288
+ string_desc_p = jmem_heap_alloc_block (sizeof (ecma_long_string_t ) + conv_size );
289
+
290
+ string_desc_p -> refs_and_container = ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING | ECMA_STRING_REF_ONE ;
291
+ string_desc_p -> u .common_field = 0 ;
292
+ string_desc_p -> u .long_utf8_string_size = conv_size ;
293
+
294
+ ecma_long_string_t * long_string_desc_p = (ecma_long_string_t * ) string_desc_p ;
295
+ long_string_desc_p -> long_utf8_string_length = str_length ;
296
+
297
+ data_p = (lit_utf8_byte_t * ) (long_string_desc_p + 1 );
298
+ }
299
+
300
+ size = 0 ;
301
+
302
+ while (size < string_size )
303
+ {
304
+ if ((string_p [size ] & LIT_UTF8_4_BYTE_MASK ) == LIT_UTF8_4_BYTE_MARKER )
305
+ {
306
+ /* Processing 4 byte unicode sequence. Always converted to two 3 byte long sequence. */
307
+ uint32_t character = ((((uint32_t ) string_p [size ++ ]) & 0x7 ) << 18 );
308
+ character |= ((((uint32_t ) string_p [size ++ ]) & LIT_UTF8_LAST_6_BITS_MASK ) << 12 );
309
+ character |= ((((uint32_t ) string_p [size ++ ]) & LIT_UTF8_LAST_6_BITS_MASK ) << 6 );
310
+ character |= (((uint32_t ) string_p [size ++ ]) & LIT_UTF8_LAST_6_BITS_MASK );
311
+
312
+ JERRY_ASSERT (character >= 0x10000 );
313
+ character -= 0x10000 ;
314
+
315
+ data_p += lit_char_to_utf8_bytes (data_p , (ecma_char_t ) (0xd800 | (character >> 10 )));
316
+ data_p += lit_char_to_utf8_bytes (data_p , (ecma_char_t ) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK )));
317
+ }
318
+ else
319
+ {
320
+ * data_p ++ = string_p [size ++ ];
321
+ }
322
+ }
323
+
324
+ JERRY_ASSERT (size == string_size );
325
+
326
+ string_desc_p -> hash = lit_utf8_string_calc_hash (data_p , conv_size );
327
+ }
328
+
329
+ return string_desc_p ;
330
+ } /* ecma_new_ecma_string_from_utf8_converted_to_cesu8 */
331
+
218
332
/**
219
333
* Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
220
334
*
0 commit comments