@@ -1342,7 +1342,7 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs;
1342
1342
return delta;
1343
1343
}
1344
1344
1345
- - (NSInteger )diff_digit16 : (unichar )c
1345
+ - (NSUInteger )diff_digit16 : (unichar )c
1346
1346
{
1347
1347
switch (c) {
1348
1348
case ' 0' : return 0 ;
@@ -1366,6 +1366,16 @@ - (NSInteger)diff_digit16:(unichar)c
1366
1366
}
1367
1367
}
1368
1368
1369
+ /* *
1370
+ * Decode a percent-encoded UTF-8 string into a string of UTF-16 code units
1371
+ * This is more permissive than `stringByRemovingPercentEncoding` because
1372
+ * that fails if the input represents invalid Unicode characters. However, different
1373
+ * diff-match-patch libraries may encode surrogate halves as if they were valid
1374
+ * Unicode code points. Therefore, instead of failing or corrupting the output, which
1375
+ * `stringByRemovingPercentEncoding` does when it inserts "(null)" in these places
1376
+ * we can decode it anyway and then once the string is reconstructed from the diffs
1377
+ * we'll end up with valid Unicode again, after the surrogate halves are re-joined
1378
+ */
1369
1379
- (NSString *)diff_decodeURIWithText : (NSString *)percentEncoded
1370
1380
{
1371
1381
unichar decoded[[percentEncoded length ]];
@@ -1376,74 +1386,97 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded
1376
1386
while (input < [percentEncoded length ]) {
1377
1387
unichar c = [percentEncoded characterAtIndex: input];
1378
1388
1389
+ // not special, so just return it
1379
1390
if (' %' != c) {
1380
1391
decoded[output++] = c;
1381
1392
input += 1 ;
1382
1393
continue ;
1383
1394
}
1384
1395
1385
- uint16 byte1 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+1 )]] << 4 ) +
1386
- [self diff_digit16: [percentEncoded characterAtIndex: (input+2 )]];
1396
+ NSUInteger byte1 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+1 )]] << 4 ) +
1397
+ [self diff_digit16: [percentEncoded characterAtIndex: (input+2 )]];
1387
1398
1399
+ // single-byte UTF-8 first byte has bitmask 0xxx xxxx
1388
1400
if ((byte1 & 0x80 ) == 0 ) {
1389
1401
decoded[output++] = byte1;
1390
1402
input += 3 ;
1391
1403
continue ;
1392
1404
}
1393
1405
1406
+ // at least one continuation byte
1394
1407
if (' %' != [percentEncoded characterAtIndex: (input + 3 )]) {
1395
1408
return nil ;
1396
1409
}
1397
1410
1398
- uint16 byte2 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+4 )]] << 4 ) +
1399
- [self diff_digit16: [percentEncoded characterAtIndex: (input+5 )]];
1411
+ NSUInteger byte2 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+4 )]] << 4 ) +
1412
+ [self diff_digit16: [percentEncoded characterAtIndex: (input+5 )]];
1400
1413
1414
+ // continuation bytes have bitmask 10xx xxxx
1401
1415
if ((byte2 & 0xC0 ) != 0x80 ) {
1402
1416
return nil ;
1403
1417
}
1404
1418
1419
+ // continuation bytes thus only contribute six bits each
1420
+ // these data bits are found with the bit mask xx11 1111
1405
1421
byte2 = byte2 & 0x3F ;
1406
1422
1423
+ // in two-byte sequences the first byte has bitmask 110x xxxx
1407
1424
if ((byte1 & 0xE0 ) == 0xC0 ) {
1425
+ // byte1 ___x xxxx << 6
1426
+ // byte2 __yy yyyy
1427
+ // value x xxxxyy yyyy -> 11 bits
1408
1428
decoded[output++] = ((byte1 & 0x1F ) << 6 ) | byte2;
1409
1429
input += 6 ;
1410
1430
continue ;
1411
1431
}
1412
1432
1433
+ // at least two continuation bytes
1413
1434
if (' %' != [percentEncoded characterAtIndex: (input + 6 )]) {
1414
1435
return nil ;
1415
1436
}
1416
1437
1417
- uint16 byte3 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+7 )]] << 4 ) +
1418
- [self diff_digit16: [percentEncoded characterAtIndex: (input+8 )]];
1438
+ NSUInteger byte3 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+7 )]] << 4 ) +
1439
+ [self diff_digit16: [percentEncoded characterAtIndex: (input+8 )]];
1419
1440
1420
1441
if ((byte3 & 0xC0 ) != 0x80 ) {
1421
1442
return nil ;
1422
1443
}
1423
1444
1424
1445
byte3 = byte3 & 0x3F ;
1425
1446
1447
+ // in three-byte sequences the first byte has bitmask 1110 xxxx
1426
1448
if ((byte1 & 0xF0 ) == 0xE0 ) {
1449
+ // byte1 ____ xxxx << 12
1450
+ // byte2 __yy yyyy << 6
1451
+ // byte3 __zz zzzz
1452
+ // value xxxxyy yyyyzz zzzz -> 16 bits
1427
1453
decoded[output++] = ((byte1 & 0x0F ) << 12 ) | (byte2 << 6 ) | byte3;
1428
1454
input += 9 ;
1429
1455
continue ;
1430
1456
}
1431
1457
1458
+ // three continuation bytes
1432
1459
if (' %' != [percentEncoded characterAtIndex: (input + 9 )]) {
1433
1460
return nil ;
1434
1461
}
1435
1462
1436
- uint16 byte4 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+10 )]] << 4 ) +
1437
- [self diff_digit16: [percentEncoded characterAtIndex: (input+11 )]];
1463
+ NSUInteger byte4 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+10 )]] << 4 ) +
1464
+ [self diff_digit16: [percentEncoded characterAtIndex: (input+11 )]];
1438
1465
1439
1466
if ((byte4 & 0xC0 ) != 0x80 ) {
1440
1467
return nil ;
1441
1468
}
1442
1469
1443
1470
byte4 = byte4 & 0x3F ;
1444
1471
1472
+ // in four-byte sequences the first byte has bitmask 1111 0xxx
1445
1473
if ((byte1 & 0xF8 ) == 0xF0 ) {
1446
- uint32 codePoint = ((byte1 & 0x07 ) << 0x12 ) | (byte2 << 0x0C ) | (byte3 << 0x06 ) | byte4;
1474
+ // byte1 ____ _xxx << 18
1475
+ // byte2 __yy yyyy << 12
1476
+ // byte3 __zz zzzz << 6
1477
+ // byte4 __tt tttt
1478
+ // value xxxyy yyyyzz zzzztt tttt -> 21 bits
1479
+ NSUInteger codePoint = ((byte1 & 0x07 ) << 0x12 ) | (byte2 << 0x0C ) | (byte3 << 0x06 ) | byte4;
1447
1480
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF ) {
1448
1481
codePoint -= 0x010000 ;
1449
1482
decoded[output++] = ((codePoint >> 10 ) & 0x3FF ) | 0xD800 ;
0 commit comments