Skip to content

Commit 9703fc4

Browse files
committed
ObjectiveC: Use NSUInteger and explain some bit shifting
1 parent 57c8246 commit 9703fc4

File tree

1 file changed

+43
-10
lines changed

1 file changed

+43
-10
lines changed

objectivec/DiffMatchPatch.m

+43-10
Original file line numberDiff line numberDiff line change
@@ -1342,7 +1342,7 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs;
13421342
return delta;
13431343
}
13441344

1345-
- (NSInteger)diff_digit16:(unichar)c
1345+
- (NSUInteger)diff_digit16:(unichar)c
13461346
{
13471347
switch (c) {
13481348
case '0': return 0;
@@ -1366,6 +1366,16 @@ - (NSInteger)diff_digit16:(unichar)c
13661366
}
13671367
}
13681368

1369+
/**
1370+
* Decode a percent-encoded UTF-8 string into a string of UTF-16 code units
1371+
* This is more permissive than `stringByRemovingPercentEncoding` because
1372+
* that fails if the input represents invalid Unicode characters. However, different
1373+
* diff-match-patch libraries may encode surrogate halves as if they were valid
1374+
* Unicode code points. Therefore, instead of failing or corrupting the output, which
1375+
* `stringByRemovingPercentEncoding` does when it inserts "(null)" in these places
1376+
* we can decode it anyway and then once the string is reconstructed from the diffs
1377+
* we'll end up with valid Unicode again, after the surrogate halves are re-joined
1378+
*/
13691379
- (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded
13701380
{
13711381
unichar decoded[[percentEncoded length]];
@@ -1376,74 +1386,97 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded
13761386
while (input < [percentEncoded length]) {
13771387
unichar c = [percentEncoded characterAtIndex:input];
13781388

1389+
// not special, so just return it
13791390
if ('%' != c) {
13801391
decoded[output++] = c;
13811392
input += 1;
13821393
continue;
13831394
}
13841395

1385-
uint16 byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) +
1386-
[self diff_digit16:[percentEncoded characterAtIndex:(input+2)]];
1396+
NSUInteger byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) +
1397+
[self diff_digit16:[percentEncoded characterAtIndex:(input+2)]];
13871398

1399+
// single-byte UTF-8 first byte has bitmask 0xxx xxxx
13881400
if ((byte1 & 0x80) == 0) {
13891401
decoded[output++] = byte1;
13901402
input += 3;
13911403
continue;
13921404
}
13931405

1406+
// at least one continuation byte
13941407
if ('%' != [percentEncoded characterAtIndex:(input + 3)]) {
13951408
return nil;
13961409
}
13971410

1398-
uint16 byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) +
1399-
[self diff_digit16:[percentEncoded characterAtIndex:(input+5)]];
1411+
NSUInteger byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) +
1412+
[self diff_digit16:[percentEncoded characterAtIndex:(input+5)]];
14001413

1414+
// continuation bytes have bitmask 10xx xxxx
14011415
if ((byte2 & 0xC0) != 0x80) {
14021416
return nil;
14031417
}
14041418

1419+
// continuation bytes thus only contribute six bits each
1420+
// these data bits are found with the bit mask xx11 1111
14051421
byte2 = byte2 & 0x3F;
14061422

1423+
// in two-byte sequences the first byte has bitmask 110x xxxx
14071424
if ((byte1 & 0xE0) == 0xC0) {
1425+
// byte1 ___x xxxx << 6
1426+
// byte2 __yy yyyy
1427+
// value x xxxxyy yyyy -> 11 bits
14081428
decoded[output++] = ((byte1 & 0x1F) << 6) | byte2;
14091429
input += 6;
14101430
continue;
14111431
}
14121432

1433+
// at least two continuation bytes
14131434
if ('%' != [percentEncoded characterAtIndex:(input + 6)]) {
14141435
return nil;
14151436
}
14161437

1417-
uint16 byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) +
1418-
[self diff_digit16:[percentEncoded characterAtIndex:(input+8)]];
1438+
NSUInteger byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) +
1439+
[self diff_digit16:[percentEncoded characterAtIndex:(input+8)]];
14191440

14201441
if ((byte3 & 0xC0) != 0x80) {
14211442
return nil;
14221443
}
14231444

14241445
byte3 = byte3 & 0x3F;
14251446

1447+
// in three-byte sequences the first byte has bitmask 1110 xxxx
14261448
if ((byte1 & 0xF0) == 0xE0) {
1449+
// byte1 ____ xxxx << 12
1450+
// byte2 __yy yyyy << 6
1451+
// byte3 __zz zzzz
1452+
// value xxxxyy yyyyzz zzzz -> 16 bits
14271453
decoded[output++] = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
14281454
input += 9;
14291455
continue;
14301456
}
14311457

1458+
// three continuation bytes
14321459
if ('%' != [percentEncoded characterAtIndex:(input + 9)]) {
14331460
return nil;
14341461
}
14351462

1436-
uint16 byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) +
1437-
[self diff_digit16:[percentEncoded characterAtIndex:(input+11)]];
1463+
NSUInteger byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) +
1464+
[self diff_digit16:[percentEncoded characterAtIndex:(input+11)]];
14381465

14391466
if ((byte4 & 0xC0) != 0x80) {
14401467
return nil;
14411468
}
14421469

14431470
byte4 = byte4 & 0x3F;
14441471

1472+
// in four-byte sequences the first byte has bitmask 1111 0xxx
14451473
if ((byte1 & 0xF8) == 0xF0) {
1446-
uint32 codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
1474+
// byte1 ____ _xxx << 18
1475+
// byte2 __yy yyyy << 12
1476+
// byte3 __zz zzzz << 6
1477+
// byte4 __tt tttt
1478+
// value xxxyy yyyyzz zzzztt tttt -> 21 bits
1479+
NSUInteger codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
14471480
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
14481481
codePoint -= 0x010000;
14491482
decoded[output++] = ((codePoint >> 10) & 0x3FF) | 0xD800;

0 commit comments

Comments
 (0)