Skip to content

Commit

Permalink
Only read number of bytes indicated by string data item
Browse files Browse the repository at this point in the history
The string parsing now no longer reads more bytes than indicated by the data item. For any incomplete UTF-8 code points an exception with a corresponding message is thrown.
  • Loading branch information
knutwannheden committed Oct 17, 2024
1 parent 0f75408 commit 5bfa87a
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
}
return;
}
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
// the longest individual unit is 4 bytes (surrogate pair) so we
// actually need len+3 bytes to avoid bounds checks
// 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
// like Integer.MAX_VALUE
final int needed = Math.max(len, len + 3);
final int available = _inputEnd - _inputPtr;

if ((available >= needed)
if ((available >= len)
// if not, could we read? NOTE: we do not require it, just attempt to read
|| ((_inputBuffer.length >= needed)
&& _tryToLoadToHaveAtLeast(needed))) {
|| ((_inputBuffer.length >= len)
&& _tryToLoadToHaveAtLeast(len))) {
_finishShortText(len);
return;
}
Expand Down Expand Up @@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
_finishChunkedText();
return _textBuffer.contentsAsString();
}
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
// the longest individual unit is 4 bytes (surrogate pair) so we
// actually need len+3 bytes to avoid bounds checks

// 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
// we get is Integer.MAX_VALUE, leading to overflow. Could change values
// to longs but simpler to truncate "needed" (will never pass following test
// due to inputBuffer never being even close to that big).

final int needed = Math.max(len + 3, len);
final int available = _inputEnd - _inputPtr;

if ((available >= needed)
if ((available >= len)
// if not, could we read? NOTE: we do not require it, just attempt to read
|| ((_inputBuffer.length >= needed)
&& _tryToLoadToHaveAtLeast(needed))) {
|| ((_inputBuffer.length >= len)
&& _tryToLoadToHaveAtLeast(len))) {
return _finishShortText(len);
}
// If not enough space, need handling similar to chunked
Expand Down Expand Up @@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
final byte[] inputBuf = _inputBuffer;

// Let's actually do a tight loop for ASCII first:
final int end = inPtr + len;
final int end = _inputPtr;

int i;
while ((i = inputBuf[inPtr]) >= 0) {
Expand All @@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
final int[] codes = UTF8_UNIT_CODES;
do {
i = inputBuf[inPtr++] & 0xFF;
switch (codes[i]) {
case 0:
break;
case 1:
{
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);
}
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
int code = codes[i];
if (code != 0) {
// 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
// be careful wrt end-of-buffer truncated codepoints
if ((inPtr + code) > end) {
final int firstCharOffset = len - (end - inPtr) - 1;
_reportTruncatedUTF8InString(len, firstCharOffset, i, code);
}
break;
case 2:
{
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);

switch (code) {
case 1: {
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);
}
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
}
final int c3 = inputBuf[inPtr++];
if ((c3 & 0xC0) != 0x080) {
_reportInvalidOther(c3 & 0xFF, inPtr);
break;
case 2: {
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);
}
final int c3 = inputBuf[inPtr++];
if ((c3 & 0xC0) != 0x080) {
_reportInvalidOther(c3 & 0xFF, inPtr);
}
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
}
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
break;
case 3:
// 30-Jan-2021, tatu: TODO - validate these too?
i = ((i & 0x07) << 18)
| ((inputBuf[inPtr++] & 0x3F) << 12)
| ((inputBuf[inPtr++] & 0x3F) << 6)
| (inputBuf[inPtr++] & 0x3F);
// note: this is the codepoint value; need to split, too
i -= 0x10000;
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
i = 0xDC00 | (i & 0x3FF);
break;
default: // invalid
_reportInvalidInitial(i);
}
break;
case 3:
// 30-Jan-2021, tatu: TODO - validate these too?
i = ((i & 0x07) << 18)
| ((inputBuf[inPtr++] & 0x3F) << 12)
| ((inputBuf[inPtr++] & 0x3F) << 6)
| (inputBuf[inPtr++] & 0x3F);
// note: this is the codepoint value; need to split, too
i -= 0x10000;
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
i = 0xDC00 | (i & 0x3FF);
break;
default: // invalid
_reportInvalidInitial(i);
}
outBuf[outPtr++] = (char) i;
} while (inPtr < end);
Expand Down Expand Up @@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
expLen, actLen), _currToken);
}

// @since 2.13
/*
// @since 2.19
private String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset,
int firstUTFByteValue, int bytesExpected)
throws IOException
{
throw _constructError(String.format(
"Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
"Truncated UTF-8 character in Unicode String value (%d bytes): "
+"byte 0x%02X at offset #%d indicated %d more bytes needed",
strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected));
}
*/

// @since 2.13
private String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception
p.getText();
fail("Should not pass");
} catch (StreamReadException e) {
verifyException(e, "Malformed UTF-8 character at the end of a");
verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)");
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;

import java.nio.charset.StandardCharsets;
import java.util.Arrays;

public class ParseInvalidUTF8String236Test extends CBORTestBase
{
// [dataformats-binary#236]: Original version; broken UTF-8 all around.
Expand All @@ -24,8 +27,8 @@ public void testShortString236Original() throws Exception
}

// Variant where the length would be valid, but the last byte is partial UTF-8
// code point
public void testShortString236EndsWithPartialUTF8() throws Exception
// code point and no more bytes are available due to end-of-stream
public void testShortString236EndsWithPartialUTF8AtEndOfStream() throws Exception
{
final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda};
try (CBORParser p = cborParser(input)) {
Expand All @@ -34,7 +37,23 @@ public void testShortString236EndsWithPartialUTF8() throws Exception
String str = p.getText();
fail("Should have failed, did not, String = '"+str+"'");
} catch (StreamReadException e) {
verifyException(e, "Malformed UTF-8 character at the end of");
verifyException(e, "Truncated UTF-8 character in Unicode String value (3 bytes)");
}
}
}

// Variant where the length would be valid, but the last byte is partial UTF-8
// code point and the subsequent byte would be a valid continuation byte, but belongs to next data item
public void testShortString236EndsWithPartialUTF8() throws Exception
{
final byte[] input = {0x62, 0x33, (byte) 0xdb, (byte) 0xa0};
try (CBORParser p = cborParser(input)) {
assertToken(JsonToken.VALUE_STRING, p.nextToken());
try {
String str = p.getText();
fail("Should have failed, did not, String = '"+str+"'");
} catch (StreamReadException e) {
verifyException(e, "Truncated UTF-8 character in Unicode String value (2 bytes)");
}
}
}
Expand Down

0 comments on commit 5bfa87a

Please sign in to comment.