Skip to content

Commit

Permalink
Add pg_encoding_set_invalid()
Browse files Browse the repository at this point in the history
There are cases where we cannot / do not want to error out for invalidly
encoded input. In such cases it can be useful to replace e.g. an incomplete
multi-byte characters with bytes that will trigger an error when getting
validated as part of a larger string.

Unfortunately, until now, for some encoding no such sequence existed. For
those encodings this commit removes one previously accepted input combination
- we consider that to be ok, as the chosen bytes are outside of the valid
ranges for the encodings, we just previously failed to detect that.

As we cannot add a new field to pg_wchar_table without breaking ABI, this is
implemented "in-line" in the newly added function.

Author: Noah Misch <noah@leadboat.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Backpatch-through: 13
Security: CVE-2025-1094
  • Loading branch information
anarazel committed Feb 10, 2025
1 parent 439776b commit 7d43ca6
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 1 deletion.
55 changes: 54 additions & 1 deletion src/common/wchar.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,25 @@
#include "utils/ascii.h"


/*
* In today's multibyte encodings other than UTF8, this two-byte sequence
* ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
*
* For historical reasons, several verifychar implementations opt to reject
* this pair specifically. Byte pair range constraints, in encoding
* originator documentation, always excluded this pair. No core conversion
* could translate it. However, longstanding verifychar implementations
* accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
* pairs not valid per encoding originator documentation. To avoid tightening
* core or non-core conversions in a security patch, we sought this one pair.
*
* PQescapeString() historically used spaces for BYTE1; many other values
* could suffice for BYTE1.
*/
#define NONUTF8_INVALID_BYTE0 (0x8d)
#define NONUTF8_INVALID_BYTE1 (' ')


/*
* Operations on multi-byte encodings are driven by a table of helper
* functions.
Expand Down Expand Up @@ -1465,6 +1484,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
if (len < l)
return -1;

if (l == 2 &&
s[0] == NONUTF8_INVALID_BYTE0 &&
s[1] == NONUTF8_INVALID_BYTE1)
return -1;

while (--l > 0)
{
if (*++s == '\0')
Expand Down Expand Up @@ -1514,6 +1538,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
if (len < l)
return -1;

if (l == 2 &&
s[0] == NONUTF8_INVALID_BYTE0 &&
s[1] == NONUTF8_INVALID_BYTE1)
return -1;

while (--l > 0)
{
if (*++s == '\0')
Expand Down Expand Up @@ -1563,6 +1592,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
if (len < l)
return -1;

if (l == 2 &&
s[0] == NONUTF8_INVALID_BYTE0 &&
s[1] == NONUTF8_INVALID_BYTE1)
return -1;

while (--l > 0)
{
if (*++s == '\0')
Expand Down Expand Up @@ -2007,6 +2041,19 @@ pg_utf8_islegal(const unsigned char *source, int length)
}


/*
* Fills the provided buffer with two bytes such that:
* pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
*/
void
pg_encoding_set_invalid(int encoding, char *dst)
{
Assert(pg_encoding_max_length(encoding) > 1);

dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
dst[1] = NONUTF8_INVALID_BYTE1;
}

/*
*-------------------------------------------------------------------
* encoding info table
Expand Down Expand Up @@ -2128,5 +2175,11 @@ pg_encoding_max_length(int encoding)
{
Assert(PG_VALID_ENCODING(encoding));

return pg_wchar_table[encoding].maxmblen;
/*
* Check for the encoding despite the assert, due to some mingw versions
* otherwise issuing bogus warnings.
*/
return PG_VALID_ENCODING(encoding) ?
pg_wchar_table[encoding].maxmblen :
pg_wchar_table[PG_SQL_ASCII].maxmblen;
}
1 change: 1 addition & 0 deletions src/include/mb/pg_wchar.h
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,7 @@ extern int pg_valid_server_encoding_id(int encoding);
* (in addition to the ones just above). The constant tables declared
* earlier in this file are also available from libpgcommon.
*/
extern void pg_encoding_set_invalid(int encoding, char *dst);
extern int pg_encoding_mblen(int encoding, const char *mbstr);
extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr);
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
Expand Down
7 changes: 7 additions & 0 deletions src/test/regress/expected/conversion.out
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
\getenv libdir PG_LIBDIR
\getenv dlsuffix PG_DLSUFFIX
\set regresslib :libdir '/regress' :dlsuffix
CREATE FUNCTION test_enc_setup() RETURNS void
AS :'regresslib', 'test_enc_setup'
LANGUAGE C STRICT;
SELECT FROM test_enc_setup();
--
(1 row)

CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
AS :'regresslib', 'test_enc_conversion'
LANGUAGE C STRICT;
Expand Down
50 changes: 50 additions & 0 deletions src/test/regress/regress.c
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,56 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
PG_RETURN_NULL();
}

/* one-time tests for encoding infrastructure */
PG_FUNCTION_INFO_V1(test_enc_setup);
Datum
test_enc_setup(PG_FUNCTION_ARGS)
{
/* Test pg_encoding_set_invalid() */
for (int i = 0; i < _PG_LAST_ENCODING_; i++)
{
char buf[2],
bigbuf[16];
int len,
mblen,
valid;

if (pg_encoding_max_length(i) == 1)
continue;
pg_encoding_set_invalid(i, buf);
len = strnlen(buf, 2);
if (len != 2)
elog(WARNING,
"official invalid string for encoding \"%s\" has length %d",
pg_enc2name_tbl[i].name, len);
mblen = pg_encoding_mblen(i, buf);
if (mblen != 2)
elog(WARNING,
"official invalid string for encoding \"%s\" has mblen %d",
pg_enc2name_tbl[i].name, mblen);
valid = pg_encoding_verifymbstr(i, buf, len);
if (valid != 0)
elog(WARNING,
"official invalid string for encoding \"%s\" has valid prefix of length %d",
pg_enc2name_tbl[i].name, valid);
valid = pg_encoding_verifymbstr(i, buf, 1);
if (valid != 0)
elog(WARNING,
"first byte of official invalid string for encoding \"%s\" has valid prefix of length %d",
pg_enc2name_tbl[i].name, valid);
memset(bigbuf, ' ', sizeof(bigbuf));
bigbuf[0] = buf[0];
bigbuf[1] = buf[1];
valid = pg_encoding_verifymbstr(i, bigbuf, sizeof(bigbuf));
if (valid != 0)
elog(WARNING,
"trailing data changed official invalid string for encoding \"%s\" to have valid prefix of length %d",
pg_enc2name_tbl[i].name, valid);
}

PG_RETURN_VOID();
}

/*
* Call an encoding conversion or verification function.
*
Expand Down
5 changes: 5 additions & 0 deletions src/test/regress/sql/conversion.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

\set regresslib :libdir '/regress' :dlsuffix

CREATE FUNCTION test_enc_setup() RETURNS void
AS :'regresslib', 'test_enc_setup'
LANGUAGE C STRICT;
SELECT FROM test_enc_setup();

CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
AS :'regresslib', 'test_enc_conversion'
LANGUAGE C STRICT;
Expand Down

0 comments on commit 7d43ca6

Please sign in to comment.