Skip to content

Commit

Permalink
expr,sql: add chr function
Browse files Browse the repository at this point in the history
  • Loading branch information
andrioni committed Feb 22, 2022
1 parent a0e8d24 commit 7d44527
Show file tree
Hide file tree
Showing 9 changed files with 237 additions and 5 deletions.
3 changes: 3 additions & 0 deletions doc/user/content/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ changes that have not yet been documented.

- Change inclusive ranges of column indices in `EXPLAIN PLAN` to use `..=` instead of `..`.

- Add [`chr`](/sql/functions#string-func) function to convert a Unicode codepoint
into a string.

{{< comment >}}
Only add new release notes above this line.

Expand Down
6 changes: 6 additions & 0 deletions doc/user/data/sql_funcs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@
- signature: 'char_length(s: str) -> int'
description: Number of code points in `s`

- signature: 'chr(i: int) -> str'
description: >-
Character with the given Unicode codepoint.
Only supports codepoints that can be encoded in UTF-8.
The NULL (0) character is not allowed.
- signature: 'convert_from(b: bytea, src_encoding: text) -> text'
description: Convert data `b` from original encoding specified by `src_encoding` into `text`.

Expand Down
16 changes: 11 additions & 5 deletions src/expr/src/scalar/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3354,6 +3354,7 @@ pub enum UnaryFunc {
ByteLengthBytes,
ByteLengthString,
CharLength,
Chr(Chr),
IsLikeMatch(like_pattern::Matcher),
IsRegexpMatch(Regex),
RegexpMatch(Regex),
Expand Down Expand Up @@ -3572,7 +3573,8 @@ derive_unary!(
CastDateToTimestamp,
CastDateToTimestampTz,
CastBytesToString,
CastVarCharToString
CastVarCharToString,
Chr
);

impl UnaryFunc {
Expand Down Expand Up @@ -3742,7 +3744,8 @@ impl UnaryFunc {
| CastDateToTimestamp(_)
| CastDateToTimestampTz(_)
| CastBytesToString(_)
| CastVarCharToString(_) => unreachable!(),
| CastVarCharToString(_)
| Chr(_) => unreachable!(),
CastStringToJsonb => cast_string_to_jsonb(a, temp_storage),
CastJsonbOrNullToJsonb => Ok(cast_jsonb_or_null_to_jsonb(a)),
CastJsonbToString => Ok(cast_jsonb_to_string(a, temp_storage)),
Expand Down Expand Up @@ -3968,7 +3971,8 @@ impl UnaryFunc {
| CastDateToTimestamp(_)
| CastDateToTimestampTz(_)
| CastBytesToString(_)
| CastVarCharToString(_) => unreachable!(),
| CastVarCharToString(_)
| Chr(_) => unreachable!(),

Ascii | CharLength | BitLengthBytes | BitLengthString | ByteLengthBytes
| ByteLengthString => ScalarType::Int32.nullable(nullable),
Expand Down Expand Up @@ -4221,7 +4225,8 @@ impl UnaryFunc {
| CastDateToTimestamp(_)
| CastDateToTimestampTz(_)
| CastBytesToString(_)
| CastVarCharToString(_) => unreachable!(),
| CastVarCharToString(_)
| Chr(_) => unreachable!(),
// These return null when their input is SQL null.
CastJsonbToString | CastJsonbToInt16 | CastJsonbToInt32 | CastJsonbToInt64
| CastJsonbToFloat32 | CastJsonbToFloat64 | CastJsonbToBool => true,
Expand Down Expand Up @@ -4494,7 +4499,8 @@ impl UnaryFunc {
| CastDateToTimestamp(_)
| CastDateToTimestampTz(_)
| CastBytesToString(_)
| CastVarCharToString(_) => unreachable!(),
| CastVarCharToString(_)
| Chr(_) => unreachable!(),
CastStringToJsonb => f.write_str("strtojsonb"),
CastJsonbOrNullToJsonb => f.write_str("jsonb?tojsonb"),
CastJsonbToString => f.write_str("jsonbtostr"),
Expand Down
18 changes: 18 additions & 0 deletions src/expr/src/scalar/func/impls/int32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,21 @@ sqlfunc!(
RegType(a)
}
);

sqlfunc!(
fn chr(a: i32) -> Result<String, EvalError> {
// This error matches the behavior of Postgres 13/14 (and potentially earlier versions)
// Postgres 15 will have a different error message for negative values
let codepoint = u32::try_from(a).map_err(|_| EvalError::CharacterTooLargeForEncoding(a))?;
if codepoint == 0 {
Err(EvalError::NullCharacterNotPermitted)
} else if 0xd800 <= codepoint && codepoint < 0xe000 {
// Postgres returns a different error message for inputs in this range
Err(EvalError::CharacterNotValidForEncoding(a))
} else {
char::from_u32(codepoint)
.map(|u| u.to_string())
.ok_or(EvalError::CharacterTooLargeForEncoding(a))
}
}
);
10 changes: 10 additions & 0 deletions src/expr/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,8 @@ pub enum DomainLimit {

#[derive(Ord, PartialOrd, Clone, Debug, Eq, PartialEq, Serialize, Deserialize, Hash, MzReflect)]
pub enum EvalError {
CharacterNotValidForEncoding(i32),
CharacterTooLargeForEncoding(i32),
DateBinOutOfRange(String),
DivisionByZero,
Unsupported {
Expand Down Expand Up @@ -1249,6 +1251,7 @@ pub enum EvalError {
InvalidRegexFlag(char),
InvalidParameterValue(String),
NegSqrt,
NullCharacterNotPermitted,
UnknownUnits(String),
UnsupportedUnits(String, String),
UnterminatedLikeEscapeSequence,
Expand Down Expand Up @@ -1277,6 +1280,12 @@ pub enum EvalError {
impl fmt::Display for EvalError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
EvalError::CharacterNotValidForEncoding(v) => {
write!(f, "requested character not valid for encoding: {v}")
}
EvalError::CharacterTooLargeForEncoding(v) => {
write!(f, "requested character too large for encoding: {v}")
}
EvalError::DateBinOutOfRange(message) => f.write_str(message),
EvalError::DivisionByZero => f.write_str("division by zero"),
EvalError::Unsupported { feature, issue_no } => {
Expand Down Expand Up @@ -1331,6 +1340,7 @@ impl fmt::Display for EvalError {
byte_sequence, encoding_name
),
EvalError::NegSqrt => f.write_str("cannot take square root of a negative number"),
EvalError::NullCharacterNotPermitted => f.write_str("null character not permitted"),
EvalError::InvalidRegex(e) => write!(f, "invalid regular expression: {}", e),
EvalError::InvalidRegexFlag(c) => write!(f, "invalid regular expression flag: {}", c),
EvalError::InvalidParameterValue(s) => f.write_str(s),
Expand Down
10 changes: 10 additions & 0 deletions src/pgwire/src/message.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use postgres::error::SqlState;
use mz_coord::session::ClientSeverity as CoordClientSeverity;
use mz_coord::session::TransactionStatus as CoordTransactionStatus;
use mz_coord::{CoordError, StartupMessage};
use mz_expr::EvalError;
use mz_pgcopy::CopyErrorNotSupportedResponse;
use mz_repr::{ColumnName, NotNullViolation, RelationDesc};

Expand Down Expand Up @@ -347,6 +348,15 @@ impl ErrorResponse {
CoordError::ConstrainedParameter { .. } => SqlState::INVALID_PARAMETER_VALUE,
CoordError::AutomaticTimestampFailure { .. } => SqlState::INTERNAL_ERROR,
CoordError::DuplicateCursor(_) => SqlState::DUPLICATE_CURSOR,
CoordError::Eval(EvalError::CharacterNotValidForEncoding(_)) => {
SqlState::PROGRAM_LIMIT_EXCEEDED
}
CoordError::Eval(EvalError::CharacterTooLargeForEncoding(_)) => {
SqlState::PROGRAM_LIMIT_EXCEEDED
}
CoordError::Eval(EvalError::NullCharacterNotPermitted) => {
SqlState::PROGRAM_LIMIT_EXCEEDED
}
CoordError::Eval(_) => SqlState::INTERNAL_ERROR,
CoordError::FixedValueParameter(_) => SqlState::INVALID_PARAMETER_VALUE,
CoordError::IdExhaustionError => SqlState::INTERNAL_ERROR,
Expand Down
3 changes: 3 additions & 0 deletions src/sql/src/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1743,6 +1743,9 @@ lazy_static! {
Ok(HirScalarExpr::literal(datum, ScalarType::String))
}), 746;
},
"chr" => Scalar {
params!(Int32) => UnaryFunc::Chr(func::Chr), 1621;
},
"date_bin" => Scalar {
params!(Interval, Timestamp) => Operation::binary(|ecx, stride, source| {
ecx.require_experimental_mode("binary date_bin")?;
Expand Down
39 changes: 39 additions & 0 deletions test/pgtest/chr.pt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Test status codes for the chr function

# NOTE: Postgres does not send the RowDescription messages for these queries,
# while we do. This is why we ignore all RowDescription messages.

# NullCharacterNotPermitted maps to 54000
send
Query {"query": "SELECT chr(0)"}
----

until ignore=RowDescription
ReadyForQuery
----
ErrorResponse {"fields":[{"typ":"S","value":"ERROR"},{"typ":"C","value":"54000"},{"typ":"M","value":"null character not permitted"}]}
ReadyForQuery {"status":"I"}


# CharacterNotValidForEncoding maps to 54000
send
Query {"query": "SELECT chr(55296)"}
----

until ignore=RowDescription
ReadyForQuery
----
ErrorResponse {"fields":[{"typ":"S","value":"ERROR"},{"typ":"C","value":"54000"},{"typ":"M","value":"requested character not valid for encoding: 55296"}]}
ReadyForQuery {"status":"I"}


# CharacterTooLargeForEncoding maps to 54000
send
Query {"query": "SELECT chr(1114112)"}
----

until ignore=RowDescription
ReadyForQuery
----
ErrorResponse {"fields":[{"typ":"S","value":"ERROR"},{"typ":"C","value":"54000"},{"typ":"M","value":"requested character too large for encoding: 1114112"}]}
ReadyForQuery {"status":"I"}
137 changes: 137 additions & 0 deletions test/sqllogictest/chr.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Copyright Materialize, Inc. and contributors. All rights reserved.
#
# Use of this software is governed by the Business Source License
# included in the LICENSE file at the root of this repository.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0.

# Return NULL for NULL input
query T
SELECT chr(NULL)
----
NULL

query error null character not permitted
SELECT chr(0)

# Match behavior of Postgres 14
query error requested character too large for encoding: -1
SELECT chr(-1)

# i32.MIN
query error requested character too large for encoding: -2147483648
SELECT chr(-2147483648)

# Test non-printable characters
query T
SELECT chr(1) = E'\u0001'
----
true

query T
SELECT chr(2) = E'\u0002'
----
true

query T
SELECT chr(10) = E'\u000a'
----
true

query T
SELECT chr(126)
----
~

query T
SELECT chr(127) = E'\u007f'
----
true

# Check if non-ASCII characters work
query T
SELECT chr(128) = E'\u0080'
----
true

# Test random basic multilingual plane (BMP) character
query T
SELECT chr(9233)
----

# Last code point before the surrogates
query T
SELECT chr(55295)
----

# Surrogate characters should not be encoded in UTF-8
# 55296 = U+D800
query error requested character not valid for encoding: 55296
SELECT chr(55296)

# Last surrogate character
# 57343 = U+DFFF
query error requested character not valid for encoding: 57343
SELECT chr(57343)

query T
SELECT chr(57344)
----

# Test full and half width characters
query T
SELECT chr(65318)
----

query T
SELECT chr(65383)
----

# Test supplementary multilingual plane (SMP / Plane 1) characters
query T
SELECT chr(66312)
----
𐌈

query T
SELECT chr(92330)
----
𖢪

query T
SELECT chr(128579)
----
🙃

# Test composing regional indicator symbols
query T
SELECT chr(127463) || chr(127479);
----
🇧🇷

# Test supplementary ideographic plane (SIP / Plane 2) characters
query T
SELECT chr(194564)
----
你

# Test last valid Unicode code point
query T
SELECT chr(1114111) = E'\U0010FFFF'
----
true

# First invalid code point
query error requested character too large for encoding: 1114112
SELECT chr(1114112)

# i32.MAX
query error requested character too large for encoding: 2147483647
SELECT chr(2147483647)

0 comments on commit 7d44527

Please sign in to comment.