Skip to content

Commit ca22a35

Browse files
committed
fix: Use the same logic for Spark
1 parent 3dba348 commit ca22a35

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

deepnote_toolkit/ocelots/pyspark/implementation.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,12 @@ def to_records(self, mode: Literal["json", "python"]) -> List[Dict[str, Any]]:
232232
StructField,
233233
)
234234

235+
def binary_to_string_repr(binary_data: Optional[bytearray]) -> Optional[str]:
236+
"""Convert binary data to Python string representation (e.g., b'hello')."""
237+
if binary_data is None:
238+
return None
239+
return str(bytes(binary_data))
240+
235241
def select_column(field: StructField) -> Column:
236242
col = F.col(field.name)
237243
# Numbers are already JSON-serialise, except Decimal
@@ -240,11 +246,11 @@ def select_column(field: StructField) -> Column:
240246
):
241247
return col
242248

243-
# We slice binary field before encoding to avoid encoding potentially big blob. Round slicing to
244-
# 4 bytes to avoid breaking multi-byte sequences
249+
# We slice binary field before converting to string representation
245250
if isinstance(field.dataType, BinaryType):
246251
sliced = F.substring(F.col(field.name), 1, keep_bytes)
247-
return F.base64(sliced)
252+
binary_udf = F.udf(binary_to_string_repr, StringType())
253+
return binary_udf(sliced)
248254

249255
# String just needs to be trimmed
250256
if isinstance(field.dataType, StringType):

0 commit comments

Comments
 (0)