Skip to content

Commit

Permalink
feat(python): Hardcoding metastring into passable parameters (#1987)
Browse files Browse the repository at this point in the history
<!--
**Thanks for contributing to Fury.**

**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**

Contribution Checklist

- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).

- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->

## What does this PR do?
In the original MetaString, MetaStringEncoder used hard coding directly
to solve the special char1/2 situation, but this was not the best
choice. So it's passable, allowing MetaString to select the special char
it passes.

<!-- Describe the purpose of this PR. -->

## Related issues
Close #1983 
<!--
Is there any related issue? Please attach here.

- #xxxx0
- #xxxx1
- #xxxx2
-->

## Does this PR introduce any user-facing change?

<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->

- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?

## Benchmark

<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
  • Loading branch information
pandalee99 authored Dec 23, 2024
1 parent 1515f94 commit 8d2d124
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 37 deletions.
108 changes: 92 additions & 16 deletions python/pyfury/meta/metastring.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,20 @@ class Encoding(Enum):

class MetaString:
def __init__(
self, original: str, encoding: Encoding, encoded_data: bytes, length: int
self,
original: str,
encoding: Encoding,
encoded_data: bytes,
length: int,
special_char1: str = ".",
special_char2: str = "|",
):
self.original = original
self.encoding = encoding
self.encoded_data = encoded_data
self.length = length
self.special_char1 = special_char1
self.special_char2 = special_char2
if self.encoding != Encoding.UTF_8:
self.strip_last_char = (encoded_data[0] & 0x80) != 0
else:
Expand All @@ -65,6 +73,17 @@ class MetaStringDecoder:
Decodes MetaString objects back into their original plain text form.
"""

def __init__(self, special_char1: str, special_char2: str):
"""
Creates a MetaStringDecoder with specified special characters used for decoding.
Args:
special_char1 (str): The first special character used for encoding.
special_char2 (str): The second special character used for encoding.
"""
self.special_char1 = special_char1
self.special_char2 = special_char2

def decode(self, encoded_data: bytes, encoding: Encoding) -> str:
"""
Decodes the encoded data using the specified encoding.
Expand Down Expand Up @@ -203,9 +222,9 @@ def _decode_lower_upper_digit_special_char(self, char_value: int) -> str:
elif 52 <= char_value <= 61:
return chr(ord("0") + (char_value - 52))
elif char_value == 62:
return "."
return self.special_char1 # Use special_char1 for the encoding
elif char_value == 63:
return "_"
return self.special_char2 # Use special_char2 for the encoding
else:
raise ValueError(
f"Invalid character value for LOWER_UPPER_DIGIT_SPECIAL: {char_value}"
Expand Down Expand Up @@ -250,9 +269,16 @@ def _decode_rep_all_to_lower_special(self, data: bytes) -> str:


class MetaStringEncoder:
"""
Encodes plain text strings into MetaString objects with specified encoding mechanisms.
"""
def __init__(self, special_char1: str, special_char2: str):
"""
Creates a MetaStringEncoder with specified special characters used for encoding.
Args:
special_char1 (str): The first special character used in custom encoding.
special_char2 (str): The second special character used in custom encoding.
"""
self.special_char1 = special_char1
self.special_char2 = special_char2

def encode(self, input_string: str) -> MetaString:
"""
Expand All @@ -270,7 +296,14 @@ def encode(self, input_string: str) -> MetaString:
), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed."

if not input_string:
return MetaString(input_string, Encoding.UTF_8, bytes(), 0)
return MetaString(
input_string,
Encoding.UTF_8,
bytes(),
0,
self.special_char1,
self.special_char2,
)

encoding = self.compute_encoding(input_string)
return self.encode_with_encoding(input_string, encoding)
Expand All @@ -292,29 +325,67 @@ def encode_with_encoding(self, input_string: str, encoding: Encoding) -> MetaStr
), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed."

if not input_string:
return MetaString(input_string, Encoding.UTF_8, bytes(), 0)
return MetaString(
input_string,
Encoding.UTF_8,
bytes(),
0,
self.special_char1,
self.special_char2,
)

length = len(input_string)
if encoding == Encoding.LOWER_SPECIAL:
encoded_data = self._encode_lower_special(input_string)
return MetaString(input_string, encoding, encoded_data, length * 5)
return MetaString(
input_string,
encoding,
encoded_data,
length * 5,
self.special_char1,
self.special_char2,
)
elif encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL:
encoded_data = self._encode_lower_upper_digit_special(input_string)
return MetaString(input_string, encoding, encoded_data, length * 6)
return MetaString(
input_string,
encoding,
encoded_data,
length * 6,
self.special_char1,
self.special_char2,
)
elif encoding == Encoding.FIRST_TO_LOWER_SPECIAL:
encoded_data = self._encode_first_to_lower_special(input_string)
return MetaString(input_string, encoding, encoded_data, length * 5)
return MetaString(
input_string,
encoding,
encoded_data,
length * 5,
self.special_char1,
self.special_char2,
)
elif encoding == Encoding.ALL_TO_LOWER_SPECIAL:
chars = list(input_string)
upper_count = sum(1 for c in chars if c.isupper())
encoded_data = self._encode_all_to_lower_special(chars)
return MetaString(
input_string, encoding, encoded_data, (upper_count + length) * 5
input_string,
encoding,
encoded_data,
(upper_count + length) * 5,
self.special_char1,
self.special_char2,
)
else:
encoded_data = bytes(input_string, "utf-8")
return MetaString(
input_string, Encoding.UTF_8, encoded_data, len(encoded_data) * 8
input_string,
Encoding.UTF_8,
encoded_data,
len(encoded_data) * 8,
self.special_char1,
self.special_char2,
)

def compute_encoding(self, input_string: str) -> Encoding:
Expand Down Expand Up @@ -363,7 +434,12 @@ def _compute_statistics(self, chars: List[str]) -> Statistics:
upper_count = 0
for c in chars:
if can_lower_upper_digit_special_encoded:
if not (c.islower() or c.isupper() or c.isdigit() or c in {".", "_"}):
if not (
c.islower()
or c.isupper()
or c.isdigit()
or c in {self.special_char1, self.special_char2}
):
can_lower_upper_digit_special_encoded = False
if can_lower_special_encoded:
if not (c.islower() or c in {".", "_", "$", "|"}):
Expand Down Expand Up @@ -500,9 +576,9 @@ def _char_to_value(self, c: str, bits_per_char: int) -> int:
return 26 + (ord(c) - ord("A"))
elif "0" <= c <= "9":
return 52 + (ord(c) - ord("0"))
elif c == ".":
elif c == self.special_char1:
return 62
elif c == "_":
elif c == self.special_char2:
return 63
else:
raise ValueError(
Expand Down
51 changes: 30 additions & 21 deletions python/pyfury/tests/test_metastring.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@


def test_encode_metastring_lower_special():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

# Test for encoding and decoding
encoded = encoder._encode_lower_special("abc_def")
assert len(encoded) == 5
assert len(encoder.encode("org.apache.fury.benchmark.data").encoded_data) == 19
Expand All @@ -41,10 +43,12 @@ def test_encode_metastring_lower_special():


def test_encode_metastring_lower_upper_digit_special():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

# Test for encoding and decoding
encoded = encoder._encode_lower_upper_digit_special("ExampleInput123")
assert len(encoded) == 12
decoder = MetaStringDecoder()
decoded = decoder.decode(encoded, Encoding.LOWER_UPPER_DIGIT_SPECIAL)
assert decoded == "ExampleInput123"

Expand Down Expand Up @@ -73,25 +77,26 @@ def create_string(length):


def test_metastring():
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

encoder = MetaStringEncoder()
for i in range(1, 128):
try:
string = create_string(i)
metastring = encoder.encode(string)
assert metastring.encoding != Encoding.UTF_8
assert metastring.original == string

decoder = MetaStringDecoder()
new_string = decoder.decode(metastring.encoded_data, metastring.encoding)
assert new_string == string
except Exception as e:
pytest.fail(f"Failed at {i} with exception: {str(e)}")


def test_encode_empty_string():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

for encoding in [
Encoding.LOWER_SPECIAL,
Encoding.LOWER_UPPER_DIGIT_SPECIAL,
Expand All @@ -106,16 +111,17 @@ def test_encode_empty_string():


def test_encode_characters_outside_of_lower_special():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "abcdefABCDEF1234!@#"
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.UTF_8


def test_all_to_upper_special_encoding():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

test_string = "ABC_DEF"
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL
Expand All @@ -124,8 +130,9 @@ def test_all_to_upper_special_encoding():


def test_first_to_lower_special_encoding():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

test_string = "Aabcdef"
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.FIRST_TO_LOWER_SPECIAL
Expand All @@ -134,8 +141,9 @@ def test_first_to_lower_special_encoding():


def test_utf8_encoding():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

test_string = "你好,世界" # Non-Latin characters
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.UTF_8
Expand All @@ -144,7 +152,7 @@ def test_utf8_encoding():


def test_strip_last_char():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "abc" # encoded as 1|00000|00, 001|00010, exactly two bytes
encoded_metastring = encoder.encode(test_string)
Expand All @@ -156,8 +164,9 @@ def test_strip_last_char():


def test_empty_string():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

metastring = encoder.encode("")
assert metastring.encoded_data == bytes()

Expand All @@ -166,7 +175,7 @@ def test_empty_string():


def test_ascii_encoding():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "asciiOnly"
encoded_metastring = encoder.encode(test_string)
Expand All @@ -175,15 +184,15 @@ def test_ascii_encoding():


def test_non_ascii_encoding():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "こんにちは" # Non-ASCII string
encoded_metastring = encoder.encode(test_string)
assert encoded_metastring.encoding == Encoding.UTF_8


def test_non_ascii_encoding_and_non_utf8():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

non_ascii_string = "こんにちは" # Non-ASCII string

Expand Down

0 comments on commit 8d2d124

Please sign in to comment.