Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Hardcoding metastring into passable parameters #1987

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 92 additions & 16 deletions python/pyfury/meta/metastring.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,20 @@ class Encoding(Enum):

class MetaString:
def __init__(
self, original: str, encoding: Encoding, encoded_data: bytes, length: int
self,
original: str,
encoding: Encoding,
encoded_data: bytes,
length: int,
special_char1: str = ".",
special_char2: str = "|",
):
self.original = original
self.encoding = encoding
self.encoded_data = encoded_data
self.length = length
self.special_char1 = special_char1
self.special_char2 = special_char2
if self.encoding != Encoding.UTF_8:
self.strip_last_char = (encoded_data[0] & 0x80) != 0
else:
Expand All @@ -65,6 +73,17 @@ class MetaStringDecoder:
Decodes MetaString objects back into their original plain text form.
"""

def __init__(self, special_char1: str, special_char2: str):
"""
Creates a MetaStringDecoder with specified special characters used for decoding.

Args:
special_char1 (str): The first special character used for encoding.
special_char2 (str): The second special character used for encoding.
"""
self.special_char1 = special_char1
self.special_char2 = special_char2

def decode(self, encoded_data: bytes, encoding: Encoding) -> str:
"""
Decodes the encoded data using the specified encoding.
Expand Down Expand Up @@ -203,9 +222,9 @@ def _decode_lower_upper_digit_special_char(self, char_value: int) -> str:
elif 52 <= char_value <= 61:
return chr(ord("0") + (char_value - 52))
elif char_value == 62:
return "."
return self.special_char1 # Use special_char1 for the encoding
elif char_value == 63:
return "_"
return self.special_char2 # Use special_char2 for the encoding
else:
raise ValueError(
f"Invalid character value for LOWER_UPPER_DIGIT_SPECIAL: {char_value}"
Expand Down Expand Up @@ -250,9 +269,16 @@ def _decode_rep_all_to_lower_special(self, data: bytes) -> str:


class MetaStringEncoder:
"""
Encodes plain text strings into MetaString objects with specified encoding mechanisms.
"""
def __init__(self, special_char1: str, special_char2: str):
"""
Creates a MetaStringEncoder with specified special characters used for encoding.

Args:
special_char1 (str): The first special character used in custom encoding.
special_char2 (str): The second special character used in custom encoding.
"""
self.special_char1 = special_char1
self.special_char2 = special_char2

def encode(self, input_string: str) -> MetaString:
"""
Expand All @@ -270,7 +296,14 @@ def encode(self, input_string: str) -> MetaString:
), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed."

if not input_string:
return MetaString(input_string, Encoding.UTF_8, bytes(), 0)
return MetaString(
input_string,
Encoding.UTF_8,
bytes(),
0,
self.special_char1,
self.special_char2,
)

encoding = self.compute_encoding(input_string)
return self.encode_with_encoding(input_string, encoding)
Expand All @@ -292,29 +325,67 @@ def encode_with_encoding(self, input_string: str, encoding: Encoding) -> MetaStr
), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed."

if not input_string:
return MetaString(input_string, Encoding.UTF_8, bytes(), 0)
return MetaString(
input_string,
Encoding.UTF_8,
bytes(),
0,
self.special_char1,
self.special_char2,
)

length = len(input_string)
if encoding == Encoding.LOWER_SPECIAL:
encoded_data = self._encode_lower_special(input_string)
return MetaString(input_string, encoding, encoded_data, length * 5)
return MetaString(
input_string,
encoding,
encoded_data,
length * 5,
self.special_char1,
self.special_char2,
)
elif encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL:
encoded_data = self._encode_lower_upper_digit_special(input_string)
return MetaString(input_string, encoding, encoded_data, length * 6)
return MetaString(
input_string,
encoding,
encoded_data,
length * 6,
self.special_char1,
self.special_char2,
)
elif encoding == Encoding.FIRST_TO_LOWER_SPECIAL:
encoded_data = self._encode_first_to_lower_special(input_string)
return MetaString(input_string, encoding, encoded_data, length * 5)
return MetaString(
input_string,
encoding,
encoded_data,
length * 5,
self.special_char1,
self.special_char2,
)
elif encoding == Encoding.ALL_TO_LOWER_SPECIAL:
chars = list(input_string)
upper_count = sum(1 for c in chars if c.isupper())
encoded_data = self._encode_all_to_lower_special(chars)
return MetaString(
input_string, encoding, encoded_data, (upper_count + length) * 5
input_string,
encoding,
encoded_data,
(upper_count + length) * 5,
self.special_char1,
self.special_char2,
)
else:
encoded_data = bytes(input_string, "utf-8")
return MetaString(
input_string, Encoding.UTF_8, encoded_data, len(encoded_data) * 8
input_string,
Encoding.UTF_8,
encoded_data,
len(encoded_data) * 8,
self.special_char1,
self.special_char2,
)

def compute_encoding(self, input_string: str) -> Encoding:
Expand Down Expand Up @@ -363,7 +434,12 @@ def _compute_statistics(self, chars: List[str]) -> Statistics:
upper_count = 0
for c in chars:
if can_lower_upper_digit_special_encoded:
if not (c.islower() or c.isupper() or c.isdigit() or c in {".", "_"}):
if not (
c.islower()
or c.isupper()
or c.isdigit()
or c in {self.special_char1, self.special_char2}
):
can_lower_upper_digit_special_encoded = False
if can_lower_special_encoded:
if not (c.islower() or c in {".", "_", "$", "|"}):
Expand Down Expand Up @@ -500,9 +576,9 @@ def _char_to_value(self, c: str, bits_per_char: int) -> int:
return 26 + (ord(c) - ord("A"))
elif "0" <= c <= "9":
return 52 + (ord(c) - ord("0"))
elif c == ".":
elif c == self.special_char1:
return 62
elif c == "_":
elif c == self.special_char2:
return 63
else:
raise ValueError(
Expand Down
51 changes: 30 additions & 21 deletions python/pyfury/tests/test_metastring.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@


def test_encode_metastring_lower_special():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

# Test for encoding and decoding
encoded = encoder._encode_lower_special("abc_def")
assert len(encoded) == 5
assert len(encoder.encode("org.apache.fury.benchmark.data").encoded_data) == 19
Expand All @@ -41,10 +43,12 @@ def test_encode_metastring_lower_special():


def test_encode_metastring_lower_upper_digit_special():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

# Test for encoding and decoding
encoded = encoder._encode_lower_upper_digit_special("ExampleInput123")
assert len(encoded) == 12
decoder = MetaStringDecoder()
decoded = decoder.decode(encoded, Encoding.LOWER_UPPER_DIGIT_SPECIAL)
assert decoded == "ExampleInput123"

Expand Down Expand Up @@ -73,25 +77,26 @@ def create_string(length):


def test_metastring():
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

encoder = MetaStringEncoder()
for i in range(1, 128):
try:
string = create_string(i)
metastring = encoder.encode(string)
assert metastring.encoding != Encoding.UTF_8
assert metastring.original == string

decoder = MetaStringDecoder()
new_string = decoder.decode(metastring.encoded_data, metastring.encoding)
assert new_string == string
except Exception as e:
pytest.fail(f"Failed at {i} with exception: {str(e)}")


def test_encode_empty_string():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

for encoding in [
Encoding.LOWER_SPECIAL,
Encoding.LOWER_UPPER_DIGIT_SPECIAL,
Expand All @@ -106,16 +111,17 @@ def test_encode_empty_string():


def test_encode_characters_outside_of_lower_special():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "abcdefABCDEF1234!@#"
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.UTF_8


def test_all_to_upper_special_encoding():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

test_string = "ABC_DEF"
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL
Expand All @@ -124,8 +130,9 @@ def test_all_to_upper_special_encoding():


def test_first_to_lower_special_encoding():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

test_string = "Aabcdef"
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.FIRST_TO_LOWER_SPECIAL
Expand All @@ -134,8 +141,9 @@ def test_first_to_lower_special_encoding():


def test_utf8_encoding():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

test_string = "你好,世界" # Non-Latin characters
metastring = encoder.encode(test_string)
assert metastring.encoding == Encoding.UTF_8
Expand All @@ -144,7 +152,7 @@ def test_utf8_encoding():


def test_strip_last_char():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "abc" # encoded as 1|00000|00, 001|00010, exactly two bytes
encoded_metastring = encoder.encode(test_string)
Expand All @@ -156,8 +164,9 @@ def test_strip_last_char():


def test_empty_string():
encoder = MetaStringEncoder()
decoder = MetaStringDecoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
decoder = MetaStringDecoder(special_char1=".", special_char2="_")

metastring = encoder.encode("")
assert metastring.encoded_data == bytes()

Expand All @@ -166,7 +175,7 @@ def test_empty_string():


def test_ascii_encoding():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "asciiOnly"
encoded_metastring = encoder.encode(test_string)
Expand All @@ -175,15 +184,15 @@ def test_ascii_encoding():


def test_non_ascii_encoding():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

test_string = "こんにちは" # Non-ASCII string
encoded_metastring = encoder.encode(test_string)
assert encoded_metastring.encoding == Encoding.UTF_8


def test_non_ascii_encoding_and_non_utf8():
encoder = MetaStringEncoder()
encoder = MetaStringEncoder(special_char1=".", special_char2="_")

non_ascii_string = "こんにちは" # Non-ASCII string

Expand Down
Loading