Skip to content

[mypyc] Add a str.format specializer which only supports empty brackets #10697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 100 additions & 6 deletions mypyc/irbuild/specialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from mypy.nodes import (
CallExpr, RefExpr, MemberExpr, NameExpr, TupleExpr, GeneratorExpr,
ListExpr, DictExpr, ARG_POS
ListExpr, DictExpr, StrExpr, ARG_POS
)
from mypy.types import AnyType, TypeOfAny

Expand All @@ -25,20 +25,20 @@
)
from mypyc.ir.rtypes import (
RType, RTuple, str_rprimitive, list_rprimitive, dict_rprimitive, set_rprimitive,
bool_rprimitive, is_dict_rprimitive, c_int_rprimitive
bool_rprimitive, is_dict_rprimitive, c_int_rprimitive, is_str_rprimitive
)
from mypyc.primitives.dict_ops import (
dict_keys_op, dict_values_op, dict_items_op, dict_setdefault_spec_init_op
)
from mypyc.primitives.list_ops import new_list_set_item_op
from mypyc.primitives.tuple_ops import new_tuple_set_item_op
from mypyc.primitives.str_ops import str_op, str_build_op
from mypyc.irbuild.builder import IRBuilder
from mypyc.irbuild.for_helpers import (
translate_list_comprehension, translate_set_comprehension,
comprehension_helper, sequence_from_generator_preallocate_helper
)


# Specializers are attempted before compiling the arguments to the
# function. Specializers can return None to indicate that they failed
# and the call should be compiled normally. Otherwise they should emit
Expand All @@ -62,9 +62,11 @@ def specialize_function(
There may exist multiple specializers for one function. When translating method
calls, the earlier appended specializer has higher priority.
"""

def wrapper(f: Specializer) -> Specializer:
specializers.setdefault((name, typ), []).append(f)
return f

return wrapper


Expand Down Expand Up @@ -189,13 +191,13 @@ def translate_safe_generator_call(
return builder.gen_method_call(
builder.accept(callee.expr), callee.name,
([translate_list_comprehension(builder, expr.args[0])]
+ [builder.accept(arg) for arg in expr.args[1:]]),
+ [builder.accept(arg) for arg in expr.args[1:]]),
builder.node_type(expr), expr.line, expr.arg_kinds, expr.arg_names)
else:
return builder.call_refexpr_with_args(
expr, callee,
([translate_list_comprehension(builder, expr.args[0])]
+ [builder.accept(arg) for arg in expr.args[1:]]))
+ [builder.accept(arg) for arg in expr.args[1:]]))
return None


Expand Down Expand Up @@ -343,7 +345,7 @@ def translate_dict_setdefault(
return None
data_type = Integer(2, c_int_rprimitive, expr.line)
elif (isinstance(arg, CallExpr) and isinstance(arg.callee, NameExpr)
and arg.callee.fullname == 'builtins.set'):
and arg.callee.fullname == 'builtins.set'):
if len(arg.args):
return None
data_type = Integer(3, c_int_rprimitive, expr.line)
Expand All @@ -356,3 +358,95 @@ def translate_dict_setdefault(
[callee_dict, key_val, data_type],
expr.line)
return None


@specialize_function('format', str_rprimitive)
def translate_str_format(
builder: IRBuilder, expr: CallExpr, callee: RefExpr) -> Optional[Value]:
if (isinstance(callee, MemberExpr) and isinstance(callee.expr, StrExpr)
and expr.arg_kinds.count(ARG_POS) == len(expr.arg_kinds)):

format_str = callee.expr.value
if not can_optimize_format(format_str):
return None

literals = split_braces(format_str)

variables = [builder.accept(x) if is_str_rprimitive(builder.node_type(x))
else builder.call_c(str_op, [builder.accept(x)], expr.line)
for x in expr.args]

# The first parameter is the total size of the following PyObject* merged from
# two lists alternatively.
result_list: List[Value] = [Integer(0, c_int_rprimitive)]
for a, b in zip(literals, variables):
if a:
result_list.append(builder.load_str(a))
result_list.append(b)
# The split_braces() always generates one more element
if literals[-1]:
result_list.append(builder.load_str(literals[-1]))

# Special case for empty string and literal string
if len(result_list) == 1:
return builder.load_str("")
if not variables and len(result_list) == 2:
return result_list[1]

result_list[0] = Integer(len(result_list) - 1, c_int_rprimitive)
return builder.call_c(str_build_op, result_list, expr.line)
return None


def can_optimize_format(format_str: str) -> bool:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add unit tests for this, since it has pretty tricky logic. Also consider various error cases and edge cases.

# TODO
# Only empty braces can be optimized
prev = ''
for c in format_str:
if (c == '{' and prev == '{'
or c == '}' and prev == '}'):
prev = ''
continue
if (prev != '' and (c == '}' and prev != '{'
or prev == '{' and c != '}')):
return False
prev = c
return True


def split_braces(format_str: str) -> List[str]:
# This function can only be called after format_str pass can_optimize_format()
tmp_str = ''
ret_list = []
prev = ''
for c in format_str:
# There are three cases: {, }, others
# when c is '}': prev is '{' -> match empty braces
# '}' -> merge into one } in literal
# others -> pass
# c is '{': prev is '{' -> merge into one { in literal
# '}' -> pass
# others -> pass
# c is others: add c into literal
clear_prev = True
if c == '}':
if prev == '{':
ret_list.append(tmp_str)
tmp_str = ''
elif prev == '}':
tmp_str += '}'
else:
clear_prev = False
elif c == '{':
if prev == '{':
tmp_str += '{'
else:
clear_prev = False
else:
tmp_str += c
clear_prev = False
prev = c
if clear_prev:
prev = ''
ret_list.append(tmp_str)
return ret_list
1 change: 1 addition & 0 deletions mypyc/lib-rt/CPy.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ static inline char CPyDict_CheckSize(PyObject *dict, CPyTagged size) {
// Str operations


PyObject *CPyStr_Build(int len, ...);
PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index);
PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split);
PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr, PyObject *new_substr, CPyTagged max_replace);
Expand Down
15 changes: 15 additions & 0 deletions mypyc/lib-rt/str_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@ PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
}
}

PyObject *CPyStr_Build(int len, ...) {
int i;
va_list args;
va_start(args, len);

PyObject *res = PyUnicode_FromObject(va_arg(args, PyObject *));
for (i = 1; i < len; i++) {
PyObject *str = va_arg(args, PyObject *);
PyUnicode_Append(&res, str);
}

va_end(args);
return res;
}

PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split)
{
Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
Expand Down
10 changes: 9 additions & 1 deletion mypyc/primitives/str_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
src='PyUnicode_Type')

# str(obj)
function_op(
str_op = function_op(
name='builtins.str',
arg_types=[object_rprimitive],
return_type=str_rprimitive,
Expand All @@ -44,6 +44,14 @@
error_kind=ERR_MAGIC
)

str_build_op = custom_op(
arg_types=[c_int_rprimitive],
return_type=str_rprimitive,
c_function_name='CPyStr_Build',
error_kind=ERR_MAGIC,
var_arg_type=str_rprimitive
)

# str.startswith(str)
method_op(
name='startswith',
Expand Down
43 changes: 43 additions & 0 deletions mypyc/test-data/irbuild-str.test
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,46 @@ L2:
return 0
L3:
unreachable

[case testStringFormatMethod]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a few more IR build test cases, such as these:

  1. an empty literal
  2. {{ and }} in the format string

def f(s: str, num: int) -> None:
s1 = "Hi! I'm {}, and I'm {} years old.".format(s, num)
s2 = ''.format()
s3 = 'abc'.format()
s3 = '}}{}{{{}}}{{{}'.format(num, num, num)
[out]
def f(s, num):
s :: str
num :: int
r0 :: object
r1, r2, r3, r4, r5, s1, r6, s2, r7, s3 :: str
r8 :: object
r9 :: str
r10 :: object
r11 :: str
r12 :: object
r13, r14, r15, r16, r17 :: str
L0:
r0 = box(int, num)
r1 = PyObject_Str(r0)
r2 = "Hi! I'm "
r3 = ", and I'm "
r4 = ' years old.'
r5 = CPyStr_Build(5, r2, s, r3, r1, r4)
s1 = r5
r6 = ''
s2 = r6
r7 = 'abc'
s3 = r7
r8 = box(int, num)
r9 = PyObject_Str(r8)
r10 = box(int, num)
r11 = PyObject_Str(r10)
r12 = box(int, num)
r13 = PyObject_Str(r12)
r14 = '}'
r15 = '{'
r16 = '}{'
r17 = CPyStr_Build(6, r14, r9, r15, r11, r16, r13)
s3 = r17
return 1
72 changes: 59 additions & 13 deletions mypyc/test-data/run-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ class A:
self.age = age

def __repr__(self):
return f"{self.name} is {self.age} years old."
return f'{self.name} is {self.age} years old.'

def test_fstring_datatype() -> None:
u = A('John Doe', 14)
Expand Down Expand Up @@ -236,8 +236,8 @@ def test_fstring_conversion() -> None:
assert f'{s}' == 'test: āĀēĒčČ..šŠūŪžŽ'
assert f'{s!a}' == "'test: \\u0101\\u0100\\u0113\\u0112\\u010d\\u010c..\\u0161\\u0160\\u016b\\u016a\\u017e\\u017d'"

assert f'Hello {var!s}' == "Hello mypyc"
assert f'Hello {num!s}' == "Hello 20"
assert f'Hello {var!s}' == 'Hello mypyc'
assert f'Hello {num!s}' == 'Hello 20'

def test_fstring_align() -> None:
assert f'Hello {var:>20}' == "Hello mypyc"
Expand All @@ -252,34 +252,57 @@ def test_fstring_multi() -> None:
assert s == 'mypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypymypy'

def test_fstring_python_doc() -> None:
name = "Fred"
name = 'Fred'
assert f"He said his name is {name!r}." == "He said his name is 'Fred'."
assert f"He said his name is {repr(name)}." == "He said his name is 'Fred'."

width = 10
precision = 4
value = decimal.Decimal("12.34567")
assert f"result: {value:{width}.{precision}}" == 'result: 12.35' # nested field
value = decimal.Decimal('12.34567')
assert f'result: {value:{width}.{precision}}' == 'result: 12.35' # nested field

today = datetime(year=2017, month=1, day=27)
assert f"{today:%B %d, %Y}" == 'January 27, 2017' # using date format specifier
assert f'{today:%B %d, %Y}' == 'January 27, 2017' # using date format specifier

number = 1024
assert f"{number:#0x}" == '0x400' # using integer format specifier
assert f'{number:#0x}' == '0x400' # using integer format specifier

[case testStringFormatMethod]
from typing import Tuple

def test_format_method_basics() -> None:
assert "".format() == ""
assert "abc".format() == "abc"
assert ''.format() == ''
assert 'abc'.format() == 'abc'
assert '{}{}'.format(1, 2) == '12'

name = "Eric"
name = 'Eric'
age = 14
assert "My name is {name}, I'm {age}.".format(name=name, age=age) == "My name is Eric, I'm 14."
assert "My name is {A}, I'm {B}.".format(A=name, B=age) == "My name is Eric, I'm 14."
assert "My name is {}, I'm {B}.".format(name, B=age) == "My name is Eric, I'm 14."

bool_var1 = True
bool_var2 = False
assert 'bool: {}, {}'.format(bool_var1, bool_var2) == 'bool: True, False'

def test_format_method_empty_braces() -> None:
name = 'Eric'
age = 14

assert 'Hello, {}!'.format(name) == 'Hello, Eric!'
assert '{}'.format(name) == 'Eric'
assert '{}! Hi!'.format(name) == 'Eric! Hi!'
assert '{}, Hi, {}'.format(name, name) == 'Eric, Hi, Eric'
assert 'Hi! {}'.format(name) == 'Hi! Eric'
assert "Hi, I'm {}. I'm {}.".format(name, age) == "Hi, I'm Eric. I'm 14."

assert '{{}}'.format() == '{}'
assert '{{{{}}}}'.format() == '{{}}'
assert '{{}}{}'.format(name) == '{}Eric'
assert 'Hi! {{{}}}'.format(name) == 'Hi! {Eric}'
assert 'Hi! {{ {}'.format(name) == 'Hi! { Eric'
assert 'Hi! {{ {} }}}}'.format(name) == 'Hi! { Eric }}'

def test_format_method_numbers() -> None:
s = 'int: {0:d}; hex: {0:x}; oct: {0:o}; bin: {0:b}'.format(-233)
assert s == 'int: -233; hex: -e9; oct: -351; bin: -11101001'
Expand All @@ -295,6 +318,29 @@ def test_format_method_numbers() -> None:
assert 'negative integer: {}'.format(neg_num) == 'negative integer: -3'
assert 'negative integer: {}'.format(-large_num) == 'negative integer: -36893488147419103232'

large_float = 1.23e30
large_float2 = 1234123412341234123400000000000000000
small_float = 1.23e-20
assert '{}, {}, {}'.format(small_float, large_float, large_float2) == '1.23e-20, 1.23e+30, 1234123412341234123400000000000000000'
nan_num = float('nan')
inf_num = float('inf')
assert '{}, {}'.format(nan_num, inf_num) == 'nan, inf'

def format_args(*args: int) -> str:
return 'x{}y{}'.format(*args)
def format_kwargs(**kwargs: int) -> str:
return 'c{x}d{y}'.format(**kwargs)
def format_args_self(*args: int) -> str:
return '{}'.format(args)
def format_kwargs_self(**kwargs: int) -> str:
return '{}'.format(kwargs)

def test_format_method_args() -> None:
assert format_args(10, 2) == 'x10y2'
assert format_args_self(10, 2) == '(10, 2)'
assert format_kwargs(x=10, y=2) == 'c10d2'
assert format_kwargs(x=10, y=2, z=1) == 'c10d2'
assert format_kwargs_self(x=10, y=2, z=1) == "{'x': 10, 'y': 2, 'z': 1}"

class Point:
def __init__(self, x, y):
Expand All @@ -319,7 +365,7 @@ def test_format_method_python_doc() -> None:
assert 'Coordinates: {latitude}, {longitude}'.format(**coord) == 'Coordinates: 37.24N, -115.81W'

# Accessing arguments’ attributes:
assert str(Point(4, 2)) == "Point(4, 2)"
assert str(Point(4, 2)) == 'Point(4, 2)'

# Accessing arguments’ items:
coord2 = (3, 5)
Expand Down Expand Up @@ -371,7 +417,7 @@ def test_format_method_python_doc() -> None:
width = 5
tmp_strs = []
for num in range(5,12):
tmp_str = ""
tmp_str = ''
for base in 'dXob':
tmp_str += ('{0:{width}{base}}'.format(num, base=base, width=width))
tmp_strs.append(tmp_str)
Expand Down