Description
Noticed a slowness in the code when trying to parse objects with a lot of '\r\n' characters in a row.
It seems that the code does too many callbacks to on_part_data
, causing potentially many operations, which are not at all necessary (see example and output below).
py version: python 3.9.2
python-multipart version: 0.0.6
Here's a demonstration
$ cat form.py
import timeit
import requests
from multipart.multipart import parse_options_header, MultipartParser
class TestFormDataParser:
def __init__(self, headers):
# Parse the Content-Type header to get the multipart boundary.
self.headers = headers
_, params = parse_options_header(self.headers['Content-Type'])
self.charset = params.get(b'charset', 'utf-8')
if type(self.charset) == bytes:
self.charset = self.charset.decode('latin-1')
try:
self.boundary = params[b'boundary']
except KeyError:
raise MultiPartException('Missing boundary in multipart.')
self.callbacks = {
'on_part_begin': lambda: None,
'on_part_data': self.on_part_data,
'on_part_end': lambda: None,
'on_header_field': lambda *_: None,
'on_header_value': lambda *_: None,
'on_header_end': lambda: None,
'on_headers_finished': lambda: None,
'on_end': lambda: None,
}
self.f = MultipartParser(self.boundary, self.callbacks)
self.on_data_calls = 0
def on_part_data(self, *_):
self.on_data_calls += 1
def parse(self, data):
self.f.write(data)
self.f.finalize()
print(f'on_part_data was called {self.on_data_calls} times')
def test():
test_data = [
(b'fo' * 300000 + b'\r\n' + b'foobar', "b'fo' * 300000 + b'\\r\\n' + b'foobar',"),
(b'fo' * 290000 + b'\r\n' * 10000 + b'foobar',"b'fo' * 290000 + b'\\r\\n' * 10000 + b'foobar'"),
(b'fo' * 150000 + b'\r\n' * 150000 + b'foobar',"b'fo' * 150000 + b'\r\n' * 150000 + b'foobar'"),
((b'fo' + b'\r\n') * 150000 + b'foobar',"(b'fo' + b'\\r\\n') * 150000 + b'foobar'"),
((b'fo' + b'\r\n' * 5) * 50000 + b'foobar',"(b'fo' + b'\\r\\n' * 5) * 50000 + b'foobar'"),
((b'fo' + b'\r\n' * 50) * 5000 + b'foobar',"(b'fo' + b'\\r\\n' * 50) * 5000 + b'foobar'"),
(b'fo' + b'\r\n' * 300000 + b'foobar',"b'fo' + b'\\r\\n' * 300000 + b'foobar'"),
]
def run_it(data):
prep = requests.Request('POST', 'http://example.com', files={'sample': sample, 'other': b'baz'}).prepare()
parser = TestFormDataParser(prep.headers)
parser.parse(prep.body)
_min = 100000
_max = 0
iters = 1
for sample, data_format in test_data:
sample_size = len(sample)
crlf_count = sample.count(b'\r\n')
measurement_time = timeit.timeit(lambda: run_it(sample), setup="pass",number=iters)
print(f'data format: {data_format}')
print(f'It took {measurement_time:.3f}s to run {iters} iter with sample size: {sample_size}.')
print(f'Sample contain {crlf_count} \\r\\n characters.\n\n')
if measurement_time < _min:
_min = measurement_time
if measurement_time > _max:
_max = measurement_time
print(f'Biggest difference is {_max-_min:.2f}s ({_max/_min*100:.2f}%) for sample of the same size')
if __name__ == '__main__':
test()
and here's the output that I get:
on_part_data was called 4 times
data format: b'fo' * 300000 + b'\r\n' + b'foobar',
It took 0.005s to run 1 iter with sample size: 600008.
Sample contain 1 \r\n characters.
on_part_data was called 9986 times
data format: b'fo' * 290000 + b'\r\n' * 10000 + b'foobar'
It took 0.057s to run 1 iter with sample size: 600006.
Sample contain 10000 \r\n characters.
on_part_data was called 149986 times
data format: b'fo' * 150000 + b'
' * 150000 + b'foobar'
It took 1.337s to run 1 iter with sample size: 600006.
Sample contain 150000 \r\n characters.
on_part_data was called 14 times
data format: (b'fo' + b'\r\n') * 150000 + b'foobar'
It took 0.005s to run 1 iter with sample size: 600006.
Sample contain 150000 \r\n characters.
on_part_data was called 14 times
data format: (b'fo' + b'\r\n' * 5) * 50000 + b'foobar'
It took 0.004s to run 1 iter with sample size: 600006.
Sample contain 250000 \r\n characters.
on_part_data was called 170002 times
data format: (b'fo' + b'\r\n' * 50) * 5000 + b'foobar'
It took 1.463s to run 1 iter with sample size: 510006.
Sample contain 250000 \r\n characters.
on_part_data was called 299986 times
data format: b'fo' + b'\r\n' * 300000 + b'foobar'
It took 2.096s to run 1 iter with sample size: 600008.
Sample contain 300000 \r\n characters.
Biggest difference is 2.09s (56888.98%) for sample of the same size
In our use case this can lead to some really slow upload times. For example, a 9MB file filled with CR LF can take 2000s+, and in applications based on FastAPI or Starlette (web frameworks), this may lead to blocking the main thread for seconds and up to a minute at a time (depending on size of chunks in the request stream).
This is an edge case, so to say, but because of it, we are forced to use a different formdata parser.
(If any python webserver uses this to parse incoming data, one can quite easily DoS the service)