Skip to content

Commit 5a12b73

Browse files
authored
Merge pull request #596 from christian-monch/ssh-shell
Add `PersistentSubShell`-feature
2 parents 401af7b + ce445a8 commit 5a12b73

17 files changed

+2010
-43
lines changed

.coveragerc

+2
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@ source =
1414
[report]
1515
# show lines missing coverage in output
1616
show_missing = True
17+
exclude_also =
18+
raise NotImplementedError

datalad_next/iterable_subprocess/iterable_subprocess.py

+25-16
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,30 @@
88
from datalad_next.exceptions import CommandError
99

1010

11+
class OutputFrom(Generator):
12+
def __init__(self, stdout, stderr_deque, chunk_size=65536):
13+
self.stdout = stdout
14+
self.stderr_deque = stderr_deque
15+
self.chunk_size = chunk_size
16+
self.returncode = None
17+
18+
def send(self, _):
19+
chunk = self.stdout.read(self.chunk_size)
20+
if not chunk:
21+
raise StopIteration
22+
return chunk
23+
24+
def throw(self, typ, value=None, traceback=None):
25+
return super().throw(typ, value, traceback)
26+
27+
1128
@contextmanager
1229
def iterable_subprocess(
1330
program,
1431
input_chunks,
1532
chunk_size=65536,
1633
cwd=None,
34+
bufsize=-1,
1735
):
1836
# This context starts a thread that populates the subprocess's standard input. It
1937
# also starts a threads that reads the process's standard error. Otherwise we risk
@@ -105,20 +123,6 @@ def input_to(stdin):
105123
if e.errno != 22:
106124
raise
107125

108-
class OutputFrom(Generator):
109-
def __init__(self, stdout):
110-
self.stdout = stdout
111-
self.returncode = None
112-
113-
def send(self, _):
114-
chunk = self.stdout.read(chunk_size)
115-
if not chunk:
116-
raise StopIteration
117-
return chunk
118-
119-
def throw(self, typ, value=None, traceback=None):
120-
return super().throw(typ, value, traceback)
121-
122126
def keep_only_most_recent(stderr, stderr_deque):
123127
total_length = 0
124128
while True:
@@ -144,12 +148,13 @@ def raise_if_not_none(exception):
144148
try:
145149

146150
with \
147-
Popen(
151+
Popen( # nosec - all arguments are controlled by the caller
148152
program,
149153
stdin=PIPE,
150154
stdout=PIPE,
151155
stderr=PIPE,
152156
cwd=cwd,
157+
bufsize=bufsize,
153158
) as proc, \
154159
thread(
155160
keep_only_most_recent,
@@ -164,7 +169,11 @@ def raise_if_not_none(exception):
164169
try:
165170
start_t_stderr()
166171
start_t_stdin()
167-
chunk_generator = OutputFrom(proc.stdout)
172+
chunk_generator = OutputFrom(
173+
proc.stdout,
174+
stderr_deque,
175+
chunk_size
176+
)
168177
yield chunk_generator
169178
except BaseException:
170179
proc.terminate()

datalad_next/itertools/align_pattern.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from __future__ import annotations
55

6+
import re
67
from typing import (
78
Generator,
89
Iterable,
@@ -74,15 +75,19 @@ def align_pattern(iterable: Iterable[str | bytes | bytearray],
7475
pattern multiple times.
7576
"""
7677

77-
def ends_with_pattern_prefix(data: str | bytes | bytearray,
78-
pattern: str | bytes | bytearray,
79-
) -> bool:
80-
""" Check whether the chunk ends with a prefix of the pattern """
81-
for index in range(len(pattern) - 1, 0, -1):
82-
if data[-index:] == pattern[:index]:
83-
return True
84-
return False
85-
78+
# Create pattern matcher for all
79+
if isinstance(pattern, str):
80+
regex: str | bytes | bytearray = '(' + '|'.join(
81+
'.' * (len(pattern) - index - 1) + re.escape(pattern[:index]) + '$'
82+
for index in range(1, len(pattern))
83+
) + ')'
84+
else:
85+
regex = b'(' + b'|'.join(
86+
b'.' * (len(pattern) - index - 1) + re.escape(pattern[:index]) + b'$'
87+
for index in range(1, len(pattern))
88+
) + b')'
89+
pattern_matcher = re.compile(regex, re.DOTALL)
90+
pattern_sub = len(pattern) - 1
8691
# Join data chunks until they are sufficiently long to contain the pattern,
8792
# i.e. have at least size: `len(pattern)`. Continue joining, if the chunk
8893
# ends with a prefix of the pattern.
@@ -94,7 +99,9 @@ def ends_with_pattern_prefix(data: str | bytes | bytearray,
9499
else:
95100
current_chunk += data_chunk
96101
if len(current_chunk) >= len(pattern) \
97-
and not ends_with_pattern_prefix(current_chunk, pattern):
102+
and not (
103+
current_chunk[-1] in pattern
104+
and pattern_matcher.match(current_chunk, len(current_chunk) - pattern_sub)):
98105
yield current_chunk
99106
current_chunk = None
100107

datalad_next/itertools/tests/test_align_pattern.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
import timeit
4+
35
import pytest
46

57
from ..align_pattern import align_pattern
@@ -22,3 +24,31 @@
2224
])
2325
def test_pattern_processor(data_chunks, pattern, expected):
2426
assert expected == list(align_pattern(data_chunks, pattern=pattern))
27+
28+
29+
def test_performance():
30+
# Ensure that the performance of align_pattern is acceptable for large
31+
# data chunks and patterns.
32+
number = 10
33+
pattern = b'01234'
34+
data_chunks = [b'a' * 1000 for _ in range(100 * 1000)] + [pattern]
35+
36+
result_base = timeit.timeit(
37+
lambda: tuple(data_chunks),
38+
number=number,
39+
)
40+
result_iter = timeit.timeit(
41+
lambda: tuple(align_pattern(data_chunks, pattern=pattern)),
42+
number=number,
43+
)
44+
45+
print(result_base, result_iter, result_iter / result_base)
46+
47+
48+
def test_newline_matches():
49+
pattern = b'----datalad-end-marker-3654137433-rekram-dne-dalatad----\n'
50+
chunk1 = b'Have a lot of fun...\n----datalad-end-marker-3654137433-r'
51+
chunk2 = b'e'
52+
chunk3 = b'kram-dne-dalatad----\n'
53+
result = list(align_pattern([chunk1, chunk2, chunk3], pattern))
54+
assert result == [chunk1 + chunk2 + chunk3]

datalad_next/runners/iter_subproc.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
List,
77
)
88

9-
from datalad_next.iterable_subprocess import iterable_subprocess
9+
from datalad_next.iterable_subprocess.iterable_subprocess import (
10+
iterable_subprocess,
11+
OutputFrom,
12+
)
1013
from datalad_next.exceptions import CommandError
1114
from datalad_next.consts import COPY_BUFSIZE
1215

@@ -19,6 +22,7 @@ def iter_subproc(
1922
input: Iterable[bytes] | None = None,
2023
chunk_size: int = COPY_BUFSIZE,
2124
cwd: Path | None = None,
25+
bufsize: int = -1,
2226
):
2327
"""Context manager to communicate with a subprocess using iterables
2428
@@ -88,6 +92,9 @@ def iter_subproc(
8892
Size of chunks to read from the subprocess's stdout/stderr in bytes.
8993
cwd: Path
9094
Working directory for the subprocess, passed to ``subprocess.Popen``.
95+
bufsize: int, optional
96+
Buffer size to use for the subprocess's ``stdin``, ``stdout``, and
97+
``stderr``. See ``subprocess.Popen`` for details.
9198
9299
Returns
93100
-------
@@ -98,4 +105,5 @@ def iter_subproc(
98105
tuple() if input is None else input,
99106
chunk_size=chunk_size,
100107
cwd=cwd,
108+
bufsize=bufsize,
101109
)

datalad_next/shell/__init__.py

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""A persistent shell connection
2+
3+
This module provides a context manager that establishes a connection to a shell
4+
and can be used to execute multiple commands in that shell. Shells are usually
5+
remote shells, e.g. connected via an ``ssh``-client, but local shells like
6+
``zsh``, ``bash`` or ``PowerShell`` can also be used.
7+
8+
The context manager returns an instance of :class:`ShellCommandExecutor` that
9+
can be used to execute commands in the shell via the method
10+
:meth:`ShellCommandExecutor.__call__`. The method will return an instance of
11+
a subclass of :class:`ShellCommandResponseGenerator` that can be used to
12+
retrieve the output of the command, the result code of the command, and the
13+
stderr-output of the command.
14+
15+
Every response generator expects a certain output structure. It is responsible
16+
for ensuring that the output structure is generated. To this end every
17+
response generator provides a method
18+
:meth:`ShellCommandResponseGenerator.get_command_list`. The method
19+
:class:`ShellCommandExecutor.__call__` will pass the user-provided command to
20+
:meth:`ShellCommandResponseGenerator.get_command_list` and receive a list of
21+
final commands that should be executed in the connected shell and that will
22+
generate the expected output structure. Instances of
23+
:class:`ShellCommandResponseGenerator` have therefore four tasks:
24+
25+
1. Create a final command list that is used to execute the user provided
26+
command. This could, for example, execute the command, print an
27+
end marker, and print the return code of the command.
28+
29+
2. Parse the output of the command, yield it to the user.
30+
31+
3. Read the return code and provide it to the user.
32+
33+
4. Provide stderr-output to the user.
34+
35+
A very versatile example of a response generator is the class
36+
:class:`VariableLengthResponseGenerator`. It can be used to execute a command
37+
that will result in an output of unknown length, e.g. ``ls``, and will yield
38+
the output of the command to the user. It does that by using a random
39+
*end marker* to detect the end of the output and read the trailing return code.
40+
This is suitable for almost all commands.
41+
42+
If :class:`VariableLengthResponseGenerator` is so versatile, why not just
43+
implement its functionality in :class:`ShellCommandExecutor`? There are two
44+
major reasons for that:
45+
46+
1. Although the :class:`VariableLengthResponseGenerator` is very versatile,
47+
it is not the most efficient implementation for commands that produce large
48+
amounts of output. In addition, there is also a minimal risk that the end
49+
marker is part of the output of the command, which would trip up the response
50+
generator. Putting response generation into a separate class allows to
51+
implement specific operations more efficiently and more safely.
52+
For example,
53+
:class:`DownloadResponseGenerator` implements the download of files. It
54+
takes a remote file name as user "command" and creates a final command list
55+
that emits the length of the file, a newline, the file content, a return
56+
code, and a newline. This allows :class:`DownloadResponseGenerator`
57+
to parse the output without relying on an end marker, thus increasing
58+
efficiency and safety
59+
60+
2. Factoring out the response generation creates an interface that can be used
61+
to support the syntax of different shells and the difference in command
62+
names and options in different operating systems. For example, the response
63+
generator class :class:`VariableLengthResponseGeneratorPowerShell` supports
64+
the invocation of commands with variable length output in a ``PowerShell``.
65+
66+
In short, parser generator classes encapsulate details of shell-syntax and
67+
operation implementation. That allows support of different shell syntax, and
68+
the efficient implementation of specific higher level operations, e.g.
69+
``download``. It also allows users to extend the functionality of
70+
:class:`ShellCommandExecutor` by providing their own response generator
71+
classes.
72+
73+
The module :mod:`datalad_next.shell.response_generators` provides two generally
74+
applicable abstract response generator classes:
75+
76+
- :class:`VariableLengthResponseGenerator`
77+
78+
- :class:`FixedLengthResponseGenerator`
79+
80+
The functionality of the former is described above. The latter can be used to
81+
execute a command that will result in output of known
82+
length, e.g. ``echo -n 012345``. It reads the specified number of bytes and a
83+
trailing return code. This is more performant than the variable length response
84+
generator (because it does not have to search for the end marker). In addition,
85+
it does not rely on the uniqueness of the end marker. It is most useful for
86+
operation like ``download``, where the length of the output can be known in
87+
advance.
88+
89+
As mentioned above, the classes :class:`VariableLengthResponseGenerator` and
90+
:class:`FixedLengthResponseGenerator` are abstract. The module
91+
:mod:`datalad_next.shell.response_generators` provides the following concrete
92+
implementations for them:
93+
94+
- :class:`VariableLengthResponseGeneratorPosix`
95+
96+
- :class:`VariableLengthResponseGeneratorPowerShell`
97+
98+
- :class:`FixedLengthResponseGeneratorPosix`
99+
100+
- :class:`FixedLengthResponseGeneratorPowerShell`
101+
102+
When :func:`shell` is executed it will use a
103+
:class:`VariableLengthResponseClass` to skip the login message of the shell.
104+
This is done by executing a *zero command* (a command that will possibly
105+
generate some output, and successfully return) in the shell. The zero command is
106+
provided by the concrete implementation of class
107+
:class:`VariableLengthResponseGenerator`. For example, the zero command for
108+
POSIX shells is ``test 0 -eq 0``, for PowerShell it is ``Write-Host hello``.
109+
110+
Because there is no way for func:`shell` to determine the kind of shell it
111+
connects to, the user can provide an alternative response generator class, in
112+
the ``zero_command_rg_class``-parameter. Instance of that class
113+
will then be used to execute the zero command. Currently, the following two
114+
response generator classes are available:
115+
116+
- :class:`VariableLengthResponseGeneratorPosix`: works with POSIX-compliant
117+
shells, e.g. ``sh`` or ``bash``. This is the default.
118+
- :class:`VariableLengthResponseGeneratorPowerShell`: works with PowerShell.
119+
120+
Whenever a command is executed via :meth:`ShellCommandExecutor.__call__`, the
121+
class identified by ``zero_command_rg_class`` will be used by default to create
122+
the final command list and to parse the result. Users can override this on a
123+
per-call basis by providing a different response generator class in the
124+
``response_generator``-parameter of :meth:`ShellCommandExecutor.__call__`.
125+
126+
.. currentmodule:: datalad_next.shell
127+
128+
.. autosummary::
129+
:toctree: generated
130+
:recursive:
131+
132+
ShellCommandExecutor
133+
ShellCommandResponseGenerator
134+
VariableLengthResponseGenerator
135+
VariableLengthResponseGeneratorPosix
136+
VariableLengthResponseGeneratorPowerShell
137+
FixedLengthResponseGenerator
138+
FixedLengthResponseGeneratorPosix
139+
FixedLengthResponseGeneratorPowerShell
140+
DownloadResponseGenerator
141+
DownloadResponseGeneratorPosix
142+
operations.posix.upload
143+
operations.posix.download
144+
operations.posix.delete
145+
"""
146+
147+
148+
__all__ = [
149+
'shell',
150+
'posix',
151+
]
152+
153+
from .shell import (
154+
shell,
155+
ShellCommandExecutor,
156+
)
157+
158+
from .operations import posix
159+
from .operations.posix import (
160+
DownloadResponseGenerator,
161+
DownloadResponseGeneratorPosix,
162+
)
163+
from .response_generators import (
164+
FixedLengthResponseGenerator,
165+
FixedLengthResponseGeneratorPosix,
166+
FixedLengthResponseGeneratorPowerShell,
167+
ShellCommandResponseGenerator,
168+
VariableLengthResponseGenerator,
169+
VariableLengthResponseGeneratorPosix,
170+
VariableLengthResponseGeneratorPowerShell,
171+
)

datalad_next/shell/operations/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)