Skip to content

Fix RsT edge cases to better support polars #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docstring_to_markdown/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
if TYPE_CHECKING:
from importlib_metadata import EntryPoint

__version__ = "0.16"
__version__ = "0.17"


class UnknownFormatError(Exception):
Expand Down
39 changes: 34 additions & 5 deletions docstring_to_markdown/rst.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import ABC, abstractmethod
from enum import IntEnum, auto
from textwrap import dedent
from types import SimpleNamespace
from typing import Callable, Match, Union, List, Dict
import re
Expand Down Expand Up @@ -299,8 +300,8 @@ def inline_markdown(self):
SECTION_DIRECTIVES: Dict[str, List[Directive]] = {
'Parameters': [
Directive(
pattern=r'^(?P<other_args>\*\*kwargs|\*args)$',
replacement=r'- `\g<other_args>`'
pattern=r'^(?P<other_args>(\w[\w\d_\.]*)|\*\*kwargs|\*args)$',
replacement=r'- `\g<other_args>`:'
),
Directive(
pattern=r'^(?P<arg1>[^:\s]+\d), (?P<arg2>[^:\s]+\d), \.\.\. : (?P<type>.+)$',
Expand Down Expand Up @@ -336,6 +337,7 @@ def _find_directive_pattern(name: str):


def looks_like_rst(value: str) -> bool:
value = dedent(value)
# check if any of the characteristic sections (and the properly formatted underline) is there
for section in _RST_SECTIONS:
if (section + '\n' + '-' * len(section) + '\n') in value:
Expand Down Expand Up @@ -542,10 +544,20 @@ class BlockParser(IParser):
follower: Union['IParser', None] = None
_buffer: List[str]
_block_started: bool
_indent: Union[int, None]
should_measure_indent = True

def __init__(self):
self._buffer = []
self._block_started = False
self._indent = None

def measure_indent(self, line: str):
line_indent = len(line) - len(line.lstrip())
if self._indent is None:
self._indent = line_indent
else:
self._indent = min(line_indent, self._indent)

@abstractmethod
def can_parse(self, line: str) -> bool:
Expand All @@ -558,24 +570,33 @@ def _start_block(self, language: str):
def consume(self, line: str):
if not self._block_started:
raise ValueError('Block has not started') # pragma: no cover
if self.should_measure_indent:
self.measure_indent(line)
self._buffer.append(line)

def finish_consumption(self, final: bool) -> str:
# if the last line is empty (e.g. a separator of intended block), discard it
if self._buffer[len(self._buffer) - 1].strip() == '':
self._buffer.pop()
self._buffer.append(self.enclosure + '\n')
result = '\n'.join(self._buffer)
indent = " " * (self._indent or 0)
intermediate = '\n'.join(self._buffer)
result = '\n'.join([
(indent + line) if line else line
for line in intermediate.splitlines()
]) if indent else intermediate
if not final:
result += '\n'
self._buffer = []
self._block_started = False
self._indent = None
return result


class IndentedBlockParser(BlockParser, ABC):
_is_block_beginning: bool
_block_indent_size: Union[int, None]
should_measure_indent = False

def __init__(self):
super(IndentedBlockParser, self).__init__()
Expand All @@ -599,6 +620,7 @@ def consume(self, line: str):
return
if self._block_indent_size is None:
self._block_indent_size = len(line) - len(line.lstrip())
self.measure_indent(line)
super().consume(line[self._block_indent_size:])

def finish_consumption(self, final: bool) -> str:
Expand Down Expand Up @@ -684,6 +706,7 @@ def can_parse(self, line: str):
return line.strip() in self.directives

def initiate_parsing(self, line: str, current_language: str):
self.measure_indent(line)
admonition = self.directives[line.strip()]
self._start_block(f'\n{admonition.block_markdown}\n')
return IBlockBeginning(remainder='')
Expand All @@ -694,6 +717,7 @@ def can_parse(self, line: str) -> bool:
return re.match(CODE_BLOCK_PATTERN, line) is not None

def initiate_parsing(self, line: str, current_language: str) -> IBlockBeginning:
self.measure_indent(line)
match = re.match(CODE_BLOCK_PATTERN, line)
# already checked in can_parse
assert match
Expand Down Expand Up @@ -753,6 +777,8 @@ def rst_to_markdown(text: str, extract_signature: bool = True) -> str:
most_recent_section: Union[str, None] = None
is_first_line = True

text = dedent(text)

def flush_buffer():
nonlocal lines_buffer
lines = '\n'.join(lines_buffer)
Expand All @@ -766,7 +792,8 @@ def flush_buffer():
lines_buffer = []
return lines

for line in text.split('\n'):
lines = text.split('\n')
for i, line in enumerate(lines):
if is_first_line:
if extract_signature:
signature_match = re.match(r'^(?P<name>\S+)\((?P<params>.*)\)$', line)
Expand Down Expand Up @@ -809,7 +836,9 @@ def flush_buffer():
else:
if most_recent_section in SECTION_DIRECTIVES:
for section_directive in SECTION_DIRECTIVES[most_recent_section]:
if re.match(section_directive.pattern, trimmed_line):
next_line = lines[i + 1] if i + 1 < len(lines) else ""
is_next_line_section = set(next_line.strip()) == {"-"}
if re.match(section_directive.pattern, line) and not is_next_line_section:
line = re.sub(section_directive.pattern, section_directive.replacement, trimmed_line)
break
if trimmed_line.rstrip() in RST_SECTIONS:
Expand Down
127 changes: 126 additions & 1 deletion tests/test_rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def func(): pass

- `x`: array_like
Input array.
- `**kwargs`
- `**kwargs`:
For other keyword-only arguments, see the ufunc docs.
"""

Expand Down Expand Up @@ -638,6 +638,119 @@ def func(): pass
"""


# this format is often used by polars
PARAMETERS_WITHOUT_TYPE = """
Parameters
----------
source
Path(s) to a file or directory
When needing to authenticate for scanning cloud locations, see the
`storage_options` parameter.
columns
Columns to select. Accepts a list of column indices (starting at zero) or a list
of column names.
n_rows
Stop reading from parquet file after reading `n_rows`.
Only valid when `use_pyarrow=False`.

Returns
-------
DataFrame
"""

PARAMETERS_WITHOUT_TYPE_MARKDOWN = """
#### Parameters

- `source`:
Path(s) to a file or directory
When needing to authenticate for scanning cloud locations, see the
`storage_options` parameter.
- `columns`:
Columns to select. Accepts a list of column indices (starting at zero) or a list
of column names.
- `n_rows`:
Stop reading from parquet file after reading `n_rows`.
Only valid when `use_pyarrow=False`.

#### Returns

DataFrame
"""

INDENTED_DOCSTRING = """
Parameters
----------
glob
Expand path given via globbing rules.
"""

INDENTED_DOCSTRING_MARKDOWN = """
#### Parameters

- `glob`:
Expand path given via globbing rules.
"""


WARNINGS_IN_PARAMETERS = """
Parameters
----------
glob
Expand path given via globbing rules.
schema
Specify the datatypes of the columns. The datatypes must match the
datatypes in the file(s). If there are extra columns that are not in the
file(s), consider also enabling `allow_missing_columns`.

.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
hive_schema
The column names and data types of the columns by which the data is partitioned.
If set to `None` (default), the schema of the Hive partitions is inferred.

.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
try_parse_hive_dates
Whether to try parsing hive values as date/datetime types.
"""


WARNINGS_IN_PARAMETERS_MARKDOWN = """
#### Parameters

- `glob`:
Expand path given via globbing rules.
- `schema`:
Specify the datatypes of the columns. The datatypes must match the
datatypes in the file(s). If there are extra columns that are not in the
file(s), consider also enabling `allow_missing_columns`.


---
⚠️ **Warning**

This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.

---
- `hive_schema`:
The column names and data types of the columns by which the data is partitioned.
If set to `None` (default), the schema of the Hive partitions is inferred.


---
⚠️ **Warning**

This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.

---
- `try_parse_hive_dates`:
Whether to try parsing hive values as date/datetime types.
"""

NESTED_PARAMETERS = """
Parameters
----------
Expand Down Expand Up @@ -887,6 +1000,18 @@ def foo():
'rst': NESTED_PARAMETERS,
'md': NESTED_PARAMETERS_MARKDOWN
},
'converts parameter without type': {
'rst': PARAMETERS_WITHOUT_TYPE,
'md': PARAMETERS_WITHOUT_TYPE_MARKDOWN
},
'converts indented parameters lists': {
'rst': INDENTED_DOCSTRING,
'md': INDENTED_DOCSTRING_MARKDOWN
},
'converts warnings in parameters lists': {
'rst': WARNINGS_IN_PARAMETERS,
'md': WARNINGS_IN_PARAMETERS_MARKDOWN
},
'converts sphinx signatures': {
'rst': SPHINX_SIGNATURE,
'md': SPHINX_SIGNATURE_MARKDOWN
Expand Down
Loading