Merge pull request #300 from zmwangx/textwrap-zero-width

Improve textwrap in presence of zero-width sequences
jarun · Nov 14, 2019 · 0db68d6 · 0db68d6
2 parents 05d5187 + ae4fc2c
commit 0db68d6
Show file tree

Hide file tree

Showing 2 changed files with 132 additions and 58 deletions.
diff --git a/googler b/googler
@@ -55,6 +55,20 @@ try:
 except (ImportError, Exception):
     pass
 
+from typing import (
+    Any,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Match,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
+
 # Basic setup
 
 logging.basicConfig(format='[%(levelname)s] %(message)s')
@@ -152,6 +166,98 @@ def monkeypatch_textwrap_for_cjk():
 monkeypatch_textwrap_for_cjk()
 
 
+CoordinateType = Tuple[int, int]
+
+
+class TrackedTextwrap:
+    """
+    Implements a text wrapper that tracks the position of each source
+    character, and can correctly insert zero-width sequences at given
+    offsets of the source text.
+
+    Wrapping result should be the same as that from PSL textwrap.wrap
+    with default settings except expand_tabs=False.
+    """
+
+    def __init__(self, text: str, width: int):
+        self._original = text
+
+        # Do the job of replace_whitespace first so that we can easily
+        # match text to wrapped lines later. Note that this operation
+        # does not change text length or offsets.
+        whitespace = "\t\n\v\f\r "
+        whitespace_trans = str.maketrans(whitespace, " " * len(whitespace))
+        text = text.translate(whitespace_trans)
+
+        self._lines = textwrap.wrap(
+            text, width, expand_tabs=False, replace_whitespace=False
+        )
+
+        # self._coords track the (row, column) coordinate of each source
+        # character in the result text. It is indexed by offset in
+        # source text.
+        self._coords = []  # type: List[CoordinateType]
+        offset = 0
+        try:
+            if not self._lines:
+                # Source text only has whitespaces. We add an empty line
+                # in order to produce meaningful coordinates.
+                self._lines = [""]
+            for row, line in enumerate(self._lines):
+                assert text[offset : offset + len(line)] == line
+                col = 0
+                for _ in line:
+                    self._coords.append((row, col))
+                    offset += 1
+                    col += 1
+                # All subsequent dropped whitespaces map to the last, imaginary column
+                # (the EOL character if you wish) of the current line.
+                while offset < len(text) and text[offset] == " ":
+                    self._coords.append((row, col))
+                    offset += 1
+            # One past the final character (think of it as EOF) should
+            # be treated as a valid offset.
+            self._coords.append((row, col))
+        except AssertionError:
+            raise RuntimeError(
+                "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format(
+                    offset, self._original
+                )
+            )
+
+    # seq should be a zero-width sequence, e.g., an ANSI escape sequence.
+    # May raise IndexError if offset is out of bounds.
+    def insert_zero_width_sequence(self, seq: str, offset: int) -> None:
+        row, col = self._coords[offset]
+        line = self._lines[row]
+        self._lines[row] = line[:col] + seq + line[col:]
+
+        # Shift coordinates of all characters after the given character
+        # on the same line.
+        shift = len(seq)
+        offset += 1
+        while offset < len(self._coords) and self._coords[offset][0] == row:
+            _, col = self._coords[offset]
+            self._coords[offset] = (row, col + shift)
+            offset += 1
+
+    @property
+    def original(self) -> str:
+        return self._original
+
+    @property
+    def lines(self) -> List[str]:
+        return self._lines
+
+    @property
+    def wrapped(self) -> str:
+        return "\n".join(self._lines)
+
+    # May raise IndexError if offset is out of bounds.
+    def get_coordinate(self, offset: int) -> CoordinateType:
+        return self._coords[offset]
+
+
 ### begin dim (DOM implementation with CSS support) ###
 ### https://github.com/zmwangx/dim/blob/master/dim.py ###
 
@@ -162,34 +268,6 @@ from collections import OrderedDict
 from enum import Enum
 from html.parser import HTMLParser
 
-try:
-    from typing import (
-        Any,
-        Dict,
-        Generator,
-        Iterable,
-        Iterator,
-        List,
-        Match,
-        Optional,
-        Tuple,
-        Union,
-        cast,
-    )
-except ImportError:  # pragma: no cover
-    # Python 3.4 without external typing module
-
-    class _TypeStub:
-        def __getitem__(self, _):  # type: ignore
-            return None
-
-    Any = None
-    Dict = Generator = Iterable = Iterator = List = Match = _TypeStub()  # type: ignore
-    Optional = Tuple = Union = _TypeStub()  # type: ignore
-
-    def cast(typ, val):  # type: ignore
-        return val
-
 
 SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]
 
@@ -2125,7 +2203,7 @@ class GoogleParser(object):
                 abstract = ''
                 for childnode in div_g.select('.st').children:
                     if childnode.tag == 'b' and childnode.text != '...':
-                            matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)})
+                        matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)})
                     abstract = abstract + childnode.text.replace('\n', '')
                 try:
                     metadata = div_g.select('.slp').text
@@ -2264,28 +2342,28 @@ class Result(object):
             self._urltable[fullindex] = sitelink.url
             subindex = chr(ord(subindex) + 1)
 
-    def _print_title_and_url(self, index, title, url, indent=3, pre=0):
+    def _print_title_and_url(self, index, title, url, indent=0):
         colors = self.colors
 
         if not self.urlexpand:
             url = '[' + urllib.parse.urlparse(url).netloc + ']'
 
         if colors:
             # Adjust index to print result index clearly
-            print(" %s%s%-*s%s" % (' ' * pre, colors.index, indent, index + '.', colors.reset), end='')
+            print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='')
             if not self.urlexpand:
                 print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset)
             else:
                 print(' ' + colors.title + title + colors.reset)
-                print(' ' * (indent + 2 + pre) + colors.url + url + colors.reset)
+                print(' ' * (indent + 5) + colors.url + url + colors.reset)
         else:
             if self.urlexpand:
-                print(' %s%-*s %s' % (' ' * pre, indent, index + '.', title))
-                print(' %s%s' % (' ' * (indent + 1 + pre), url))
+                print(' %s%-3s %s' % (' ' * indent, index + '.', title))
+                print(' %s%s' % (' ' * (indent + 4), url))
             else:
-                print(' %s%-*s %s %s' % (' ' * pre, indent, index + '.', title, url))
+                print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url))
 
-    def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=5, pre=0):
+    def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0):
         colors = self.colors
         try:
             columns, _ = os.get_terminal_size()
@@ -2294,40 +2372,36 @@ class Result(object):
 
         if metadata:
             if colors:
-                print(' ' * (indent + pre) + colors.metadata + metadata + colors.reset)
+                print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset)
             else:
-                print(' ' * (indent + pre) + metadata)
+                print(' ' * (indent + 5) + metadata)
 
+        fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract)
+        wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
         if colors and not self.nohl:
-            # Start from the last match, as inserting the bold characters changes the offsets.
-            for match in reversed(matches or []):
-                abstract = (
-                    abstract[: match['offset']]
-                    + '\033[1m'
-                    + match['phrase']
-                    + '\033[0m'
-                    + abstract[match['offset'] + len(match['phrase']) :]
-                )
+            # Highlight matches.
+            for match in matches or []:
+                offset = match['offset']
+                span = len(match['phrase'])
+                wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset)
+                wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span)
+
+        if colors:
             print(colors.abstract, end='')
-        if columns > indent + 1 + pre:
-            # Try to fill to columns
-            fillwidth = columns - indent - 1
-            for line in textwrap.wrap(abstract.replace('\n', ''), width=fillwidth):
-                print('%s%s' % (' ' * (indent + pre), line))
-            print('')
-        else:
-            print('%s%s\n' % (' ' * pre, abstract.replace('\n', ' ')))
+        for line in wrapped_abstract.lines:
+            print('%s%s' % (' ' * (indent + 5), line))
         if colors:
             print(colors.reset, end='')
+        print('')
 
     def print(self):
         """Print the result entry."""
         self._print_title_and_url(self.index, self.title, self.url)
         self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
 
         for sitelink in self.sitelinks:
-            self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, pre=4)
-            self._print_metadata_and_abstract(sitelink.abstract, pre=4)
+            self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
+            self._print_metadata_and_abstract(sitelink.abstract, indent=4)
 
     def jsonizable_object(self):
         """Return a JSON-serializable dict representing the result entry."""

diff --git a/tests/test b/tests/test
@@ -105,7 +105,7 @@ test_googler () {
         printf 'failed with status %d.\033[0m\n' $last_status >&2
         exitcode=1
 
-        (( rerun )) && { googler --noprompt -d "$@"; printf '\n\033[33m[Exit status] %d\033[0m\n' $?; } || :
+        (( rerun )) && { $googler --noprompt -d "$@"; printf '\n\033[33m[Exit status] %d\033[0m\n' $?; } || :
     }
 
     declare -g quiet