Skip to content
This repository has been archived by the owner on Mar 5, 2022. It is now read-only.

Commit

Permalink
Merge pull request #300 from zmwangx/textwrap-zero-width
Browse files Browse the repository at this point in the history
Improve textwrap in presence of zero-width sequences
  • Loading branch information
jarun authored Nov 14, 2019
2 parents 05d5187 + ae4fc2c commit 0db68d6
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 58 deletions.
188 changes: 131 additions & 57 deletions googler
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,20 @@ try:
except (ImportError, Exception):
pass

from typing import (
Any,
Dict,
Generator,
Iterable,
Iterator,
List,
Match,
Optional,
Tuple,
Union,
cast,
)

# Basic setup

logging.basicConfig(format='[%(levelname)s] %(message)s')
Expand Down Expand Up @@ -152,6 +166,98 @@ def monkeypatch_textwrap_for_cjk():
monkeypatch_textwrap_for_cjk()


CoordinateType = Tuple[int, int]


class TrackedTextwrap:
"""
Implements a text wrapper that tracks the position of each source
character, and can correctly insert zero-width sequences at given
offsets of the source text.
Wrapping result should be the same as that from PSL textwrap.wrap
with default settings except expand_tabs=False.
"""

def __init__(self, text: str, width: int):
self._original = text

# Do the job of replace_whitespace first so that we can easily
# match text to wrapped lines later. Note that this operation
# does not change text length or offsets.
whitespace = "\t\n\v\f\r "
whitespace_trans = str.maketrans(whitespace, " " * len(whitespace))
text = text.translate(whitespace_trans)

self._lines = textwrap.wrap(
text, width, expand_tabs=False, replace_whitespace=False
)

# self._coords track the (row, column) coordinate of each source
# character in the result text. It is indexed by offset in
# source text.
self._coords = [] # type: List[CoordinateType]
offset = 0
try:
if not self._lines:
# Source text only has whitespaces. We add an empty line
# in order to produce meaningful coordinates.
self._lines = [""]
for row, line in enumerate(self._lines):
assert text[offset : offset + len(line)] == line
col = 0
for _ in line:
self._coords.append((row, col))
offset += 1
col += 1
# All subsequent dropped whitespaces map to the last, imaginary column
# (the EOL character if you wish) of the current line.
while offset < len(text) and text[offset] == " ":
self._coords.append((row, col))
offset += 1
# One past the final character (think of it as EOF) should
# be treated as a valid offset.
self._coords.append((row, col))
except AssertionError:
raise RuntimeError(
"TrackedTextwrap: the impossible happened at offset {} of text {!r}".format(
offset, self._original
)
)

# seq should be a zero-width sequence, e.g., an ANSI escape sequence.
# May raise IndexError if offset is out of bounds.
def insert_zero_width_sequence(self, seq: str, offset: int) -> None:
row, col = self._coords[offset]
line = self._lines[row]
self._lines[row] = line[:col] + seq + line[col:]

# Shift coordinates of all characters after the given character
# on the same line.
shift = len(seq)
offset += 1
while offset < len(self._coords) and self._coords[offset][0] == row:
_, col = self._coords[offset]
self._coords[offset] = (row, col + shift)
offset += 1

@property
def original(self) -> str:
return self._original

@property
def lines(self) -> List[str]:
return self._lines

@property
def wrapped(self) -> str:
return "\n".join(self._lines)

# May raise IndexError if offset is out of bounds.
def get_coordinate(self, offset: int) -> CoordinateType:
return self._coords[offset]


### begin dim (DOM implementation with CSS support) ###
### https://github.com/zmwangx/dim/blob/master/dim.py ###

Expand All @@ -162,34 +268,6 @@ from collections import OrderedDict
from enum import Enum
from html.parser import HTMLParser

try:
from typing import (
Any,
Dict,
Generator,
Iterable,
Iterator,
List,
Match,
Optional,
Tuple,
Union,
cast,
)
except ImportError: # pragma: no cover
# Python 3.4 without external typing module

class _TypeStub:
def __getitem__(self, _): # type: ignore
return None

Any = None
Dict = Generator = Iterable = Iterator = List = Match = _TypeStub() # type: ignore
Optional = Tuple = Union = _TypeStub() # type: ignore

def cast(typ, val): # type: ignore
return val


SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]

Expand Down Expand Up @@ -2125,7 +2203,7 @@ class GoogleParser(object):
abstract = ''
for childnode in div_g.select('.st').children:
if childnode.tag == 'b' and childnode.text != '...':
matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)})
matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)})
abstract = abstract + childnode.text.replace('\n', '')
try:
metadata = div_g.select('.slp').text
Expand Down Expand Up @@ -2264,28 +2342,28 @@ class Result(object):
self._urltable[fullindex] = sitelink.url
subindex = chr(ord(subindex) + 1)

def _print_title_and_url(self, index, title, url, indent=3, pre=0):
def _print_title_and_url(self, index, title, url, indent=0):
colors = self.colors

if not self.urlexpand:
url = '[' + urllib.parse.urlparse(url).netloc + ']'

if colors:
# Adjust index to print result index clearly
print(" %s%s%-*s%s" % (' ' * pre, colors.index, indent, index + '.', colors.reset), end='')
print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='')
if not self.urlexpand:
print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset)
else:
print(' ' + colors.title + title + colors.reset)
print(' ' * (indent + 2 + pre) + colors.url + url + colors.reset)
print(' ' * (indent + 5) + colors.url + url + colors.reset)
else:
if self.urlexpand:
print(' %s%-*s %s' % (' ' * pre, indent, index + '.', title))
print(' %s%s' % (' ' * (indent + 1 + pre), url))
print(' %s%-3s %s' % (' ' * indent, index + '.', title))
print(' %s%s' % (' ' * (indent + 4), url))
else:
print(' %s%-*s %s %s' % (' ' * pre, indent, index + '.', title, url))
print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url))

def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=5, pre=0):
def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0):
colors = self.colors
try:
columns, _ = os.get_terminal_size()
Expand All @@ -2294,40 +2372,36 @@ class Result(object):

if metadata:
if colors:
print(' ' * (indent + pre) + colors.metadata + metadata + colors.reset)
print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset)
else:
print(' ' * (indent + pre) + metadata)
print(' ' * (indent + 5) + metadata)

fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract)
wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
if colors and not self.nohl:
# Start from the last match, as inserting the bold characters changes the offsets.
for match in reversed(matches or []):
abstract = (
abstract[: match['offset']]
+ '\033[1m'
+ match['phrase']
+ '\033[0m'
+ abstract[match['offset'] + len(match['phrase']) :]
)
# Highlight matches.
for match in matches or []:
offset = match['offset']
span = len(match['phrase'])
wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset)
wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span)

if colors:
print(colors.abstract, end='')
if columns > indent + 1 + pre:
# Try to fill to columns
fillwidth = columns - indent - 1
for line in textwrap.wrap(abstract.replace('\n', ''), width=fillwidth):
print('%s%s' % (' ' * (indent + pre), line))
print('')
else:
print('%s%s\n' % (' ' * pre, abstract.replace('\n', ' ')))
for line in wrapped_abstract.lines:
print('%s%s' % (' ' * (indent + 5), line))
if colors:
print(colors.reset, end='')
print('')

def print(self):
"""Print the result entry."""
self._print_title_and_url(self.index, self.title, self.url)
self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)

for sitelink in self.sitelinks:
self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, pre=4)
self._print_metadata_and_abstract(sitelink.abstract, pre=4)
self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
self._print_metadata_and_abstract(sitelink.abstract, indent=4)

def jsonizable_object(self):
"""Return a JSON-serializable dict representing the result entry."""
Expand Down
2 changes: 1 addition & 1 deletion tests/test
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ test_googler () {
printf 'failed with status %d.\033[0m\n' $last_status >&2
exitcode=1

(( rerun )) && { googler --noprompt -d "$@"; printf '\n\033[33m[Exit status] %d\033[0m\n' $?; } || :
(( rerun )) && { $googler --noprompt -d "$@"; printf '\n\033[33m[Exit status] %d\033[0m\n' $?; } || :
}

declare -g quiet
Expand Down

0 comments on commit 0db68d6

Please sign in to comment.