-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
98 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
""" | ||
Misc util functions for converting HTML/XML entites to unicode. | ||
""" | ||
import html | ||
import functools | ||
import re | ||
|
||
from ebook_converter.ebooks.html_entities import html5_entities | ||
|
||
|
||
ENT_PAT = re.compile(r'&(\S+?);') | ||
|
||
|
||
def entity_to_unicode(match, exceptions=[], encoding='cp1252', | ||
result_exceptions={}): | ||
""" | ||
:param match: A match object such that '&'+match.group(1)';' is the entity. | ||
:param exceptions: A list of entities to not convert (Each entry is the | ||
name of the entity, for e.g. 'apos' or '#1234' | ||
:param encoding: The encoding to use to decode numeric entities between | ||
128 and 256. If None, the Unicode UCS encoding is used. | ||
A common encoding is cp1252. | ||
:param result_exceptions: A mapping of characters to entities. If the | ||
result is in result_exceptions, | ||
result_exception[result] is returned instead. | ||
Convenient way to specify exception for things | ||
like < or > that can be specified by various | ||
actual entities. | ||
""" | ||
|
||
def my_unichr(num): | ||
try: | ||
return chr(num) | ||
except (ValueError, OverflowError): | ||
return '?' | ||
|
||
def check(ch): | ||
return result_exceptions.get(ch, ch) | ||
|
||
ent = match.group(1) | ||
if ent in exceptions: | ||
return '&'+ent+';' | ||
# squot is generated by some broken CMS software | ||
if ent in {'apos', 'squot'}: | ||
return check("'") | ||
if ent == 'hellips': | ||
ent = 'hellip' | ||
if ent.startswith('#'): | ||
try: | ||
if ent[1] in ('x', 'X'): | ||
num = int(ent[2:], 16) | ||
else: | ||
num = int(ent[1:]) | ||
except Exception: | ||
return '&'+ent+';' | ||
if encoding is None or num > 255: | ||
return check(my_unichr(num)) | ||
try: | ||
return check(bytes(bytearray((num,))).decode(encoding)) | ||
except UnicodeDecodeError: | ||
return check(my_unichr(num)) | ||
try: | ||
return check(html5_entities[ent]) | ||
except KeyError: | ||
pass | ||
try: | ||
return check(my_unichr(html.entities.name2codepoint[ent])) | ||
except KeyError: | ||
return '&'+ent+';' | ||
|
||
|
||
xml_entity_to_unicode = functools.partial(entity_to_unicode, | ||
result_exceptions={'"': '"', | ||
"'": ''', | ||
'<': '<', | ||
'>': '>', | ||
'&': '&'}) | ||
|
||
|
||
def replace_entities(raw, encoding='cp1252'): | ||
return ENT_PAT.sub(functools.partial(entity_to_unicode, encoding=encoding), | ||
raw) | ||
|
||
|
||
def xml_replace_entities(raw, encoding='cp1252'): | ||
return ENT_PAT.sub(functools.partial(xml_entity_to_unicode, | ||
encoding=encoding), raw) | ||
|
||
|
||
def prepare_string_for_xml(raw, attribute=False): | ||
raw = ENT_PAT.sub(entity_to_unicode, raw) | ||
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') | ||
if attribute: | ||
raw = raw.replace('"', '"').replace("'", ''') | ||
return raw |