diff --git a/ebook_converter/utils/entities.py b/ebook_converter/utils/entities.py new file mode 100644 index 0000000..517d463 --- /dev/null +++ b/ebook_converter/utils/entities.py @@ -0,0 +1,98 @@ +""" +Misc util functions for converting HTML/XML entites to unicode. +""" +import html +import functools +import re + +from ebook_converter.ebooks.html_entities import html5_entities + + +ENT_PAT = re.compile(r'&(\S+?);') + + +def entity_to_unicode(match, exceptions=[], encoding='cp1252', + result_exceptions={}): + """ + :param match: A match object such that '&'+match.group(1)';' is the entity. + + :param exceptions: A list of entities to not convert (Each entry is the + name of the entity, for e.g. 'apos' or '#1234' + + :param encoding: The encoding to use to decode numeric entities between + 128 and 256. If None, the Unicode UCS encoding is used. + A common encoding is cp1252. + + :param result_exceptions: A mapping of characters to entities. If the + result is in result_exceptions, + result_exception[result] is returned instead. + Convenient way to specify exception for things + like < or > that can be specified by various + actual entities. + """ + + def my_unichr(num): + try: + return chr(num) + except (ValueError, OverflowError): + return '?' + + def check(ch): + return result_exceptions.get(ch, ch) + + ent = match.group(1) + if ent in exceptions: + return '&'+ent+';' + # squot is generated by some broken CMS software + if ent in {'apos', 'squot'}: + return check("'") + if ent == 'hellips': + ent = 'hellip' + if ent.startswith('#'): + try: + if ent[1] in ('x', 'X'): + num = int(ent[2:], 16) + else: + num = int(ent[1:]) + except Exception: + return '&'+ent+';' + if encoding is None or num > 255: + return check(my_unichr(num)) + try: + return check(bytes(bytearray((num,))).decode(encoding)) + except UnicodeDecodeError: + return check(my_unichr(num)) + try: + return check(html5_entities[ent]) + except KeyError: + pass + try: + return check(my_unichr(html.entities.name2codepoint[ent])) + except KeyError: + return '&'+ent+';' + + +xml_entity_to_unicode = functools.partial(entity_to_unicode, + result_exceptions={'"': '"', + "'": ''', + '<': '<', + '>': '>', + '&': '&'}) + + +def replace_entities(raw, encoding='cp1252'): + return ENT_PAT.sub(functools.partial(entity_to_unicode, encoding=encoding), + raw) + + +def xml_replace_entities(raw, encoding='cp1252'): + return ENT_PAT.sub(functools.partial(xml_entity_to_unicode, + encoding=encoding), raw) + + +def prepare_string_for_xml(raw, attribute=False): + raw = ENT_PAT.sub(entity_to_unicode, raw) + raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') + if attribute: + raw = raw.replace('"', '"').replace("'", ''') + return raw