Merge pull request #124 from openlawlibrary/merge-all-hyperlink

tiberlas · web-flow · commit dc45681c3cab · 2024-02-28T13:06:14.000+01:00
Merge all hyperlink
diff --git a/docx/__init__.py b/docx/__init__.py
@@ -2,7 +2,7 @@
 
 from docx.api import Document  # noqa
 
-__version__ = '0.8.10.29'
+__version__ = '0.8.10.30'
 
 
 # register custom Part classes with opc package reader
diff --git a/docx/oxml/__init__.py b/docx/oxml/__init__.py
@@ -287,3 +287,6 @@ def OxmlElement(nsptag_str, attrs=None, nsdecls=None):
 
 from .text.symbol import CT_Sym
 register_element_cls('w:sym', CT_Sym)
+
+from .text.hyperlink import CT_Hyperlink
+register_element_cls('w:hyperlink', CT_Hyperlink)
diff --git a/docx/oxml/text/hyperlink.py b/docx/oxml/text/hyperlink.py
@@ -0,0 +1,19 @@
+# encoding: utf-8
+
+"""
+Custom element classes related to text runs (CT_Hyperlink).
+"""
+
+
+from ..simpletypes import ST_String
+from ..xmlchemy import (
+    BaseOxmlElement, OptionalAttribute, ZeroOrMore
+)
+
+class CT_Hyperlink(BaseOxmlElement):
+    """
+    ``<w:hyperlink>`` element, containing properties related to field.
+    """
+    anchor = OptionalAttribute('w:anchor', ST_String)
+    relationship_id = OptionalAttribute('r:id', ST_String)
+    r = ZeroOrMore('w:r')
diff --git a/docx/oxml/text/paragraph.py b/docx/oxml/text/paragraph.py
@@ -17,11 +17,36 @@ class CT_P(BaseOxmlElement):
     pPr = ZeroOrOne('w:pPr', successors=('w:bookmarkEnd',))
     r = ZeroOrMore('w:r', successors=('w:bookmarkEnd',))
     bookmarkEnd = ZeroOrMore('w:bookmarkEnd')
+    hyperlink = ZeroOrMore('w:hyperlink')
 
     def _insert_pPr(self, pPr):
         self.insert(0, pPr)
         return pPr
 
+    def add_hyperlink(self, text, reference):
+        """
+        Return a new ``<w:hyperlink>`` element inserted at the end of this
+        paragraph. The `reference` can be a valid URL or an bookmark name.
+
+        If the `reference` is a URL than a relationship element is created in
+        the relationship part of the document, and the id of this relationship
+        is stored in `relationship_id` attribute of the ``<w:hyperlink>``.
+
+        If the `reference` is an bookmark name then that value is stored in
+        `anchor` attribute of the ``<w:hyperlink>``.
+        """
+        new_h = self._add_hyperlink()
+        r = new_h._add_r()
+        r.text = text
+        r.style = 'Hyperlink'
+        if reference.startswith('rId'):
+            # reference is an relationship id of a URL,
+            # so it's stored in a `relationship_id`
+            new_h.relationship_id = reference
+        else:
+            new_h.anchor = reference
+        return new_h
+
     def add_p_before(self):
         """
         Return a new ``<w:p>`` element inserted directly prior to this one.
@@ -121,13 +146,15 @@ def style(self, style):
         pPr = self.get_or_add_pPr()
         pPr.style = style
 
-    def iter_r_lst_recursive(self):
+    def iter_r_and_hyperlinks(self, return_hyperlinks=False):
         """
         Override xmlchemy generated list of runs to include runs from
         hyperlinks and content controls.
+        If the argument `return_hyperlinks` is `True` then the hyperlinks
+        will be yielded as `CT_Hyperlink`.
         """
 
-        def get_runs(elem):
+        def get_el(elem):
             # Two flags used to remove hidden parts of complex field characters.
             ignoreRun = 0 # used to count nesting of ``<w:fldChar>``, if it's 0 then the run property is not inside a hidden part of ``<w:fldChar>``
             hasSeparate = False
@@ -161,6 +188,11 @@ def get_runs(elem):
                     # yields runs that have at least one visible element
                     if len(child) > 0:
                         yield child
-                elif child.tag in (qn('w:hyperlink'), qn('w:sdt'), qn('w:sdtContent'), qn('w:smartTag'),):
-                    yield from get_runs(child)
-        yield from get_runs(self)
+                elif child.tag == qn('w:hyperlink'):
+                    if return_hyperlinks:
+                        yield child
+                    else:
+                        yield from get_el(child)
+                elif child.tag in (qn('w:sdt'), qn('w:sdtContent'), qn('w:smartTag'),):
+                    yield from get_el(child)
+        yield from get_el(self)
diff --git a/docx/shared.py b/docx/shared.py
@@ -7,6 +7,8 @@
 from __future__ import absolute_import, print_function, unicode_literals
 from functools import wraps
 
+from docx.exceptions import PythonDocxError
+
 
 class Length(int):
     """
@@ -275,3 +277,35 @@ def wrapper(self, *args, **kwargs):
             return out
         return wrapper
     return decorator
+
+
+def find_containing_document(element):
+    """
+    Go through elements parent until it finds the root element (Document)
+    And return it.
+    """
+    from .document import Document
+    while True:
+        if not hasattr(element, '_parent'):
+            raise PythonDocxError(f'{type(element)} has no `_parent` property.')
+        if isinstance(element._parent, Document):
+            return element._parent
+        else:
+            element = element._parent
+
+
+def is_valid_url(url):
+    """
+    Returns `True` if it's a valid URL.
+    """
+    if not isinstance(url, str):
+        return False
+    import re
+    regex = re.compile(
+        r'^https?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    return bool(regex.search(url))
diff --git a/docx/text/hyperlink.py b/docx/text/hyperlink.py
@@ -0,0 +1,30 @@
+from ..shared import Parented, find_containing_document
+
+class Hyperlink(Parented):
+
+    def __init__(self, h, parent):
+        super(Hyperlink, self).__init__(parent)
+        self._h = h
+        self._parent = parent
+
+    @property
+    def text(self):
+        """
+        String formed by concatenating the text of each run in the hyperlink.
+        """
+        if self._h.r_lst is None:
+            return ''
+        text = ''
+        for r in self._h.r_lst:
+            text += r.text
+        return text
+
+    @property
+    def link(self):
+        """
+        String that can be either an URL or an |Bookmark| name.
+        """
+        if self._h.relationship_id:
+            return find_containing_document(self).part.rels[self._h.relationship_id].target_ref
+        else:
+            return self._h.anchor
diff --git a/docx/text/paragraph.py b/docx/text/paragraph.py
@@ -15,11 +15,14 @@
 from ..enum.style import WD_STYLE_TYPE
 from .parfmt import ParagraphFormat
 from .run import Run
-from ..shared import Parented, Length, lazyproperty, Inches, cache, bust_cache
+from .hyperlink import Hyperlink
+from ..shared import Parented, Length, find_containing_document, is_valid_url, lazyproperty, Inches, cache, bust_cache
 from ..oxml.ns import nsmap
+from ..oxml.text.hyperlink import CT_Hyperlink
 from docx.bookmark import BookmarkParent
 from docx.parts.image import ImagePart
 
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
 
 # Decorator for all text changing functions used to invalidate text cache.
 text_changing = bust_cache(('text', 'run_text'))
@@ -51,6 +54,19 @@ def add_footnote(self):
         footnote = document._add_footnote(new_fr_id)
         return footnote
 
+    def add_hyperlink(self, text, reference):
+        """
+        Append a ``<w:hyperlink>`` element.
+        The passed `reference` can be a valid URL address or
+        an bookmark name.
+        """
+        if is_valid_url(reference):
+            # Store URL as relationship rId
+            rId = find_containing_document(self).part.relate_to(
+                reference, RT.HYPERLINK, True)
+            reference = rId
+        return Hyperlink(self._p.add_hyperlink(text, reference), self)
+
     @text_changing
     def add_run(self, text=None, style=None):
         """
@@ -110,7 +126,6 @@ def set_std_placeholder_text(r, text=None):
             active_placeholder = sdtPr._add_active_placeholder()
             active_placeholder.set('{%s}val' % nsmap['w'], 'true')
 
-
         sdt = self._p._new_sdt()
 
         sdtPr = sdt._add_sdtPr()
@@ -164,6 +179,14 @@ def clear(self):
         self._p.clear_content()
         return self
 
+    @property
+    def runs_and_hyperlinks(self):
+        """
+        Sequence of |Run| and |Hyperlink| instances corresponding to the
+        ``<w:r>`` and ``<w:hyperlink>`` elements in this paragraph.
+        """
+        return [Hyperlink(e, self) if isinstance(e, CT_Hyperlink) else Run(e, self) for e in self._p.iter_r_and_hyperlinks(return_hyperlinks=True)]
+
     @property
     def footnotes(self):
         """
@@ -179,6 +202,14 @@ def footnotes(self):
             footnote_list.append(footnotes[ref_id])
         return footnote_list
 
+    @property
+    def hyperlinks(self):
+        """
+        Sequence of |Hyperlink| instances corresponding to the <w:hyperlink>
+        elements in this paragraph.
+        """
+        return [Hyperlink(h, self) for h in self._p.hyperlink_lst]
+
     def insert_paragraph_before(self, text=None, style=None, ilvl=None):
         """
         Return a newly created paragraph, inserted directly before this
@@ -269,7 +300,7 @@ def remove_text(self, start=0, end=-1):
             runend = runstart + len(run.text)
             if runstart <= start and end <= runend:
                 run.text = run.text[:(start-runstart)] \
-                           + run.text[(end-runstart):]
+                    + run.text[(end-runstart):]
                 if not run.text:
                     run._r.getparent().remove(run._r)
                 return self
@@ -334,7 +365,8 @@ def lvl_from_para_props(self):
         """
         if self._lvl_from_para_props is None:
             try:
-                self._lvl_from_para_props = self._p.lvl_from_para_props(self.part.numbering_part._element)
+                self._lvl_from_para_props = self._p.lvl_from_para_props(
+                    self.part.numbering_part._element)
             except (AttributeError, NotImplementedError):
                 return None
         return self._lvl_from_para_props
@@ -384,7 +416,7 @@ def runs(self):
         Sequence of |Run| instances corresponding to the <w:r> elements in
         this paragraph.
         """
-        return [Run(r, self) for r in self._p.iter_r_lst_recursive()]
+        return [Run(r, self) for r in self._p.iter_r_and_hyperlinks()]
 
     @property
     def bookmark_starts(self):
@@ -445,7 +477,7 @@ def set_li_lvl(self, styles, prev, ilvl):
         prev_el = prev._element if prev else None
         _ilvl = 0 if ilvl is None else ilvl
         self._p.set_li_lvl(self.part.numbering_part._element,
-                              self.part.cached_styles, prev_el, _ilvl)
+                           self.part.cached_styles, prev_el, _ilvl)
 
     @property
     @cache
@@ -506,7 +538,7 @@ def insert_text(self, position, new_text):
             runend += len(run.text)
             if runend >= position:
                 run.text = run.text[:(position-runstart)] \
-                           + new_text + run.text[(position-runstart):]
+                    + new_text + run.text[(position-runstart):]
                 break
         return self
 
@@ -618,7 +650,8 @@ def _inner_get_tabstops(obj):
 
                 obj = obj.paragraph_format
 
-                tabstops.extend([round(ts.position.inches, 2) for ts in obj.tab_stops])
+                tabstops.extend([round(ts.position.inches, 2)
+                                for ts in obj.tab_stops])
                 clear_t_stops = [round(ts.position.inches, 2)
                                  for ts in obj.tab_stops
                                  if ts._element.attrib['{%s}val' % nsmap['w']] == 'clear']
@@ -637,16 +670,21 @@ def apply_formatting(source, first_line_indent=None, left_indent=None):
 
         # Apply paragraph styles by priority (from lowest to highest).
         # Formatting from the base style has the lowest priority.
-        first_line_indent = get_base_style_attr(self.style, 'first_line_indent')
+        first_line_indent = get_base_style_attr(
+            self.style, 'first_line_indent')
         left_indent = get_base_style_attr(self.style, 'left_indent')
         # Next, we apply formatting from numbering properties defined in paragraph style.
-        first_line_indent, left_indent = apply_formatting(self.style_numbering_format, first_line_indent, left_indent)
+        first_line_indent, left_indent = apply_formatting(
+            self.style_numbering_format, first_line_indent, left_indent)
         # Then formatting from paragraph style.
-        first_line_indent, left_indent = apply_formatting(self.style.paragraph_format, first_line_indent, left_indent)
+        first_line_indent, left_indent = apply_formatting(
+            self.style.paragraph_format, first_line_indent, left_indent)
         # Next, formatting from numbering properties defined in direct paragraph properties is applied.
-        first_line_indent, left_indent = apply_formatting(self.para_numbering_format, first_line_indent, left_indent)
+        first_line_indent, left_indent = apply_formatting(
+            self.para_numbering_format, first_line_indent, left_indent)
         # Finally, we apply formatting from direct paragraph formatting.
-        first_line_indent, left_indent = apply_formatting(self.paragraph_format, first_line_indent, left_indent)
+        first_line_indent, left_indent = apply_formatting(
+            self.paragraph_format, first_line_indent, left_indent)
 
         # Get explicitly set indentation
         if first_line_indent is not None:
@@ -664,13 +702,15 @@ def apply_formatting(source, first_line_indent=None, left_indent=None):
 
         # Find out the number of tabs at the beginning of the paragraph.
         # Ignore regular spaces.
-        tab_count = self.text[:len(self.text) - len(self.text.lstrip())].count('\t')
+        tab_count = self.text[:len(self.text) -
+                              len(self.text.lstrip())].count('\t')
 
         if tab_count:
 
             # Get tab stops but only those to the right of first line indent as the previous
             # don't affect the indentation.
-            tab_stops = [ts for ts in get_tabstops(self) if ts > first_line_indent]
+            tab_stops = [ts for ts in get_tabstops(
+                self) if ts > first_line_indent]
 
             # If the first line indent is left of the paragraph indent, first tab will tab to
             # the paragraph indent.
@@ -695,7 +735,8 @@ def apply_formatting(source, first_line_indent=None, left_indent=None):
 
                 # Let's round up to the first tab char indent. If already rounded add one.
                 tab_count -= 1
-                indent = math.ceil(indent) if not indent.is_integer() else indent + 1
+                indent = math.ceil(
+                    indent) if not indent.is_integer() else indent + 1
 
                 # The remaining tab chars just adds whole indents.
                 indent += tab_count