[Confluence] New method 'scrap_regex_from_page' + docs + example (#1327)

gkowalc · web-flow · commit 416a2b09155d · 2024-02-06T09:29:22.000+03:00
* fixing minor issue in scrap_regex_from_issue method

* new Confluence method scrap_regex_from_page+ docs + examples

---------

Co-authored-by: gkowalc &lt;&gt;
diff --git a/atlassian/confluence.py b/atlassian/confluence.py
@@ -3,7 +3,7 @@
 import os
 import time
 import json
-
+import re
 from requests import HTTPError
 import requests
 from deprecated import deprecated
@@ -397,6 +397,32 @@ def get_tables_from_page(self, page_id):
         except Exception as e:
             log.error("Error occured", e)
 
+    def scrap_regex_from_page(self, page_id, regex):
+        """
+        Method scraps regex patterns from a Confluence page_id.
+
+        :param page_id: The ID of the Confluence page.
+        :param regex: The regex pattern to scrape.
+        :return: A list of regex matches.
+        """
+        regex_output = []
+        page_output = self.get_page_by_id(page_id, expand="body.storage")["body"]["storage"]["value"]
+        try:
+            if page_output is not None:
+                description_matches = [x.group(0) for x in re.finditer(regex, page_output)]
+                if description_matches:
+                    regex_output.extend(description_matches)
+            return regex_output
+        except HTTPError as e:
+            if e.response.status_code == 404:
+                # Raise ApiError as the documented reason is ambiguous
+                log.error("couldn't find page_id : ", page_id)
+                raise ApiNotFoundError(
+                    "There is no content with the given page id,"
+                    "or the calling user does not have permission to view the page",
+                    reason=e,
+                )
+
     def get_page_labels(self, page_id, prefix=None, start=None, limit=None):
         """
         Returns the list of labels on a piece of Content.
diff --git a/atlassian/jira.py b/atlassian/jira.py
@@ -1553,15 +1553,16 @@ def scrap_regex_from_issue(self, issue, regex):
         comments = issue_output["fields"]["comment"]["comments"]
 
         try:
-            description_matches = [x.group(0) for x in re.finditer(regex, description)]
-            if description_matches:
-                regex_output.extend(description_matches)
-
-            for comment in comments:
-                comment_html = comment["body"]
-                comment_matches = [x.group(0) for x in re.finditer(regex, comment_html)]
-                if comment_matches:
-                    regex_output.extend(comment_matches)
+            if description is not None:
+                description_matches = [x.group(0) for x in re.finditer(regex, description)]
+                if description_matches:
+                    regex_output.extend(description_matches)
+
+                for comment in comments:
+                    comment_html = comment["body"]
+                    comment_matches = [x.group(0) for x in re.finditer(regex, comment_html)]
+                    if comment_matches:
+                        regex_output.extend(comment_matches)
 
             return regex_output
         except HTTPError as e:
diff --git a/docs/confluence.rst b/docs/confluence.rst
@@ -156,7 +156,10 @@ Page actions
     confluence.add_comment(page_id, text)
 
      # Fetch tables from Confluence page
-    confluence.get_page_tables(page_id)
+    confluence.get_tables_from_page(page_id)
+
+    # Get regex matches from Confluence page
+    confluence.scrap_regex_from_page(page_id, regex)
 
 Template actions
 ----------------
diff --git a/examples/confluence/confluence_scrap_regex_from_page.py b/examples/confluence/confluence_scrap_regex_from_page.py
@@ -0,0 +1,13 @@
+from atlassian import Confluence
+
+
+confluence = Confluence(
+    url="<instance_url>",
+    username="<user_enamil>",
+    password="api_key",
+)
+page_id = 393464
+ipv4_regex = r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+confluence.scrap_regex_from_page(
+    page_id, ipv4_regex
+)  # method returns list of matches of ipv4 addresses from page content.