atlassian-api · gonchik · Dec 12, 2023 · Dec 12, 2023
diff --git a/atlassian/confluence.py b/atlassian/confluence.py
@@ -7,6 +7,7 @@
 from requests import HTTPError
 import requests
 from deprecated import deprecated
+from bs4 import BeautifulSoup
 from atlassian import utils
 from .errors import ApiError, ApiNotFoundError, ApiPermissionError, ApiValueError, ApiConflictError, ApiNotAcceptable
 from .rest_client import AtlassianRestAPI
@@ -356,6 +357,46 @@ def get_page_by_id(self, page_id, expand=None, status=None, version=None):
 
         return response
 
+    def get_tables_from_page(self, page_id):
+        """
+        Fetches html  tables added to  confluence page
+        :param page_id: integer confluence page_id
+        :return: json object with page_id, number_of_tables_in_page  and  list of list tables_content representing scrapepd tables
+        """
+        try:
+            page_content = self.get_page_by_id(page_id, expand="body.storage")["body"]["storage"]["value"]
+
+            if page_content:
+                tables_raw = [
+                    [[cell.text for cell in row("th") + row("td")] for row in table("tr")]
+                    for table in BeautifulSoup(page_content, features="lxml")("table")
+                ]
+                if len(tables_raw) > 0:
+                    return json.dumps(
+                        {
+                            "page_id": page_id,
+                            "number_of_tables_in_page": len(tables_raw),
+                            "tables_content": tables_raw,
+                        }
+                    )
+                else:
+                    return {
+                        "No tables found for page: ": page_id,
+                    }
+            else:
+                return {"Page content is empty"}
+        except HTTPError as e:
+            if e.response.status_code == 404:
+                # Raise ApiError as the documented reason is ambiguous
+                log.error("Couldn't retrieve tables  from page", page_id)
+                raise ApiError(
+                    "There is no content with the given pageid, pageid params is not an integer "
+                    "or the calling user does not have permission to view the page",
+                    reason=e,
+                )
+        except Exception as e:
+            log.error("Error occured", e)
+
     def get_page_labels(self, page_id, prefix=None, start=None, limit=None):
         """
         Returns the list of labels on a piece of Content.

diff --git a/docs/confluence.rst b/docs/confluence.rst
@@ -152,6 +152,9 @@ Page actions
     # Add comment into page
     confluence.add_comment(page_id, text)
 
+     # Fetch tables from Confluence page
+    confluence.get_page_tables(page_id)
+
 Template actions
 ----------------
 

diff --git a/examples/confluence/confluence_get_tables_from_page.py b/examples/confluence/confluence_get_tables_from_page.py
@@ -0,0 +1,17 @@
+from atlassian import Confluence
+import logging
+
+confluence = Confluence(
+    url="<instance_url>",
+    username="<user_enamil>",
+    password="api_key",
+)
+page_id = 393464
+logging.basicConfig(level=logging.INFO)
+# Page_id is the page id of the page you want to get the tables from.
+
+result = confluence.get_tables_from_page(page_id)
+print(result)
+# Let's say page has two table, each one has 3 columns and 2 rows'
+# Method should return following output: {"page_id": 393464, "number_of_tables_in_page": 2, "tables_content": [[["header1", "header2", "header3"], ["h1r1", "h2r1", "h3r1"], ["h1r2", "h2r2", "h3r2"]], [["table2 header1", "table2 header2", "table2 header3"], ["h1r1", "h2r1", "h3r1"], ["h1r2", "h2r2", "h3r2"]]]}
+# tables_content is a list of lists of lists. Each nested list represents a table. Each nested list inside a table represents a row.
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,5 @@ six
 oauthlib
 requests_oauthlib
 requests-kerberos==0.14.0
+bs4
+lxml
diff --git a/tox.ini b/tox.ini
@@ -11,6 +11,7 @@ deps =
     pytest-cov
     coverage
     requests
+    bs4
 commands =
     coverage erase
     pytest -v --cov=atlassian --cov-branch --cov-report=xml