Improved download_attachments_from_page: added filename filter for si… (#1476)

AImenes · web-flow · commit 1b49840798ae · 2025-01-07T01:00:37.000+02:00
* Improved download_attachments_from_page: added filename filter for single-file-download and in-memory BytesIO option (for pandas loading) in addition to writing to disc.

* Hotfix: Changed save_path back to path for fully backwards compatibility

---------

Co-authored-by: Anders Imenes &lt;anders.imenes@telenor.no&gt;
diff --git a/atlassian/confluence.py b/atlassian/confluence.py
@@ -1,15 +1,25 @@
 # coding=utf-8
+import io
+import json
 import logging
 import os
-import time
-import json
 import re
-from requests import HTTPError
-import requests
-from deprecated import deprecated
+import time
+
 from bs4 import BeautifulSoup
+from deprecated import deprecated
+import requests
+from requests import HTTPError
+
 from atlassian import utils
-from .errors import ApiError, ApiNotFoundError, ApiPermissionError, ApiValueError, ApiConflictError, ApiNotAcceptable
+from .errors import (
+    ApiConflictError,
+    ApiError,
+    ApiNotAcceptable,
+    ApiNotFoundError,
+    ApiPermissionError,
+    ApiValueError,
+)
 from .rest_client import AtlassianRestAPI
 
 log = logging.getLogger(__name__)
@@ -1390,40 +1400,85 @@ def attach_file(
             comment=comment,
         )
 
-    def download_attachments_from_page(self, page_id, path=None, start=0, limit=50):
-        """
-        Downloads all attachments from a page
-        :param page_id:
-        :param path: OPTIONAL: path to directory where attachments will be saved. If None, current working directory will be used.
-        :param start: OPTIONAL: The start point of the collection to return. Default: None (0).
-        :param limit: OPTIONAL: The limit of the number of attachments to return, this may be restricted by
-                                fixed system limits. Default: 50
-        :return info message: number of saved attachments + path to directory where attachments were saved:
-        """
-        if path is None:
+    def download_attachments_from_page(self, page_id, path=None, start=0, limit=50, filename=None, to_memory=False):
+        """
+        Downloads attachments from a Confluence page. Supports downloading all files or a specific file. 
+        Files can either be saved to disk or returned as BytesIO objects for in-memory handling.
+
+        :param page_id: str
+            The ID of the Confluence page to fetch attachments from.
+        :param path: str, optional
+            Directory where attachments will be saved. If None, defaults to the current working directory.
+            Ignored if `to_memory` is True.
+        :param start: int, optional
+            The start point for paginated attachment fetching. Default is 0. Ignored if `filename` is specified.
+        :param limit: int, optional
+            The maximum number of attachments to fetch per request. Default is 50. Ignored if `filename` is specified.
+        :param filename: str, optional
+            The name of a specific file to download. If provided, only this file will be fetched.
+        :param to_memory: bool, optional
+            If True, attachments are returned as a dictionary of {filename: BytesIO object}.
+            If False, files are written to the specified directory on disk.
+        :return:
+            - If `to_memory` is True, returns a dictionary {filename: BytesIO object}.
+            - If `to_memory` is False, returns a summary dict: {"attachments_downloaded": int, "path": str}.
+        :raises:
+            - FileNotFoundError: If the specified path does not exist.
+            - PermissionError: If there are permission issues with the specified path.
+            - requests.HTTPError: If the HTTP request to fetch an attachment fails.
+            - Exception: For any unexpected errors.
+        """
+        # Default path to current working directory if not provided
+        if not to_memory and path is None:
             path = os.getcwd()
+
         try:
-            attachments = self.get_attachments_from_content(page_id=page_id, start=start, limit=limit)["results"]
-            if not attachments:
-                return "No attachments found"
+            # Fetch attachments based on the specified parameters
+            if filename:
+                # Fetch specific file by filename
+                attachments = self.get_attachments_from_content(page_id=page_id, filename=filename)["results"]
+                if not attachments:
+                    return f"No attachment with filename '{filename}' found on the page."
+            else:
+                # Fetch all attachments with pagination
+                attachments = self.get_attachments_from_content(page_id=page_id, start=start, limit=limit)["results"]
+                if not attachments:
+                    return "No attachments found on the page."
+
+            # Prepare to handle downloads
+            downloaded_files = {}
             for attachment in attachments:
-                file_name = attachment["title"]
-                if not file_name:
-                    file_name = attachment["id"]  # if the attachment has no title, use attachment_id as a filename
+                file_name = attachment["title"] or attachment["id"]  # Use attachment ID if title is unavailable
                 download_link = self.url + attachment["_links"]["download"]
-                r = self._session.get(download_link)
-                file_path = os.path.join(path, file_name)
-                with open(file_path, "wb") as f:
-                    f.write(r.content)
+
+                # Fetch the file content
+                response = self._session.get(download_link)
+                response.raise_for_status()  # Raise error if request fails
+
+                if to_memory:
+                    # Store in BytesIO object
+                    file_obj = io.BytesIO(response.content)
+                    downloaded_files[file_name] = file_obj
+                else:
+                    # Save file to disk
+                    file_path = os.path.join(path, file_name)
+                    with open(file_path, "wb") as file:
+                        file.write(response.content)
+
+            # Return results based on storage mode
+            if to_memory:
+                return downloaded_files
+            else:
+                return {"attachments_downloaded": len(attachments), "path": path}
+
         except NotADirectoryError:
-            raise NotADirectoryError("Verify if directory path is correct and/or if directory exists")
+            raise FileNotFoundError(f"The directory '{path}' does not exist.")
         except PermissionError:
-            raise PermissionError(
-                "Directory found, but there is a problem with saving file to this directory. Check directory permissions"
-            )
-        except Exception as e:
-            raise e
-        return {"attachments downloaded": len(attachments), " to path ": path}
+            raise PermissionError(f"Permission denied when trying to save files to '{path}'.")
+        except requests.HTTPError as http_err:
+            raise Exception(f"HTTP error occurred while downloading attachments: {http_err}")
+        except Exception as err:
+            raise Exception(f"An unexpected error occurred: {err}")
 
     def delete_attachment(self, page_id, filename, version=None):
         """