ral-facilities · joelvdavies · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/README.md b/README.md
@@ -130,12 +130,15 @@ To populate the database and object storage with mock data for testing IMS first
 and `object-storage-api` are running in docker and then run.
 
 ```bash
-python ./scripts/dev_cli.py generate
+python ./scripts/dev_cli.py generate -c
 ```
 
 This will clear the database and MinIO storage, fetch existing entities from the `inventory-management-system-api` and
 add generate and add mock data for them.
 
+You can also generate mock attachments just for specific entities by repeatedly using `-e ENTITY_ID`. This will ensure
+the entity has at least one attachment and image.
+
 ## Notes
 
 ### Application Configuration

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ test = [
 
 scripts = [
     "faker==30.3.0",
+    "faker-file[pdf,docx,images]==0.17.12",
 ]
 
 dev = [

diff --git a/scripts/dev_cli.py b/scripts/dev_cli.py
@@ -150,43 +150,74 @@ def __init__(self):
     def setup(self, parser: argparse.ArgumentParser):
         add_mongodb_auth_args(parser)
         add_minio_alias_args(parser)
+        parser.add_argument(
+            "-c",
+            "--clear",
+            action=argparse.BooleanOptionalAction,
+            help="Whether existing data should be cleared before generating new data.",
+        )
+        parser.add_argument(
+            "-e",
+            "--entities",
+            nargs="+",
+            default=None,
+            help="One or more entity IDs to generate attachments and images for.",
+        )
+        parser.add_argument(
+            "-na",
+            "--num-attachments",
+            type=int,
+            default=None,
+            help="Specific number of attachments to generate for each entity.",
+        )
+        parser.add_argument(
+            "-ni",
+            "--num-images",
+            type=int,
+            default=None,
+            help="Specific number of images to generate for each entity.",
+        )
 
     def run(self, args: argparse.Namespace):
         if args.ci:
             sys.exit("Cannot use --ci with generate (currently has interactive input)")
 
-        # Firstly confirm ok with deleting
-        answer = input("This operation will replace all existing data, are you sure? ")
-        if answer in ("y", "yes"):
-            # Delete the existing data
-            logging.info("Deleting database contents...")
-            run_mongodb_command(
-                ["mongosh", "object-storage"]
-                + get_mongodb_auth_args(args)
-                + [
-                    "--eval",
-                    "db.dropDatabase()",
-                ]
+        if args.clear:
+            # Firstly confirm ok with deleting
+            answer = input("This operation will replace all existing data, are you sure? ")
+            if answer in ("y", "yes"):
+                # Delete the existing data
+                logging.info("Deleting database contents...")
+                run_mongodb_command(
+                    ["mongosh", "object-storage"]
+                    + get_mongodb_auth_args(args)
+                    + [
+                        "--eval",
+                        "db.dropDatabase()",
+                    ]
+                )
+                logging.info("Deleting MinIO bucket contents...")
+
+                # Not ideal that this runs here - would either have to setup once as part of some sort of init (e.g.
+                # could have an init for creating the buckets instead of using the minio/mc image) or would have to
+                # somehow detect if it has already been done. Doesn't seem to be any harm in setting it again here
+                # though.
+                set_minio_alias(args)
+
+                run_minio_command(["mc", "rm", "--recursive", "--force", "object-storage/object-storage"])
+
+        # Generate new data
+        logging.info("Generating new mock data...")
+        try:
+            # Import here only because CI wont install necessary packages to import it directly
+            # pylint:disable=import-outside-toplevel
+            from generate_mock_data import generate_mock_data
+
+            generate_mock_data(
+                entity_ids=args.entities, num_attachments=args.num_attachments, num_images=args.num_images
             )
-            logging.info("Deleting MinIO bucket contents...")
-
-            # Not ideal that this runs here - would either have to setup once as part of some sort of init (e.g. could
-            # have an init for creating the buckets instead of using the minio/mc image) or would have to somehow detect if it
-            # has already been done. Doesn't seem to be any harm in setting it again here though.
-            set_minio_alias(args)
-
-            run_minio_command(["mc", "rm", "--recursive", "--force", "object-storage/object-storage"])
-
-            # Generate new data
-            logging.info("Generating new mock data...")
-            try:
-                # Import here only because CI wont install necessary packages to import it directly
-                # pylint:disable=import-outside-toplevel
-                from generate_mock_data import generate_mock_data
-
-                generate_mock_data()
-            except ImportError:
-                logging.error("Failed to find generate_mock_data.py")
+        except ImportError:
+            logging.error("Failed to find generate_mock_data.py")
 
 
 # List of subcommands

diff --git a/scripts/generate_mock_data.py b/scripts/generate_mock_data.py
@@ -1,19 +1,39 @@
 """Module defining a script for populating the database and object store with randomised data."""
 
 import logging
-from typing import Any
+from typing import Any, Optional
 
 import requests
 from faker import Faker
+from faker_file.providers.docx_file import DocxFileProvider
+from faker_file.providers.image.pil_generator import PilImageGenerator
+from faker_file.providers.jpeg_file import GraphicJpegFileProvider
+from faker_file.providers.pdf_file import PdfFileProvider
+from faker_file.providers.pdf_file.generators.reportlab_generator import ReportlabPdfGenerator
+from faker_file.providers.png_file import GraphicPngFileProvider
+from faker_file.providers.txt_file import TxtFileProvider
 
 fake = Faker("en_GB")
+fake.add_provider(TxtFileProvider)
+fake.add_provider(PdfFileProvider)
+fake.add_provider(DocxFileProvider)
+fake.add_provider(GraphicJpegFileProvider)
+fake.add_provider(GraphicPngFileProvider)
 
 # Various constants determining the result of the script
 API_URL = "http://localhost:8002"
 IMS_API_URL = "http://localhost:8000"
-MAX_NUMBER_ATTACHMENTS_PER_ENTITY = 3
-PROBABILITY_ENTITY_HAS_ATTACHMENTS = 0.2
+MIN_NUMBER_ATTACHMENTS_PER_ENTITY = 1
+MIN_NUMBER_IMAGES_PER_ENTITY = 1
+MAX_NUMBER_ATTACHMENTS_PER_ENTITY = 1
+MAX_NUMBER_IMAGES_PER_ENTITY = 5
+PROBABILITY_ENTITY_HAS_ATTACHMENTS = 0.3
+PROBABILITY_ENTITY_HAS_IMAGES = 0.3
 PROBABILITY_ATTACHMENT_HAS_OPTIONAL_FIELD = 0.5
+ATTACHMENT_MIN_CHARS = 100
+ATTACHMENT_MAX_CHARS = 1000
+IMAGE_MIN_SIZE = 200
+IMAGE_MAX_SIZE = 600
 SEED = 0
 
 logging.basicConfig(level=logging.INFO)
@@ -27,12 +47,22 @@ def optional_attachment_field(function):
     return function() if fake.random.random() < PROBABILITY_ATTACHMENT_HAS_OPTIONAL_FIELD else None
 
 
-def generate_random_attachment(entity_id: str):
-    """Generates randomised data for an attachment with a given entity ID."""
+def generate_random_attachment_metadata(entity_id: str):
+    """Generates randomised metadata for an attachment with a given entity ID (purposefully excludes the filename as it
+    will be determined later with the file data)."""
+
+    return {
+        "entity_id": entity_id,
+        "title": optional_attachment_field(lambda: fake.paragraph(nb_sentences=1)),
+        "description": optional_attachment_field(lambda: fake.paragraph(nb_sentences=2)),
+    }
+
+
+def generate_random_image_metadata(entity_id: str):
+    """Generates randomised data for an image with a given entity ID."""
 
     return {
         "entity_id": entity_id,
-        "file_name": fake.file_name(),
         "title": optional_attachment_field(lambda: fake.paragraph(nb_sentences=1)),
         "description": optional_attachment_field(lambda: fake.paragraph(nb_sentences=2)),
     }
@@ -46,29 +76,92 @@ def post(endpoint: str, json: dict) -> dict[str, Any]:
     return requests.post(f"{API_URL}{endpoint}", json=json, timeout=10).json()
 
 
-def create_attachment(attachment_data: dict) -> dict[str, Any]:
-    """Creates an attachment given its metadata and uploads some file data to it."""
+def create_attachment(attachment_metadata: dict) -> dict[str, Any]:
+    """Creates an attachment given its metadata and uploads some randomly generated file data to it."""
 
-    attachment = post("/attachments", attachment_data)
+    file = None
+    extension = fake.random.choice(["txt", "pdf", "docx"])
+
+    params = {"raw": True, "max_nb_chars": fake.random.randint(ATTACHMENT_MIN_CHARS, ATTACHMENT_MAX_CHARS)}
+
+    if extension == "txt":
+        file = fake.txt_file(**params)
+    elif extension == "pdf":
+        # Use this generator as default requires wkhtmltopdf to be installed on the system separately
+        # see https://faker-file.readthedocs.io/en/0.15.5/faker_file.providers.pdf_file.html
+        file = fake.pdf_file(**params, pdf_generator_cls=ReportlabPdfGenerator)
+    elif extension == "docx":
+        file = fake.docx_file(**params)
+
+    file_name = fake.file_name(extension=extension)
+
+    attachment = post("/attachments", {**attachment_metadata, "file_name": file_name})
     upload_info = attachment["upload_info"]
     requests.post(
         upload_info["url"],
-        files={"file": fake.paragraph(nb_sentences=2)},
+        files={"file": file},
         data=upload_info["fields"],
         timeout=5,
     )
 
     return attachment
 
 
-def populate_random_attachments(existing_entity_ids: list[str]):
+def create_image(image_metadata: dict) -> dict[str, Any]:
+    """Creates an image given its metadata and uploads some file data to it."""
+
+    file = None
+    extension = fake.random.choice(["jpeg", "png"])
+
+    params = {
+        "image_generator_cls": PilImageGenerator,
+        "raw": True,
+        "size": (
+            fake.random.randint(IMAGE_MIN_SIZE, IMAGE_MAX_SIZE),
+            fake.random.randint(IMAGE_MIN_SIZE, IMAGE_MAX_SIZE),
+        ),
+    }
+
+    # Use PIL generator as default requires wkhtmltopdf to be installed on the system separately
+    # see https://faker-file.readthedocs.io/en/latest/creating_images.html
+    # Also avoid having text in it as Rocky 8 cannot load fonts presumably due to lacking any being installed
+    if extension == "jpeg":
+        file = fake.graphic_jpeg_file(**params)
+    elif extension == "png":
+        file = fake.graphic_png_file(**params)
+
+    file_name = fake.file_name(extension=extension)
+
+    image = requests.post(
+        f"{API_URL}/images",
+        data=image_metadata,
+        files={"upload_file": (file_name, file, f"image/{extension}")},
+        timeout=5,
+    ).json()
+
+    return image
+
+
+def populate_random_attachments(existing_entity_ids: list[str], exclude_existence_check=False):
     """Randomly populates attachments for the given list of entity IDs."""
 
     for entity_id in existing_entity_ids:
-        if fake.random.random() < PROBABILITY_ENTITY_HAS_ATTACHMENTS:
-            for _ in range(0, fake.random.randint(0, MAX_NUMBER_ATTACHMENTS_PER_ENTITY)):
-                attachment = generate_random_attachment(entity_id)
-                create_attachment(attachment)
+        if exclude_existence_check or fake.random.random() < PROBABILITY_ENTITY_HAS_ATTACHMENTS:
+            for _ in range(
+                0, fake.random.randint(MIN_NUMBER_ATTACHMENTS_PER_ENTITY, MAX_NUMBER_ATTACHMENTS_PER_ENTITY)
+            ):
+                attachment_metadata = generate_random_attachment_metadata(entity_id)
+                create_attachment(attachment_metadata)
+
+
+def populate_random_images(existing_entity_ids: list[str], exclude_existence_check=False):
+    """Randomly populates images for the given list of entity IDs."""
+
+    for entity_id in existing_entity_ids:
+        if exclude_existence_check or fake.random.random() < PROBABILITY_ENTITY_HAS_IMAGES:
+            for _ in range(0, fake.random.randint(MIN_NUMBER_IMAGES_PER_ENTITY, MAX_NUMBER_IMAGES_PER_ENTITY)):
+                image_metadata = generate_random_image_metadata(entity_id)
+                create_image(image_metadata)
 
 
 def obtain_existing_ims_entities() -> list[str]:
@@ -87,14 +180,38 @@ def obtain_existing_ims_entities() -> list[str]:
     return existing_entity_ids
 
 
-def generate_mock_data():
+def generate_mock_data(
+    entity_ids: list[str] = None, num_attachments: Optional[int] = None, num_images: Optional[int] = None
+):
     """Generates mock data for all the entities."""
 
-    logger.info("Obtaining a list of existing IMS entities...")
-    existing_entity_ids = obtain_existing_ims_entities()
+    existing_entity_ids = entity_ids
+    exclude_existence_check = False
+
+    if not entity_ids:
+        logger.info("Obtaining a list of existing IMS entities...")
+        existing_entity_ids = obtain_existing_ims_entities()
+    else:
+        exclude_existence_check = True
+
+    # pylint:disable=global-statement
+    if num_attachments is not None:
+        global MIN_NUMBER_ATTACHMENTS_PER_ENTITY
+        global MAX_NUMBER_ATTACHMENTS_PER_ENTITY
+        MIN_NUMBER_ATTACHMENTS_PER_ENTITY = num_attachments
+        MAX_NUMBER_ATTACHMENTS_PER_ENTITY = num_attachments
+
+    if num_images is not None:
+        global MIN_NUMBER_IMAGES_PER_ENTITY
+        global MAX_NUMBER_IMAGES_PER_ENTITY
+        MIN_NUMBER_IMAGES_PER_ENTITY = num_images
+        MAX_NUMBER_IMAGES_PER_ENTITY = num_images
 
     logger.info("Populating attachments...")
-    populate_random_attachments(existing_entity_ids)
+    populate_random_attachments(existing_entity_ids, exclude_existence_check)
+
+    logger.info("Populating images...")
+    populate_random_images(existing_entity_ids, exclude_existence_check)
 
 
 if __name__ == "__main__":