Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add images to mock data generation script #45 #47

Open
wants to merge 9 commits into
base: add-attachment-generation-script-#14
Choose a base branch
from
Open
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,15 @@ To populate the database and object storage with mock data for testing IMS first
and `object-storage-api` are running in docker and then run.

```bash
python ./scripts/dev_cli.py generate
python ./scripts/dev_cli.py generate -c
```

This will clear the database and MinIO storage, fetch existing entities from the `inventory-management-system-api` and
add generate and add mock data for them.

You can also generate mock attachments just for specific entities by repeatedly using `-e ENTITY_ID`. This will ensure
the entity has at least one attachment and image.

## Notes

### Application Configuration
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ test = [

scripts = [
"faker==30.3.0",
"faker-file[pdf,docx,images]==0.17.12",
]

dev = [
Expand Down
93 changes: 62 additions & 31 deletions scripts/dev_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,43 +150,74 @@ def __init__(self):
def setup(self, parser: argparse.ArgumentParser):
add_mongodb_auth_args(parser)
add_minio_alias_args(parser)
parser.add_argument(
"-c",
"--clear",
action=argparse.BooleanOptionalAction,
help="Whether existing data should be cleared before generating new data.",
)
parser.add_argument(
"-e",
"--entities",
nargs="+",
default=None,
help="One or more entity IDs to generate attachments and images for.",
)
parser.add_argument(
"-na",
"--num-attachments",
type=int,
default=None,
help="Specific number of attachments to generate for each entity.",
)
parser.add_argument(
"-ni",
"--num-images",
type=int,
default=None,
help="Specific number of images to generate for each entity.",
)

def run(self, args: argparse.Namespace):
if args.ci:
sys.exit("Cannot use --ci with generate (currently has interactive input)")

# Firstly confirm ok with deleting
answer = input("This operation will replace all existing data, are you sure? ")
if answer in ("y", "yes"):
# Delete the existing data
logging.info("Deleting database contents...")
run_mongodb_command(
["mongosh", "object-storage"]
+ get_mongodb_auth_args(args)
+ [
"--eval",
"db.dropDatabase()",
]
if args.clear:
# Firstly confirm ok with deleting
answer = input("This operation will replace all existing data, are you sure? ")
if answer in ("y", "yes"):
# Delete the existing data
logging.info("Deleting database contents...")
run_mongodb_command(
["mongosh", "object-storage"]
+ get_mongodb_auth_args(args)
+ [
"--eval",
"db.dropDatabase()",
]
)
logging.info("Deleting MinIO bucket contents...")

# Not ideal that this runs here - would either have to setup once as part of some sort of init (e.g.
# could have an init for creating the buckets instead of using the minio/mc image) or would have to
# somehow detect if it has already been done. Doesn't seem to be any harm in setting it again here
# though.
set_minio_alias(args)

run_minio_command(["mc", "rm", "--recursive", "--force", "object-storage/object-storage"])

# Generate new data
logging.info("Generating new mock data...")
try:
# Import here only because CI wont install necessary packages to import it directly
# pylint:disable=import-outside-toplevel
from generate_mock_data import generate_mock_data

generate_mock_data(
entity_ids=args.entities, num_attachments=args.num_attachments, num_images=args.num_images
)
logging.info("Deleting MinIO bucket contents...")

# Not ideal that this runs here - would either have to setup once as part of some sort of init (e.g. could
# have an init for creating the buckets instead of using the minio/mc image) or would have to somehow detect if it
# has already been done. Doesn't seem to be any harm in setting it again here though.
set_minio_alias(args)

run_minio_command(["mc", "rm", "--recursive", "--force", "object-storage/object-storage"])

# Generate new data
logging.info("Generating new mock data...")
try:
# Import here only because CI wont install necessary packages to import it directly
# pylint:disable=import-outside-toplevel
from generate_mock_data import generate_mock_data

generate_mock_data()
except ImportError:
logging.error("Failed to find generate_mock_data.py")
except ImportError:
logging.error("Failed to find generate_mock_data.py")


# List of subcommands
Expand Down
155 changes: 136 additions & 19 deletions scripts/generate_mock_data.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,39 @@
"""Module defining a script for populating the database and object store with randomised data."""

import logging
from typing import Any
from typing import Any, Optional

import requests
from faker import Faker
from faker_file.providers.docx_file import DocxFileProvider
from faker_file.providers.image.pil_generator import PilImageGenerator
from faker_file.providers.jpeg_file import GraphicJpegFileProvider
from faker_file.providers.pdf_file import PdfFileProvider
from faker_file.providers.pdf_file.generators.reportlab_generator import ReportlabPdfGenerator
from faker_file.providers.png_file import GraphicPngFileProvider
from faker_file.providers.txt_file import TxtFileProvider

fake = Faker("en_GB")
fake.add_provider(TxtFileProvider)
fake.add_provider(PdfFileProvider)
fake.add_provider(DocxFileProvider)
fake.add_provider(GraphicJpegFileProvider)
fake.add_provider(GraphicPngFileProvider)

# Various constants determining the result of the script
API_URL = "http://localhost:8002"
IMS_API_URL = "http://localhost:8000"
MAX_NUMBER_ATTACHMENTS_PER_ENTITY = 3
PROBABILITY_ENTITY_HAS_ATTACHMENTS = 0.2
MIN_NUMBER_ATTACHMENTS_PER_ENTITY = 1
MIN_NUMBER_IMAGES_PER_ENTITY = 1
MAX_NUMBER_ATTACHMENTS_PER_ENTITY = 1
MAX_NUMBER_IMAGES_PER_ENTITY = 5
PROBABILITY_ENTITY_HAS_ATTACHMENTS = 0.3
PROBABILITY_ENTITY_HAS_IMAGES = 0.3
PROBABILITY_ATTACHMENT_HAS_OPTIONAL_FIELD = 0.5
ATTACHMENT_MIN_CHARS = 100
ATTACHMENT_MAX_CHARS = 1000
IMAGE_MIN_SIZE = 200
IMAGE_MAX_SIZE = 600
SEED = 0

logging.basicConfig(level=logging.INFO)
Expand All @@ -27,12 +47,22 @@ def optional_attachment_field(function):
return function() if fake.random.random() < PROBABILITY_ATTACHMENT_HAS_OPTIONAL_FIELD else None


def generate_random_attachment(entity_id: str):
"""Generates randomised data for an attachment with a given entity ID."""
def generate_random_attachment_metadata(entity_id: str):
"""Generates randomised metadata for an attachment with a given entity ID (purposefully excludes the filename as it
will be determined later with the file data)."""

return {
"entity_id": entity_id,
"title": optional_attachment_field(lambda: fake.paragraph(nb_sentences=1)),
"description": optional_attachment_field(lambda: fake.paragraph(nb_sentences=2)),
}


def generate_random_image_metadata(entity_id: str):
"""Generates randomised data for an image with a given entity ID."""

return {
"entity_id": entity_id,
"file_name": fake.file_name(),
"title": optional_attachment_field(lambda: fake.paragraph(nb_sentences=1)),
"description": optional_attachment_field(lambda: fake.paragraph(nb_sentences=2)),
}
Expand All @@ -46,29 +76,92 @@ def post(endpoint: str, json: dict) -> dict[str, Any]:
return requests.post(f"{API_URL}{endpoint}", json=json, timeout=10).json()


def create_attachment(attachment_data: dict) -> dict[str, Any]:
"""Creates an attachment given its metadata and uploads some file data to it."""
def create_attachment(attachment_metadata: dict) -> dict[str, Any]:
"""Creates an attachment given its metadata and uploads some randomly generated file data to it."""

attachment = post("/attachments", attachment_data)
file = None
extension = fake.random.choice(["txt", "pdf", "docx"])

params = {"raw": True, "max_nb_chars": fake.random.randint(ATTACHMENT_MIN_CHARS, ATTACHMENT_MAX_CHARS)}

if extension == "txt":
file = fake.txt_file(**params)
elif extension == "pdf":
# Use this generator as default requires wkhtmltopdf to be installed on the system separately
# see https://faker-file.readthedocs.io/en/0.15.5/faker_file.providers.pdf_file.html
file = fake.pdf_file(**params, pdf_generator_cls=ReportlabPdfGenerator)
elif extension == "docx":
file = fake.docx_file(**params)

file_name = fake.file_name(extension=extension)

attachment = post("/attachments", {**attachment_metadata, "file_name": file_name})
upload_info = attachment["upload_info"]
requests.post(
upload_info["url"],
files={"file": fake.paragraph(nb_sentences=2)},
files={"file": file},
data=upload_info["fields"],
timeout=5,
)

return attachment


def populate_random_attachments(existing_entity_ids: list[str]):
def create_image(image_metadata: dict) -> dict[str, Any]:
"""Creates an image given its metadata and uploads some file data to it."""

file = None
extension = fake.random.choice(["jpeg", "png"])

params = {
"image_generator_cls": PilImageGenerator,
"raw": True,
"size": (
fake.random.randint(IMAGE_MIN_SIZE, IMAGE_MAX_SIZE),
fake.random.randint(IMAGE_MIN_SIZE, IMAGE_MAX_SIZE),
),
}

# Use PIL generator as default requires wkhtmltopdf to be installed on the system separately
# see https://faker-file.readthedocs.io/en/latest/creating_images.html
# Also avoid having text in it as Rocky 8 cannot load fonts presumably due to lacking any being installed
if extension == "jpeg":
file = fake.graphic_jpeg_file(**params)
elif extension == "png":
file = fake.graphic_png_file(**params)

file_name = fake.file_name(extension=extension)

image = requests.post(
f"{API_URL}/images",
data=image_metadata,
files={"upload_file": (file_name, file, f"image/{extension}")},
timeout=5,
).json()

return image


def populate_random_attachments(existing_entity_ids: list[str], exclude_existence_check=False):
"""Randomly populates attachments for the given list of entity IDs."""

for entity_id in existing_entity_ids:
if fake.random.random() < PROBABILITY_ENTITY_HAS_ATTACHMENTS:
for _ in range(0, fake.random.randint(0, MAX_NUMBER_ATTACHMENTS_PER_ENTITY)):
attachment = generate_random_attachment(entity_id)
create_attachment(attachment)
if exclude_existence_check or fake.random.random() < PROBABILITY_ENTITY_HAS_ATTACHMENTS:
for _ in range(
0, fake.random.randint(MIN_NUMBER_ATTACHMENTS_PER_ENTITY, MAX_NUMBER_ATTACHMENTS_PER_ENTITY)
):
attachment_metadata = generate_random_attachment_metadata(entity_id)
create_attachment(attachment_metadata)


def populate_random_images(existing_entity_ids: list[str], exclude_existence_check=False):
"""Randomly populates images for the given list of entity IDs."""

for entity_id in existing_entity_ids:
if exclude_existence_check or fake.random.random() < PROBABILITY_ENTITY_HAS_IMAGES:
for _ in range(0, fake.random.randint(MIN_NUMBER_IMAGES_PER_ENTITY, MAX_NUMBER_IMAGES_PER_ENTITY)):
image_metadata = generate_random_image_metadata(entity_id)
create_image(image_metadata)


def obtain_existing_ims_entities() -> list[str]:
Expand All @@ -87,14 +180,38 @@ def obtain_existing_ims_entities() -> list[str]:
return existing_entity_ids


def generate_mock_data():
def generate_mock_data(
entity_ids: list[str] = None, num_attachments: Optional[int] = None, num_images: Optional[int] = None
):
"""Generates mock data for all the entities."""

logger.info("Obtaining a list of existing IMS entities...")
existing_entity_ids = obtain_existing_ims_entities()
existing_entity_ids = entity_ids
exclude_existence_check = False

if not entity_ids:
logger.info("Obtaining a list of existing IMS entities...")
existing_entity_ids = obtain_existing_ims_entities()
else:
exclude_existence_check = True

# pylint:disable=global-statement
if num_attachments is not None:
global MIN_NUMBER_ATTACHMENTS_PER_ENTITY
global MAX_NUMBER_ATTACHMENTS_PER_ENTITY
MIN_NUMBER_ATTACHMENTS_PER_ENTITY = num_attachments
MAX_NUMBER_ATTACHMENTS_PER_ENTITY = num_attachments

if num_images is not None:
global MIN_NUMBER_IMAGES_PER_ENTITY
global MAX_NUMBER_IMAGES_PER_ENTITY
MIN_NUMBER_IMAGES_PER_ENTITY = num_images
MAX_NUMBER_IMAGES_PER_ENTITY = num_images

logger.info("Populating attachments...")
populate_random_attachments(existing_entity_ids)
populate_random_attachments(existing_entity_ids, exclude_existence_check)

logger.info("Populating images...")
populate_random_images(existing_entity_ids, exclude_existence_check)


if __name__ == "__main__":
Expand Down