WayScience · d33bs · May 17, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,7 @@ __pycache__/
 # normalized control features
 3.normalize_data/normalized_data/negative_control_data*
 3.normalize_data/normalized_data/positive_control_data*
+
+# data packaging ignores
+5.data_packaging/packaged
+5.data_packaging/images
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,43 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+default_language_version:
+  python: python3.11
+files: >
+  (?x)^(
+    5.data_packaging/.* |
+    pyproject.toml |
+    poetry.lock |
+    README.md
+  )$
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-added-large-files
+      - id: check-toml
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.6
+    hooks:
+      - id: codespell
+        exclude: >
+          (?x)^(
+              .*\.lock|.*\.csv
+          )$
+  - repo: https://github.com/psf/black
+    rev: 24.3.0
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/python-poetry/poetry
+    rev: "1.8.0"
+    hooks:
+      - id: poetry-check
+  - repo: https://github.com/hadolint/hadolint
+    rev: v2.12.0
+    hooks:
+      - id: hadolint-docker
diff --git a/1.idr_streams/stream_files/idr0013-screenA-plates-w-colnames.tsv b/1.idr_streams/stream_files/idr0013-screenA-plates-w-colnames.tsv
diff --git a/5.data_packaging/Dockerfile.bfconvert b/5.data_packaging/Dockerfile.bfconvert
@@ -0,0 +1,33 @@
+# This Dockerfile is for running bfconvert for this project.
+# See here for more:
+# https://bio-formats.readthedocs.io/en/v7.3.0/users/comlinetools/conversion.html
+
+# base image java
+FROM openjdk:22-slim
+
+# provide a version argument
+ARG version=7.2.0
+
+# set the workdir to /app
+WORKDIR /app
+
+# create a directory for the application files
+RUN mkdir -p /opt/bftools
+
+# install required packages
+# hadolint ignore=DL3008
+RUN apt-get update \
+    && apt-get install --no-install-recommends -y \
+    wget \
+    unzip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# download and unzip bftools
+RUN wget --progress=dot:giga \
+    https://downloads.openmicroscopy.org/bio-formats/$version/artifacts/bftools.zip \
+    -O /opt/bftools/bftools.zip \
+    && unzip /opt/bftools/bftools.zip -d /opt
+
+# Set the entrypoint for bfconvert
+ENTRYPOINT ["/opt/bftools/bfconvert"]
diff --git a/5.data_packaging/README.md b/5.data_packaging/README.md
@@ -0,0 +1,73 @@
+# 5. Data Packaging
+
+In this module, we collect and package data created by this project.
+Packaging involves the process of making the data easier to both store and use by others.
+This portion of the project strives to be additive-only, attempting "First do no harm." through "yes and" focus.
+
+A story to help describe goals:
+
+_"As a research data participant I need a way to analyze (understand, contextualize, and explore) and implement (engineer solutions which efficiently scale for time and computing resources) the data found here in order to effectively reproduce findings, make new discoveries, and avoid challenging (or perhaps incorrect) translations individually."_
+
+We tried to think about the "research data participant" here with empathy; we can reduce barriers for other people by readying the data for use outside of this project.
+If the barriers are high, a person may use the data incorrectly or opt to not use it at all.
+We can't know all the reasons why or how someone might use the findings here but we can empathize for them through how much time cost they may face to use it.
+A side-effect of thinking this way is that we also can benefit one another (we all face similar challenges).
+
+Proposed solutions:
+
+- Use named column data tables at a bare minimum to increase data understandability by the audience.
+- Use data-typed in-memory and file-based formats to ensure consistent handling of data once read from (potentially  un-typed) data sources.
+- Store data in high-performance file-formats for distribution and scalable implementation by other people.
+- Share data schema upfront to indicate data translation outside of in-process observation.
+- Containerize OS-level dependencies to ensure reproducibility.
+- Rewrite IDR data extraction to use FTP as a simple and currently documented procedure.
+- Create avenues for reuse where possible to help increase the chances of multiplied benefit beyond just this PR.
+
+## Development
+
+This module leverages system-available Python, [Poetry](https://github.com/python-poetry/poetry), and [Poe the Poet](https://poethepoet.natn.io/index.html) (among other dependencies found in the `pyproject.toml` file) to complete tasks.
+This module also leverages Docker to reproducibly leverage additional tooling outside of Python dependencies.
+We recommend installing Docker (suggested through [Docker Desktop](https://www.docker.com/products/docker-desktop/)), Python (suggested through [pyenv](https://github.com/pyenv/pyenv)) and Poetry (suggested through `pip install poetry`), then using the following to run the processes related to this step.
+
+```sh
+# note: run these from the project root (one directory up).
+# after installing poetry, create the environment
+poetry install
+
+# run the poe the poet task related to this step
+# (triggers multiple Python modules)
+poetry run poe package_data
+```
+
+## Data Assets
+
+The following data assets are included as part of the data package.
+
+- mitocheck_metadata/features.samples-w-colnames.txt
+- mitocheck_metadata/idr0013-screenA-annotation.csv.gz
+- 0.locate_data/locations/negative_control_locations.tsv
+- 0.locate_data/locations/positive_control_locations.tsv
+- 0.locate_data/locations/training_locations.tsv
+- 1.idr_streams/stream_files/idr0013-screenA-plates-w-colnames.tsv
+- 2.format_training_data/results/training_data__ic.csv.gz
+- 2.format_training_data/results/training_data__no_ic.csv.gz
+- 3.normalize_data/normalized_data/training_data__ic.csv.gz
+- 3.normalize_data/normalized_data/training_data__no_ic.csv.gz
+- 4.analyze_data/results/compiled_2D_umap_embeddings.csv
+- 4.analyze_data/results/single_cell_class_counts.csv
+
+## Schema
+
+The schema for the data assets mentioned above are stored as references to help understand how the data will translate into various databases or collections of data.
+This information may be found within the `5.data_packaging/schema` directory as text versions of PyArrow Table schemas.
+
+## Data Asset Column Labeling
+
+Some data assets are duplicated to help label their columns where the originals may not include them.
+These column names help provide context on what the row values contain and make working with the data more observable within Arrow, Lance, and other technologies used here.
+These are presented below as pairs (with the `-w-colnames` indicating data which uses updated labeling for column names).
+
+- mitocheck_metadata/features.samples.txt
+- mitocheck_metadata/features.samples-w-colnames.txt
+- 1.idr_streams/stream_files/idr0013-screenA-plates.tsv
+- 1.idr_streams/stream_files/idr0013-screenA-plates-w-colnames.tsv
diff --git a/5.data_packaging/constants.py b/5.data_packaging/constants.py
@@ -0,0 +1,45 @@
+"""
+Create various constants for use during data packaging.
+"""
+
+# list of files to be processed
+DATA_FILES = [
+    "mitocheck_metadata/features.samples.txt",
+    "mitocheck_metadata/features.samples-w-colnames.txt",
+    "mitocheck_metadata/idr0013-screenA-annotation.csv.gz",
+    "0.locate_data/locations/negative_control_locations.tsv",
+    "0.locate_data/locations/positive_control_locations.tsv",
+    "0.locate_data/locations/training_locations.tsv",
+    "1.idr_streams/stream_files/idr0013-screenA-plates.tsv",
+    "1.idr_streams/stream_files/idr0013-screenA-plates-w-colnames.tsv",
+    "2.format_training_data/results/training_data__ic.csv.gz",
+    "2.format_training_data/results/training_data__no_ic.csv.gz",
+    "3.normalize_data/normalized_data/training_data__ic.csv.gz",
+    "3.normalize_data/normalized_data/training_data__no_ic.csv.gz",
+    "4.analyze_data/results/compiled_2D_umap_embeddings.csv",
+    "4.analyze_data/results/single_cell_class_counts.csv",
+]
+
+# create a copy of the data files, removing any which don't include column names
+DATA_FILES_W_COLNAMES = [
+    file
+    for file in DATA_FILES
+    if file
+    not in [
+        "mitocheck_metadata/features.samples.txt",
+        "1.idr_streams/stream_files/idr0013-screenA-plates.tsv",
+    ]
+]
+
+PACKAGING_FILES = ["5.data_packaging/location_and_ch5_frame_image_data.parquet"]
+
+# FTP resources for accessing IDR
+# See here for more:
+# https://idr.openmicroscopy.org/about/download.html
+FTP_IDR_URL = "ftp.ebi.ac.uk"
+FTP_IDR_USER = "anonymous"
+FTP_IDR_MITOCHECK_CH5_DIR = (
+    "/pub/databases/IDR/idr0013-neumann-mitocheck/20150916-mitocheck-analysis/mitocheck"
+)
+
+DOCKER_PLATFORM = "linux/amd64"
diff --git a/5.data_packaging/create_lancedb.py b/5.data_packaging/create_lancedb.py
@@ -0,0 +1,69 @@
+"""
+Python module for packaging data in lancedb database.
+"""
+
+import pathlib
+
+import duckdb
+import lancedb
+import pandas as pd
+import pyarrow as pa
+from constants import DATA_FILES_W_COLNAMES, PACKAGING_FILES
+from pyarrow import parquet
+
+# specify a dir where the lancedb database may go and create lancedb client
+lancedb_dir = pathlib.Path("5.data_packaging/packaged/lancedb/mitocheck_data")
+ldb = lancedb.connect(lancedb_dir)
+
+
+def get_arrow_tbl_from_csv(filename_read: str) -> str:
+    """
+    Get an Arrow table from a CSV file through DuckDB.
+
+    Args:
+        filename_read (str):
+            The path to the CSV file to be read.
+
+    Returns:
+        str:
+            A string representing the Arrow table obtained from the CSV file.
+
+    """
+
+    # try to read a typed arrow table
+    # falling back to a high-memory (string-focused) pandas
+    # dataframe read converted to arrow
+    try:
+        with duckdb.connect() as ddb:
+            return ddb.execute(
+                f"""
+                SELECT *
+                FROM read_csv('{filename_read}');
+                """
+            ).arrow()
+    except duckdb.duckdb.ConversionException:
+        return pa.Table.from_pandas(
+            df=pd.read_csv(filepath_or_buffer=filename_read, low_memory=False),
+        )
+    except:
+        raise
+
+
+# send csv file data as arrow tables to a lancedb database
+for filename in DATA_FILES_W_COLNAMES:
+    table = get_arrow_tbl_from_csv(filename_read=filename)
+    ldb.create_table(
+        name=f"{filename.replace('/','.')}",
+        data=table,
+        schema=table.schema,
+        mode="overwrite",
+    )
+
+for filename in PACKAGING_FILES:
+    table = parquet.read_table(source=filename)
+    ldb.create_table(
+        name=f"{filename.replace('/','.')}",
+        data=table,
+        schema=table.schema,
+        mode="overwrite",
+    )