Add old expedition scripts back

rafelafrance · Feb 14, 2024 · b207739 · b207739
1 parent f1fb84b
commit b207739
Show file tree

Hide file tree

Showing 9 changed files with 748 additions and 0 deletions.
diff --git a/finder/__finder_label_babel_reconcile.py b/finder/__finder_label_babel_reconcile.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Reconcile data from a "Label Babel" expedition.
+
+We need training data for the label finder model. We use volunteers to build the
+initial batch of training data. That is, we use a Zooniverse "Notes from Nature"
+expedition to have volunteers (often 3 or more) draw the label bounding boxes. Every
+bounding will be slightly different, so we use this script to reconcile the differences
+into a single best label. There are many wrinkles to this process, some of which are:
+    - Sometimes a person will draw a box around many labels.
+    - Sometimes a box gets drawn around nothing.
+    - Sometimes the drawn boxes are really large or small (outliers).
+    - Etc.
+So we cannot just take a simple average of the box coordinates.
+"""
+import argparse
+import textwrap
+from pathlib import Path
+
+from util.pylib import log
+
+from finder.pylib.rise_of_machines import reconcile_expedition
+
+
+def main():
+    log.started()
+    args = parse_args()
+    reconcile_expedition.reconcile(args)
+    log.finished()
+
+
+def parse_args() -> argparse.Namespace:
+    description = """Reconcile data from a "Label Babel" expedition.
+
+    We need training data for the label finder model and we use use volunteers to build
+    the initial batch of training data. That is, we use a "Notes from Nature" Zooniverse
+    expedition to have volunteers (often 3 or more) draw all label bounding boxes around
+    every label. Every volunteer draws a slightly different bounding box, so we use this
+    script to reconcile the differences into a single "best" label."""
+
+    arg_parser = argparse.ArgumentParser(
+        description=textwrap.dedent(description),
+        fromfile_prefix_chars="@",
+    )
+
+    arg_parser.add_argument(
+        "--database",
+        required=True,
+        type=Path,
+        metavar="PATH",
+        help="""Path to a digi-leap database.""",
+    )
+
+    arg_parser.add_argument(
+        "--unreconciled-csv",
+        required=True,
+        metavar="PATH",
+        help="""Get volunteer drawn labels from this CSV file.""",
+    )
+
+    arg_parser.add_argument(
+        "--reconciled-set",
+        required=True,
+        metavar="NAME",
+        help="""Write reconciled labels to this set.""",
+    )
+
+    arg_parser.add_argument(
+        "--notes",
+        default="",
+        metavar="TEXT",
+        help="""Notes about this run. Enclose them in quotes.""",
+    )
+
+    args = arg_parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finder/__finder_rise_of_machines_build.py b/finder/__finder_rise_of_machines_build.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Build an expedition to determine the quality of label finder output."""
+import argparse
+import textwrap
+from pathlib import Path
+
+from traiter.pylib import log
+
+from finder.pylib.finder.rise_of_machines import build_expedition
+
+
+def main():
+    log.started()
+    args = parse_args()
+    build_expedition.build(args)
+    log.finished()
+
+
+def parse_args() -> argparse.Namespace:
+    description = """Build an expedition to determine the quality of the
+        label builder.
+
+        Ths "Rise of Machines" expedition.
+        """
+
+    arg_parser = argparse.ArgumentParser(
+        description=textwrap.dedent(description), fromfile_prefix_chars="@"
+    )
+
+    arg_parser.add_argument(
+        "--database",
+        required=True,
+        type=Path,
+        metavar="PATH",
+        help="""Path to a digi-leap database.""",
+    )
+
+    arg_parser.add_argument(
+        "--expedition-dir",
+        required=True,
+        type=Path,
+        metavar="PATH",
+        help="""Place expedition files in this directory.""",
+    )
+
+    arg_parser.add_argument(
+        "--label-set",
+        required=True,
+        metavar="NAME",
+        help="""Get labels from this label set.""",
+    )
+
+    arg_parser.add_argument(
+        "--label-conf",
+        type=float,
+        default=0.25,
+        help="""Use labels that have a confidence >= to this. (default: %(default)s)""",
+    )
+
+    arg_parser.add_argument(
+        "--limit",
+        type=float,
+        default=3000,
+        help="""Sample this many sheets. (default: %(default)s)""",
+    )
+
+    arg_parser.add_argument(
+        "--reduce-by",
+        type=int,
+        default=1,
+        metavar="N",
+        help="""Shrink images by this factor. (default: %(default)s)""",
+    )
+
+    arg_parser.add_argument(
+        "--notes",
+        default="",
+        metavar="TEXT",
+        help="""Notes about this run. Enclose them in quotes.""",
+    )
+
+    args = arg_parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finder/__finder_rise_of_machines_reconcile.py b/finder/__finder_rise_of_machines_reconcile.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+import argparse
+import textwrap
+from pathlib import Path
+
+from traiter.pylib import log
+
+from finder.pylib.finder.rise_of_machines import reconcile_expedition
+
+
+def main():
+    args = parse_args()
+    log.started()
+    reconcile_expedition.reconcile(args)
+    log.finished()
+
+
+def parse_args() -> argparse.Namespace:
+    description = """Reconcile data from a "Rise of the Machines" expedition.
+
+        This expedition is a quality control check on the label finder's output. It
+        presents volunteers with herbarium sheets with outlines of the labels. The
+        type of labels is indicated by the color of the outline of the label. The
+        volunteers judge the correctness of found labels by clicking inside of the
+        label (a point) with a correct/incorrect indicator. If the label finder
+        completely missed a label a volunteer draws a bounding box around the missing
+        label."""
+
+    arg_parser = argparse.ArgumentParser(
+        description=textwrap.dedent(description), fromfile_prefix_chars="@"
+    )
+
+    arg_parser.add_argument(
+        "--database",
+        required=True,
+        type=Path,
+        metavar="PATH",
+        help="""Path to a digi-leap database.""",
+    )
+
+    arg_parser.add_argument(
+        "--unreconciled-csv",
+        required=True,
+        metavar="PATH",
+        help="""Get volunteer input from this CSV file.""",
+    )
+
+    arg_parser.add_argument(
+        "--label-set",
+        required=True,
+        metavar="NAME",
+        help="""Get labels from this set.""",
+    )
+
+    arg_parser.add_argument(
+        "--sheet-set",
+        required=True,
+        metavar="NAME",
+        help="""Write reconciled sheets to this set.""",
+    )
+
+    arg_parser.add_argument(
+        "--train-set",
+        required=True,
+        metavar="NAME",
+        help="""Write new reconciled labels to this set.""",
+    )
+
+    arg_parser.add_argument(
+        "--label-conf",
+        type=float,
+        default=0.25,
+        help="""Only include labels that have a confidence >= to this. Set it to 0.0 to
+            get all of the labels. (default: %(default)s)""",
+    )
+
+    arg_parser.add_argument(
+        "--increase-by",
+        type=int,
+        default=1,
+        metavar="N",
+        help="""Increase image size by this factor. This must match the --reduce-by N
+            argument when you built the expedition with rise_of_machines_build.py.
+            (default: %(default)s)""",
+    )
+
+    arg_parser.add_argument(
+        "--notes",
+        default="",
+        metavar="TEXT",
+        help="""Notes about this run. Enclose them in quotes.""",
+    )
+
+    args = arg_parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finder/__finder_yolo_inference_ingest.py b/finder/__finder_yolo_inference_ingest.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+import argparse
+import textwrap
+from pathlib import Path
+
+from util.pylib import log
+
+from old import inference_ingest_yolo
+
+
+def main():
+    log.started()
+    args = parse_args()
+    inference_ingest_yolo.ingest(args)
+    log.finished()
+
+
+def parse_args():
+    description = """Read in YOLO inference results."""
+
+    arg_parser = argparse.ArgumentParser(
+        description=textwrap.dedent(description),
+        fromfile_prefix_chars="@",
+    )
+
+    arg_parser.add_argument(
+        "--yolo-dir",
+        type=Path,
+        metavar="PATH",
+        required=True,
+        help="""Read YOLO results from this directory.""",
+    )
+
+    args = arg_parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finder/pylib/label_babel/__init__.py b/finder/pylib/label_babel/__init__.py
diff --git a/finder/pylib/label_babel/reconcile_expedition.py b/finder/pylib/label_babel/reconcile_expedition.py
@@ -0,0 +1,86 @@
+from argparse import Namespace
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from finder.pylib.old_subject import Subject
+
+
+def reconcile(args: Namespace) -> None:
+    ...
+    # with db.connect(args.database) as cxn:
+    #     run_id = db.insert_run(cxn, args)
+    #
+    #     with args.unreconciled_csv.open() as csv_file:
+    #         reader = csv.DictReader(csv_file)
+    #         classifications = list(reader)
+    #
+    #     subjects = get_subjects(classifications)
+    #
+    #     for subject in tqdm(subjects):
+    #         subject.merge_box_groups()
+    #
+    #     df = get_reconciled_boxes(subjects, args.reconciled_set)
+    #     df.to_sql("label_train", cxn, if_exists="append", index=False)
+    #
+    #     db.update_run_finished(cxn, run_id)
+
+
+def get_reconciled_boxes(subjects, reconciled_set):
+    rec_boxes = []
+
+    for subject in subjects:
+        boxes = subject.merged_boxes
+
+        if not boxes:
+            continue
+
+        classes = subject.merged_types
+
+        if len(boxes) != len(classes):
+            msg = f"Malformed subject {subject.subject_id}"
+            raise ValueError(msg)
+
+        for box, cls in zip(boxes, classes, strict=False):
+            rec_boxes.append(
+                {
+                    "sheet_id": subject.sheet_id,
+                    "train_set": reconciled_set,
+                    "train_class": cls,
+                    "train_left": box[0],
+                    "train_top": box[1],
+                    "train_right": box[2],
+                    "train_bottom": box[3],
+                },
+            )
+
+    df = pd.DataFrame(rec_boxes)
+    return df
+
+
+def get_subjects(classifications):
+    subs: dict[str, Subject] = defaultdict(Subject)
+
+    for class_if in tqdm(classifications):
+        sub_id = class_if["subject_id"]
+
+        subs[sub_id].subject_id = sub_id
+        subs[sub_id].sheet_id = class_if["sheet_id"]
+
+        coords = [v for k, v in class_if.items() if k.startswith("Box(es): box") and v]
+        boxes = np.array([Subject.bbox_from_json(c) for c in coords if c])
+        if len(boxes):
+            subs[sub_id].boxes = np.vstack((subs[sub_id].boxes, boxes))
+
+        selects = [
+            (v if v else "")
+            for k, v in class_if.items()
+            if k.startswith("Box(es): select")
+        ]
+        types = np.array(selects[: len(boxes)], dtype=str)
+        if len(types):
+            subs[sub_id].types = np.hstack((subs[sub_id].types, types))
+
+    return list(subs.values())
diff --git a/finder/pylib/rise_of_machines/__init__.py b/finder/pylib/rise_of_machines/__init__.py