tskit-dev · petrelharp · Jan 3, 2025 · Jan 4, 2025
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,15 @@
 UPCOMING
 ***************************
 
+**Bugfixes**:
+
+- Recapitation on tree sequences with null genomes would attempt to simulate
+    the history of those null genomes; this would in all but exceptional cases
+    fail with an error ("not all roots are at the time expected"). Now,
+    `recapitate` removes the "sample" flag from all such null genomes in
+    sampled individuals. (:user: `petrelharp`, :pr:`358`)
+
+
 
 ***************************
 [1.0.4] - 2023-08-01

diff --git a/pyslim/methods.py b/pyslim/methods.py
@@ -39,13 +39,37 @@ def recapitate(ts,
     that if neither ``recombination_rate`` or a ``recombination_map`` are
     provided, there will be *no* recombination.
 
+    .. warning:
+
+        If the tree sequence contains (SLiM-specified) "null" genomes, as for
+        instance in a haploid or haplodiploid simulation, then these genomes
+        will remain in the tree sequence but no longer be marked as samples,
+        so loading back into SLiM will not work. If this is a problem, please
+        open an issue on github.
+
     :param tskit.TreeSequence ts: The tree sequence to transform.
     :param float ancestral_Ne: If specified, then will simulate from a single
         ancestral population of this size. It is an error to specify this
         as well as ``demography``.
     :param dict kwargs: Any other arguments to :func:`msprime.sim_ancestry`.
     '''
     is_current_version(ts, _warn=True)
+    has_null = False
+    for nid in ts.samples():
+        n = ts.node(nid)
+        if n.metadata['is_null']:
+            has_null = True
+            break
+    if has_null:
+        # we need to mark NULL genomes to be *not* samples
+        # so we don't simulate their ancestry
+        tables = ts.tables
+        tables.nodes.clear()
+        for n in ts.nodes():
+            if n.metadata['is_null']:
+                n = n.replace(flags=n.flags & ~tskit.NODE_IS_SAMPLE)
+            tables.nodes.append(n)
+        ts = tables.tree_sequence()
     if ancestral_Ne is not None:
         if "demography" in kwargs:
             raise ValueError("You cannot specify both `demography` and `ancestral_Ne`.")
@@ -66,7 +90,7 @@ def recapitate(ts,
                 "Not all roots of the provided tree sequence are at the time expected "
                 "by recapitate(). This could happen if you've simplified in "
                 "python before recapitating (fix: don't simplify first). "
-                "If could also happen in other situations, e.g., "
+                "It could also happen in other situations, e.g., "
                 "you added new individuals without parents in SLiM "
                 "during the course of the simulation with sim.addSubPop(), "
                 "in which case you will probably need to recapitate with "

diff --git a/tests/test_annotation.py b/tests/test_annotation.py
@@ -6,6 +6,7 @@
 import pytest
 import random
 import warnings
+import contextlib
 
 import msprime
 import tskit
@@ -14,6 +15,13 @@
 import tests
 from .recipe_specs import restarted_recipe_eq
 
+def mutcontext(ts):
+    if ts.num_mutations > 0:
+        handler = pytest.warns(Warning, match="already has.*metadata")
+    else:
+        handler = contextlib.nullcontext()
+    return handler
+
 class TestAnnotate(tests.PyslimTestCase):
     '''
     Tests for tools to annotate existing msprime-derived tree sequences.
@@ -270,13 +278,18 @@ def test_basic_annotation(self, helper_functions, tmp_path):
                 tick = 4
                 cycle = 1
                 stage = "late"
-                slim_ts = pyslim.annotate(
-                        ts, model_type="WF",
-                        tick=tick,
-                        cycle=cycle,
-                        stage=stage,
-                        annotate_mutations=do_mutations,
-                )
+                if do_mutations:
+                    handler = mutcontext(ts)
+                else:
+                    handler = contextlib.nullcontext()
+                with handler:
+                    slim_ts = pyslim.annotate(
+                            ts, model_type="WF",
+                            tick=tick,
+                            cycle=cycle,
+                            stage=stage,
+                            annotate_mutations=do_mutations,
+                    )
                 assert slim_ts.metadata['SLiM']['model_type'] == 'WF'
                 assert slim_ts.metadata['SLiM']['tick'] == tick
                 assert slim_ts.metadata['SLiM']['cycle'] == cycle
@@ -305,7 +318,8 @@ def test_annotate_refseq(self):
     def test_annotate_individuals(self, helper_functions, tmp_path):
         # test the workflow of annotating defaults, then assigning sexes randomly
         for ts in helper_functions.get_msprime_examples():
-            slim_ts = pyslim.annotate(ts, model_type="nonWF", tick=1, stage="early")
+            with mutcontext(ts):
+                slim_ts = pyslim.annotate(ts, model_type="nonWF", tick=1, stage="early")
             tables = slim_ts.dump_tables()
             top_md = tables.metadata
             top_md['SLiM']['separate_sexes'] = True
@@ -345,7 +359,8 @@ def test_annotate_XY(self, helper_functions, tmp_path):
         random.seed(8)
         for ts in helper_functions.get_msprime_examples():
             for genome_type in ["X", "Y"]:
-                slim_ts = pyslim.annotate_defaults(ts, model_type="nonWF", slim_generation=1)
+                with mutcontext(ts):
+                    slim_ts = pyslim.annotate_defaults(ts, model_type="nonWF", slim_generation=1)
                 tables = slim_ts.dump_tables()
                 top_md = tables.metadata
                 top_md['SLiM']['separate_sexes'] = True
@@ -399,7 +414,8 @@ def test_annotate_XY(self, helper_functions, tmp_path):
     def test_annotate_nodes(self, helper_functions):
         # test workflow of annotating defaults and then editing node metadata
         for ts in helper_functions.get_msprime_examples():
-            slim_ts = pyslim.annotate(ts, model_type="nonWF", tick=1)
+            with mutcontext(ts):
+                slim_ts = pyslim.annotate(ts, model_type="nonWF", tick=1)
             tables = slim_ts.dump_tables()
             metadata = [n.metadata for n in tables.nodes]
             gtypes = [
@@ -421,7 +437,8 @@ def test_annotate_nodes(self, helper_functions):
 
     def test_annotate_mutations(self, helper_functions):
         for ts in helper_functions.get_msprime_examples():
-            slim_ts = pyslim.annotate(ts, model_type="nonWF", tick=1)
+            with mutcontext(ts):
+                slim_ts = pyslim.annotate(ts, model_type="nonWF", tick=1)
             tables = slim_ts.dump_tables()
             metadata = [m.metadata for m in tables.mutations]
             selcoefs = [random.uniform(0, 1) for _ in metadata]

diff --git a/tests/test_tree_sequence.py b/tests/test_tree_sequence.py
@@ -637,7 +637,7 @@ def test_mutation_at(self, recipe):
         ts = recipe["ts"]
         for _ in range(100):
             node = random.randint(0, ts.num_nodes - 1)
-            pos = random.randint(0, ts.sequence_length - 1)
+            pos = random.randint(0, int(ts.sequence_length - 1))
             tree = ts.at(pos)
             parent = tree.parent(node)
             a = pyslim.mutation_at(ts, node, pos)
@@ -664,7 +664,7 @@ def test_nucleotide_at(self, recipe):
                 assert len(ts.reference_sequence.data) == ts.sequence_length
                 for _ in range(100):
                     node = random.randint(0, ts.num_nodes - 1)
-                    pos = random.randint(0, ts.sequence_length - 1)
+                    pos = random.randint(0, int(ts.sequence_length - 1))
                     tree = ts.at(pos)
                     parent = tree.parent(node)
                     a = pyslim.nucleotide_at(ts, node, pos)