Merge pull request #176 from hyanwong/document-path-compression-mc

jeromekelleher · web-flow · commit 2008cb37c8de · 2019-09-02T13:28:36.000+01:00
Set keep_unary on simplify, and document (including path compression docs)
diff --git a/docs/api.rst b/docs/api.rst
@@ -42,8 +42,7 @@ Running inference
 
 .. autofunction:: tsinfer.match_samples
 
-.. todo::
-    1. Add documentation for path compression here.
+.. autofunction:: tsinfer.augment_ancestors
 
 ++++++++++
 Exceptions
diff --git a/docs/inference.rst b/docs/inference.rst
@@ -161,6 +161,19 @@ given time, and an ancestor can copy from any older ancestor. For each
 ancestor, we find a path through older ancestors that minimises the
 number of recombination events.
 
+As well as minimising recombination events by finding the best path, we can also
+minimise events by looking for *shared recombination breakpoints*. A shared
+breakpoint exists if a set of children share a breakpoint in the same position,
+and they also have identical parents to the left of the breakpoint and identical
+parents to the right of the breakpoint. Rather than supposing that these
+children experienced multiple identical recombination events in parallel, we can
+reduce the number of ancestral recombination events by postulating a "synthetic
+ancestor" with this breakpoint, existing at a slightly older point
+in time, from whom all the children are descended at this genomic position. We
+call the algorithm used to implement this addition to the ancestral copying
+paths, "path compression".
+
+
 .. todo:: Schematic of the ancestors copying process.
 
 The copying path for each ancestor then describes its ancestry at every
@@ -183,7 +196,9 @@ The final phase of a ``tsinfer`` inference consists of a number steps:
 1. The first (and usually most time-consuming) is to find copying paths
    for our sample haplotypes through the ancestors. Each copying path
    corresponds to a set of tree sequence edges in precisely the same
-   way as for ancestors.
+   way as for ancestors, and the path compression algorithm can be equally
+   applied here.
+
 
 2. As we only use a subset of the available sites for inference
    (excluding by default any sites that are fixed or singletons)
@@ -198,13 +213,18 @@ The final phase of a ``tsinfer`` inference consists of a number steps:
    may be clades of ancestral state samples which would allow us
    to encode the data with fewer back mutations.**
 
-3. Reduce the resulting tree sequence to a canonical form by
-   `simplifying it
-   <https://tskit.readthedocs.io/en/latest/python-api.html#tskit.TreeSequence.simplify>`_.
+3. Remove ancestral paths that do not lead to any of the samples by
+   `simplifying
+   <https://tskit.readthedocs.io/en/latest/python-api.html#tskit.TreeSequence.simplify>`_
+   the final tree sequence. When simplifying, we keep non-branching ("unary")
+   nodes, as they indicate ancestors which we have actively inferred, and
+   for technical reasons keeping unary ancestors can also lead to better
+   compression. Note that this means that not every internal node in the
+   inferred tree sequence will correspond to a coalescent event.
 
 .. todo::
     1. Describe path compression here and above in the ancestors
        section
-    2. Describe the structure of the outpt tree sequences; how the
+    2. Describe the structure of the output tree sequences; how the
        nodes are mapped, what the time values mean, etc.
 
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -4,8 +4,8 @@
 Installation
 ############
 
-Python 3.3 or newer is required for ``tsinfer``. Any Unix-like platform should work
-(``tsinfer`` is tested on Linux and OSX).
+Python 3.5 or newer is required for ``tsinfer``. Any Unix-like platform should
+work (``tsinfer`` is tested on Linux, OS X, and Windows).
 
 Please use ``pip`` to install,
 e.g.::
@@ -33,8 +33,6 @@ first using `venv <https://docs.python.org/3/library/venv.html>`_::
     (tsinfer-venv) $ pip install tsinfer
     (tsinfer-venv) $ tsinfer --help
 
-.. _sec_installation_installation_problems:
-
 ****************
 Potential issues
 ****************
diff --git a/evaluation.py b/evaluation.py
@@ -373,9 +373,13 @@ def edge_plot(ts, filename):
     for tree in ts.trees():
         left, right = tree.interval
         for u in tree.nodes():
-            for c in tree.children(u):
-                lines.append([(left, c), (right, c)])
-                colours.append(pallete[unrank(tree.samples(c), n)])
+            children = tree.children(u)
+            # Don't bother plotting unary nodes, which will all have the same
+            # samples under them as their next non-unary descendant
+            if len(children) > 1:
+                for c in children:
+                    lines.append([(left, c), (right, c)])
+                    colours.append(pallete[unrank(tree.samples(c), n)])
 
     lc = mc.LineCollection(lines, linewidths=2, colors=colours)
     fig, ax = plt.subplots()
diff --git a/setup.py b/setup.py
@@ -49,7 +49,7 @@ def finalize_options(self):
         "tqdm",
         "humanize",
         "daiquiri",
-        "tskit",
+        "tskit>=0.2.1",
         "numcodecs>=0.6",
         "zarr>=2.2",
         "lmdb",
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -610,13 +610,13 @@ def verify_inserted_ancestors(self, ts):
         with tsinfer.SampleData(sequence_length=ts.sequence_length) as sample_data:
             for v in ts.variants():
                 sample_data.add_site(v.position, v.genotypes, v.alleles)
-
         ancestor_data = tsinfer.AncestorData(sample_data)
         tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts)
         ancestor_data.finalise()
 
-        A = np.zeros(
-            (ancestor_data.num_sites, ancestor_data.num_ancestors), dtype=np.uint8)
+        A = np.full(
+            (ancestor_data.num_sites, ancestor_data.num_ancestors),
+            tskit.MISSING_DATA, dtype=np.int8)
         start = ancestor_data.ancestors_start[:]
         end = ancestor_data.ancestors_end[:]
         ancestors = ancestor_data.ancestors_haplotype[:]
@@ -628,8 +628,7 @@ def verify_inserted_ancestors(self, ts):
             tsinfer.check_ancestors_ts(ancestors_ts)
             self.assertEqual(ancestor_data.num_sites, ancestors_ts.num_sites)
             self.assertEqual(ancestor_data.num_ancestors, ancestors_ts.num_samples)
-            self.assertTrue(np.array_equal(
-                ancestors_ts.genotype_matrix(impute_missing_data=True), A))
+            self.assertTrue(np.array_equal(ancestors_ts.genotype_matrix(), A))
             inferred_ts = tsinfer.match_samples(
                 sample_data, ancestors_ts, engine=engine)
             self.assertTrue(np.array_equal(
@@ -1230,6 +1229,14 @@ def verify(self, ts):
             list(ts2.samples()),
             list(range(ts2.num_nodes - n, ts2.num_nodes)))
 
+        # Check that we're calling simplify with the correct arguments.
+        ts2 = tsinfer.infer(sd, simplify=False).simplify(keep_unary=True)
+        t1 = ts1.dump_tables()
+        t2 = ts2.dump_tables()
+        t1.provenances.clear()
+        t2.provenances.clear()
+        self.assertEqual(t1, t2)
+
     def test_single_tree(self):
         ts = msprime.simulate(5, random_seed=1, mutation_rate=2)
         self.verify(ts)
diff --git a/tsinfer/eval_util.py b/tsinfer/eval_util.py
@@ -705,6 +705,8 @@ def run_perfect_inference(
         num_threads=num_threads, extended_checks=extended_checks,
         progress_monitor=progress_monitor,
         stabilise_node_ordering=time_chunking and not path_compression)
+    # to compare against the original, we need to remove unary nodes from the inferred TS
+    inferred_ts = inferred_ts.simplify(keep_unary=False, filter_sites=False)
     return ts, inferred_ts
 
 
diff --git a/tsinfer/inference.py b/tsinfer/inference.py
@@ -139,10 +139,10 @@ def verify(samples, tree_sequence, progress_monitor=None):
 
 
 def infer(
-        sample_data, progress_monitor=None, num_threads=0, path_compression=True,
-        simplify=True, engine=constants.C_ENGINE):
+        sample_data, num_threads=0, path_compression=True, simplify=True,
+        engine=constants.C_ENGINE, progress_monitor=None):
     """
-    infer(sample_data, num_threads=0)
+    infer(sample_data, num_threads=0, path_compression=True, simplify=True)
 
     Runs the full :ref:`inference pipeline <sec_inference>` on the specified
     :class:`SampleData` instance and returns the inferred
@@ -153,13 +153,19 @@ def infer(
     :param int num_threads: The number of worker threads to use in parallelised
         sections of the algorithm. If <= 0, do not spawn any threads and
         use simpler sequential algorithms (default).
+    :param bool path_compression: Whether to merge edges that share identical
+        paths (essentially taking advantage of shared recombination breakpoints).
+    :param bool simplify: Whether to remove extra tree nodes and edges that are not
+        on a path between the root and any of the samples. To do so, the final tree
+        sequence is simplified by appling the :meth:`tskit.TreeSequence.simplify` method
+        with ``keep_unary`` set to True (default = ``True``).
     :returns: The :class:`tskit.TreeSequence` object inferred from the
         input sample data.
     :rtype: tskit.TreeSequence
     """
     ancestor_data = generate_ancestors(
-        sample_data, engine=engine, progress_monitor=progress_monitor,
-        num_threads=num_threads)
+        sample_data, num_threads=num_threads,
+        engine=engine, progress_monitor=progress_monitor,)
     ancestors_ts = match_ancestors(
         sample_data, ancestor_data, engine=engine, num_threads=num_threads,
         path_compression=path_compression, progress_monitor=progress_monitor)
@@ -171,8 +177,8 @@ def infer(
 
 
 def generate_ancestors(
-        sample_data, num_threads=0, progress_monitor=None, engine=constants.C_ENGINE,
-        **kwargs):
+        sample_data, num_threads=0, path=None,
+        engine=constants.C_ENGINE, progress_monitor=None, **kwargs):
     """
     generate_ancestors(sample_data, num_threads=0, path=None, **kwargs)
 
@@ -186,30 +192,32 @@ def generate_ancestors(
 
         ancestor_data = tsinfer.generate_ancestors(sample_data, path="mydata.ancestors")
 
-    All other keyword arguments are passed to the :class:`AncestorData` constructor,
+    Other keyword arguments are passed to the :class:`AncestorData` constructor,
     which may be used to control the storage properties of the generated file.
 
     :param SampleData sample_data: The :class:`SampleData` instance that we are
         genering putative ancestors from.
     :param int num_threads: The number of worker threads to use. If < 1, use a
         simpler synchronous algorithm.
+    :param str path: The path of the file to store the sample data. If None,
+        the information is stored in memory and not persistent.
     :rtype: AncestorData
     :returns: The inferred ancestors stored in an :class:`AncestorData` instance.
     """
     progress_monitor = _get_progress_monitor(progress_monitor)
-    with formats.AncestorData(sample_data, **kwargs) as ancestor_data:
+    with formats.AncestorData(sample_data, path=path, **kwargs) as ancestor_data:
         generator = AncestorsGenerator(
-            sample_data, ancestor_data, progress_monitor, engine=engine,
-            num_threads=num_threads)
+            sample_data, ancestor_data, num_threads=num_threads,
+            engine=engine, progress_monitor=progress_monitor)
         generator.add_sites()
         generator.run()
         ancestor_data.record_provenance("generate-ancestors")
     return ancestor_data
 
 
 def match_ancestors(
-        sample_data, ancestor_data, progress_monitor=None, num_threads=0,
-        path_compression=True, extended_checks=False, engine=constants.C_ENGINE):
+        sample_data, ancestor_data, num_threads=0, path_compression=True,
+        extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None):
     """
     match_ancestors(sample_data, ancestor_data, num_threads=0, path_compression=True)
 
@@ -225,24 +233,25 @@ def match_ancestors(
         a history for.
     :param int num_threads: The number of match worker threads to use. If
         this is <= 0 then a simpler sequential algorithm is used (default).
-    :param bool path_compression: Should we try to merge edges that share identical
-        paths (essentially taking advantage of shared recombination breakpoints)
+    :param bool path_compression: Whether to merge edges that share identical
+        paths (essentially taking advantage of shared recombination breakpoints).
     :return: The ancestors tree sequence representing the inferred history
         of the set of ancestors.
     :rtype: tskit.TreeSequence
     """
     matcher = AncestorMatcher(
-        sample_data, ancestor_data, engine=engine,
-        progress_monitor=progress_monitor, path_compression=path_compression,
-        num_threads=num_threads, extended_checks=extended_checks)
+        sample_data, ancestor_data, num_threads=num_threads,
+        path_compression=path_compression, extended_checks=extended_checks,
+        engine=engine, progress_monitor=progress_monitor)
     return matcher.match_ancestors()
 
 
 def augment_ancestors(
-        sample_data, ancestors_ts, indexes, progress_monitor=None, num_threads=0,
-        path_compression=True, extended_checks=False, engine=constants.C_ENGINE):
+        sample_data, ancestors_ts, indexes, num_threads=0, path_compression=True,
+        extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None):
     """
-    augment_ancestors(sample_data, ancestors_ts, indexes, num_threads=0, simplify=True)
+    augment_ancestors(sample_data, ancestors_ts, indexes, num_threads=0,\
+        path_compression=True)
 
     Runs the sample matching :ref:`algorithm <sec_inference_match_samples>`
     on the specified :class:`SampleData` instance and ancestors tree sequence,
@@ -260,32 +269,34 @@ def augment_ancestors(
         tree sequence.
     :param int num_threads: The number of match worker threads to use. If
         this is <= 0 then a simpler sequential algorithm is used (default).
+    :param bool path_compression: Whether to merge edges that share identical
+        paths (essentially taking advantage of shared recombination breakpoints).
     :return: The specified ancestors tree sequence augmented with copying
         paths for the specified sample.
     :rtype: tskit.TreeSequence
     """
     manager = SampleMatcher(
-        sample_data, ancestors_ts, path_compression=path_compression,
-        engine=engine, progress_monitor=progress_monitor, num_threads=num_threads,
-        extended_checks=extended_checks)
+        sample_data, ancestors_ts, num_threads=num_threads,
+        path_compression=path_compression, extended_checks=extended_checks,
+        engine=engine, progress_monitor=progress_monitor
+        )
     manager.match_samples(indexes)
     ts = manager.get_augmented_ancestors_tree_sequence(indexes)
     return ts
 
 
 def match_samples(
-        sample_data, ancestors_ts, progress_monitor=None, num_threads=0,
-        path_compression=True, simplify=True, extended_checks=False,
-        stabilise_node_ordering=False, engine=constants.C_ENGINE):
+        sample_data, ancestors_ts, num_threads=0, path_compression=True, simplify=True,
+        extended_checks=False, stabilise_node_ordering=False, engine=constants.C_ENGINE,
+        progress_monitor=None):
     """
-    match_samples(sample_data, ancestors_ts, num_threads=0, simplify=True)
+    match_samples(sample_data, ancestors_ts, num_threads=0, path_compression=True,\
+        simplify=True)
 
     Runs the sample matching :ref:`algorithm <sec_inference_match_samples>`
     on the specified :class:`SampleData` instance and ancestors tree sequence,
     returning the final :class:`tskit.TreeSequence` instance containing
-    the full inferred history for all samples and sites. If ``simplify`` is
-    True (the default) run :meth:`tskit.TreeSequence.simplify` on the
-    inferred tree sequence to ensure that it is in canonical form.
+    the full inferred history for all samples and sites.
 
     :param SampleData sample_data: The :class:`SampleData` instance
         representing the input data.
@@ -294,14 +305,20 @@ def match_samples(
         history among ancestral ancestral haplotypes.
     :param int num_threads: The number of match worker threads to use. If
         this is <= 0 then a simpler sequential algorithm is used (default).
+    :param bool path_compression: Whether to merge edges that share identical
+        paths (essentially taking advantage of shared recombination breakpoints).
+    :param bool simplify: Whether to remove extra tree nodes and edges that are not
+        on a path between the root and any of the samples. To do so, the final tree
+        sequence is simplified by appling the :meth:`tskit.TreeSequence.simplify` method
+        with ``keep_unary`` set to True (default = ``True``).
     :return: The tree sequence representing the inferred history
         of the sample.
     :rtype: tskit.TreeSequence
     """
     manager = SampleMatcher(
-        sample_data, ancestors_ts, path_compression=path_compression,
-        engine=engine, progress_monitor=progress_monitor, num_threads=num_threads,
-        extended_checks=extended_checks)
+        sample_data, ancestors_ts, num_threads=num_threads,
+        path_compression=path_compression, extended_checks=extended_checks,
+        engine=engine, progress_monitor=progress_monitor)
     manager.match_samples()
     ts = manager.finalise(
         simplify=simplify, stabilise_node_ordering=stabilise_node_ordering)
@@ -313,7 +330,8 @@ class AncestorsGenerator(object):
     Manages the process of building ancestors.
     """
     def __init__(
-            self, sample_data, ancestor_data, progress_monitor, engine, num_threads=0):
+            self, sample_data, ancestor_data, num_threads=0,
+            engine=constants.C_ENGINE, progress_monitor=None):
         self.sample_data = sample_data
         self.ancestor_data = ancestor_data
         self.progress_monitor = progress_monitor
@@ -453,8 +471,8 @@ def run(self):
 class Matcher(object):
 
     def __init__(
-            self, sample_data, num_threads=1, engine=constants.C_ENGINE,
-            path_compression=True, progress_monitor=None, extended_checks=False):
+            self, sample_data, num_threads=1, path_compression=True,
+            extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None):
         self.sample_data = sample_data
         self.num_threads = num_threads
         self.path_compression = path_compression
@@ -1106,8 +1124,9 @@ def finalise(self, simplify=True, stabilise_node_ordering=False):
         logger.info("Finalising tree sequence")
         ts = self.get_samples_tree_sequence()
         if simplify:
-            logger.info("Running simplify on {} nodes and {} edges".format(
-                ts.num_nodes, ts.num_edges))
+            logger.info(
+                "Running simplify(keep_unary=True) on {} nodes and {} edges".format(
+                    ts.num_nodes, ts.num_edges))
             if stabilise_node_ordering:
                 # Ensure all the node times are distinct so that they will have
                 # stable IDs after simplifying. This could possibly also be done
@@ -1122,7 +1141,8 @@ def finalise(self, simplify=True, stabilise_node_ordering=False):
                 tables.nodes.set_columns(flags=tables.nodes.flags, time=time)
                 tables.sort()
                 ts = tables.tree_sequence()
-            ts = ts.simplify(samples=self.sample_ids, filter_sites=False)
+            ts = ts.simplify(
+                samples=self.sample_ids, filter_sites=False, keep_unary=True)
             logger.info("Finished simplify; now have {} nodes and {} edges".format(
                 ts.num_nodes, ts.num_edges))
         return ts