tskit-dev · jeromekelleher · Mar 19, 2020 · Jul 19, 2019 · Feb 7, 2020 · Feb 7, 2020
diff --git a/docs/inference.rst b/docs/inference.rst
@@ -23,10 +23,13 @@ Data requirements
 Input haplotype data for tskit must satisfy the following requirements:
 
 - Data must be *phased*.
-- For each site used for inference we must know the *ancestral state*. Note that this is
-  not necessarily the same as the REF column from VCF, and ancestral
+- For each site used for inference we must know the *ancestral state*. Note that
+  this is not necessarily the same as the REF column from VCF, and ancestral
   states must be computed using some other method.
 - Only biallelic sites can be used for inference.
+- Missing data can be included in the haplotypes using the value
+  ``tskit.MISSING_DATA`` (-1). The inferred tree sequence will have values
+  imputed for the haplotypes at these sites.
 
 
 .. _sec_inference_data_model:
@@ -36,7 +39,7 @@ Data model
 **********
 
 The data model for ``tsinfer`` is tightly integrated with
-``tskit``'s `data model <https://tskit.readthedocs.io/en/latest/data-model.html>`_
+``tskit``'s :ref:`data model <sec_data_model>`
 and uses the same concepts throughout. The intermediate file formats and APIs
 described here provide a bridge between this model and existing data sources. For
 convenience, we provide a brief description of concepts needed for importing
@@ -183,10 +186,8 @@ paths, "path compression".
 
 The copying path for each ancestor then describes its ancestry at every
 point in the sequence: from a genealogical perspective, we know its
-parent node. This information is encoded precisely as an `edge
-<https://tskit.readthedocs.io/en/latest/data-model.html#edge-table>`_ in a
-`tree sequence
-<https://tskit.readthedocs.io/en/latest/data-model.html>`_.
+parent node. This information is encoded precisely as an :ref:`edge
+<sec_edge_table_definition>` in a :ref:`tree sequence <sec_data_model>`.
 Thus, we refer to the output of this step as the "ancestors tree sequence",
 which is conventionally stored in a file ending with ``.ancestors.trees``.
 
@@ -204,18 +205,14 @@ The final phase of a ``tsinfer`` inference consists of a number steps:
    way as for ancestors, and the path compression algorithm can be equally
    applied here.
 
-
 2. As we only use a subset of the available sites for inference
    (excluding by default any sites that are fixed or singletons)
    we then place mutations on the inferred trees in order to
-   represent the information at these sites. This is done using the tskit
-   `map_mutations <https://tskit.readthedocs.io/en/latest/python-api.html#tskit.Tree.map_mutations>`_.
-   method.
-
+   represent the information at these sites. This is done using
+   :meth:`tskit.Tree.map_mutations`.
 
 3. Remove ancestral paths that do not lead to any of the samples by
-   `simplifying
-   <https://tskit.readthedocs.io/en/latest/python-api.html#tskit.TreeSequence.simplify>`_
+   :meth:`simplifying <tskit.TreeSequence.simplify>`
    the final tree sequence. When simplifying, we keep non-branching ("unary")
    nodes, as they indicate ancestors which we have actively inferred, and
    for technical reasons keeping unary ancestors can also lead to better

diff --git a/lib/ancestor_builder.c b/lib/ancestor_builder.c
@@ -238,45 +238,57 @@ ancestor_builder_compute_ancestral_states(ancestor_builder_t *self,
 
             genotypes = self->sites[l].genotypes;
             ones = 0;
+            zeros = 0;
             for (j = 0; j < sample_set_size; j++) {
-                ones += (size_t) genotypes[sample_set[j]];
+                switch (genotypes[sample_set[j]]) {
+                    case 0:
+                        zeros++;
+                        break;
+                    case 1:
+                        ones++;
+                        break;
+                }
             }
-            zeros = sample_set_size - ones;
             consensus = 0;
-            if (ones >= zeros) {
-                consensus = 1;
-            }
-            /* printf("\t:ones=%d, consensus=%d\n", (int) ones, consensus); */
-            /* fflush(stdout); */
-            for (j = 0; j < sample_set_size; j++) {
-                u = sample_set[j];
-                if (disagree[u] && genotypes[u] != consensus) {
-                    /* This sample has disagreed with consensus twice in a row,
-                     * so remove it */
-                    /* printf("\t\tremoving %d\n", sample_set[j]); */
-                    sample_set[j] = -1;
+            if (ones + zeros > 0) {
+                /* We have valid (non-missing) data in the tracked samples at site l */
+                if (ones >= zeros) {
+                    consensus = 1;
                 }
-            }
-            /* Repack the sample set */
-            tmp_size = 0;
-            for (j = 0; j < sample_set_size; j++) {
-                if (sample_set[j] != -1) {
-                    sample_set[tmp_size] = sample_set[j];
-                    tmp_size++;
+                /* printf("\t:ones=%d, consensus=%d\n", (int) ones, consensus); */
+                /* fflush(stdout); */
+                for (j = 0; j < sample_set_size; j++) {
+                    u = sample_set[j];
+                    if (disagree[u] && (genotypes[u] != consensus) &&
+                        (genotypes[u] != TSK_MISSING_DATA)) {
+                        /* This sample has disagreed with consensus twice in a row,
+                         * so remove it */
+                        /* printf("\t\tremoving %d\n", sample_set[j]); */
+                        sample_set[j] = -1;
+                    }
+                }
+                /* Repack the sample set */
+                tmp_size = 0;
+                for (j = 0; j < sample_set_size; j++) {
+                    if (sample_set[j] != -1) {
+                        sample_set[tmp_size] = sample_set[j];
+                        tmp_size++;
+                    }
+                }
+                sample_set_size = tmp_size;
+                if (sample_set_size <= min_sample_set_size) {
+                    /* printf("BREAK\n"); */
+                    break;
+                }
+                /* For the remaining sample set, set the disagree flags based
+                 * on whether they agree with the consensus for this site. */
+                for (j = 0; j < sample_set_size; j++) {
+                    u = sample_set[j];
+                    disagree[u] = ((genotypes[u] != consensus) &&
+                                   (genotypes[u] != TSK_MISSING_DATA));
                 }
-            }
-            sample_set_size = tmp_size;
-            if (sample_set_size <= min_sample_set_size) {
-                /* printf("BREAK\n"); */
-                break;
             }
             ancestor[l] = consensus;
-            /* For the remaining sample set, set the disagree flags based
-             * on whether they agree with the consensus for this site. */
-            for (j = 0; j < sample_set_size; j++) {
-                u = sample_set[j];
-                disagree[u] = genotypes[u] != consensus;
-            }
         }
     }
     *last_site_ret = last_site;
@@ -312,12 +324,22 @@ ancestor_builder_compute_between_focal_sites(ancestor_builder_t *self,
                 /* } */
                 genotypes = self->sites[l].genotypes;
                 ones = 0;
+                zeros = 0;
                 for (k = 0; k < sample_set_size; k++) {
-                    ones += (size_t) genotypes[sample_set[k]];
+                    switch (genotypes[sample_set[k]]) {
+                        case 0:
+                            zeros++;
+                            break;
+                        case 1:
+                            ones++;
+                            break;
+                    }
                 }
-                zeros = sample_set_size - ones;
-                if (ones >= zeros) {
-                    ancestor[l] = 1;
+                if (ones + zeros > 0) {
+                    /* We have valid (non-missing) data at this site */
+                    if (ones >= zeros) {
+                        ancestor[l] = 1;
+                    }
                 }
             }
         }
@@ -445,15 +467,23 @@ ancestor_builder_break_ancestor(ancestor_builder_t *self, tsk_id_t a,
 {
     bool ret = false;
     tsk_id_t j, k;
-    size_t ones;
+    size_t ones, missing;
 
     for (j = a + 1; j < b && !ret; j++) {
         if (self->sites[j].time > self->sites[a].time) {
             ones = 0;
+            missing = 0;
             for (k = 0; k < (tsk_id_t) num_samples; k++) {
-                ones += (size_t) self->sites[j].genotypes[samples[k]];
+                switch (self->sites[j].genotypes[samples[k]]) {
+                    case TSK_MISSING_DATA:
+                        missing++;
+                        break;
+                    case 1:
+                        ones++;
+                        break;
+                }
             }
-            if (ones != num_samples && ones != 0) {
+            if (ones != (num_samples - missing) && ones != 0) {
                 ret = true;
             }
         }

diff --git a/lib/tsinfer.h b/lib/tsinfer.h
@@ -1,5 +1,5 @@
 /*
-** Copyright (C) 2018 University of Oxford
+** Copyright (C) 2018-2020 University of Oxford
 **
 ** This file is part of tsinfer.
 **
@@ -27,6 +27,9 @@
 #include "object_heap.h"
 #include "avl.h"
 
+/* TODO remove this when we update tskit version. */
+#define TSK_MISSING_DATA (-1)
+
 /* NULL_LIKELIHOOD represents a compressed path and NONZERO_ROOT_LIKELIHOOD
  * marks a node that is not in the current tree. */
 #define NULL_LIKELIHOOD (-1)

diff --git a/requirements/conda-minimal.txt b/requirements/conda-minimal.txt
@@ -3,7 +3,7 @@ numpy
 six
 tqdm
 humanize
-tskit
+tskit>=0.2.3
 msprime
 zarr
 lmdb

diff --git a/tests/test_formats.py b/tests/test_formats.py
@@ -514,17 +514,16 @@ def test_provenance(self):
 
     def test_variant_errors(self):
         input_file = formats.SampleData(sequence_length=10)
-        genotypes = np.zeros(2, np.int8)
+        genotypes = [0, 0]
         input_file.add_site(0, alleles=["0", "1"], genotypes=genotypes)
         for bad_position in [-1, 10, 100]:
             self.assertRaises(
                 ValueError, input_file.add_site, position=bad_position,
                 alleles=["0", "1"], genotypes=genotypes)
-        for bad_genotypes in [[0, 2], [-1, 0], [], [0], [0, 0, 0]]:
-            genotypes = np.array(bad_genotypes, dtype=np.int8)
+        for bad_genotypes in [[0, 2], [-2, 0], [], [0], [0, 0, 0]]:
             self.assertRaises(
                 ValueError, input_file.add_site, position=1,
-                alleles=["0", "1"], genotypes=genotypes)
+                alleles=["0", "1"], genotypes=bad_genotypes)
         self.assertRaises(
             ValueError, input_file.add_site, position=1, inference=True,
             alleles=["0", "1", "2"], genotypes=[0, 1])
@@ -533,28 +532,41 @@ def test_variant_errors(self):
             alleles=["0"], genotypes=[0, 1])
         self.assertRaises(
             ValueError, input_file.add_site, position=1,
-            alleles=["0", "1"], genotypes=np.array([0, 2], dtype=np.int8))
+            alleles=["0", "1"], genotypes=[0, 2])
         self.assertRaises(
             ValueError, input_file.add_site, position=1,
-            alleles=["0", "0"], genotypes=np.array([0, 2], dtype=np.int8))
+            alleles=["0", "0"], genotypes=[0, 2])
 
     def test_invalid_inference_sites(self):
         # Trying to add singletons or fixed sites as inference sites
         # raise and error
         input_file = formats.SampleData()
         # Make sure this is OK
-        input_file.add_site(0, [0, 1, 1], inference=True)
+        input_file.add_site(0, [0, 1, 1, tskit.MISSING_DATA], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[0, 0, 0, 0], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[1, 0, 0, 0], inference=True)
         self.assertRaises(
             ValueError, input_file.add_site,
-            position=1, genotypes=[0, 0, 0], inference=True)
+            position=1, genotypes=[1, 1, 1, 1], inference=True)
         self.assertRaises(
             ValueError, input_file.add_site,
-            position=1, genotypes=[1, 0, 0], inference=True)
+            position=1, genotypes=[tskit.MISSING_DATA, 0, 0, 0], inference=True)
         self.assertRaises(
             ValueError, input_file.add_site,
-            position=1, genotypes=[1, 1, 1], inference=True)
+            position=1, genotypes=[tskit.MISSING_DATA, 1, 1, 1], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[tskit.MISSING_DATA, 0, 1, 0], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[tskit.MISSING_DATA] * 4, inference=True)
+        # Check we can still add at pos 1
         input_file.add_site(
-            position=1, genotypes=[1, 0, 1], inference=True)
+            position=1, genotypes=[1, 0, 1, tskit.MISSING_DATA], inference=True)
 
     def test_duplicate_sites(self):
         # Duplicate sites are not accepted.
@@ -1146,6 +1158,30 @@ def test_too_many_alleles(self):
                     sample_data.add_site(
                         0, [0, 0], alleles=[str(x) for x in range(num_alleles)])
 
+    def test_missing_data(self):
+        u = tskit.MISSING_DATA
+        sites_by_samples = np.array([
+            [u, u, u, 1, 1, 0, 1, 1, 1],
+            [u, u, u, 1, 1, 0, 1, 1, 0],
+            [u, u, u, 1, 0, 1, 1, 0, 1],
+            [u, 0, 0, 1, 1, 1, 1, u, u],
+            [u, 0, 1, 1, 1, 0, 1, u, u],
+            [u, 1, 1, 0, 0, 0, 0, u, u]
+            ], dtype=np.int8)
+        with tsinfer.SampleData() as data:
+            for col in range(sites_by_samples.shape[1]):
+                data.add_site(col, sites_by_samples[:, col])
+
+        self.assertEqual(data.sequence_length, 9.0)
+        self.assertEqual(data.num_sites, 9)
+        # First site is a entirely missing, second is singleton with missing data =>
+        # neither should be marked for inference
+        inference_sites = data.sites_inference[:]
+        self.assertEqual(inference_sites[0], 0)  # Entirely missing data
+        self.assertEqual(inference_sites[1], 0)  # Singleton with missing data
+        for i in inference_sites[2:]:
+            self.assertEqual(i, 1)
+
 
 class TestAncestorData(unittest.TestCase, DataContainerMixin):
     """
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,7 @@ numpy @@
     six
     tqdm
     humanize
-    tskit
+    tskit>=0.2.3
     msprime
     zarr
     lmdb
@@ Expand Down @@