Allow sample missing data

hyanwong · hyanwong · commit 9c26fc0abc29 · 2019-08-09T17:27:07.000+01:00
diff --git a/tests/test_formats.py b/tests/test_formats.py
@@ -241,47 +241,59 @@ def test_provenance(self):
 
     def test_variant_errors(self):
         input_file = formats.SampleData(sequence_length=10)
-        genotypes = np.zeros(2, np.int8)
+        genotypes = [0, 0]
         input_file.add_site(0, alleles=["0", "1"], genotypes=genotypes)
+        self.assertRaises(
+            ValueError, input_file.add_site, position=1,
+            alleles=["0", "1", "2"], genotypes=genotypes)
         for bad_position in [-1, 10, 100]:
             self.assertRaises(
                 ValueError, input_file.add_site, position=bad_position,
                 alleles=["0", "1"], genotypes=genotypes)
-        for bad_genotypes in [[0, 2], [-1, 0], [], [0], [0, 0, 0]]:
-            genotypes = np.array(bad_genotypes, dtype=np.int8)
+        for bad_genotypes in [[0, 2], [-2, 0], [], [0], [0, 0, 0]]:
             self.assertRaises(
                 ValueError, input_file.add_site, position=1,
-                alleles=["0", "1"], genotypes=genotypes)
+                alleles=["0", "1"], genotypes=bad_genotypes)
         self.assertRaises(
             ValueError, input_file.add_site, position=1,
-            alleles=["0", "1", "2"], genotypes=np.zeros(2, dtype=np.int8))
+            alleles=["0"], genotypes=[0, 1])
         self.assertRaises(
             ValueError, input_file.add_site, position=1,
-            alleles=["0"], genotypes=np.array([0, 1], dtype=np.int8))
+            alleles=["0", "1"], genotypes=[0, 2])
         self.assertRaises(
             ValueError, input_file.add_site, position=1,
-            alleles=["0", "1"], genotypes=np.array([0, 2], dtype=np.int8))
-        self.assertRaises(
-            ValueError, input_file.add_site, position=1,
-            alleles=["0", "0"], genotypes=np.array([0, 2], dtype=np.int8))
+            alleles=["0", "0"], genotypes=[0, 2])
 
     def test_invalid_inference_sites(self):
         # Trying to add singletons or fixed sites as inference sites
         # raise and error
         input_file = formats.SampleData()
         # Make sure this is OK
-        input_file.add_site(0, [0, 1, 1], inference=True)
+        input_file.add_site(0, [0, 1, 1, tskit.MISSING_DATA], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[0, 0, 0, 0], inference=True)
         self.assertRaises(
             ValueError, input_file.add_site,
-            position=1, genotypes=[0, 0, 0], inference=True)
+            position=1, genotypes=[1, 0, 0, 0], inference=True)
         self.assertRaises(
             ValueError, input_file.add_site,
-            position=1, genotypes=[1, 0, 0], inference=True)
+            position=1, genotypes=[1, 1, 1, 1], inference=True)
         self.assertRaises(
             ValueError, input_file.add_site,
-            position=1, genotypes=[1, 1, 1], inference=True)
+            position=1, genotypes=[tskit.MISSING_DATA, 0, 0, 0], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[tskit.MISSING_DATA, 1, 1, 1], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[tskit.MISSING_DATA, 0, 1, 0], inference=True)
+        self.assertRaises(
+            ValueError, input_file.add_site,
+            position=1, genotypes=[tskit.MISSING_DATA] * 4, inference=True)
+        # Check we can still add at pos 1
         input_file.add_site(
-            position=1, genotypes=[1, 0, 1], inference=True)
+            position=1, genotypes=[1, 0, 1, tskit.MISSING_DATA], inference=True)
 
     def test_duplicate_sites(self):
         # Duplicate sites are not accepted.
@@ -770,6 +782,30 @@ def test_sequence_length(self):
         data.finalise()
         self.assertEqual(data.sequence_length, 1)
 
+    def test_missing_data(self):
+        u = tskit.MISSING_DATA
+        sites_by_samples = np.array([
+            [u, u, u, 1, 1, 0, 1, 1, 1],
+            [u, u, u, 1, 1, 0, 1, 1, 0],
+            [u, u, u, 1, 0, 1, 1, 0, 1],
+            [u, 0, 0, 1, 1, 1, 1, u, u],
+            [u, 0, 1, 1, 1, 0, 1, u, u],
+            [u, 1, 1, 0, 0, 0, 0, u, u]
+            ], dtype=np.int8)
+        with tsinfer.SampleData() as data:
+            for col in range(sites_by_samples.shape[1]):
+                data.add_site(col, sites_by_samples[:, col])
+
+        self.assertEqual(data.sequence_length, 9.0)
+        self.assertEqual(data.num_sites, 9)
+        # First site is a entirely missing, second is singleton with missing data =>
+        # neither should be marked for inference
+        inference_sites = data.sites_inference[:]
+        self.assertEqual(inference_sites[0], 0)  # Entirely missing data
+        self.assertEqual(inference_sites[1], 0)  # Singleton with missing data
+        for i in inference_sites[2:]:
+            self.assertEqual(i, 1)
+
 
 class TestAncestorData(unittest.TestCase, DataContainerMixin):
     """
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -615,8 +615,9 @@ def verify_inserted_ancestors(self, ts):
         tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts)
         ancestor_data.finalise()
 
-        A = np.zeros(
-            (ancestor_data.num_sites, ancestor_data.num_ancestors), dtype=np.uint8)
+        A = np.full(
+            (ancestor_data.num_sites, ancestor_data.num_ancestors), tskit.MISSING_DATA,
+            dtype=np.int8)
         start = ancestor_data.ancestors_start[:]
         end = ancestor_data.ancestors_end[:]
         ancestors = ancestor_data.ancestors_haplotype[:]
@@ -1922,3 +1923,114 @@ def verify_example(self, full_subset, samples, ancestors, path_compression):
             self.assertEqual(expected_sample_ancestors, num_sample_ancestors)
             tsinfer.verify(samples, final_ts.simplify())
             ancestors_ts = augmented_ancestors
+
+
+class TestMissingSampleDataInference(unittest.TestCase):
+    """
+    Test that we can infer sites with tskit.MISSING_DATA, using both the PY and C engines
+    """
+    def test_missing_haplotypes(self):
+        u = tskit.MISSING_DATA
+        sites_by_samples = np.array([
+            [u, u, u, u],
+            [u, 1, 0, u],
+            [u, 0, 1, u],
+            [u, 1, 1, u]
+            ], dtype=np.int8)
+        with tsinfer.SampleData() as sample_data:
+            for col in range(sites_by_samples.shape[1]):
+                sample_data.add_site(col, sites_by_samples[:, col])
+        ts = tsinfer.infer(sample_data)
+        self.assertTrue(np.all(sites_by_samples == ts.genotype_matrix().T))
+
+    def test_small_truncated_fragments(self):
+        u = tskit.MISSING_DATA
+        sites_by_samples = np.array([
+            [u, u, u, 1, 1, 0, 1, 1, 1, u],
+            [u, u, u, 1, 0, 0, 1, 1, 0, u],
+            [u, u, u, 1, 0, 1, 1, 0, 1, u],
+            [u, 0, 0, 1, 0, 1, 1, u, u, u],
+            [u, 0, 1, 1, 0, 0, 1, u, u, u],
+            [u, 1, 1, 0, 0, 0, 0, u, u, u]
+            ], dtype=np.int8)
+        with tsinfer.SampleData() as sample_data:
+            for col in range(sites_by_samples.shape[1]):
+                sample_data.add_site(col, sites_by_samples[:, col])
+        for e in [tsinfer.PY_ENGINE, tsinfer.C_ENGINE]:
+            ancestors = tsinfer.generate_ancestors(sample_data, engine=e)
+            ancestors_ts = tsinfer.match_ancestors(
+                sample_data, ancestors, engine=e, extended_checks=True)
+            ts = tsinfer.match_samples(
+                sample_data, ancestors_ts, engine=e, extended_checks=True)
+            self.assertTrue(1.0 in list(ts.breakpoints()))  # End of lft unknown region
+            self.assertTrue(3.0 in list(ts.breakpoints()))  # End of 1st unknown batch
+            self.assertTrue(7.0 in list(ts.breakpoints()))  # Start of 2nd unknown batch
+            self.assertTrue(9.0 in list(ts.breakpoints()))  # Start of rgt unknown region
+            for tree in ts.trees():
+                for s in ts.samples():
+                    if tree.interval[1] <= 1:
+                        self.assertTrue(tree.parent(s) == tskit.NULL)
+                    elif tree.interval[1] <= 3:
+                        if s in [0, 1, 2]:
+                            self.assertTrue(tree.parent(s) == tskit.NULL)
+                        else:
+                            self.assertTrue(tree.parent(s) != tskit.NULL)
+                    elif tree.interval[0] >= 9:
+                        self.assertTrue(tree.parent(s) == tskit.NULL)
+                    elif tree.interval[0] >= 7:
+                        if s in [3, 4, 5]:
+                            self.assertTrue(tree.parent(s) == tskit.NULL)
+                        else:
+                            self.assertTrue(tree.parent(s) != tskit.NULL)
+
+        self.assertTrue(np.all(sites_by_samples == ts.genotype_matrix().T))
+
+    def test_large_truncated_fragments(self):
+        def truncate_ts_samples(ts, average_span, random_seed, min_span=5):
+            """
+            Create a tree sequence that has sample nodes which have been truncated
+            so that they span only a small region of the genome. The length of the
+            truncated spans is given by a poisson distribution whose mean is average_span
+            but which cannot go below a fixed min_span, or above the sequence_length
+
+            Samples are truncated by removing the edges that connect them to the rest
+            of the tree.
+            """
+            np.random.seed(random_seed)
+            # Make a list of (left,right) tuples giving the new limits of each sample
+            # Keyed by sample ID.
+            keep = {}
+            # for simplicity, we pick lengths from a poisson distribution of av 300 bp
+            for sample_id, span in zip(
+                    ts.samples(), np.random.poisson(average_span, ts.num_samples)):
+                span = max(span, min_span)
+                span = min(span, ts.sequence_length)
+                start = np.random.uniform(0, ts.sequence_length-span)
+                keep[sample_id] = (start, start+span)
+
+            tables = ts.dump_tables()
+            tables.edges.clear()
+            for e in ts.tables.edges:
+                if e.child not in keep:
+                    left, right = e.left, e.right
+                else:
+                    if e.right <= keep[e.child][0] or e.left >= keep[e.child][1]:
+                        continue  # this edge is outside the focal region
+                    else:
+                        left = max(e.left, keep[e.child][0])
+                        right = min(e.right, keep[e.child][1])
+                tables.edges.add_row(left, right, e.parent, e.child)
+            return tables.tree_sequence()
+
+        ts = msprime.simulate(
+            100, Ne=1e2, length=400, recombination_rate=1e-4, mutation_rate=2e-4,
+            random_seed=1)
+        truncated_ts = truncate_ts_samples(ts, average_span=200, random_seed=123)
+        sd = tsinfer.SampleData.from_tree_sequence(truncated_ts, use_times=False)
+        # Cannot use the normal `simplify` as this removes parts of the TS where only
+        # one sample is connected to the root (& the other samples have missing data)
+        ts_inferred = tsinfer.infer(sd, simplify=False)
+        # Instead we run simplicy explicitly, with `keep_unary=True`
+        ts_inferred = ts_inferred.simplify(filter_sites=False, keep_unary=True)
+        self.assertTrue(
+            np.all(ts_inferred.genotype_matrix() == truncated_ts.genotype_matrix()))
diff --git a/tsinfer/algorithm.py b/tsinfer/algorithm.py
@@ -140,8 +140,14 @@ def ancestor_descriptors(self):
 
     def compute_ancestral_states(self, a, focal_site, sites):
         """
-        Together with make_ancestor, this is the main algorithm as implemented in Fig S2
-        of the preprint, with the buffer.
+        For a given focal site, and set of sites to fill in (usually all the ones
+        leftwards or rightwards), augment the haplotype array a with the inferred sites
+        Together with `make_ancestor`, which calls this function, these describe the main
+        algorithm as implemented in Fig S2 of the preprint, with the buffer.
+
+        TODO - account for tskit.MISSING_DATA in samples (e.g. when encountered in
+        the remove_buffer we should keep the sample in the buffer until we know that
+        there is a conflict, rather than clear the remove buffer on every iteration)
         """
         focal_age = self.sites[focal_site].age
         S = set(np.where(self.sites[focal_site].genotypes == 1)[0])
@@ -154,8 +160,8 @@ def compute_ancestral_states(self, a, focal_site, sites):
             last_site = l
             if self.sites[l].age > focal_age:
                 g_l = self.sites[l].genotypes
-                ones = sum(g_l[u] for u in S)
-                zeros = len(S) - ones
+                ones = sum(g_l[u] == 1 for u in S)
+                zeros = sum(g_l[u] == 0 for u in S)
                 # print("\tsite", l, ones, zeros, sep="\t")
                 consensus = 0
                 if ones >= zeros:
@@ -174,6 +180,7 @@ def compute_ancestral_states(self, a, focal_site, sites):
                     if g_l[u] != consensus:
                         remove_buffer.append(u)
                 a[l] = consensus
+        assert a[last_site] != tskit.MISSING_DATA
         return last_site
 
     def make_ancestor(self, focal_sites, a):
@@ -189,27 +196,34 @@ def make_ancestor(self, focal_sites, a):
         for focal_site in focal_sites:
             a[focal_site] = 1
         S = set(np.where(self.sites[focal_sites[0]].genotypes == 1)[0])
+        # Interpolate ancestral haplotype within focal region (i.e. region
+        #  spanning from leftmost to rightmost focal site)
         for j in range(len(focal_sites) - 1):
+            # Interpolate region between focal site j and focal site j+1
             for l in range(focal_sites[j] + 1, focal_sites[j + 1]):
                 a[l] = 0
                 if self.sites[l].age > focal_age:
                     g_l = self.sites[l].genotypes
-                    ones = sum(g_l[u] for u in S)
-                    zeros = len(S) - ones
+                    ones = sum(g_l[u] == 1 for u in S)
+                    zeros = sum(g_l[u] == 0 for u in S)
                     # print("\t", l, ones, zeros, sep="\t")
-                    if ones >= zeros:
+                    if ones >= zeros:  # Should probably be "ones > zeros" (see below)
+                        # Since this site should be older, this is a conflict
+                        # We just take the majority rule. If equal, we assume that
+                        # the derived variant is more likely (this is probably wrong)
+                        # (we could possibly do something more sophisticated for ancient
+                        #  samples by taking into account the sample age)
                         a[l] = 1
-        # Go rightwards
+        # Extend ancestral haplotype rightwards from rightmost focal site
         focal_site = focal_sites[-1]
         last_site = self.compute_ancestral_states(
                 a, focal_site, range(focal_site + 1, self.num_sites))
         assert a[last_site] != tskit.MISSING_DATA
         end = last_site + 1
-        # Go leftwards
+        # Extend ancestral haplotype leftwards from leftmost focal site
         focal_site = focal_sites[0]
         last_site = self.compute_ancestral_states(
                 a, focal_site, range(focal_site - 1, -1, -1))
-        assert a[last_site] != tskit.MISSING_DATA
         start = last_site
         return start, end
 
@@ -386,7 +400,7 @@ def update_node_time(self, child_id, pc_parent_id):
             edge = edge.next
         assert min_parent_time >= 0
         assert min_parent_time <= self.time[0]
-        # For the asserttion to be violated we would need to have 64K pc
+        # For the assertion to be violated we would need to have 64K pc
         # ancestors sequentially copying from each other.
         self.time[pc_parent_id] = min_parent_time - (1 / 2**16)
         assert self.time[pc_parent_id] > self.time[child_id]
@@ -553,6 +567,10 @@ def dump_nodes(self):
         return flags, time
 
     def dump_edges(self):
+        """
+        Return all the edges, in path order (such that all edges for a child are gathered
+        together, and the edges for this child are always listed from left to right)
+        """
         left = np.zeros(self.num_edges, dtype=np.int32)
         right = np.zeros(self.num_edges, dtype=np.int32)
         parent = np.zeros(self.num_edges, dtype=np.int32)
diff --git a/tsinfer/formats.py b/tsinfer/formats.py
@@ -1121,12 +1121,17 @@ def add_site(
 
         if alleles is None:
             alleles = ["0", "1"]
-        if len(alleles) > 2:
-            raise ValueError("Only biallelic sites supported")
+        if len(set(alleles) - set([None])) > 2:
+            raise ValueError("Only biallelic sites supported: {}".format(alleles))
         if len(set(alleles)) != len(alleles):
             raise ValueError("Alleles must be distinct")
-        if np.any(genotypes >= len(alleles)) or np.any(genotypes < 0):
-            raise ValueError("Genotypes values must be between 0 and len(alleles) - 1")
+        # Check we can never confuse a real allele with the value for MISSING_DATA
+        assert not (0 <= tskit.MISSING_DATA <= len(alleles))
+        if np.any(np.logical_and(genotypes < 0, genotypes != tskit.MISSING_DATA)):
+            raise ValueError("Non-missing values for genotypes cannot be negative")
+        if np.any(np.logical_and(
+                genotypes >= len(alleles), genotypes != tskit.MISSING_DATA)):
+            raise ValueError("Non-missing values for genotypes must be < len(alleles)")
         if genotypes.shape != (self.num_samples,):
             raise ValueError("Must have num_samples genotypes.")
         if position < 0:
@@ -1136,8 +1141,12 @@ def add_site(
         if position <= self._last_position:
             raise ValueError(
                 "Sites positions must be unique and added in increasing order")
-        count = np.sum(genotypes)
-        if count > 1 and count < self.num_samples:
+
+        n_known = np.sum(genotypes != tskit.MISSING_DATA)
+        n_unknown = self.num_samples - n_known
+        n_ancestral = np.sum(genotypes == 0)
+        n_derived = n_known - n_ancestral
+        if n_derived > 1 and n_derived < n_known:
             if inference is None:
                 inference = True
         else:
@@ -1147,7 +1156,8 @@ def add_site(
                 raise ValueError(
                     "Cannot specify singletons or fixed sites for inference")
         if age is None:
-            age = count
+            age = n_derived
+            age += n_unknown/2.0  # Slight hack: unknown alleles create intermediate age
         site_id = self._sites_writer.add(
             position=position, genotypes=genotypes,
             metadata=self._check_metadata(metadata),
diff --git a/tsinfer/inference.py b/tsinfer/inference.py