Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions docs/inference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@ Data requirements
Input haplotype data for tskit must satisfy the following requirements:

- Data must be *phased*.
- For each site used for inference we must know the *ancestral state*. Note that this is
not necessarily the same as the REF column from VCF, and ancestral
- For each site used for inference we must know the *ancestral state*. Note that
this is not necessarily the same as the REF column from VCF, and ancestral
states must be computed using some other method.
- Only biallelic sites can be used for inference.
- Missing data can be included in the haplotypes using the value
``tskit.MISSING_DATA`` (-1). The inferred tree sequence will have values
imputed for the haplotypes at these sites.


.. _sec_inference_data_model:
Expand All @@ -36,7 +39,7 @@ Data model
**********

The data model for ``tsinfer`` is tightly integrated with
``tskit``'s `data model <https://tskit.readthedocs.io/en/latest/data-model.html>`_
``tskit``'s :ref:`data model <sec_data_model>`
and uses the same concepts throughout. The intermediate file formats and APIs
described here provide a bridge between this model and existing data sources. For
convenience, we provide a brief description of concepts needed for importing
Expand Down Expand Up @@ -183,10 +186,8 @@ paths, "path compression".

The copying path for each ancestor then describes its ancestry at every
point in the sequence: from a genealogical perspective, we know its
parent node. This information is encoded precisely as an `edge
<https://tskit.readthedocs.io/en/latest/data-model.html#edge-table>`_ in a
`tree sequence
<https://tskit.readthedocs.io/en/latest/data-model.html>`_.
parent node. This information is encoded precisely as an :ref:`edge
<sec_edge_table_definition>` in a :ref:`tree sequence <sec_data_model>`.
Thus, we refer to the output of this step as the "ancestors tree sequence",
which is conventionally stored in a file ending with ``.ancestors.trees``.

Expand All @@ -204,18 +205,14 @@ The final phase of a ``tsinfer`` inference consists of a number steps:
way as for ancestors, and the path compression algorithm can be equally
applied here.


2. As we only use a subset of the available sites for inference
(excluding by default any sites that are fixed or singletons)
we then place mutations on the inferred trees in order to
represent the information at these sites. This is done using the tskit
`map_mutations <https://tskit.readthedocs.io/en/latest/python-api.html#tskit.Tree.map_mutations>`_.
method.

represent the information at these sites. This is done using
:meth:`tskit.Tree.map_mutations`.

3. Remove ancestral paths that do not lead to any of the samples by
`simplifying
<https://tskit.readthedocs.io/en/latest/python-api.html#tskit.TreeSequence.simplify>`_
:meth:`simplifying <tskit.TreeSequence.simplify>`
the final tree sequence. When simplifying, we keep non-branching ("unary")
nodes, as they indicate ancestors which we have actively inferred, and
for technical reasons keeping unary ancestors can also lead to better
Expand Down
108 changes: 69 additions & 39 deletions lib/ancestor_builder.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,45 +238,57 @@ ancestor_builder_compute_ancestral_states(ancestor_builder_t *self,

genotypes = self->sites[l].genotypes;
ones = 0;
zeros = 0;
for (j = 0; j < sample_set_size; j++) {
ones += (size_t) genotypes[sample_set[j]];
switch (genotypes[sample_set[j]]) {
case 0:
zeros++;
break;
case 1:
ones++;
break;
}
}
zeros = sample_set_size - ones;
consensus = 0;
if (ones >= zeros) {
consensus = 1;
}
/* printf("\t:ones=%d, consensus=%d\n", (int) ones, consensus); */
/* fflush(stdout); */
for (j = 0; j < sample_set_size; j++) {
u = sample_set[j];
if (disagree[u] && genotypes[u] != consensus) {
/* This sample has disagreed with consensus twice in a row,
* so remove it */
/* printf("\t\tremoving %d\n", sample_set[j]); */
sample_set[j] = -1;
if (ones + zeros > 0) {
/* We have valid (non-missing) data in the tracked samples at site l */
if (ones >= zeros) {
consensus = 1;
}
}
/* Repack the sample set */
tmp_size = 0;
for (j = 0; j < sample_set_size; j++) {
if (sample_set[j] != -1) {
sample_set[tmp_size] = sample_set[j];
tmp_size++;
/* printf("\t:ones=%d, consensus=%d\n", (int) ones, consensus); */
/* fflush(stdout); */
for (j = 0; j < sample_set_size; j++) {
u = sample_set[j];
if (disagree[u] && (genotypes[u] != consensus) &&
(genotypes[u] != TSK_MISSING_DATA)) {
/* This sample has disagreed with consensus twice in a row,
* so remove it */
/* printf("\t\tremoving %d\n", sample_set[j]); */
sample_set[j] = -1;
}
}
/* Repack the sample set */
tmp_size = 0;
for (j = 0; j < sample_set_size; j++) {
if (sample_set[j] != -1) {
sample_set[tmp_size] = sample_set[j];
tmp_size++;
}
}
sample_set_size = tmp_size;
if (sample_set_size <= min_sample_set_size) {
/* printf("BREAK\n"); */
break;
}
/* For the remaining sample set, set the disagree flags based
* on whether they agree with the consensus for this site. */
for (j = 0; j < sample_set_size; j++) {
u = sample_set[j];
disagree[u] = ((genotypes[u] != consensus) &&
(genotypes[u] != TSK_MISSING_DATA));
}
}
sample_set_size = tmp_size;
if (sample_set_size <= min_sample_set_size) {
/* printf("BREAK\n"); */
break;
}
ancestor[l] = consensus;
/* For the remaining sample set, set the disagree flags based
* on whether they agree with the consensus for this site. */
for (j = 0; j < sample_set_size; j++) {
u = sample_set[j];
disagree[u] = genotypes[u] != consensus;
}
}
}
*last_site_ret = last_site;
Expand Down Expand Up @@ -312,12 +324,22 @@ ancestor_builder_compute_between_focal_sites(ancestor_builder_t *self,
/* } */
genotypes = self->sites[l].genotypes;
ones = 0;
zeros = 0;
for (k = 0; k < sample_set_size; k++) {
ones += (size_t) genotypes[sample_set[k]];
switch (genotypes[sample_set[k]]) {
case 0:
zeros++;
break;
case 1:
ones++;
break;
}
}
zeros = sample_set_size - ones;
if (ones >= zeros) {
ancestor[l] = 1;
if (ones + zeros > 0) {
/* We have valid (non-missing) data at this site */
if (ones >= zeros) {
ancestor[l] = 1;
}
}
}
}
Expand Down Expand Up @@ -445,15 +467,23 @@ ancestor_builder_break_ancestor(ancestor_builder_t *self, tsk_id_t a,
{
bool ret = false;
tsk_id_t j, k;
size_t ones;
size_t ones, missing;

for (j = a + 1; j < b && !ret; j++) {
if (self->sites[j].time > self->sites[a].time) {
ones = 0;
missing = 0;
for (k = 0; k < (tsk_id_t) num_samples; k++) {
ones += (size_t) self->sites[j].genotypes[samples[k]];
switch (self->sites[j].genotypes[samples[k]]) {
case TSK_MISSING_DATA:
missing++;
break;
case 1:
ones++;
break;
}
}
if (ones != num_samples && ones != 0) {
if (ones != (num_samples - missing) && ones != 0) {
ret = true;
}
}
Expand Down
5 changes: 4 additions & 1 deletion lib/tsinfer.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
** Copyright (C) 2018 University of Oxford
** Copyright (C) 2018-2020 University of Oxford
**
** This file is part of tsinfer.
**
Expand Down Expand Up @@ -27,6 +27,9 @@
#include "object_heap.h"
#include "avl.h"

/* TODO remove this when we update tskit version. */
#define TSK_MISSING_DATA (-1)

/* NULL_LIKELIHOOD represents a compressed path and NONZERO_ROOT_LIKELIHOOD
* marks a node that is not in the current tree. */
#define NULL_LIKELIHOOD (-1)
Expand Down
2 changes: 1 addition & 1 deletion requirements/conda-minimal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ numpy
six
tqdm
humanize
tskit
tskit>=0.2.3
msprime
zarr
lmdb
Expand Down
58 changes: 47 additions & 11 deletions tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,17 +514,16 @@ def test_provenance(self):

def test_variant_errors(self):
input_file = formats.SampleData(sequence_length=10)
genotypes = np.zeros(2, np.int8)
genotypes = [0, 0]
input_file.add_site(0, alleles=["0", "1"], genotypes=genotypes)
for bad_position in [-1, 10, 100]:
self.assertRaises(
ValueError, input_file.add_site, position=bad_position,
alleles=["0", "1"], genotypes=genotypes)
for bad_genotypes in [[0, 2], [-1, 0], [], [0], [0, 0, 0]]:
genotypes = np.array(bad_genotypes, dtype=np.int8)
for bad_genotypes in [[0, 2], [-2, 0], [], [0], [0, 0, 0]]:
self.assertRaises(
ValueError, input_file.add_site, position=1,
alleles=["0", "1"], genotypes=genotypes)
alleles=["0", "1"], genotypes=bad_genotypes)
self.assertRaises(
ValueError, input_file.add_site, position=1, inference=True,
alleles=["0", "1", "2"], genotypes=[0, 1])
Expand All @@ -533,28 +532,41 @@ def test_variant_errors(self):
alleles=["0"], genotypes=[0, 1])
self.assertRaises(
ValueError, input_file.add_site, position=1,
alleles=["0", "1"], genotypes=np.array([0, 2], dtype=np.int8))
alleles=["0", "1"], genotypes=[0, 2])
self.assertRaises(
ValueError, input_file.add_site, position=1,
alleles=["0", "0"], genotypes=np.array([0, 2], dtype=np.int8))
alleles=["0", "0"], genotypes=[0, 2])

def test_invalid_inference_sites(self):
# Trying to add singletons or fixed sites as inference sites
# raise and error
input_file = formats.SampleData()
# Make sure this is OK
input_file.add_site(0, [0, 1, 1], inference=True)
input_file.add_site(0, [0, 1, 1, tskit.MISSING_DATA], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[0, 0, 0, 0], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[1, 0, 0, 0], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[0, 0, 0], inference=True)
position=1, genotypes=[1, 1, 1, 1], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[1, 0, 0], inference=True)
position=1, genotypes=[tskit.MISSING_DATA, 0, 0, 0], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[1, 1, 1], inference=True)
position=1, genotypes=[tskit.MISSING_DATA, 1, 1, 1], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[tskit.MISSING_DATA, 0, 1, 0], inference=True)
self.assertRaises(
ValueError, input_file.add_site,
position=1, genotypes=[tskit.MISSING_DATA] * 4, inference=True)
# Check we can still add at pos 1
input_file.add_site(
position=1, genotypes=[1, 0, 1], inference=True)
position=1, genotypes=[1, 0, 1, tskit.MISSING_DATA], inference=True)

def test_duplicate_sites(self):
# Duplicate sites are not accepted.
Expand Down Expand Up @@ -1146,6 +1158,30 @@ def test_too_many_alleles(self):
sample_data.add_site(
0, [0, 0], alleles=[str(x) for x in range(num_alleles)])

def test_missing_data(self):
u = tskit.MISSING_DATA
sites_by_samples = np.array([
[u, u, u, 1, 1, 0, 1, 1, 1],
[u, u, u, 1, 1, 0, 1, 1, 0],
[u, u, u, 1, 0, 1, 1, 0, 1],
[u, 0, 0, 1, 1, 1, 1, u, u],
[u, 0, 1, 1, 1, 0, 1, u, u],
[u, 1, 1, 0, 0, 0, 0, u, u]
], dtype=np.int8)
with tsinfer.SampleData() as data:
for col in range(sites_by_samples.shape[1]):
data.add_site(col, sites_by_samples[:, col])

self.assertEqual(data.sequence_length, 9.0)
self.assertEqual(data.num_sites, 9)
# First site is a entirely missing, second is singleton with missing data =>
# neither should be marked for inference
inference_sites = data.sites_inference[:]
self.assertEqual(inference_sites[0], 0) # Entirely missing data
self.assertEqual(inference_sites[1], 0) # Singleton with missing data
for i in inference_sites[2:]:
self.assertEqual(i, 1)


class TestAncestorData(unittest.TestCase, DataContainerMixin):
"""
Expand Down
Loading