Skip to content

Don't update sample flags in simplify #2663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions c/tests/test_trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -3323,6 +3323,36 @@ test_simplest_no_node_filter(void)
tsk_treeseq_free(&ts);
}

static void
test_simplest_no_update_flags(void)
{
const char *nodes = "0 0 0\n"
"1 0 0\n"
"0 1 0\n";
const char *edges = "0 1 2 0,1\n";
tsk_treeseq_t ts, simplified;
tsk_id_t sample_ids[] = { 0, 1 };
int ret;

tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);

/* We have a mixture of sample and non-samples in the input tables */
ret = tsk_treeseq_simplify(
&ts, sample_ids, 2, TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS, &simplified, NULL);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
tsk_treeseq_free(&simplified);

ret = tsk_treeseq_simplify(&ts, sample_ids, 2,
TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS | TSK_SIMPLIFY_NO_FILTER_NODES, &simplified,
NULL);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
tsk_treeseq_free(&simplified);

tsk_treeseq_free(&ts);
}

static void
test_simplest_map_mutations(void)
{
Expand Down Expand Up @@ -8093,6 +8123,7 @@ main(int argc, char **argv)
{ "test_simplest_population_filter", test_simplest_population_filter },
{ "test_simplest_individual_filter", test_simplest_individual_filter },
{ "test_simplest_no_node_filter", test_simplest_no_node_filter },
{ "test_simplest_no_update_flags", test_simplest_no_update_flags },
{ "test_simplest_map_mutations", test_simplest_map_mutations },
{ "test_simplest_nonbinary_map_mutations",
test_simplest_nonbinary_map_mutations },
Expand Down
31 changes: 19 additions & 12 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -8808,16 +8808,18 @@ static tsk_id_t TSK_WARN_UNUSED
simplifier_record_node(simplifier_t *self, tsk_id_t input_id)
{
tsk_node_t node;
tsk_flags_t flags;
bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS);

tsk_node_table_get_row_unsafe(&self->input_tables.nodes, (tsk_id_t) input_id, &node);
/* Zero out the sample bit */
flags = node.flags & (tsk_flags_t) ~TSK_NODE_IS_SAMPLE;
if (self->is_sample[input_id]) {
flags |= TSK_NODE_IS_SAMPLE;
if (update_flags) {
/* Zero out the sample bit */
node.flags &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE;
if (self->is_sample[input_id]) {
node.flags |= TSK_NODE_IS_SAMPLE;
}
}
self->node_id_map[input_id] = (tsk_id_t) self->tables->nodes.num_rows;
return tsk_node_table_add_row(&self->tables->nodes, flags, node.time,
return tsk_node_table_add_row(&self->tables->nodes, node.flags, node.time,
node.population, node.individual, node.metadata, node.metadata_length);
}

Expand Down Expand Up @@ -9108,6 +9110,7 @@ simplifier_init_nodes(simplifier_t *self, const tsk_id_t *samples)
tsk_size_t j;
const tsk_size_t num_nodes = self->input_tables.nodes.num_rows;
bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES);
bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS);
tsk_flags_t *node_flags = self->tables->nodes.flags;
tsk_id_t *node_id_map = self->node_id_map;

Expand All @@ -9123,13 +9126,17 @@ simplifier_init_nodes(simplifier_t *self, const tsk_id_t *samples)
}
} else {
tsk_bug_assert(self->tables->nodes.num_rows == num_nodes);
/* The node table has not been changed */
for (j = 0; j < num_nodes; j++) {
/* Reset the sample flags */
node_flags[j] &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE;
if (self->is_sample[j]) {
node_flags[j] |= TSK_NODE_IS_SAMPLE;
if (update_flags) {
for (j = 0; j < num_nodes; j++) {
/* Reset the sample flags */
node_flags[j] &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE;
if (self->is_sample[j]) {
node_flags[j] |= TSK_NODE_IS_SAMPLE;
}
}
}

for (j = 0; j < num_nodes; j++) {
node_id_map[j] = (tsk_id_t) j;
}
}
Expand Down
23 changes: 20 additions & 3 deletions c/tskit/tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,10 @@ first.
*/
#define TSK_SIMPLIFY_NO_FILTER_NODES (1 << 7)
/**
Do not update the sample status of nodes as a result of simplification.
*/
#define TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS (1 << 8)
/**
Reduce the topological information in the tables to the minimum necessary to
represent the trees that contain sites. If there are zero sites this will
result in an zero output edges. When the number of sites is greater than zero,
Expand Down Expand Up @@ -3919,9 +3923,10 @@ or :c:macro:`TSK_NULL` if the node has been removed. Thus, ``node_map`` must be
of at least ``self->nodes.num_rows`` :c:type:`tsk_id_t` values.

If the `TSK_SIMPLIFY_NO_FILTER_NODES` option is specified, the node table will be
unaltered except for changing the sample status of nodes that were samples in the
input tables, but not in the specified list of sample IDs (if provided). The
``node_map`` (if specified) will always be the identity mapping, such that
unaltered except for changing the sample status of nodes (but see the
`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option below) and to update references
to other tables that may have changed as a result of filtering (see below).
The ``node_map`` (if specified) will always be the identity mapping, such that
``node_map[u] == u`` for all nodes. Note also that the order of the list of
samples is not important in this case.

Expand All @@ -3941,6 +3946,17 @@ sample status flag of nodes.
may be entirely unreferenced entities in the input tables, which
are not affected by whether we filter nodes or not.

By default, the node sample flags are updated by unsetting the
:c:macro:`TSK_NODE_IS_SAMPLE` flag for all nodes and subsequently setting it
for the nodes provided as input to this function. The
`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option will prevent this from occuring,
making it the responsibility of calling code to keep track of the ultimate
sample status of nodes. Using this option in conjunction with
`TSK_SIMPLIFY_NO_FILTER_NODES` (and without the
`TSK_SIMPLIFY_FILTER_POPULATIONS` and `TSK_SIMPLIFY_FILTER_INDIVIDUALS`
options) guarantees that the node table will not be written to during the
lifetime of this function.

The table collection will always be unindexed after simplify successfully completes.

.. note:: Migrations are currently not supported by simplify, and an error will
Expand All @@ -3956,6 +3972,7 @@ Options can be specified by providing one or more of the following bitwise
- :c:macro:`TSK_SIMPLIFY_FILTER_POPULATIONS`
- :c:macro:`TSK_SIMPLIFY_FILTER_INDIVIDUALS`
- :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES`
- :c:macro:`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS`
- :c:macro:`TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY`
- :c:macro:`TSK_SIMPLIFY_KEEP_UNARY`
- :c:macro:`TSK_SIMPLIFY_KEEP_INPUT_ROOTS`
Expand Down
17 changes: 11 additions & 6 deletions python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -6589,21 +6589,23 @@ TableCollection_simplify(TableCollection *self, PyObject *args, PyObject *kwds)
int filter_individuals = false;
int filter_populations = false;
int filter_nodes = true;
int update_sample_flags = true;
int keep_unary = false;
int keep_unary_in_individuals = false;
int keep_input_roots = false;
int reduce_to_site_topology = false;
static char *kwlist[] = { "samples", "filter_sites", "filter_populations",
"filter_individuals", "filter_nodes", "reduce_to_site_topology", "keep_unary",
"keep_unary_in_individuals", "keep_input_roots", NULL };
static char *kwlist[]
= { "samples", "filter_sites", "filter_populations", "filter_individuals",
"filter_nodes", "update_sample_flags", "reduce_to_site_topology",
"keep_unary", "keep_unary_in_individuals", "keep_input_roots", NULL };

if (TableCollection_check_state(self) != 0) {
goto out;
}
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|iiiiiiii", kwlist, &samples,
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|iiiiiiiii", kwlist, &samples,
&filter_sites, &filter_populations, &filter_individuals, &filter_nodes,
&reduce_to_site_topology, &keep_unary, &keep_unary_in_individuals,
&keep_input_roots)) {
&update_sample_flags, &reduce_to_site_topology, &keep_unary,
&keep_unary_in_individuals, &keep_input_roots)) {
goto out;
}
samples_array = (PyArrayObject *) PyArray_FROMANY(
Expand All @@ -6625,6 +6627,9 @@ TableCollection_simplify(TableCollection *self, PyObject *args, PyObject *kwds)
if (!filter_nodes) {
options |= TSK_SIMPLIFY_NO_FILTER_NODES;
}
if (!update_sample_flags) {
options |= TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS;
}
if (reduce_to_site_topology) {
options |= TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY;
}
Expand Down
27 changes: 15 additions & 12 deletions python/tests/simplify.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def __init__(
keep_unary=False,
keep_unary_in_individuals=False,
keep_input_roots=False,
filter_nodes=True, # If this is False, the order in `sample` is ignored
filter_nodes=True,
update_sample_flags=True,
):
self.ts = ts
self.n = len(sample)
Expand All @@ -121,6 +122,7 @@ def __init__(
self.filter_populations = filter_populations
self.filter_individuals = filter_individuals
self.filter_nodes = filter_nodes
self.update_sample_flags = update_sample_flags
self.keep_unary = keep_unary
self.keep_unary_in_individuals = keep_unary_in_individuals
self.keep_input_roots = keep_input_roots
Expand Down Expand Up @@ -152,14 +154,14 @@ def __init__(
# NOTE In the C implementation we would really just not touch the
# original tables.
self.tables.nodes.replace_with(self.ts.tables.nodes)
# TODO make this optional somehow
flags = self.tables.nodes.flags
# Zero out other sample flags
flags = np.bitwise_and(flags, ~tskit.NODE_IS_SAMPLE)
flags[sample] |= tskit.NODE_IS_SAMPLE
self.tables.nodes.flags = flags.astype(np.uint32)
self.node_id_map[:] = np.arange(ts.num_nodes)
if self.update_sample_flags:
flags = self.tables.nodes.flags
# Zero out other sample flags
flags = np.bitwise_and(flags, ~tskit.NODE_IS_SAMPLE)
flags[sample] |= tskit.NODE_IS_SAMPLE
self.tables.nodes.flags = flags.astype(np.uint32)

self.node_id_map[:] = np.arange(ts.num_nodes)
for sample_id in sample:
self.add_ancestry(sample_id, 0, self.sequence_length, sample_id)
else:
Expand All @@ -178,10 +180,11 @@ def record_node(self, input_id):
"""
node = self.ts.node(input_id)
flags = node.flags
# Need to zero out the sample flag
flags &= ~tskit.NODE_IS_SAMPLE
if self.is_sample[input_id]:
flags |= tskit.NODE_IS_SAMPLE
if self.update_sample_flags:
# Need to zero out the sample flag
flags &= ~tskit.NODE_IS_SAMPLE
if self.is_sample[input_id]:
flags |= tskit.NODE_IS_SAMPLE
output_id = self.tables.nodes.append(node.replace(flags=flags))
self.node_id_map[input_id] = output_id
return output_id
Expand Down
41 changes: 28 additions & 13 deletions python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,19 +254,19 @@ def get_internal_samples_examples():
# Set all nodes to be samples.
flags[:] = tskit.NODE_IS_SAMPLE
nodes.flags = flags
ret.append(("all nodes samples", tables.tree_sequence()))
ret.append(("all_nodes_samples", tables.tree_sequence()))

# Set just internal nodes to be samples.
flags[:] = 0
flags[n:] = tskit.NODE_IS_SAMPLE
nodes.flags = flags
ret.append(("internal nodes samples", tables.tree_sequence()))
ret.append(("internal_nodes_samples", tables.tree_sequence()))

# Set a mixture of internal and leaf samples.
flags[:] = 0
flags[n // 2 : n + n // 2] = tskit.NODE_IS_SAMPLE
nodes.flags = flags
ret.append(("mixture of internal and leaf samples", tables.tree_sequence()))
ret.append(("mixed_internal_leaf_samples", tables.tree_sequence()))
return ret


Expand All @@ -281,7 +281,7 @@ def get_decapitated_examples():

ts = msprime.simulate(20, recombination_rate=1, random_seed=1234)
assert ts.num_trees > 2
ret.append(("decapitate recomb", ts.decapitate(ts.tables.nodes.time[-1] / 4)))
ret.append(("decapitate_recomb", ts.decapitate(ts.tables.nodes.time[-1] / 4)))
return ret


Expand All @@ -302,7 +302,7 @@ def get_bottleneck_examples():
demographic_events=bottlenecks,
random_seed=n,
)
yield (f"bottleneck n={n}", ts)
yield (f"bottleneck_n={n}", ts)


def get_back_mutation_examples():
Expand Down Expand Up @@ -337,13 +337,13 @@ def make_example_tree_sequences():
)
ts = tsutil.insert_random_ploidy_individuals(ts, 4, seed=seed)
yield (
f"n={n} m={m} rho={rho}",
f"n={n}_m={m}_rho={rho}",
tsutil.add_random_metadata(ts, seed=seed),
)
seed += 1
for name, ts in get_bottleneck_examples():
yield (
f"{name} mutated",
f"{name}_mutated",
msprime.mutate(
ts,
rate=0.1,
Expand All @@ -352,7 +352,7 @@ def make_example_tree_sequences():
),
)
ts = tskit.Tree.generate_balanced(8).tree_sequence
yield ("rev node order", ts.subset(np.arange(ts.num_nodes - 1, -1, -1)))
yield ("rev_node_order", ts.subset(np.arange(ts.num_nodes - 1, -1, -1)))
ts = msprime.sim_ancestry(
8, sequence_length=40, recombination_rate=0.1, random_seed=seed
)
Expand All @@ -361,20 +361,20 @@ def make_example_tree_sequences():
ts = tables.tree_sequence()
assert ts.num_trees > 1
yield (
"back mutations",
"back_mutations",
tsutil.insert_branch_mutations(ts, mutations_per_branch=2),
)
ts = tsutil.insert_multichar_mutations(ts)
yield ("multichar", ts)
yield ("multichar w/ metadata", tsutil.add_random_metadata(ts))
yield ("multichar_no_metadata", tsutil.add_random_metadata(ts))
tables = ts.dump_tables()
tables.nodes.flags = np.zeros_like(tables.nodes.flags)
yield ("no samples", tables.tree_sequence()) # no samples
yield ("no_samples", tables.tree_sequence()) # no samples
tables = ts.dump_tables()
tables.edges.clear()
yield ("empty tree", tables.tree_sequence()) # empty tree
yield ("empty_tree", tables.tree_sequence()) # empty tree
yield (
"empty ts",
"empty_ts",
tskit.TableCollection(sequence_length=1).tree_sequence(),
) # empty tree seq
yield ("all_fields", tsutil.all_fields_ts())
Expand All @@ -384,6 +384,8 @@ def make_example_tree_sequences():


def get_example_tree_sequences(pytest_params=True):
# NOTE: pytest names should not contain spaces and be shell safe so
# that they can be easily specified on the command line.
if pytest_params:
return [pytest.param(ts, id=name) for name, ts in _examples]
else:
Expand Down Expand Up @@ -2785,6 +2787,19 @@ def test_simplify_migrations_fails(self):
with pytest.raises(_tskit.LibraryError):
ts.simplify()

@pytest.mark.parametrize("ts", get_example_tree_sequences())
def test_no_update_sample_flags_no_filter_nodes(self, ts):
# Can't simplify edges with metadata
if ts.tables.edges.metadata_schema == tskit.MetadataSchema(schema=None):
k = min(ts.num_samples, 3)
subset = ts.samples()[:k]
ts1 = ts.simplify(subset)
ts2 = ts.simplify(subset, update_sample_flags=False, filter_nodes=False)
assert ts1.num_samples == len(subset)
assert ts2.num_samples == ts.num_samples
assert ts1.num_edges == ts2.num_edges
assert ts2.tables.nodes == ts.tables.nodes


class TestMinMaxTime:
def get_example_tree_sequence(self, use_unknown_time):
Expand Down
3 changes: 3 additions & 0 deletions python/tests/test_lowlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ def test_simplify_bad_args(self):
tc.simplify([0, 1], filter_populations="x")
with pytest.raises(TypeError):
tc.simplify([0, 1], filter_nodes="x")
with pytest.raises(TypeError):
tc.simplify([0, 1], update_sample_flags="x")
with pytest.raises(_tskit.LibraryError):
tc.simplify([0, -1])

Expand All @@ -360,6 +362,7 @@ def test_simplify_bad_args(self):
"filter_populations",
"filter_individuals",
"filter_nodes",
"update_sample_flags",
"reduce_to_site_topology",
"keep_unary",
"keep_unary_in_individuals",
Expand Down
Loading