Skip to content

Adding unit tests for diploid #66

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Feb 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7af1330
Merge pull request #8 from hiclib/master
gesinecauer Jan 21, 2020
b6e82da
Merge pull request #9 from gesinecauer/master
gesinecauer Jan 21, 2020
a8eebfd
edited returns & saving behavior to facilitate unit testing
gesinecauer Jan 22, 2020
a91f75e
added full test of algorithm with unambig
gesinecauer Jan 22, 2020
b16e6a4
bug fix: renamed utils to utils_poisson in import statements
gesinecauer Jan 22, 2020
424afc5
bug fix: imports
gesinecauer Jan 22, 2020
e32a915
install autograd for travis
gesinecauer Jan 22, 2020
5fb552d
actually installing autograd for travis, hopefully
gesinecauer Jan 22, 2020
224063d
bug fix: finished renaming "stepwise" to "piecewise"
gesinecauer Jan 22, 2020
f5bbf9d
bug fix _get_counts()
gesinecauer Jan 22, 2020
a69d9d2
bug fix: subset_chrom()
gesinecauer Jan 22, 2020
0f22f26
bug fix: subset_chrom(), again
gesinecauer Jan 22, 2020
03fcdc9
bug fix: _check_counts_matrix() now only warns if counts are float, r…
gesinecauer Jan 22, 2020
8fe09bc
bug fix: _initialize_struct_mds()
gesinecauer Jan 22, 2020
6dec024
added full tests of run with ambig & pa
gesinecauer Jan 22, 2020
6c715a1
bug fix: prevent unnecessary erroneous about non-integer warnings in …
gesinecauer Jan 23, 2020
e616caf
bug fix: type(lengths) in PastisPM()
gesinecauer Jan 23, 2020
eba1f84
bug fix: format of ouput files - history & inference_variables
gesinecauer Jan 23, 2020
d85eb99
dont run python3-only unit tests with python2
gesinecauer Jan 23, 2020
61129ae
dont run python3-only unit tests with python2.. trying again
gesinecauer Jan 23, 2020
c8a12c4
dont run python3-only unit tests with python2.. trying again
gesinecauer Jan 23, 2020
23dac77
dont run python3-only unit tests with python2.. try #3
gesinecauer Jan 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ matrix:
# Ubuntu 14.04 versions
- name: "Python 2.7"
python: 2.7
env: TEST=TRUE NUMPY_VERSION=1.16 SCIPY_VERSION=1.2 PANDAS_VERSION=* SKLEARN_VERSION=0.20
env: TEST=TRUE NUMPY_VERSION=1.16 SCIPY_VERSION=1.2 PANDAS_VERSION=* SKLEARN_VERSION=0.20 AUTOGRAD_VERSION=*
- name: "Python 3.6"
python: 3.6
env: TEST=TRUE NUMPY_VERSION=1.14 SCIPY_VERSION=0.19 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.18 COVERAGE=true
env: TEST=TRUE NUMPY_VERSION=1.14 SCIPY_VERSION=0.19 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.18 AUTOGRAD_VERSION=1.2 COVERAGE=true
- name: "Python 3.6 + recent sklearn"
python: 3.6
env: TEST=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3.1 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.21 COVERAGE=true BUILD_DOC=true
env: TEST=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3.1 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.21 AUTOGRAD_VERSION=1.2 COVERAGE=true BUILD_DOC=true
# - name: "Python 3.7"
# python: 3.7
# env: TEST=TRUE COVERAGE=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3 PANDAS_VERSION="*" SKLEARN_VERSION="*"
Expand Down
7 changes: 7 additions & 0 deletions build_tools/travis/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ else
fi


if [[ $AUTOGRAD_VERSION != "*" ]]; then
pip install --upgrade autograd==$AUTOGRAD_VERSION
else
pip install --upgrade autograd
fi


pip install cython


Expand Down
27 changes: 15 additions & 12 deletions pastis/optimization/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

import numpy as np
from scipy import sparse
from warnings import warn

from iced.filter import filter_low_counts
from iced.normalization import ICE_normalization

from iced.io import write_counts

from .utils_poisson import _constraint_dis_indices
from .constraints import _constraint_dis_indices
from .utils_poisson import find_beads_to_remove

from .multiscale_optimization import decrease_lengths_res
Expand Down Expand Up @@ -153,15 +154,16 @@ def subset_chrom(ploidy, lengths_full, chrom_full, chrom_subset=None,
the specified chromosomes. Otherwise, return None.
"""

if isinstance(chrom_subset, str):
chrom_subset = np.array([chrom_subset])
missing_chrom = [x for x in chrom_subset if x not in chrom_full]
if len(missing_chrom) > 0:
raise ValueError("Chromosomes to be subsetted (%s) are not in full"
"list of chromosomes (%s)" %
(','.join(missing_chrom), ','.join(chrom_full)))
# Make sure chrom_subset is sorted properly
chrom_subset = [chrom for chrom in chrom_full if chrom in chrom_subset]
if chrom_subset is not None:
if isinstance(chrom_subset, str):
chrom_subset = np.array([chrom_subset])
missing_chrom = [x for x in chrom_subset if x not in chrom_full]
if len(missing_chrom) > 0:
raise ValueError("Chromosomes to be subsetted (%s) are not in full"
"list of chromosomes (%s)" %
(','.join(missing_chrom), ','.join(chrom_full)))
# Make sure chrom_subset is sorted properly
chrom_subset = [chrom for chrom in chrom_full if chrom in chrom_subset]

if chrom_subset is None or np.array_equal(chrom_subset, chrom_full):
chrom_subset = chrom_full.copy()
Expand Down Expand Up @@ -221,8 +223,9 @@ def _check_counts_matrix(counts, lengths, ploidy, exclude_zeros=True,
if not isinstance(counts, np.ndarray):
counts = np.array(counts)

if not np.array_equal(counts, counts.round()):
raise ValueError("Counts matrix must only contain integers or NaN")
if not np.array_equal(counts[~np.isnan(counts)],
counts[~np.isnan(counts)].round()):
warn("Counts matrix must only contain integers or NaN")

if counts.shape[0] == counts.shape[1]:
counts[np.tril_indices(counts.shape[0])] = empty_val
Expand Down
6 changes: 3 additions & 3 deletions pastis/optimization/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
"""Initialize structure via multi-dimensional scaling of unambig counts.
"""

from .utils import find_beads_to_remove
from .utils_poisson import find_beads_to_remove
from .mds import estimate_X

if verbose:
Expand All @@ -31,7 +31,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
ua_beta *= multiscale_factor ** 2

struct = estimate_X(
ua_counts.counts.tocoo().astype(float),
ua_counts._counts.astype(float),
alpha=-3. if alpha is None else alpha, beta=ua_beta, verbose=False,
use_zero_entries=False, precompute_distances='auto',
bias=(np.tile(bias, ploidy) if bias is not None else bias),
Expand All @@ -51,7 +51,7 @@ def _initialize_struct(counts, lengths, ploidy, alpha, bias, random_state,
"""Initialize structure, randomly or via MDS of unambig counts.
"""

from .utils import _struct_replace_nan, _format_structures
from .utils_poisson import _struct_replace_nan, _format_structures

random_state = check_random_state(random_state)

Expand Down
8 changes: 5 additions & 3 deletions pastis/optimization/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,18 @@ def _get_counts(counts, lengths):
lengths = _get_lengths(lengths)
output = []
for f in counts:
if isinstance(f, np.ndarray) or isinstance(f, sparse.coo_matrix):
if isinstance(f, np.ndarray) or sparse.issparse(f):
counts_maps = f
if f.endswith(".npy"):
elif f.endswith(".npy"):
counts_maps = np.load(f)
counts_maps[np.isnan(counts_maps)] = 0
elif f.endswith(".matrix"):
counts_maps = load_counts(f, lengths=lengths)
else:
raise ValueError("Counts file must end with .npy (for numpy array)"
" or .matrix (for hiclib / iced format)")
if sparse.issparse(counts_maps):
counts_maps = counts_maps.toarray()
counts_maps[np.isnan(counts_maps)] = 0
output.append(sparse.coo_matrix(counts_maps))
return output

Expand Down
129 changes: 84 additions & 45 deletions pastis/optimization/pastis_algorithms.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
from __future__ import print_function

import sys

if sys.version_info[0] < 3:
raise Exception("Must be using Python 3")

import os
import numpy as np
import pandas as pd
from sklearn.utils import check_random_state
from .utils import _print_code_header
from .utils_poisson import _print_code_header
from distutils.util import strtobool


def _test_objective(struct, counts, lengths, ploidy, alpha, bias,
Expand Down Expand Up @@ -37,7 +45,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
normalize=True, filter_threshold=0.04, alpha_init=-3.,
max_alpha_loop=20, beta=None, multiscale_factor=1,
multiscale_rounds=1, use_multiscale_variance=True,
final_multiscale_round=False, init='msd', max_iter=10000000000,
final_multiscale_round=False, init='mds', max_iter=10000000000,
factr=10000000., pgtol=1e-05, alpha_factr=1000000000000.,
bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
fullres_torm=None, struct_draft_fullres=None, draft=False,
Expand Down Expand Up @@ -138,7 +146,6 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
infer_var : dict
A few of the variables used in inference or generated by inference.
Keys: 'alpha', 'beta', 'hsc_r', 'obj', and 'seed'.

"""

from .counts import preprocess_counts, ambiguate_counts, _update_betas_in_counts_matrices
Expand All @@ -148,34 +155,40 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
from .poisson import PastisPM
from .estimate_alpha_beta import _estimate_beta
from .multiscale_optimization import get_multiscale_variances_from_struct, _choose_max_multiscale_factor, decrease_lengths_res
from .utils import find_beads_to_remove

try:
os.makedirs(outdir)
except OSError:
pass
if seed is None:
seed_str = ''
else:
seed_str = '.%03d' % seed
out_file = os.path.join(outdir, 'struct_inferred%s.coords' % seed_str)
orient_file = os.path.join(outdir, 'orient_inferred%s.coords' % seed_str)
history_file = os.path.join(outdir, 'history%s' % seed_str)
infer_var_file = os.path.join(
outdir, 'inference_variables%s' % seed_str)
out_fail = os.path.join(outdir, 'struct_nonconverged%s.coords' % seed_str)

if os.path.exists(out_file):
print('CONVERGED', flush=True)
infer_var = dict(pd.read_csv(
infer_var_file, sep='\t', header=None, squeeze=True, index_col=0))
infer_var['beta'] = [float(b) for b in infer_var['beta'].split()]
infer_var['alpha'] = float(infer_var['alpha'])
struct_ = np.loadtxt(out_file)
return struct_, infer_var
elif os.path.exists(out_fail):
print('OPTIMIZATION DID NOT CONVERGE', flush=True)
exit(1)
from .utils_poisson import find_beads_to_remove

if outdir is not None:
try:
os.makedirs(outdir)
except OSError:
pass
if seed is None:
seed_str = ''
else:
seed_str = '.%03d' % seed
out_file = os.path.join(outdir, 'struct_inferred%s.coords' % seed_str)
orient_file = os.path.join(
outdir, 'orient_inferred%s.coords' % seed_str)
history_file = os.path.join(outdir, 'history%s' % seed_str)
infer_var_file = os.path.join(
outdir, 'inference_variables%s' % seed_str)
out_fail = os.path.join(
outdir, 'struct_nonconverged%s.coords' % seed_str)

if os.path.exists(out_file) or os.path.exists(out_fail):
if os.path.exists(out_file):
print('CONVERGED', flush=True)
elif os.path.exists(out_fail):
print('OPTIMIZATION DID NOT CONVERGE', flush=True)
infer_var = dict(pd.read_csv(
infer_var_file, sep='\t', header=None, squeeze=True,
index_col=0))
infer_var['beta'] = [float(b) for b in infer_var['beta'].split()]
infer_var['hsc_r'] = [float(r) for r in infer_var['hsc_r'].split()]
infer_var['alpha'] = float(infer_var['alpha'])
infer_var['converged'] = strtobool(infer_var['converged'])
struct_ = np.loadtxt(out_file)
return struct_, infer_var

random_state = np.random.RandomState(seed)
random_state = check_random_state(random_state)
Expand Down Expand Up @@ -256,6 +269,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
alpha_true=alpha_true, struct_true=struct_true,
input_weight=input_weight, exclude_zeros=exclude_zeros,
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
if not infer_var_fullres['converged']:
return struct_draft_fullres, infer_var_fullres
alpha_ = infer_var_fullres['alpha']
beta_ = infer_var_fullres['beta']
counts = _update_betas_in_counts_matrices(
Expand Down Expand Up @@ -290,7 +305,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
simple_diploid_for_lowres = True
multiscale_factor_for_lowres = _choose_max_multiscale_factor(
lengths=lengths, min_beads=hsc_min_beads)
struct_draft_lowres, _ = infer(
struct_draft_lowres, infer_var_lowres = infer(
counts_raw=counts_for_lowres,
outdir=os.path.join(outdir, 'struct_draft_lowres'),
lengths=lengths, ploidy=ploidy, alpha=alpha_,
Expand All @@ -309,6 +324,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
struct_true=struct_true, input_weight=input_weight,
exclude_zeros=exclude_zeros, null=null,
mixture_coefs=mixture_coefs, verbose=verbose)
if not infer_var_lowres['converged']:
return struct_draft_lowres, infer_var_lowres
hsc_r = distance_between_homologs(
structures=struct_draft_lowres,
lengths=decrease_lengths_res(
Expand Down Expand Up @@ -359,20 +376,30 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
struct_[torm] = np.nan

infer_var = {'alpha': pm.alpha_, 'beta': pm.beta_, 'hsc_r': hsc_r,
'obj': pm.obj_, 'seed': seed}
'obj': pm.obj_, 'seed': seed, 'converged': pm.converged_}

if outdir is not None:
with open(infer_var_file, 'w') as f:
for k, v in infer_var.items():
if isinstance(v, np.ndarray) or isinstance(v, list):
f.write(
'%s\t%s\n' % (k, ' '.join(['%g' % x for x in v])))
else:
f.write('%s\t%g\n' % (k, v))
if reorienter is not None and reorienter.reorient:
np.savetxt(orient_file, pm.orientation_)
if pm.converged_:
np.savetxt(out_file, struct_)
if pm.history_ is not None:
pd.DataFrame(
pm.history_).to_csv(history_file, sep='\t', index=False)
else:
np.savetxt(out_fail, struct_)

if reorienter is not None and reorienter.reorient:
np.savetxt(orient_file, pm.orientation_)
if pm.converged_:
np.savetxt(out_file, struct_)
pd.Series(infer_var).to_csv(infer_var_file, sep='\t', header=False)
if pm.history_ is not None:
pd.DataFrame(
pm.history_).to_csv(history_file, sep='\t', index=False)
return struct_, infer_var
else:
np.savetxt(out_fail, struct_)
exit(1)
return None, infer_var

else:
# BEGIN MULTISCALE OPTIMIZATION
Expand Down Expand Up @@ -413,6 +440,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
alpha_true=alpha_true, struct_true=struct_true,
input_weight=input_weight, exclude_zeros=exclude_zeros,
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
if not infer_var['converged']:
return struct_, infer_var
return struct_, infer_var


Expand All @@ -427,7 +456,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
piecewise=False, piecewise_step=None, piecewise_chrom=None,
piecewise_min_beads=5, piecewise_fix_homo=False,
piecewise_opt_orient=True, alpha_true=None, struct_true=None,
init='msd', input_weight=None, exclude_zeros=False,
init='mds', input_weight=None, exclude_zeros=False,
null=False, mixture_coefs=None, verbose=True):
"""Infer 3D structures with PASTIS via Poisson model.

Expand Down Expand Up @@ -492,10 +521,18 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
hsc_min_beads : int, optional
For diploid organisms: number of beads in the low-resolution
structure from which `hsc_r` is estimated.

Returns
-------
struct_ : array_like of float of shape (lengths.sum() * ploidy, 3)
3D structure resulting from the optimization.
infer_var : dict
A few of the variables used in inference or generated by inference.
Keys: 'alpha', 'beta', 'hsc_r', 'obj', and 'seed'.
"""

from .load_data import load_data
from .stepwise_whole_genome import stepwise_inference
from .piecewise_whole_genome import piecewise_inference

lengths_full = lengths
chrom_full = chromosomes
Expand Down Expand Up @@ -529,7 +566,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
input_weight=input_weight, exclude_zeros=exclude_zeros,
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
else:
stepwise_inference(
struct_, infer_var = piecewise_inference(
counts=counts, outdir=outdir, lengths=lengths_subset, ploidy=ploidy,
chromosomes=chrom_subset, alpha=alpha, seed=seed, normalize=normalize,
filter_threshold=filter_threshold, alpha_init=alpha_init,
Expand All @@ -548,6 +585,8 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
input_weight=input_weight, exclude_zeros=exclude_zeros, null=null,
mixture_coefs=mixture_coefs, verbose=verbose)

return struct_, infer_var


def _output_subdir(outdir, chrom_full, chrom_subset=None, null=False,
piecewise=False, piecewise_step=None,
Expand Down
22 changes: 11 additions & 11 deletions pastis/optimization/piecewise_whole_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,24 +287,24 @@ def _assemble_highres_chrom_via_lowres_genome(outdir, outdir_lowres, outdir_orie
return X_all_chrom, trans_rot_init


def stepwise_inference(counts, outdir, lengths, ploidy, chromosomes, alpha, seed=0, normalize=True,
filter_threshold=0.04, alpha_init=-3., max_alpha_loop=20, multiscale_rounds=1,
use_multiscale_variance=True,
max_iter=1e40, factr=10000000.0, pgtol=1e-05, alpha_factr=1000000000000.,
bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
callback_function=None, callback_freq=None,
piecewise_step=None, piecewise_chrom=None,
piecewise_min_beads=5, piecewise_fix_homo=False, piecewise_opt_orient=True,
alpha_true=None, struct_true=None, init='msd', input_weight=None,
exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
def piecewise_inference(counts, outdir, lengths, ploidy, chromosomes, alpha, seed=0, normalize=True,
filter_threshold=0.04, alpha_init=-3., max_alpha_loop=20, multiscale_rounds=1,
use_multiscale_variance=True,
max_iter=1e40, factr=10000000.0, pgtol=1e-05, alpha_factr=1000000000000.,
bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
callback_function=None, callback_freq=None,
piecewise_step=None, piecewise_chrom=None,
piecewise_min_beads=5, piecewise_fix_homo=False, piecewise_opt_orient=True,
alpha_true=None, struct_true=None, init='msd', input_weight=None,
exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
"""
"""

import os
from .load_data import _load_inferred_struct, _choose_best_seed
from .counts import subset_chrom
from .pastis_algorithms import infer, _output_subdir
from .utils import _choose_max_multiscale_factor, _print_code_header
from .utils_poisson import _choose_max_multiscale_factor, _print_code_header

if piecewise_step is None:
piecewise_step = [1, 2, 3, 4]
Expand Down
Loading