Skip to content

Commit e8d0b60

Browse files
authored
Adding unit tests for diploid (#66)
* edited returns & saving behavior to facilitate unit testing * added full test of algorithm with unambig * bug fix: renamed utils to utils_poisson in import statements * bug fix: imports * install autograd for travis * actually installing autograd for travis, hopefully * bug fix: finished renaming "stepwise" to "piecewise" * bug fix _get_counts() * bug fix: subset_chrom() * bug fix: subset_chrom(), again * bug fix: _check_counts_matrix() now only warns if counts are float, rather than raising ValueError * bug fix: _initialize_struct_mds() * added full tests of run with ambig & pa * bug fix: prevent unnecessary erroneous about non-integer warnings in counts * bug fix: type(lengths) in PastisPM() * bug fix: format of ouput files - history & inference_variables * dont run python3-only unit tests with python2 * dont run python3-only unit tests with python2.. trying again * dont run python3-only unit tests with python2.. trying again * dont run python3-only unit tests with python2.. try #3
1 parent 16ee5d6 commit e8d0b60

File tree

9 files changed

+236
-85
lines changed

9 files changed

+236
-85
lines changed

.travis.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@ matrix:
1212
# Ubuntu 14.04 versions
1313
- name: "Python 2.7"
1414
python: 2.7
15-
env: TEST=TRUE NUMPY_VERSION=1.16 SCIPY_VERSION=1.2 PANDAS_VERSION=* SKLEARN_VERSION=0.20
15+
env: TEST=TRUE NUMPY_VERSION=1.16 SCIPY_VERSION=1.2 PANDAS_VERSION=* SKLEARN_VERSION=0.20 AUTOGRAD_VERSION=*
1616
- name: "Python 3.6"
1717
python: 3.6
18-
env: TEST=TRUE NUMPY_VERSION=1.14 SCIPY_VERSION=0.19 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.18 COVERAGE=true
18+
env: TEST=TRUE NUMPY_VERSION=1.14 SCIPY_VERSION=0.19 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.18 AUTOGRAD_VERSION=1.2 COVERAGE=true
1919
- name: "Python 3.6 + recent sklearn"
2020
python: 3.6
21-
env: TEST=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3.1 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.21 COVERAGE=true BUILD_DOC=true
21+
env: TEST=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3.1 PANDAS_VERSION=0.25 SKLEARN_VERSION=0.21 AUTOGRAD_VERSION=1.2 COVERAGE=true BUILD_DOC=true
2222
# - name: "Python 3.7"
2323
# python: 3.7
2424
# env: TEST=TRUE COVERAGE=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3 PANDAS_VERSION="*" SKLEARN_VERSION="*"

build_tools/travis/install.sh

+7
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ else
4242
fi
4343

4444

45+
if [[ $AUTOGRAD_VERSION != "*" ]]; then
46+
pip install --upgrade autograd==$AUTOGRAD_VERSION
47+
else
48+
pip install --upgrade autograd
49+
fi
50+
51+
4552
pip install cython
4653

4754

pastis/optimization/counts.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22

33
import numpy as np
44
from scipy import sparse
5+
from warnings import warn
56

67
from iced.filter import filter_low_counts
78
from iced.normalization import ICE_normalization
89

910
from iced.io import write_counts
1011

11-
from .utils_poisson import _constraint_dis_indices
12+
from .constraints import _constraint_dis_indices
1213
from .utils_poisson import find_beads_to_remove
1314

1415
from .multiscale_optimization import decrease_lengths_res
@@ -153,15 +154,16 @@ def subset_chrom(ploidy, lengths_full, chrom_full, chrom_subset=None,
153154
the specified chromosomes. Otherwise, return None.
154155
"""
155156

156-
if isinstance(chrom_subset, str):
157-
chrom_subset = np.array([chrom_subset])
158-
missing_chrom = [x for x in chrom_subset if x not in chrom_full]
159-
if len(missing_chrom) > 0:
160-
raise ValueError("Chromosomes to be subsetted (%s) are not in full"
161-
"list of chromosomes (%s)" %
162-
(','.join(missing_chrom), ','.join(chrom_full)))
163-
# Make sure chrom_subset is sorted properly
164-
chrom_subset = [chrom for chrom in chrom_full if chrom in chrom_subset]
157+
if chrom_subset is not None:
158+
if isinstance(chrom_subset, str):
159+
chrom_subset = np.array([chrom_subset])
160+
missing_chrom = [x for x in chrom_subset if x not in chrom_full]
161+
if len(missing_chrom) > 0:
162+
raise ValueError("Chromosomes to be subsetted (%s) are not in full"
163+
"list of chromosomes (%s)" %
164+
(','.join(missing_chrom), ','.join(chrom_full)))
165+
# Make sure chrom_subset is sorted properly
166+
chrom_subset = [chrom for chrom in chrom_full if chrom in chrom_subset]
165167

166168
if chrom_subset is None or np.array_equal(chrom_subset, chrom_full):
167169
chrom_subset = chrom_full.copy()
@@ -221,8 +223,9 @@ def _check_counts_matrix(counts, lengths, ploidy, exclude_zeros=True,
221223
if not isinstance(counts, np.ndarray):
222224
counts = np.array(counts)
223225

224-
if not np.array_equal(counts, counts.round()):
225-
raise ValueError("Counts matrix must only contain integers or NaN")
226+
if not np.array_equal(counts[~np.isnan(counts)],
227+
counts[~np.isnan(counts)].round()):
228+
warn("Counts matrix must only contain integers or NaN")
226229

227230
if counts.shape[0] == counts.shape[1]:
228231
counts[np.tril_indices(counts.shape[0])] = empty_val

pastis/optimization/initialization.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
99
"""Initialize structure via multi-dimensional scaling of unambig counts.
1010
"""
1111

12-
from .utils import find_beads_to_remove
12+
from .utils_poisson import find_beads_to_remove
1313
from .mds import estimate_X
1414

1515
if verbose:
@@ -31,7 +31,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
3131
ua_beta *= multiscale_factor ** 2
3232

3333
struct = estimate_X(
34-
ua_counts.counts.tocoo().astype(float),
34+
ua_counts._counts.astype(float),
3535
alpha=-3. if alpha is None else alpha, beta=ua_beta, verbose=False,
3636
use_zero_entries=False, precompute_distances='auto',
3737
bias=(np.tile(bias, ploidy) if bias is not None else bias),
@@ -51,7 +51,7 @@ def _initialize_struct(counts, lengths, ploidy, alpha, bias, random_state,
5151
"""Initialize structure, randomly or via MDS of unambig counts.
5252
"""
5353

54-
from .utils import _struct_replace_nan, _format_structures
54+
from .utils_poisson import _struct_replace_nan, _format_structures
5555

5656
random_state = check_random_state(random_state)
5757

pastis/optimization/load_data.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,18 @@ def _get_counts(counts, lengths):
4747
lengths = _get_lengths(lengths)
4848
output = []
4949
for f in counts:
50-
if isinstance(f, np.ndarray) or isinstance(f, sparse.coo_matrix):
50+
if isinstance(f, np.ndarray) or sparse.issparse(f):
5151
counts_maps = f
52-
if f.endswith(".npy"):
52+
elif f.endswith(".npy"):
5353
counts_maps = np.load(f)
54-
counts_maps[np.isnan(counts_maps)] = 0
5554
elif f.endswith(".matrix"):
5655
counts_maps = load_counts(f, lengths=lengths)
5756
else:
5857
raise ValueError("Counts file must end with .npy (for numpy array)"
5958
" or .matrix (for hiclib / iced format)")
59+
if sparse.issparse(counts_maps):
60+
counts_maps = counts_maps.toarray()
61+
counts_maps[np.isnan(counts_maps)] = 0
6062
output.append(sparse.coo_matrix(counts_maps))
6163
return output
6264

pastis/optimization/pastis_algorithms.py

+84-45
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1+
from __future__ import print_function
2+
3+
import sys
4+
5+
if sys.version_info[0] < 3:
6+
raise Exception("Must be using Python 3")
7+
18
import os
29
import numpy as np
310
import pandas as pd
411
from sklearn.utils import check_random_state
5-
from .utils import _print_code_header
12+
from .utils_poisson import _print_code_header
13+
from distutils.util import strtobool
614

715

816
def _test_objective(struct, counts, lengths, ploidy, alpha, bias,
@@ -37,7 +45,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
3745
normalize=True, filter_threshold=0.04, alpha_init=-3.,
3846
max_alpha_loop=20, beta=None, multiscale_factor=1,
3947
multiscale_rounds=1, use_multiscale_variance=True,
40-
final_multiscale_round=False, init='msd', max_iter=10000000000,
48+
final_multiscale_round=False, init='mds', max_iter=10000000000,
4149
factr=10000000., pgtol=1e-05, alpha_factr=1000000000000.,
4250
bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
4351
fullres_torm=None, struct_draft_fullres=None, draft=False,
@@ -138,7 +146,6 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
138146
infer_var : dict
139147
A few of the variables used in inference or generated by inference.
140148
Keys: 'alpha', 'beta', 'hsc_r', 'obj', and 'seed'.
141-
142149
"""
143150

144151
from .counts import preprocess_counts, ambiguate_counts, _update_betas_in_counts_matrices
@@ -148,34 +155,40 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
148155
from .poisson import PastisPM
149156
from .estimate_alpha_beta import _estimate_beta
150157
from .multiscale_optimization import get_multiscale_variances_from_struct, _choose_max_multiscale_factor, decrease_lengths_res
151-
from .utils import find_beads_to_remove
152-
153-
try:
154-
os.makedirs(outdir)
155-
except OSError:
156-
pass
157-
if seed is None:
158-
seed_str = ''
159-
else:
160-
seed_str = '.%03d' % seed
161-
out_file = os.path.join(outdir, 'struct_inferred%s.coords' % seed_str)
162-
orient_file = os.path.join(outdir, 'orient_inferred%s.coords' % seed_str)
163-
history_file = os.path.join(outdir, 'history%s' % seed_str)
164-
infer_var_file = os.path.join(
165-
outdir, 'inference_variables%s' % seed_str)
166-
out_fail = os.path.join(outdir, 'struct_nonconverged%s.coords' % seed_str)
167-
168-
if os.path.exists(out_file):
169-
print('CONVERGED', flush=True)
170-
infer_var = dict(pd.read_csv(
171-
infer_var_file, sep='\t', header=None, squeeze=True, index_col=0))
172-
infer_var['beta'] = [float(b) for b in infer_var['beta'].split()]
173-
infer_var['alpha'] = float(infer_var['alpha'])
174-
struct_ = np.loadtxt(out_file)
175-
return struct_, infer_var
176-
elif os.path.exists(out_fail):
177-
print('OPTIMIZATION DID NOT CONVERGE', flush=True)
178-
exit(1)
158+
from .utils_poisson import find_beads_to_remove
159+
160+
if outdir is not None:
161+
try:
162+
os.makedirs(outdir)
163+
except OSError:
164+
pass
165+
if seed is None:
166+
seed_str = ''
167+
else:
168+
seed_str = '.%03d' % seed
169+
out_file = os.path.join(outdir, 'struct_inferred%s.coords' % seed_str)
170+
orient_file = os.path.join(
171+
outdir, 'orient_inferred%s.coords' % seed_str)
172+
history_file = os.path.join(outdir, 'history%s' % seed_str)
173+
infer_var_file = os.path.join(
174+
outdir, 'inference_variables%s' % seed_str)
175+
out_fail = os.path.join(
176+
outdir, 'struct_nonconverged%s.coords' % seed_str)
177+
178+
if os.path.exists(out_file) or os.path.exists(out_fail):
179+
if os.path.exists(out_file):
180+
print('CONVERGED', flush=True)
181+
elif os.path.exists(out_fail):
182+
print('OPTIMIZATION DID NOT CONVERGE', flush=True)
183+
infer_var = dict(pd.read_csv(
184+
infer_var_file, sep='\t', header=None, squeeze=True,
185+
index_col=0))
186+
infer_var['beta'] = [float(b) for b in infer_var['beta'].split()]
187+
infer_var['hsc_r'] = [float(r) for r in infer_var['hsc_r'].split()]
188+
infer_var['alpha'] = float(infer_var['alpha'])
189+
infer_var['converged'] = strtobool(infer_var['converged'])
190+
struct_ = np.loadtxt(out_file)
191+
return struct_, infer_var
179192

180193
random_state = np.random.RandomState(seed)
181194
random_state = check_random_state(random_state)
@@ -256,6 +269,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
256269
alpha_true=alpha_true, struct_true=struct_true,
257270
input_weight=input_weight, exclude_zeros=exclude_zeros,
258271
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
272+
if not infer_var_fullres['converged']:
273+
return struct_draft_fullres, infer_var_fullres
259274
alpha_ = infer_var_fullres['alpha']
260275
beta_ = infer_var_fullres['beta']
261276
counts = _update_betas_in_counts_matrices(
@@ -290,7 +305,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
290305
simple_diploid_for_lowres = True
291306
multiscale_factor_for_lowres = _choose_max_multiscale_factor(
292307
lengths=lengths, min_beads=hsc_min_beads)
293-
struct_draft_lowres, _ = infer(
308+
struct_draft_lowres, infer_var_lowres = infer(
294309
counts_raw=counts_for_lowres,
295310
outdir=os.path.join(outdir, 'struct_draft_lowres'),
296311
lengths=lengths, ploidy=ploidy, alpha=alpha_,
@@ -309,6 +324,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
309324
struct_true=struct_true, input_weight=input_weight,
310325
exclude_zeros=exclude_zeros, null=null,
311326
mixture_coefs=mixture_coefs, verbose=verbose)
327+
if not infer_var_lowres['converged']:
328+
return struct_draft_lowres, infer_var_lowres
312329
hsc_r = distance_between_homologs(
313330
structures=struct_draft_lowres,
314331
lengths=decrease_lengths_res(
@@ -359,20 +376,30 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
359376
struct_[torm] = np.nan
360377

361378
infer_var = {'alpha': pm.alpha_, 'beta': pm.beta_, 'hsc_r': hsc_r,
362-
'obj': pm.obj_, 'seed': seed}
379+
'obj': pm.obj_, 'seed': seed, 'converged': pm.converged_}
380+
381+
if outdir is not None:
382+
with open(infer_var_file, 'w') as f:
383+
for k, v in infer_var.items():
384+
if isinstance(v, np.ndarray) or isinstance(v, list):
385+
f.write(
386+
'%s\t%s\n' % (k, ' '.join(['%g' % x for x in v])))
387+
else:
388+
f.write('%s\t%g\n' % (k, v))
389+
if reorienter is not None and reorienter.reorient:
390+
np.savetxt(orient_file, pm.orientation_)
391+
if pm.converged_:
392+
np.savetxt(out_file, struct_)
393+
if pm.history_ is not None:
394+
pd.DataFrame(
395+
pm.history_).to_csv(history_file, sep='\t', index=False)
396+
else:
397+
np.savetxt(out_fail, struct_)
363398

364-
if reorienter is not None and reorienter.reorient:
365-
np.savetxt(orient_file, pm.orientation_)
366399
if pm.converged_:
367-
np.savetxt(out_file, struct_)
368-
pd.Series(infer_var).to_csv(infer_var_file, sep='\t', header=False)
369-
if pm.history_ is not None:
370-
pd.DataFrame(
371-
pm.history_).to_csv(history_file, sep='\t', index=False)
372400
return struct_, infer_var
373401
else:
374-
np.savetxt(out_fail, struct_)
375-
exit(1)
402+
return None, infer_var
376403

377404
else:
378405
# BEGIN MULTISCALE OPTIMIZATION
@@ -413,6 +440,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
413440
alpha_true=alpha_true, struct_true=struct_true,
414441
input_weight=input_weight, exclude_zeros=exclude_zeros,
415442
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
443+
if not infer_var['converged']:
444+
return struct_, infer_var
416445
return struct_, infer_var
417446

418447

@@ -427,7 +456,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
427456
piecewise=False, piecewise_step=None, piecewise_chrom=None,
428457
piecewise_min_beads=5, piecewise_fix_homo=False,
429458
piecewise_opt_orient=True, alpha_true=None, struct_true=None,
430-
init='msd', input_weight=None, exclude_zeros=False,
459+
init='mds', input_weight=None, exclude_zeros=False,
431460
null=False, mixture_coefs=None, verbose=True):
432461
"""Infer 3D structures with PASTIS via Poisson model.
433462
@@ -492,10 +521,18 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
492521
hsc_min_beads : int, optional
493522
For diploid organisms: number of beads in the low-resolution
494523
structure from which `hsc_r` is estimated.
524+
525+
Returns
526+
-------
527+
struct_ : array_like of float of shape (lengths.sum() * ploidy, 3)
528+
3D structure resulting from the optimization.
529+
infer_var : dict
530+
A few of the variables used in inference or generated by inference.
531+
Keys: 'alpha', 'beta', 'hsc_r', 'obj', and 'seed'.
495532
"""
496533

497534
from .load_data import load_data
498-
from .stepwise_whole_genome import stepwise_inference
535+
from .piecewise_whole_genome import piecewise_inference
499536

500537
lengths_full = lengths
501538
chrom_full = chromosomes
@@ -529,7 +566,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
529566
input_weight=input_weight, exclude_zeros=exclude_zeros,
530567
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
531568
else:
532-
stepwise_inference(
569+
struct_, infer_var = piecewise_inference(
533570
counts=counts, outdir=outdir, lengths=lengths_subset, ploidy=ploidy,
534571
chromosomes=chrom_subset, alpha=alpha, seed=seed, normalize=normalize,
535572
filter_threshold=filter_threshold, alpha_init=alpha_init,
@@ -548,6 +585,8 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
548585
input_weight=input_weight, exclude_zeros=exclude_zeros, null=null,
549586
mixture_coefs=mixture_coefs, verbose=verbose)
550587

588+
return struct_, infer_var
589+
551590

552591
def _output_subdir(outdir, chrom_full, chrom_subset=None, null=False,
553592
piecewise=False, piecewise_step=None,

pastis/optimization/piecewise_whole_genome.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -287,24 +287,24 @@ def _assemble_highres_chrom_via_lowres_genome(outdir, outdir_lowres, outdir_orie
287287
return X_all_chrom, trans_rot_init
288288

289289

290-
def stepwise_inference(counts, outdir, lengths, ploidy, chromosomes, alpha, seed=0, normalize=True,
291-
filter_threshold=0.04, alpha_init=-3., max_alpha_loop=20, multiscale_rounds=1,
292-
use_multiscale_variance=True,
293-
max_iter=1e40, factr=10000000.0, pgtol=1e-05, alpha_factr=1000000000000.,
294-
bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
295-
callback_function=None, callback_freq=None,
296-
piecewise_step=None, piecewise_chrom=None,
297-
piecewise_min_beads=5, piecewise_fix_homo=False, piecewise_opt_orient=True,
298-
alpha_true=None, struct_true=None, init='msd', input_weight=None,
299-
exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
290+
def piecewise_inference(counts, outdir, lengths, ploidy, chromosomes, alpha, seed=0, normalize=True,
291+
filter_threshold=0.04, alpha_init=-3., max_alpha_loop=20, multiscale_rounds=1,
292+
use_multiscale_variance=True,
293+
max_iter=1e40, factr=10000000.0, pgtol=1e-05, alpha_factr=1000000000000.,
294+
bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
295+
callback_function=None, callback_freq=None,
296+
piecewise_step=None, piecewise_chrom=None,
297+
piecewise_min_beads=5, piecewise_fix_homo=False, piecewise_opt_orient=True,
298+
alpha_true=None, struct_true=None, init='msd', input_weight=None,
299+
exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
300300
"""
301301
"""
302302

303303
import os
304304
from .load_data import _load_inferred_struct, _choose_best_seed
305305
from .counts import subset_chrom
306306
from .pastis_algorithms import infer, _output_subdir
307-
from .utils import _choose_max_multiscale_factor, _print_code_header
307+
from .utils_poisson import _choose_max_multiscale_factor, _print_code_header
308308

309309
if piecewise_step is None:
310310
piecewise_step = [1, 2, 3, 4]

0 commit comments

Comments
 (0)