Adding unit tests for diploid (#66)

gesinecauer · web-flow · commit e8d0b6062543 · 2020-02-03T14:55:13.000+01:00
* edited returns & saving behavior to facilitate unit testing * added full test of algorithm with unambig * bug fix: renamed utils to utils_poisson in import statements * bug fix: imports * install autograd for travis * actually installing autograd for travis, hopefully * bug fix: finished renaming "stepwise" to "piecewise" * bug fix _get_counts() * bug fix: subset_chrom() * bug fix: subset_chrom(), again * bug fix: _check_counts_matrix() now only warns if counts are float, rather than raising ValueError * bug fix: _initialize_struct_mds() * added full tests of run with ambig & pa * bug fix: prevent unnecessary erroneous about non-integer warnings in counts * bug fix: type(lengths) in PastisPM() * bug fix: format of ouput files - history & inference_variables * dont run python3-only unit tests with python2 * dont run python3-only unit tests with python2.. trying again * dont run python3-only unit tests with python2.. trying again * dont run python3-only unit tests with python2.. try #3
diff --git a/.travis.yml b/.travis.yml
@@ -12,13 +12,13 @@ matrix:
     # Ubuntu 14.04 versions
     - name: "Python 2.7"
       python: 2.7
-      env: TEST=TRUE NUMPY_VERSION=1.16 SCIPY_VERSION=1.2  PANDAS_VERSION=* SKLEARN_VERSION=0.20
+      env: TEST=TRUE NUMPY_VERSION=1.16 SCIPY_VERSION=1.2  PANDAS_VERSION=* SKLEARN_VERSION=0.20 AUTOGRAD_VERSION=*
     - name: "Python 3.6"
       python: 3.6
-      env: TEST=TRUE NUMPY_VERSION=1.14 SCIPY_VERSION=0.19  PANDAS_VERSION=0.25 SKLEARN_VERSION=0.18 COVERAGE=true
+      env: TEST=TRUE NUMPY_VERSION=1.14 SCIPY_VERSION=0.19  PANDAS_VERSION=0.25 SKLEARN_VERSION=0.18 AUTOGRAD_VERSION=1.2 COVERAGE=true
     - name: "Python 3.6 + recent sklearn"
       python: 3.6
-      env: TEST=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3.1 PANDAS_VERSION=0.25  SKLEARN_VERSION=0.21 COVERAGE=true BUILD_DOC=true
+      env: TEST=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3.1 PANDAS_VERSION=0.25  SKLEARN_VERSION=0.21 AUTOGRAD_VERSION=1.2 COVERAGE=true BUILD_DOC=true
     # - name: "Python 3.7"
     #   python: 3.7
     #  env: TEST=TRUE COVERAGE=TRUE NUMPY_VERSION=1.17 SCIPY_VERSION=1.3 PANDAS_VERSION="*"  SKLEARN_VERSION="*"
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
@@ -42,6 +42,13 @@ else
 fi
 
 
+if [[ $AUTOGRAD_VERSION != "*" ]]; then
+    pip install --upgrade autograd==$AUTOGRAD_VERSION
+else
+    pip install --upgrade autograd
+fi
+
+
 pip install cython
 
 
diff --git a/pastis/optimization/counts.py b/pastis/optimization/counts.py
@@ -2,13 +2,14 @@
 
 import numpy as np
 from scipy import sparse
+from warnings import warn
 
 from iced.filter import filter_low_counts
 from iced.normalization import ICE_normalization
 
 from iced.io import write_counts
 
-from .utils_poisson import _constraint_dis_indices
+from .constraints import _constraint_dis_indices
 from .utils_poisson import find_beads_to_remove
 
 from .multiscale_optimization import decrease_lengths_res
@@ -153,15 +154,16 @@ def subset_chrom(ploidy, lengths_full, chrom_full, chrom_subset=None,
         the specified chromosomes. Otherwise, return None.
     """
 
-    if isinstance(chrom_subset, str):
-        chrom_subset = np.array([chrom_subset])
-    missing_chrom = [x for x in chrom_subset if x not in chrom_full]
-    if len(missing_chrom) > 0:
-        raise ValueError("Chromosomes to be subsetted (%s) are not in full"
-                         "list of chromosomes (%s)" %
-                         (','.join(missing_chrom), ','.join(chrom_full)))
-    # Make sure chrom_subset is sorted properly
-    chrom_subset = [chrom for chrom in chrom_full if chrom in chrom_subset]
+    if chrom_subset is not None:
+        if isinstance(chrom_subset, str):
+            chrom_subset = np.array([chrom_subset])
+        missing_chrom = [x for x in chrom_subset if x not in chrom_full]
+        if len(missing_chrom) > 0:
+            raise ValueError("Chromosomes to be subsetted (%s) are not in full"
+                             "list of chromosomes (%s)" %
+                             (','.join(missing_chrom), ','.join(chrom_full)))
+        # Make sure chrom_subset is sorted properly
+        chrom_subset = [chrom for chrom in chrom_full if chrom in chrom_subset]
 
     if chrom_subset is None or np.array_equal(chrom_subset, chrom_full):
         chrom_subset = chrom_full.copy()
@@ -221,8 +223,9 @@ def _check_counts_matrix(counts, lengths, ploidy, exclude_zeros=True,
     if not isinstance(counts, np.ndarray):
         counts = np.array(counts)
 
-    if not np.array_equal(counts, counts.round()):
-        raise ValueError("Counts matrix must only contain integers or NaN")
+    if not np.array_equal(counts[~np.isnan(counts)],
+                          counts[~np.isnan(counts)].round()):
+        warn("Counts matrix must only contain integers or NaN")
 
     if counts.shape[0] == counts.shape[1]:
         counts[np.tril_indices(counts.shape[0])] = empty_val
diff --git a/pastis/optimization/initialization.py b/pastis/optimization/initialization.py
@@ -9,7 +9,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
     """Initialize structure via multi-dimensional scaling of unambig counts.
     """
 
-    from .utils import find_beads_to_remove
+    from .utils_poisson import find_beads_to_remove
     from .mds import estimate_X
 
     if verbose:
@@ -31,7 +31,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
         ua_beta *= multiscale_factor ** 2
 
     struct = estimate_X(
-        ua_counts.counts.tocoo().astype(float),
+        ua_counts._counts.astype(float),
         alpha=-3. if alpha is None else alpha, beta=ua_beta, verbose=False,
         use_zero_entries=False, precompute_distances='auto',
         bias=(np.tile(bias, ploidy) if bias is not None else bias),
@@ -51,7 +51,7 @@ def _initialize_struct(counts, lengths, ploidy, alpha, bias, random_state,
     """Initialize structure, randomly or via MDS of unambig counts.
     """
 
-    from .utils import _struct_replace_nan, _format_structures
+    from .utils_poisson import _struct_replace_nan, _format_structures
 
     random_state = check_random_state(random_state)
 
diff --git a/pastis/optimization/load_data.py b/pastis/optimization/load_data.py
@@ -47,16 +47,18 @@ def _get_counts(counts, lengths):
     lengths = _get_lengths(lengths)
     output = []
     for f in counts:
-        if isinstance(f, np.ndarray) or isinstance(f, sparse.coo_matrix):
+        if isinstance(f, np.ndarray) or sparse.issparse(f):
             counts_maps = f
-        if f.endswith(".npy"):
+        elif f.endswith(".npy"):
             counts_maps = np.load(f)
-            counts_maps[np.isnan(counts_maps)] = 0
         elif f.endswith(".matrix"):
             counts_maps = load_counts(f, lengths=lengths)
         else:
             raise ValueError("Counts file must end with .npy (for numpy array)"
                              " or .matrix (for hiclib / iced format)")
+        if sparse.issparse(counts_maps):
+            counts_maps = counts_maps.toarray()
+        counts_maps[np.isnan(counts_maps)] = 0
         output.append(sparse.coo_matrix(counts_maps))
     return output
 
diff --git a/pastis/optimization/pastis_algorithms.py b/pastis/optimization/pastis_algorithms.py
@@ -1,8 +1,16 @@
+from __future__ import print_function
+
+import sys
+
+if sys.version_info[0] < 3:
+    raise Exception("Must be using Python 3")
+
 import os
 import numpy as np
 import pandas as pd
 from sklearn.utils import check_random_state
-from .utils import _print_code_header
+from .utils_poisson import _print_code_header
+from distutils.util import strtobool
 
 
 def _test_objective(struct, counts, lengths, ploidy, alpha, bias,
@@ -37,7 +45,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
           normalize=True, filter_threshold=0.04, alpha_init=-3.,
           max_alpha_loop=20, beta=None, multiscale_factor=1,
           multiscale_rounds=1, use_multiscale_variance=True,
-          final_multiscale_round=False, init='msd', max_iter=10000000000,
+          final_multiscale_round=False, init='mds', max_iter=10000000000,
           factr=10000000., pgtol=1e-05, alpha_factr=1000000000000.,
           bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
           fullres_torm=None, struct_draft_fullres=None, draft=False,
@@ -138,7 +146,6 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
     infer_var : dict
         A few of the variables used in inference or generated by inference.
         Keys: 'alpha', 'beta', 'hsc_r', 'obj', and 'seed'.
-
     """
 
     from .counts import preprocess_counts, ambiguate_counts, _update_betas_in_counts_matrices
@@ -148,34 +155,40 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
     from .poisson import PastisPM
     from .estimate_alpha_beta import _estimate_beta
     from .multiscale_optimization import get_multiscale_variances_from_struct, _choose_max_multiscale_factor, decrease_lengths_res
-    from .utils import find_beads_to_remove
-
-    try:
-        os.makedirs(outdir)
-    except OSError:
-        pass
-    if seed is None:
-        seed_str = ''
-    else:
-        seed_str = '.%03d' % seed
-    out_file = os.path.join(outdir, 'struct_inferred%s.coords' % seed_str)
-    orient_file = os.path.join(outdir, 'orient_inferred%s.coords' % seed_str)
-    history_file = os.path.join(outdir, 'history%s' % seed_str)
-    infer_var_file = os.path.join(
-        outdir, 'inference_variables%s' % seed_str)
-    out_fail = os.path.join(outdir, 'struct_nonconverged%s.coords' % seed_str)
-
-    if os.path.exists(out_file):
-        print('CONVERGED', flush=True)
-        infer_var = dict(pd.read_csv(
-            infer_var_file, sep='\t', header=None, squeeze=True, index_col=0))
-        infer_var['beta'] = [float(b) for b in infer_var['beta'].split()]
-        infer_var['alpha'] = float(infer_var['alpha'])
-        struct_ = np.loadtxt(out_file)
-        return struct_, infer_var
-    elif os.path.exists(out_fail):
-        print('OPTIMIZATION DID NOT CONVERGE', flush=True)
-        exit(1)
+    from .utils_poisson import find_beads_to_remove
+
+    if outdir is not None:
+        try:
+            os.makedirs(outdir)
+        except OSError:
+            pass
+        if seed is None:
+            seed_str = ''
+        else:
+            seed_str = '.%03d' % seed
+        out_file = os.path.join(outdir, 'struct_inferred%s.coords' % seed_str)
+        orient_file = os.path.join(
+            outdir, 'orient_inferred%s.coords' % seed_str)
+        history_file = os.path.join(outdir, 'history%s' % seed_str)
+        infer_var_file = os.path.join(
+            outdir, 'inference_variables%s' % seed_str)
+        out_fail = os.path.join(
+            outdir, 'struct_nonconverged%s.coords' % seed_str)
+
+        if os.path.exists(out_file) or os.path.exists(out_fail):
+            if os.path.exists(out_file):
+                print('CONVERGED', flush=True)
+            elif os.path.exists(out_fail):
+                print('OPTIMIZATION DID NOT CONVERGE', flush=True)
+            infer_var = dict(pd.read_csv(
+                infer_var_file, sep='\t', header=None, squeeze=True,
+                index_col=0))
+            infer_var['beta'] = [float(b) for b in infer_var['beta'].split()]
+            infer_var['hsc_r'] = [float(r) for r in infer_var['hsc_r'].split()]
+            infer_var['alpha'] = float(infer_var['alpha'])
+            infer_var['converged'] = strtobool(infer_var['converged'])
+            struct_ = np.loadtxt(out_file)
+            return struct_, infer_var
 
     random_state = np.random.RandomState(seed)
     random_state = check_random_state(random_state)
@@ -256,6 +269,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
                 alpha_true=alpha_true, struct_true=struct_true,
                 input_weight=input_weight, exclude_zeros=exclude_zeros,
                 null=null, mixture_coefs=mixture_coefs, verbose=verbose)
+            if not infer_var_fullres['converged']:
+                return struct_draft_fullres, infer_var_fullres
             alpha_ = infer_var_fullres['alpha']
             beta_ = infer_var_fullres['beta']
             counts = _update_betas_in_counts_matrices(
@@ -290,7 +305,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
                 simple_diploid_for_lowres = True
             multiscale_factor_for_lowres = _choose_max_multiscale_factor(
                 lengths=lengths, min_beads=hsc_min_beads)
-            struct_draft_lowres, _ = infer(
+            struct_draft_lowres, infer_var_lowres = infer(
                 counts_raw=counts_for_lowres,
                 outdir=os.path.join(outdir, 'struct_draft_lowres'),
                 lengths=lengths, ploidy=ploidy, alpha=alpha_,
@@ -309,6 +324,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
                 struct_true=struct_true, input_weight=input_weight,
                 exclude_zeros=exclude_zeros, null=null,
                 mixture_coefs=mixture_coefs, verbose=verbose)
+            if not infer_var_lowres['converged']:
+                return struct_draft_lowres, infer_var_lowres
             hsc_r = distance_between_homologs(
                 structures=struct_draft_lowres,
                 lengths=decrease_lengths_res(
@@ -359,20 +376,30 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
         struct_[torm] = np.nan
 
         infer_var = {'alpha': pm.alpha_, 'beta': pm.beta_, 'hsc_r': hsc_r,
-                     'obj': pm.obj_, 'seed': seed}
+                     'obj': pm.obj_, 'seed': seed, 'converged': pm.converged_}
+
+        if outdir is not None:
+            with open(infer_var_file, 'w') as f:
+                for k, v in infer_var.items():
+                    if isinstance(v, np.ndarray) or isinstance(v, list):
+                        f.write(
+                            '%s\t%s\n' % (k, ' '.join(['%g' % x for x in v])))
+                    else:
+                        f.write('%s\t%g\n' % (k, v))
+            if reorienter is not None and reorienter.reorient:
+                np.savetxt(orient_file, pm.orientation_)
+            if pm.converged_:
+                np.savetxt(out_file, struct_)
+                if pm.history_ is not None:
+                    pd.DataFrame(
+                        pm.history_).to_csv(history_file, sep='\t', index=False)
+            else:
+                np.savetxt(out_fail, struct_)
 
-        if reorienter is not None and reorienter.reorient:
-            np.savetxt(orient_file, pm.orientation_)
         if pm.converged_:
-            np.savetxt(out_file, struct_)
-            pd.Series(infer_var).to_csv(infer_var_file, sep='\t', header=False)
-            if pm.history_ is not None:
-                pd.DataFrame(
-                    pm.history_).to_csv(history_file, sep='\t', index=False)
             return struct_, infer_var
         else:
-            np.savetxt(out_fail, struct_)
-            exit(1)
+            return None, infer_var
 
     else:
         # BEGIN MULTISCALE OPTIMIZATION
@@ -413,6 +440,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
                 alpha_true=alpha_true, struct_true=struct_true,
                 input_weight=input_weight, exclude_zeros=exclude_zeros,
                 null=null, mixture_coefs=mixture_coefs, verbose=verbose)
+            if not infer_var['converged']:
+                return struct_, infer_var
         return struct_, infer_var
 
 
@@ -427,7 +456,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
                    piecewise=False, piecewise_step=None, piecewise_chrom=None,
                    piecewise_min_beads=5, piecewise_fix_homo=False,
                    piecewise_opt_orient=True, alpha_true=None, struct_true=None,
-                   init='msd', input_weight=None, exclude_zeros=False,
+                   init='mds', input_weight=None, exclude_zeros=False,
                    null=False, mixture_coefs=None, verbose=True):
     """Infer 3D structures with PASTIS via Poisson model.
 
@@ -492,10 +521,18 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
     hsc_min_beads : int, optional
         For diploid organisms: number of beads in the low-resolution
         structure from which `hsc_r` is estimated.
+
+    Returns
+    -------
+    struct_ : array_like of float of shape (lengths.sum() * ploidy, 3)
+        3D structure resulting from the optimization.
+    infer_var : dict
+        A few of the variables used in inference or generated by inference.
+        Keys: 'alpha', 'beta', 'hsc_r', 'obj', and 'seed'.
     """
 
     from .load_data import load_data
-    from .stepwise_whole_genome import stepwise_inference
+    from .piecewise_whole_genome import piecewise_inference
 
     lengths_full = lengths
     chrom_full = chromosomes
@@ -529,7 +566,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
             input_weight=input_weight, exclude_zeros=exclude_zeros,
             null=null, mixture_coefs=mixture_coefs, verbose=verbose)
     else:
-        stepwise_inference(
+        struct_, infer_var = piecewise_inference(
             counts=counts, outdir=outdir, lengths=lengths_subset, ploidy=ploidy,
             chromosomes=chrom_subset, alpha=alpha, seed=seed, normalize=normalize,
             filter_threshold=filter_threshold, alpha_init=alpha_init,
@@ -548,6 +585,8 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
             input_weight=input_weight, exclude_zeros=exclude_zeros, null=null,
             mixture_coefs=mixture_coefs, verbose=verbose)
 
+    return struct_, infer_var
+
 
 def _output_subdir(outdir, chrom_full, chrom_subset=None, null=False,
                    piecewise=False, piecewise_step=None,
diff --git a/pastis/optimization/piecewise_whole_genome.py b/pastis/optimization/piecewise_whole_genome.py
@@ -287,24 +287,24 @@ def _assemble_highres_chrom_via_lowres_genome(outdir, outdir_lowres, outdir_orie
     return X_all_chrom, trans_rot_init
 
 
-def stepwise_inference(counts, outdir, lengths, ploidy, chromosomes, alpha, seed=0, normalize=True,
-                       filter_threshold=0.04, alpha_init=-3., max_alpha_loop=20, multiscale_rounds=1,
-                       use_multiscale_variance=True,
-                       max_iter=1e40, factr=10000000.0, pgtol=1e-05, alpha_factr=1000000000000.,
-                       bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
-                       callback_function=None, callback_freq=None,
-                       piecewise_step=None, piecewise_chrom=None,
-                       piecewise_min_beads=5, piecewise_fix_homo=False, piecewise_opt_orient=True,
-                       alpha_true=None, struct_true=None, init='msd', input_weight=None,
-                       exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
+def piecewise_inference(counts, outdir, lengths, ploidy, chromosomes, alpha, seed=0, normalize=True,
+                        filter_threshold=0.04, alpha_init=-3., max_alpha_loop=20, multiscale_rounds=1,
+                        use_multiscale_variance=True,
+                        max_iter=1e40, factr=10000000.0, pgtol=1e-05, alpha_factr=1000000000000.,
+                        bcc_lambda=0., hsc_lambda=0., hsc_r=None, hsc_min_beads=5,
+                        callback_function=None, callback_freq=None,
+                        piecewise_step=None, piecewise_chrom=None,
+                        piecewise_min_beads=5, piecewise_fix_homo=False, piecewise_opt_orient=True,
+                        alpha_true=None, struct_true=None, init='msd', input_weight=None,
+                        exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
     """
     """
 
     import os
     from .load_data import _load_inferred_struct, _choose_best_seed
     from .counts import subset_chrom
     from .pastis_algorithms import infer, _output_subdir
-    from .utils import _choose_max_multiscale_factor, _print_code_header
+    from .utils_poisson import _choose_max_multiscale_factor, _print_code_header
 
     if piecewise_step is None:
         piecewise_step = [1, 2, 3, 4]
diff --git a/pastis/optimization/poisson.py b/pastis/optimization/poisson.py
diff --git a/pastis/optimization/tests/test_pastis_algorithms.py b/pastis/optimization/tests/test_pastis_algorithms.py