Skip to content

Commit

Permalink
Merge pull request #136 from JonathanShor/adam_changes
Browse files Browse the repository at this point in the history
move to poetry for dist, remove excess code, use pip installable pheno
  • Loading branch information
adamgayoso authored Dec 18, 2020
2 parents b784798 + efdd775 commit 3534bd5
Show file tree
Hide file tree
Showing 11 changed files with 3,006 additions and 453 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: doubletdetection

on:
push:
branches: [master]
pull_request:
branches: [master]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Cache pip
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install dependencies
run: |
pip install --quiet .[dev]
- name: Lint with flake8
run: |
flake8
- name: Format with black
run: |
black --check .
- name: Test with pytest
run: |
pytest
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@

[![DOI](https://zenodo.org/badge/86256007.svg)](https://zenodo.org/badge/latestdoi/86256007)
[![Documentation Status](https://readthedocs.org/projects/doubletdetection/badge/?version=latest)](https://doubletdetection.readthedocs.io/en/latest/?badge=latest)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
![Build Status](https://github.com/JonathanShor/DoubletDetection/workflows/doubletdetection/badge.svg)

DoubletDetection is a Python3 package to detect doublets (technical errors) in single-cell RNA-seq count matrices.

## Installing DoubletDetection

Install from PyPI

```bash
pip install doubletdetection
```

Install from source

```bash
git clone https://github.com/JonathanShor/DoubletDetection.git
cd DoubletDetection
Expand Down Expand Up @@ -41,7 +51,7 @@ The classifier works best when

In `v2.5` we have added a new experimental clustering method (`scanpy`'s Louvain clustering) that is much faster than phenograph. We are still validating results from this new clustering. Please see the notebook below for an example of using this new feature.

See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_8k_vignette.ipynb) for an example on 8k PBMCs from 10x.
See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb) for an example on 8k PBMCs from 10x.

## Obtaining data

Expand Down
14 changes: 13 additions & 1 deletion doubletdetection/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
from .doubletdetection import BoostClassifier, load_mtx, load_10x_h5
from .doubletdetection import BoostClassifier
from . import plot


# https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094
# https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302
try:
import importlib.metadata as importlib_metadata
except ModuleNotFoundError:
import importlib_metadata
package_name = "doubletdetection"
__version__ = importlib_metadata.version(package_name)

__all__ = ["BoostClassifier", "plot"]
110 changes: 35 additions & 75 deletions doubletdetection/doubletdetection.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,22 @@
"""Doublet detection in single-cell RNA-seq data."""

import collections
import io
import warnings
from contextlib import redirect_stdout

import anndata
import numpy as np
import phenograph
import scipy.sparse as sp_sparse
import tables
import scanpy as sc
from scipy.io import mmread
from scipy.sparse import csr_matrix
from scipy.stats import hypergeom
from sklearn.utils import check_array
from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
from tqdm.auto import tqdm


def load_10x_h5(file, genome):
"""Load count matrix in 10x H5 format
Adapted from:
https://support.10xgenomics.com/single-cell-gene-expression/software/
pipelines/latest/advanced/h5_matrices
Args:
file (str): Path to H5 file
genome (str): genome, top level h5 group
Returns:
ndarray: Raw count matrix.
ndarray: Barcodes
ndarray: Gene names
"""

with tables.open_file(file, "r") as f:
try:
group = f.get_node(f.root, genome)
except tables.NoSuchNodeError:
print("That genome does not exist in this file.")
return None
# gene_ids = getattr(group, 'genes').read()
gene_names = getattr(group, "gene_names").read()
barcodes = getattr(group, "barcodes").read()
data = getattr(group, "data").read()
indices = getattr(group, "indices").read()
indptr = getattr(group, "indptr").read()
shape = getattr(group, "shape").read()
matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)

return matrix, barcodes, gene_names


def load_mtx(file):
"""Load count matrix in mtx format
Args:
file (str): Path to mtx file
Returns:
ndarray: Raw count matrix.
"""
raw_counts = np.transpose(mmread(file))

return raw_counts.tocsc()


class BoostClassifier:
"""Classifier for doublets in single-cell RNA-seq data.
Expand Down Expand Up @@ -162,8 +114,6 @@ def __init__(
if use_phenograph is True:
if "prune" not in phenograph_parameters:
phenograph_parameters["prune"] = True
if ("verbosity" not in phenograph_parameters) and (not self.verbose):
phenograph_parameters["verbosity"] = 1
self.phenograph_parameters = phenograph_parameters
if (self.n_iters == 1) and (phenograph_parameters.get("prune") is True):
warn_msg = (
Expand Down Expand Up @@ -238,9 +188,7 @@ def fit(self, raw_counts):
self.all_log_p_values_ = np.zeros((self.n_iters, self._num_cells))
all_communities = np.zeros((self.n_iters, self._num_cells))
all_parents = []
all_synth_communities = np.zeros(
(self.n_iters, int(self.boost_rate * self._num_cells))
)
all_synth_communities = np.zeros((self.n_iters, int(self.boost_rate * self._num_cells)))

for i in tqdm(range(self.n_iters)):
if self.verbose:
Expand Down Expand Up @@ -284,9 +232,7 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
"""
log_p_thresh = np.log(p_thresh)
if self.n_iters > 1:
with np.errstate(
invalid="ignore"
): # Silence numpy warning about NaN comparison
with np.errstate(invalid="ignore"): # Silence numpy warning about NaN comparison
self.voting_average_ = np.mean(
np.ma.masked_invalid(self.all_log_p_values_) <= log_p_thresh, axis=0
)
Expand All @@ -298,20 +244,34 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
# Find a cutoff score
potential_cutoffs = np.unique(self.all_scores_[~np.isnan(self.all_scores_)])
if len(potential_cutoffs) > 1:
max_dropoff = (
np.argmax(potential_cutoffs[1:] - potential_cutoffs[:-1]) + 1
)
max_dropoff = np.argmax(potential_cutoffs[1:] - potential_cutoffs[:-1]) + 1
else: # Most likely pathological dataset, only one (or no) clusters
max_dropoff = 0
self.suggested_score_cutoff_ = potential_cutoffs[max_dropoff]
with np.errstate(
invalid="ignore"
): # Silence numpy warning about NaN comparison
with np.errstate(invalid="ignore"): # Silence numpy warning about NaN comparison
self.labels_ = self.all_scores_[0, :] >= self.suggested_score_cutoff_
self.labels_[np.isnan(self.all_scores_)[0, :]] = np.nan

return self.labels_

def doublet_score(self):
"""Produce doublet scores
The doublet score is the average negative log p-value of doublet enrichment
averaged over the iterations. Higher means more likely to be doublet.
Returns:
scores (ndarray, ndims=1): Average negative log p-value over iterations
"""

if self.n_iters > 1:
with np.errstate(invalid="ignore"): # Silence numpy warning about NaN comparison
avg_log_p = np.mean(np.ma.masked_invalid(self.all_log_p_values_), axis=0)
else:
avg_log_p = self.all_log_p_values_[0]

return -avg_log_p

def _one_fit(self):
if self.verbose:
print("\nCreating synthetic doublets...")
Expand Down Expand Up @@ -347,19 +307,22 @@ def _one_fit(self):
if self.verbose:
print("Clustering augmented data set...\n")
if self.use_phenograph:
fullcommunities, _, _ = phenograph.cluster(
aug_counts.obsm["X_pca"], **self.phenograph_parameters
)
f = io.StringIO()
with redirect_stdout(f):
fullcommunities, _, _ = phenograph.cluster(
aug_counts.obsm["X_pca"], **self.phenograph_parameters
)
out = f.getvalue()
if self.verbose:
print(out)
else:
sc.pp.neighbors(
aug_counts,
random_state=self.random_state,
method="umap",
n_neighbors=10,
)
sc.tl.louvain(
aug_counts, random_state=self.random_state, resolution=4, directed=False
)
sc.tl.louvain(aug_counts, random_state=self.random_state, resolution=4, directed=False)
fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int)
min_ID = min(fullcommunities)
self.communities_ = fullcommunities[: self._num_cells]
Expand All @@ -380,8 +343,7 @@ def _one_fit(self):
orig_cells_per_comm = collections.Counter(self.communities_)
community_IDs = orig_cells_per_comm.keys()
community_scores = {
i: float(synth_cells_per_comm[i])
/ (synth_cells_per_comm[i] + orig_cells_per_comm[i])
i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i])
for i in community_IDs
}
scores = np.array([community_scores[i] for i in self.communities_])
Expand Down Expand Up @@ -412,9 +374,7 @@ def _createDoublets(self):
num_synths = int(self.boost_rate * self._num_cells)

# Parent indices
choices = np.random.choice(
self._num_cells, size=(num_synths, 2), replace=self.replace
)
choices = np.random.choice(self._num_cells, size=(num_synths, 2), replace=self.replace)
parents = [list(p) for p in choices]

parent0 = self._raw_counts[choices[:, 0], :]
Expand Down
Loading

0 comments on commit 3534bd5

Please sign in to comment.