simslab · ricomnl · Aug 8, 2022 · Aug 8, 2022
diff --git a/schpf/preprocessing.py b/schpf/preprocessing.py
@@ -64,6 +64,39 @@ def load_loom(filename):
     return loom_coo,loom_genes
 
 
+def load_h5ad(filename):
+    """Load data from a h5ad file
+    Parameters
+    ----------
+    filename: str
+        file to load
+    Returns
+    -------
+    coo : coo_matrix
+        cell x gene sparse count matrix
+    genes : Dataframe
+        Dataframe of gene attributes.  Attributes are ordered so
+        Accession and Gene are the first columns, if those attributs are
+        present
+    """
+    import anndata
+    # load the anndata file
+    adata = anndata.read_h5ad(filename)
+    adata_genes = adata.var
+    adata_coo = adata.X
+
+    # order gene attributes so Accession and Gene are the first two columns,
+    # if they are present
+    first_cols = []
+    for colname in ['Accession', 'Gene']:
+        if colname in adata_genes.columns:
+            first_cols.append(colname)
+    rest_cols = adata_genes.columns.difference(first_cols).tolist()
+    adata_genes = adata_genes[first_cols + rest_cols]
+
+    return adata_coo,adata_genes
+
+
 def load_txt(filename,  ngene_cols=2, verbose=True):
     """Load data from a whitespace delimited txt file
 
@@ -334,9 +367,9 @@ def load_and_filter(infile, min_cells, whitelist='', blacklist='',
     infile : str
         Input data. Currently accepts either: (1) a whitespace-delimited gene
         by cell UMI count matrix with 2 leading columns of gene attributes
-        (ENSEMBL_ID and GENE_NAME respectively), or (2) a loom file with at
-        least one of the row attributes `Accession` or `Gene`, where `Accession`
-        is an ENSEMBL id and `Gene` is the name.
+        (ENSEMBL_ID and GENE_NAME respectively), or (2) a loom/h5ad (anndata) 
+        file with at least one of the row attributes `Accession` or `Gene`, 
+        where `Accession` is an ENSEMBL id and `Gene` is the name.
     min_cells : float or int
         Minimum number of cells in which we must observe at least one transcript
         of a gene for the gene to pass filtering. If 0 <`min_cells`< 1, sets
@@ -386,6 +419,18 @@ def load_and_filter(infile, min_cells, whitelist='', blacklist='',
             msg = 'loom files must have at least one of the row '
             msg+= 'attributes: `Gene` or `Accession`.'
             raise ValueError(msg)
+    elif infile.endswith('.h5ad'):
+        umis, genes = load_h5ad(infile)
+        if 'Accession' in genes.columns:
+            candidate_names = genes['Accession']
+            genelist_col = 0
+        elif 'Gene' in genes.columns:
+            candidate_names = genes['Gene']
+            genelist_col = 1
+        else:
+            msg = 'h5ad files must have at least one of the row '
+            msg+= 'attributes: `Gene` or `Accession`.'
+            raise ValueError(msg)
     else:
         umis, genes = load_txt(infile)
         genelist_col = 1 if filter_by_gene_name else 0

diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 requires = ['scikit-learn',
             "numba >= 0.39, !=0.41, !=0.42, !=0.43; python_version<='3.7.3'",
             "numba >= 0.44; python_version=='3.7.4'",
-            "numba >= 0.45; python_version>'3.7.4'",
+            "numba >= 0.45, <0.53.0; python_version>'3.7.4'",
             'scipy >= 1.1',
             'numpy',
             'pandas',
@@ -19,6 +19,7 @@
 tests_require = ['pytest']
 extras_require = {
         'loompy' : ['loompy'],
+        'anndata' : ['anndata'],
         'docs' : ['sphinx-argparse'],
         }