make mypy and pylint pass (#11)

* pylint fix for ncbitax2lin/data_io.py * pylint fix for ncbitax2lin/__init__.py * pylint fix for ncbitax2lin/ncbitax2lin.py * added autoflake poetry.lock * added mypy.ini pylintrc * pylint utils.py * bump to 2.0.2 * update
zyxue · May 3, 2020 · 7858c08 · 7858c08
1 parent 4cf732c
commit 7858c08
Show file tree

Hide file tree

Showing 10 changed files with 90 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 ## Change Log
 
+### v2.0.2 (2020/05/02)
+
+- made pylint and mypy pass
+
 ### v2.0.1 (2020/05/02)
 
 - adopted [poetry](https://python-poetry.org/) for package management

diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,5 @@
+[mypy]
+python_version = 3.7
+disallow_untyped_defs = True
+ignore_missing_imports = True
+show_column_numbers = True
diff --git a/ncbitax2lin/__init__.py b/ncbitax2lin/__init__.py
@@ -1 +1,3 @@
-__version__ = "2.0.1"
+"""__init__.py for this project"""
+
+__version__ = "2.0.2"
diff --git a/ncbitax2lin/data_io.py b/ncbitax2lin/data_io.py
@@ -1,5 +1,4 @@
-import gzip
-import io
+"""utility functions related to IO"""
 
 import pandas as pd
 
@@ -18,7 +17,7 @@ def load_nodes(nodes_file: str) -> pd.DataFrame:
     """
     load nodes.dmp and convert it into a pandas.DataFrame
     """
-    df = pd.read_csv(
+    df_data = pd.read_csv(
         nodes_file,
         sep="|",
         header=None,
@@ -40,35 +39,39 @@ def load_nodes(nodes_file: str) -> pd.DataFrame:
         ],
     )
 
-    # To get rid of flanking tab characters
-    df["rank"] = df["rank"].apply(strip)
-    df["embl_code"] = df["embl_code"].apply(strip)
-    df["comments"] = df["comments"].apply(strip)
-    return df
+    return df_data.assign(
+        rank=lambda df: df["rank"].apply(strip),
+        embl_code=lambda df: df["embl_code"].apply(strip),
+        comments=lambda df: df["comments"].apply(strip),
+    )
 
 
 @utils.timeit
 def load_names(names_file: str) -> pd.DataFrame:
     """
     load names.dmp and convert it into a pandas.DataFrame
     """
-    df = pd.read_csv(
+    df_data = pd.read_csv(
         names_file,
         sep="|",
         header=None,
         index_col=False,
         names=["tax_id", "name_txt", "unique_name", "name_class"],
     )
-    df["name_txt"] = df["name_txt"].apply(strip)
-    df["unique_name"] = df["unique_name"].apply(strip)
-    df["name_class"] = df["name_class"].apply(strip)
 
-    sci_df = df[df["name_class"] == "scientific name"]
-    sci_df.reset_index(drop=True, inplace=True)
-    return sci_df
+    return (
+        df_data.assign(
+            name_txt=lambda df: df["name_txt"].apply(strip),
+            unique_name=lambda df: df["unique_name"].apply(strip),
+            name_class=lambda df: df["name_class"].apply(strip),
+        )
+        .loc[lambda df: df["name_class"] == "scientific name"]
+        .reset_index(drop=True)
+    )
 
 
 def read_names_and_nodes(names_file: str, nodes_file: str) -> pd.DataFrame:
+    """Reads in data from names and nodes files"""
     # data downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/
     # args = parse_args()
     nodes_df = load_nodes(nodes_file)

diff --git a/ncbitax2lin/ncbitax2lin.py b/ncbitax2lin/ncbitax2lin.py
@@ -1,9 +1,7 @@
-import argparse
-import gzip
+"""Converts NCBI taxonomy dump into lineages"""
+
 import logging
 import multiprocessing
-import os
-import re
 from typing import Container, Dict, Iterable, List, NewType, Optional, Tuple, Union
 
 import fire
@@ -15,6 +13,9 @@
 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s|%(levelname)s|%(message)s")
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class TaxUnit(TypedDict):
     """
     Represents a basic unit in taxonomy e.g. (phylum, Proteobacteria), where
@@ -53,13 +54,13 @@ def calc_rank_key(rank: str, existing_ranks: Container[str]) -> str:
     # e.g. there could be multiple 'no rank'
     if rank not in existing_ranks:
         return rank
-    else:
-        count = 1
+
+    count = 1
+    numbered_rank = f"{rank}{count}"
+    while numbered_rank in existing_ranks:
+        count += 1
         numbered_rank = f"{rank}{count}"
-        while numbered_rank in existing_ranks:
-            count += 1
-            numbered_rank = f"{rank}{count}"
-        return numbered_rank
+    return numbered_rank
 
 
 def calc_taxonomy_dict(df_tax: pd.DataFrame) -> Dict[int, TaxUnit]:
@@ -68,8 +69,9 @@ def calc_taxonomy_dict(df_tax: pd.DataFrame) -> Dict[int, TaxUnit]:
 
 
 def find_lineage(tax_id: int) -> Lineage:
+    """Finds lineage for a single tax id"""
     if tax_id % 50000 == 0:
-        logging.debug("working on tax_id: {0}".format(tax_id))
+        _LOGGER.info("working on tax_id: %d", tax_id)
 
     lineage = []
     while True:
@@ -88,11 +90,11 @@ def find_lineage(tax_id: int) -> Lineage:
 
 
 def find_all_lineages(tax_ids: Iterable) -> List[Lineage]:
-    """find the lineages for all tax ids"""
+    """Finds the lineages for all tax ids"""
     ncpus = multiprocessing.cpu_count()
-    logging.info(
-        "found {0} cpus, and will use all of them to find lineages "
-        "for all tax ids".format(ncpus)
+    _LOGGER.info(
+        "found %d cpus, and will use all of them to find lineages for all tax ids",
+        ncpus,
     )
 
     with multiprocessing.Pool(ncpus) as pool:
@@ -146,7 +148,7 @@ def convert_lineage_to_dict(lineage: Lineage) -> Dict[str, Union[int, str]]:
 
 def prepare_lineages_for_output(lineages: List[Lineage]) -> pd.DataFrame:
     """prepares lineages into a dataframe for writing to disk"""
-    logging.info("Preparings all lineages into a dataframe to be written to disk ...")
+    _LOGGER.info("Preparings all lineages into a dataframe to be written to disk ...")
 
     df_out = pd.DataFrame([convert_lineage_to_dict(lineage) for lineage in lineages])
 
@@ -167,11 +169,11 @@ def taxonomy_to_lineages(
         output_prefix: output lineages will be written to output_prefix.csv.gz
     """
     df_data = data_io.read_names_and_nodes(names_file, nodes_file)
-    logging.info(f"# of tax ids: {df_data.shape[0]:,d}")
-    logging.info(f"df.info:\n{utils.collect_df_info(df_data)}")
+    _LOGGER.info("# of tax ids: %s", f"{df_data.shape[0]:,d}")
+    _LOGGER.info("df.info:\n%s", f"{utils.collect_df_info(df_data)}")
 
-    logging.info("Generating TAXONOMY_DICT ...")
-    global TAXONOMY_DICT
+    _LOGGER.info("Generating TAXONOMY_DICT ...")
+    global TAXONOMY_DICT  # pylint: disable=global-statement
     TAXONOMY_DICT = calc_taxonomy_dict(df_data)
 
     lineages = find_all_lineages(df_data.tax_id)
@@ -181,9 +183,10 @@ def taxonomy_to_lineages(
     if output is None:
         output = f"ncbi_lineages_{pd.Timestamp.utcnow().date()}.csv.gz"
     utils.maybe_backup_file(output)
-    logging.info(f"Writing lineages to {output} ...")
+    _LOGGER.info("Writing lineages to %s ...", output)
     data_io.write_lineages_to_disk(df_lineages, output)
 
 
 def main() -> None:
+    """Main function, entry point"""
     fire.Fire(taxonomy_to_lineages)
diff --git a/ncbitax2lin/utils.py b/ncbitax2lin/utils.py
@@ -1,10 +1,12 @@
+"""Utility functions"""
+
 import datetime
 import functools
 import io
 import logging
 import os
 import time
-from typing import Any, Callable, Optional
+from typing import Any, Callable
 
 import pandas as pd
 
@@ -40,7 +42,7 @@ def maybe_backup_file(filepath: str) -> None:
         while os.path.exists(backup):
             count += 1
             backup = os.path.join(dirname, f"#{basename}.{count}#")
-        logging.info(f"Backing up {filepath} to {backup}")
+        logging.info("Backing up %s to %s", filepath, backup)
         os.rename(filepath, backup)
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pylintrc b/pylintrc
@@ -0,0 +1,2 @@
+[MESSAGES CONTROL]
+disable=bad-continuation, fixme, duplicate-code
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ncbitax2lin"
-version = "2.0.1"
+version = "2.0.2"
 description = "A tool that converts NCBI taxonomy dump into lineages"
 authors = ["Zhuyi Xue <zhuyi@alum.utoronto.ca>"]
 readme = "README.md"
@@ -18,6 +18,7 @@ black = "^19.10b0"
 mypy = "^0.770"
 pylint = "^2.5.0"
 pytest = "^5.2"
+autoflake = "^1.3.1"
 
 [tool.poetry.scripts]
 ncbitax2lin = "ncbitax2lin.ncbitax2lin:main"

diff --git a/tests/test___init__.py b/tests/test___init__.py
@@ -4,4 +4,4 @@
 
 
 def test_version() -> None:
-    assert __version__ == "2.0"
+    assert __version__ == "2.0.2"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[MESSAGES CONTROL]
		disable=bad-continuation, fixme, duplicate-code
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@


		def test_version() -> None:
		assert __version__ == "2.0"
		assert __version__ == "2.0.2"