Skip to content

Commit

Permalink
make mypy and pylint pass (#11)
Browse files Browse the repository at this point in the history
* pylint fix for ncbitax2lin/data_io.py

* pylint fix for ncbitax2lin/__init__.py

* pylint fix for ncbitax2lin/ncbitax2lin.py

* added autoflake poetry.lock

* added mypy.ini pylintrc

* pylint utils.py

* bump to 2.0.2

* update
  • Loading branch information
zyxue authored May 3, 2020
1 parent 4cf732c commit 7858c08
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 42 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Change Log

### v2.0.2 (2020/05/02)

- made pylint and mypy pass

### v2.0.1 (2020/05/02)

- adopted [poetry](https://python-poetry.org/) for package management
Expand Down
5 changes: 5 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[mypy]
python_version = 3.7
disallow_untyped_defs = True
ignore_missing_imports = True
show_column_numbers = True
4 changes: 3 additions & 1 deletion ncbitax2lin/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
__version__ = "2.0.1"
"""__init__.py for this project"""

__version__ = "2.0.2"
33 changes: 18 additions & 15 deletions ncbitax2lin/data_io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import gzip
import io
"""utility functions related to IO"""

import pandas as pd

Expand All @@ -18,7 +17,7 @@ def load_nodes(nodes_file: str) -> pd.DataFrame:
"""
load nodes.dmp and convert it into a pandas.DataFrame
"""
df = pd.read_csv(
df_data = pd.read_csv(
nodes_file,
sep="|",
header=None,
Expand All @@ -40,35 +39,39 @@ def load_nodes(nodes_file: str) -> pd.DataFrame:
],
)

# To get rid of flanking tab characters
df["rank"] = df["rank"].apply(strip)
df["embl_code"] = df["embl_code"].apply(strip)
df["comments"] = df["comments"].apply(strip)
return df
return df_data.assign(
rank=lambda df: df["rank"].apply(strip),
embl_code=lambda df: df["embl_code"].apply(strip),
comments=lambda df: df["comments"].apply(strip),
)


@utils.timeit
def load_names(names_file: str) -> pd.DataFrame:
"""
load names.dmp and convert it into a pandas.DataFrame
"""
df = pd.read_csv(
df_data = pd.read_csv(
names_file,
sep="|",
header=None,
index_col=False,
names=["tax_id", "name_txt", "unique_name", "name_class"],
)
df["name_txt"] = df["name_txt"].apply(strip)
df["unique_name"] = df["unique_name"].apply(strip)
df["name_class"] = df["name_class"].apply(strip)

sci_df = df[df["name_class"] == "scientific name"]
sci_df.reset_index(drop=True, inplace=True)
return sci_df
return (
df_data.assign(
name_txt=lambda df: df["name_txt"].apply(strip),
unique_name=lambda df: df["unique_name"].apply(strip),
name_class=lambda df: df["name_class"].apply(strip),
)
.loc[lambda df: df["name_class"] == "scientific name"]
.reset_index(drop=True)
)


def read_names_and_nodes(names_file: str, nodes_file: str) -> pd.DataFrame:
"""Reads in data from names and nodes files"""
# data downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/
# args = parse_args()
nodes_df = load_nodes(nodes_file)
Expand Down
45 changes: 24 additions & 21 deletions ncbitax2lin/ncbitax2lin.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import argparse
import gzip
"""Converts NCBI taxonomy dump into lineages"""

import logging
import multiprocessing
import os
import re
from typing import Container, Dict, Iterable, List, NewType, Optional, Tuple, Union

import fire
Expand All @@ -15,6 +13,9 @@
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s|%(levelname)s|%(message)s")


_LOGGER = logging.getLogger(__name__)


class TaxUnit(TypedDict):
"""
Represents a basic unit in taxonomy e.g. (phylum, Proteobacteria), where
Expand Down Expand Up @@ -53,13 +54,13 @@ def calc_rank_key(rank: str, existing_ranks: Container[str]) -> str:
# e.g. there could be multiple 'no rank'
if rank not in existing_ranks:
return rank
else:
count = 1

count = 1
numbered_rank = f"{rank}{count}"
while numbered_rank in existing_ranks:
count += 1
numbered_rank = f"{rank}{count}"
while numbered_rank in existing_ranks:
count += 1
numbered_rank = f"{rank}{count}"
return numbered_rank
return numbered_rank


def calc_taxonomy_dict(df_tax: pd.DataFrame) -> Dict[int, TaxUnit]:
Expand All @@ -68,8 +69,9 @@ def calc_taxonomy_dict(df_tax: pd.DataFrame) -> Dict[int, TaxUnit]:


def find_lineage(tax_id: int) -> Lineage:
"""Finds lineage for a single tax id"""
if tax_id % 50000 == 0:
logging.debug("working on tax_id: {0}".format(tax_id))
_LOGGER.info("working on tax_id: %d", tax_id)

lineage = []
while True:
Expand All @@ -88,11 +90,11 @@ def find_lineage(tax_id: int) -> Lineage:


def find_all_lineages(tax_ids: Iterable) -> List[Lineage]:
"""find the lineages for all tax ids"""
"""Finds the lineages for all tax ids"""
ncpus = multiprocessing.cpu_count()
logging.info(
"found {0} cpus, and will use all of them to find lineages "
"for all tax ids".format(ncpus)
_LOGGER.info(
"found %d cpus, and will use all of them to find lineages for all tax ids",
ncpus,
)

with multiprocessing.Pool(ncpus) as pool:
Expand Down Expand Up @@ -146,7 +148,7 @@ def convert_lineage_to_dict(lineage: Lineage) -> Dict[str, Union[int, str]]:

def prepare_lineages_for_output(lineages: List[Lineage]) -> pd.DataFrame:
"""prepares lineages into a dataframe for writing to disk"""
logging.info("Preparings all lineages into a dataframe to be written to disk ...")
_LOGGER.info("Preparings all lineages into a dataframe to be written to disk ...")

df_out = pd.DataFrame([convert_lineage_to_dict(lineage) for lineage in lineages])

Expand All @@ -167,11 +169,11 @@ def taxonomy_to_lineages(
output_prefix: output lineages will be written to output_prefix.csv.gz
"""
df_data = data_io.read_names_and_nodes(names_file, nodes_file)
logging.info(f"# of tax ids: {df_data.shape[0]:,d}")
logging.info(f"df.info:\n{utils.collect_df_info(df_data)}")
_LOGGER.info("# of tax ids: %s", f"{df_data.shape[0]:,d}")
_LOGGER.info("df.info:\n%s", f"{utils.collect_df_info(df_data)}")

logging.info("Generating TAXONOMY_DICT ...")
global TAXONOMY_DICT
_LOGGER.info("Generating TAXONOMY_DICT ...")
global TAXONOMY_DICT # pylint: disable=global-statement
TAXONOMY_DICT = calc_taxonomy_dict(df_data)

lineages = find_all_lineages(df_data.tax_id)
Expand All @@ -181,9 +183,10 @@ def taxonomy_to_lineages(
if output is None:
output = f"ncbi_lineages_{pd.Timestamp.utcnow().date()}.csv.gz"
utils.maybe_backup_file(output)
logging.info(f"Writing lineages to {output} ...")
_LOGGER.info("Writing lineages to %s ...", output)
data_io.write_lineages_to_disk(df_lineages, output)


def main() -> None:
"""Main function, entry point"""
fire.Fire(taxonomy_to_lineages)
6 changes: 4 additions & 2 deletions ncbitax2lin/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Utility functions"""

import datetime
import functools
import io
import logging
import os
import time
from typing import Any, Callable, Optional
from typing import Any, Callable

import pandas as pd

Expand Down Expand Up @@ -40,7 +42,7 @@ def maybe_backup_file(filepath: str) -> None:
while os.path.exists(backup):
count += 1
backup = os.path.join(dirname, f"#{basename}.{count}#")
logging.info(f"Backing up {filepath} to {backup}")
logging.info("Backing up %s to %s", filepath, backup)
os.rename(filepath, backup)


Expand Down
28 changes: 27 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[MESSAGES CONTROL]
disable=bad-continuation, fixme, duplicate-code
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ncbitax2lin"
version = "2.0.1"
version = "2.0.2"
description = "A tool that converts NCBI taxonomy dump into lineages"
authors = ["Zhuyi Xue <zhuyi@alum.utoronto.ca>"]
readme = "README.md"
Expand All @@ -18,6 +18,7 @@ black = "^19.10b0"
mypy = "^0.770"
pylint = "^2.5.0"
pytest = "^5.2"
autoflake = "^1.3.1"

[tool.poetry.scripts]
ncbitax2lin = "ncbitax2lin.ncbitax2lin:main"
Expand Down
2 changes: 1 addition & 1 deletion tests/test___init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@


def test_version() -> None:
assert __version__ == "2.0"
assert __version__ == "2.0.2"

0 comments on commit 7858c08

Please sign in to comment.