Skip to content

Commit

Permalink
Convert only the columns used in the query
Browse files Browse the repository at this point in the history
In the end, it's only worth calling to_numeric on the columns used for
numerical comparison.

This gets us halfway there, since in most cases, only a small subset of
metadata columns are used in the query.

This is a hacky approach, but it is more computationally efficient.
  • Loading branch information
victorlin committed Jul 28, 2023
1 parent b325b97 commit 2ead5b3
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 2 deletions.
36 changes: 34 additions & 2 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
from augur.utils import read_strains
from . import constants

try:
# python ≥3.8 only
from typing import Literal # type: ignore
except ImportError:
from typing_extensions import Literal # type: ignore


# The strains to keep as a result of applying a filter function.
FilterFunctionReturn = Set[str]
Expand Down Expand Up @@ -178,8 +184,8 @@ def filter_by_query(metadata, query) -> FilterFunctionReturn:
set()
"""
# Try converting all columns to numeric.
for column in metadata.columns:
# Try converting all queried columns to numeric.
for column in extract_variables(query).intersection(metadata.columns):
metadata[column] = pd.to_numeric(metadata[column], errors='ignore')

return set(metadata.query(query).index.values)
Expand Down Expand Up @@ -803,3 +809,29 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
kwarg_list.append((key, value))

return json.dumps(kwarg_list)


# From https://stackoverflow.com/a/76536356
def extract_variables(pandas_query: str):
"""Extract variable names used in a pandas query string."""

# Track variables in a dictionary to be used as a dictionary of globals.
variables: Dict[str, Literal[None]] = {}

while True:
try:
# Try creating a Expr object with the query string and dictionary of globals.
# This will raise an error as long as the dictionary of globals is incomplete.
env = pd.core.computation.scope.ensure_scope(level=0, global_dict=variables)
pd.core.computation.expr.Expr(pandas_query, env=env)

# Exit the loop when evaluation is successful.
break
except pd.errors.UndefinedVariableError as e:
# This relies on the format defined here: https://github.com/pandas-dev/pandas/blob/965ceca9fd796940050d6fc817707bba1c4f9bff/pandas/errors/__init__.py#L401
name = re.findall("name '(.+?)' is not defined", str(e))[0]

# Add the name to the globals dictionary with a dummy value.
variables[name] = None

return set(variables.keys())
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
"sphinx-autodoc-typehints >=1.21.4",
"types-jsonschema >=3.0.0, ==3.*",
"types-setuptools",
"typing_extensions; python_version <'3.8'",
"wheel >=0.32.3",
"ipdb >=0.10.1"
]
Expand Down

0 comments on commit 2ead5b3

Please sign in to comment.