Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add usecols and flatten to to_dataframe feature. #330

Merged
merged 12 commits into from
May 28, 2020
63 changes: 50 additions & 13 deletions signac/contrib/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .hashing import calc_id
from .indexing import SignacProjectCrawler
from .indexing import MasterCrawler
from .utility import _mkdir_p, split_and_print_progress
from .utility import _mkdir_p, split_and_print_progress, _nested_dicts_to_dotted_keys
from .schema import ProjectSchema
from .errors import WorkspaceError
from .errors import DestinationExistsError
Expand Down Expand Up @@ -1878,32 +1878,69 @@ def export_to(self, target, path=None, copytree=None):
return dict(export_jobs(jobs=list(self), target=target,
path=path, copytree=copytree))

def to_dataframe(self, sp_prefix='sp.', doc_prefix='doc.'):
def to_dataframe(self, sp_prefix='sp.', doc_prefix='doc.', usecols=None,
flatten=False):
"""Convert the selection of jobs to a pandas dataframe.

This function exports the job metadata to a :py:class:`pandas.DataFrame`.
All state point and document keys are prefixed by default to be able to distinguish them.
This function exports the job metadata to a
:py:class:`pandas.DataFrame`. All state point and document keys are
prefixed by default to be able to distinguish them.

:param sp_prefix:
Prefix state point keys with the given string. Defaults to "sp.".
Prefix state point keys with the given string. Defaults to
``'sp.'``.
:type sp_prefix:
str
str, optional
:param doc_prefix:
Prefix document keys with the given string. Defaults to "doc.".
Prefix document keys with the given string. Defaults to ``'doc.'``.
:type doc_prefix:
str
str, optional
:param usecols:
Used to select a subset of columns. If list-like, must contain
strings corresponding to the column names that should be included.
For example, ``['sp.a', 'doc.notes']``. If callable, the column
will be included if the function called on the column name returns
True. For example, ``lambda x: 'sp.' in x``. Defaults to ``None``,
which uses all columns from the state point and document. Note
that this filter is applied *after* the doc and sp prefixes are
added to the column names.
:type usecols:
list-like or callable, optional
:param flatten:
Whether nested state points or document keys should be flattened.
If True, ``{'a': {'b': 'c'}}`` becomes a column named ``a.b`` with
value ``c``. If False, it becomes a column named ``a`` with value
``{'b': 'c'}``. Defaults to ``False``.
:type flatten:
bool, optional
:returns:
A pandas dataframe with all job metadata.
A :class:`pandas.DataFrame` with all job metadata.
:rtype:
:py:class:`pandas.DataFrame`
"""
import pandas

if usecols is None:
def usecols(column):
return True
elif not callable(usecols):
included_columns = set(usecols)

def usecols(column):
return column in included_columns

def _flatten(d):
return dict(_nested_dicts_to_dotted_keys(d)) if flatten else d

def _export_sp_and_doc(job):
for key, value in job.sp.items():
yield sp_prefix + key, value
for key, value in job.doc.items():
yield doc_prefix + key, value
for key, value in _flatten(job.sp).items():
prefixed_key = sp_prefix + key
if usecols(prefixed_key):
yield prefixed_key, value
for key, value in _flatten(job.doc).items():
prefixed_key = doc_prefix + key
if usecols(prefixed_key):
yield prefixed_key, value

return pandas.DataFrame.from_dict(
data={job._id: dict(_export_sp_and_doc(job)) for job in self},
Expand Down
83 changes: 83 additions & 0 deletions tests/test_pandas_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,86 @@ def test_jobs_iterator_to_dataframe(self):
assert len(df) == len(jobs)
assert 'sp.a' in df.columns
assert 'doc.b' in df.columns

def test_prefixes(self):
for i in range(10):
job = self.project.open_job(dict(a=i))
job.doc.b = float(i)
df = self.project.to_dataframe(sp_prefix='', doc_prefix='')
assert len(df) == len(self.project)
assert 'a' in df.columns
assert 'b' in df.columns

def test_usecols(self):
for i in range(10):
job = self.project.open_job(dict(a=i, b=i*2))
job.doc.c = float(i)
job.doc.d = float(i*3)

# Including no keys should return an empty DataFrame
df = self.project.to_dataframe(usecols=[])
assert len(df.columns) == 0
assert len(df) == 0

# Excluding all keys should return an empty DataFrame
def usecols(column):
return column not in ['sp.a', 'sp.b', 'doc.c', 'doc.d']
df = self.project.to_dataframe(usecols=usecols)
assert len(df.columns) == 0
assert len(df) == 0

# Include one state point column
df = self.project.to_dataframe(usecols=['sp.a'])
assert 'sp.a' in df.columns
assert len(df.columns) == 1
assert len(df) == len(self.project)

# Exclude one state point column
def usecols(column):
return column != 'sp.b'
df = self.project.to_dataframe(usecols=usecols)
assert 'sp.a' in df.columns
assert 'sp.b' not in df.columns
assert 'doc.c' in df.columns
assert 'doc.d' in df.columns
assert len(df.columns) == 3
assert len(df) == len(self.project)

# Include one document column
df = self.project.to_dataframe(usecols=['doc.c'])
assert 'doc.c' in df.columns
assert len(df.columns) == 1
assert len(df) == len(self.project)

# Exclude one document column
def usecols(column):
return column != 'doc.d'
df = self.project.to_dataframe(usecols=usecols)
assert 'sp.a' in df.columns
assert 'sp.b' in df.columns
assert 'doc.c' in df.columns
assert 'doc.d' not in df.columns
assert len(df.columns) == 3
assert len(df) == len(self.project)

def test_flatten(self):
for i in range(10):
job = self.project.open_job(dict(a=dict(b=i*2, c=i*3), d=i))
job.doc.e = dict(f=float(i))

# Including no keys should return an empty DataFrame
df = self.project.to_dataframe(usecols=[], flatten=True)
assert len(df.columns) == 0
assert len(df) == 0

# Include one flattened state point column
df = self.project.to_dataframe(usecols=['sp.a.b'], flatten=True)
assert 'sp.a.b' in df.columns
assert len(df.columns) == 1
assert len(df) == len(self.project)

# Include one flattened document column
df = self.project.to_dataframe(usecols=['doc.e.f'], flatten=True)
assert 'doc.e.f' in df.columns
assert len(df.columns) == 1
assert len(df) == len(self.project)