Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/sparse #209

Merged
merged 4 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions dhlab/api/dhlab_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pandas import DataFrame, Series

from dhlab.constants import BASE_URL
from scipy.sparse import dok_matrix

pd.options.display.max_rows = 100

Expand Down Expand Up @@ -512,9 +513,35 @@ def ngram_news(
# df.index = df.index.map(pd.Timestamp)
return df

def create_sparse_matrix(structure):
"""Create a sparse matrix from an API counts object"""

# fetch all words
words = list(set(word for dct in structure.values() for word in dct))
# fetch all dhlabids
dhlabids = list(structure.keys())
# create an int/dhlabid mapping
dhlabid_to_col = {dhlabid: idx for idx, dhlabid in enumerate(dhlabids)}
# create an int/word mapping
word_to_row = {word: idx for idx, word in enumerate(words)}

# construct the matrix with each word as a row and each dhlabid as a column (DTM)
num_cols = len(dhlabids)
num_rows = len(words)
sparse_matrix = dok_matrix((num_rows, num_cols), dtype=int)

# incrementally fill the sparse matrix from dictionary
for col_idx, dhlabid in enumerate(dhlabids):
dct = structure[dhlabid]
for word, value in dct.items():
row_idx = word_to_row[word]
sparse_matrix[row_idx, col_idx] = value

df_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=words, columns=dhlabids)
return df_sparse

def get_document_frequencies(
urns: List[str] = None, cutoff: int = 0, words: List[str] = None
urns: List[str] = None, cutoff: int = 0, words: List[str] = None, sparse: bool = False
) -> DataFrame:
"""Fetch frequency counts of ``words`` in documents (``urns``).

Expand All @@ -525,6 +552,7 @@ def get_document_frequencies(
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param int cutoff: minimum frequency of a word to be counted
:param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned.
:param bool sparse: create a sparse matrix for memory efficiency
"""
params = locals()
r = requests.post(f"{BASE_URL}/frequencies", json=params)
Expand All @@ -537,7 +565,12 @@ def get_document_frequencies(
structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
except IndexError:
pass
df = pd.DataFrame(structure)

if sparse == True:
df = create_sparse_matrix(structure)
else:
df = pd.DataFrame(structure)

df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
else:
df = pd.DataFrame(result)
Expand Down
45 changes: 40 additions & 5 deletions dhlab/text/conc_coll.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,13 @@ def from_df(cls, df):
class Counts(DhlabObj):
"""Provide counts for a corpus - shouldn't be too large"""

def __init__(self, corpus=None, words=None, cutoff=0):
def __init__(self, corpus=None, words=None, cutoff=0, sparse=False):
"""Get frequency list for Corpus

:param corpus: target Corpus, defaults to None
:param words: list of words to be counted, defaults to None
:param cutoff: frequency cutoff, will not include words with frequency <= cutoff
:param cutoff: frequency cutoff, will not include words with frequency < cutoff
:param sparse: return a sparse matrix for memory efficiency
"""
if corpus is None and words is None:
self.freq = pd.DataFrame()
Expand All @@ -190,7 +191,7 @@ def __init__(self, corpus=None, words=None, cutoff=0):
# count - if words is none result will be as if counting all words
# in the corpus
self.freq = get_document_frequencies(
urns=urnlist(corpus), cutoff=cutoff, words=words
urns=urnlist(corpus), cutoff=cutoff, words=words, sparse=sparse
)

# Include dhlab and title link in object
Expand All @@ -208,12 +209,46 @@ def __init__(self, corpus=None, words=None, cutoff=0):

super().__init__(self.freq)

def is_sparse(self):
"""Function to report sparsity of counts frame"""
try:
density = self.freq.sparse.density
if density:
sparse = True
else:
sparse = False
except:
sparse = False
return sparse

def sum(self):
"""Summarize Corpus frequencies

:return: frequency list for Corpus
"""
return self.from_df(self.counts.sum(axis=1).to_frame("freq"))

# Needed since Pandas seems to make sparse matrices dense when summing
if self.is_sparse() == True:
# convert to coo matrix for iteration
coo_matrix = self.freq.sparse.to_coo()

# get the words and their indices
rowidx = {idx: word for idx, word in enumerate(list(self.freq.index))}

# build a freq dictionary by looping
freqDict = dict()

for i,j,v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data):
word = rowidx[i]
if word in freqDict:
freqDict[word] += v
else:
freqDict[word] = v

df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word").sort_values(by="freq", ascending=False)
df.index.name = None
return self.from_df(df)
else:
return self.from_df(self.counts.sum(axis=1).to_frame("freq"))

def display_names(self):
"Display data with record names as column titles."
Expand Down
8 changes: 4 additions & 4 deletions dhlab/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,13 @@ def coll(
ignore_caps=ignore_caps,
)

def count(self, words=None, cutoff=0):
def count(self, words=None, cutoff=0, sparse=False):
"""Get word frequencies for corpus"""
return dh.Counts(self, words, cutoff)
return dh.Counts(self, words, cutoff, sparse)

def freq(self, words=None, cutoff=0):
def freq(self, words=None, cutoff=0, sparse=False):
"""Get word frequencies for corpus"""
return dh.Counts(self, words, cutoff)
return dh.Counts(self, words, cutoff, sparse)

@staticmethod
def _is_Corpus(corpus: "Corpus") -> bool:
Expand Down
Loading