From 42f595f77ae29cc0ef7ccecd96a2adf48bac5ee9 Mon Sep 17 00:00:00 2001 From: gsheni Date: Mon, 25 Mar 2024 15:50:13 -0400 Subject: [PATCH] increased line length --- deepecho/demo.py | 4 +- deepecho/models/base.py | 19 ++---- deepecho/models/basic_gan.py | 72 ++++++--------------- deepecho/models/par.py | 122 +++++++++-------------------------- deepecho/sequences.py | 20 ++---- pyproject.toml | 2 +- tasks.py | 24 ++----- tests/unit/test_sequences.py | 8 +-- 8 files changed, 68 insertions(+), 203 deletions(-) diff --git a/deepecho/demo.py b/deepecho/demo.py index 229a39b..56984fa 100644 --- a/deepecho/demo.py +++ b/deepecho/demo.py @@ -9,6 +9,4 @@ def load_demo(): """Load the demo DataFrame.""" - return pd.read_csv( - os.path.join(_DATA_PATH, 'demo.csv'), parse_dates=['date'] - ) + return pd.read_csv(os.path.join(_DATA_PATH, 'demo.csv'), parse_dates=['date']) diff --git a/deepecho/models/base.py b/deepecho/models/base.py index 1bfda64..cfd3982 100644 --- a/deepecho/models/base.py +++ b/deepecho/models/base.py @@ -100,9 +100,7 @@ def _get_data_types(data, data_types, columns): elif kind == 'M': dtypes_list.append('datetime') else: - error = ( - f'Unsupported data_type for column {column}: {dtype}' - ) + error = f'Unsupported data_type for column {column}: {dtype}' raise ValueError(error) return dtypes_list @@ -146,14 +144,11 @@ def fit( such as integer values or datetimes. """ if not entity_columns and segment_size is None: - raise TypeError( - 'If the data has no `entity_columns`, `segment_size` must be given.' - ) + raise TypeError('If the data has no `entity_columns`, `segment_size` must be given.') if segment_size is not None and not isinstance(segment_size, int): if sequence_index is None: raise TypeError( - '`segment_size` must be of type `int` if ' - 'no `sequence_index` is given.' + '`segment_size` must be of type `int` if ' 'no `sequence_index` is given.' ) if data[sequence_index].dtype.kind != 'M': raise TypeError( @@ -176,9 +171,7 @@ def fit( self._data_columns.remove(sequence_index) data_types = self._get_data_types(data, data_types, self._data_columns) - context_types = self._get_data_types( - data, data_types, self._context_columns - ) + context_types = self._get_data_types(data, data_types, self._context_columns) sequences = assemble_sequences( data, self._entity_columns, @@ -236,9 +229,7 @@ def sample(self, num_entities=None, context=None, sequence_length=None): """ if context is None: if num_entities is None: - raise TypeError( - 'Either context or num_entities must be not None' - ) + raise TypeError('Either context or num_entities must be not None') context = self._context_values.sample(num_entities, replace=True) context = context.reset_index(drop=True) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index 734dafa..c6beadb 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -16,9 +16,7 @@ def _expand_context(data, context): return torch.cat( [ data, - context.unsqueeze(0).expand( - data.shape[0], context.shape[0], context.shape[1] - ), + context.unsqueeze(0).expand(data.shape[0], context.shape[0], context.shape[1]), ], dim=2, ) @@ -52,9 +50,7 @@ class BasicGenerator(torch.nn.Module): Device to which this Module is associated to. """ - def __init__( - self, context_size, latent_size, hidden_size, data_size, device - ): + def __init__(self, context_size, latent_size, hidden_size, data_size, device): super().__init__() self.latent_size = latent_size self.rnn = torch.nn.GRU(context_size + latent_size, hidden_size) @@ -251,31 +247,21 @@ def _analyze_data(self, sequences, context_types, data_types): - Index map and dimensions for the context. - Index map and dimensions for the data. """ - sequence_lengths = np.array([ - len(sequence['data'][0]) for sequence in sequences - ]) + sequence_lengths = np.array([len(sequence['data'][0]) for sequence in sequences]) self._max_sequence_length = np.max(sequence_lengths) - self._fixed_length = ( - sequence_lengths == self._max_sequence_length - ).all() + self._fixed_length = (sequence_lengths == self._max_sequence_length).all() # Concatenate all the context sequences together context = [] for column in range(len(context_types)): - context.append([ - sequence['context'][column] for sequence in sequences - ]) + context.append([sequence['context'][column] for sequence in sequences]) - self._context_map, self._context_size = self._index_map( - context, context_types - ) + self._context_map, self._context_size = self._index_map(context, context_types) # Concatenate all the data sequences together data = [] for column in range(len(data_types)): - data.append( - sum([sequence['data'][column] for sequence in sequences], []) - ) + data.append(sum([sequence['data'][column] for sequence in sequences], [])) self._data_map, self._data_size = self._index_map(data, data_types) @@ -388,9 +374,7 @@ def _tensor_to_data(self, tensor): for row in range(sequence_length): if column_type in ('continuous', 'count'): round_value = column_type == 'count' - value = self._denormalize( - tensor, row, properties, round_value=round_value - ) + value = self._denormalize(tensor, row, properties, round_value=round_value) elif column_type in ('categorical', 'ordinal'): value = self._one_hot_decode(tensor, row, properties) else: @@ -418,14 +402,10 @@ def _transform(self, data): if column_type in ('continuous', 'count'): value_idx, missing_idx = properties['indices'] data[:, :, value_idx] = torch.tanh(data[:, :, value_idx]) - data[:, :, missing_idx] = torch.sigmoid( - data[:, :, missing_idx] - ) + data[:, :, missing_idx] = torch.sigmoid(data[:, :, missing_idx]) elif column_type in ('categorical', 'ordinal'): indices = list(properties['indices'].values()) - data[:, :, indices] = torch.nn.functional.softmax( - data[:, :, indices] - ) + data[:, :, indices] = torch.nn.functional.softmax(data[:, :, indices]) return data @@ -454,9 +434,7 @@ def _generate(self, context, sequence_length=None): return generated - def _discriminator_step( - self, discriminator, discriminator_opt, data_context, context - ): + def _discriminator_step(self, discriminator, discriminator_opt, data_context, context): real_scores = discriminator(data_context) fake = self._generate(context) @@ -500,12 +478,8 @@ def _build_fit_artifacts(self): hidden_size=self._hidden_size, ).to(self._device) - generator_opt = torch.optim.Adam( - self._generator.parameters(), lr=self._gen_lr - ) - discriminator_opt = torch.optim.Adam( - discriminator.parameters(), lr=self._dis_lr - ) + generator_opt = torch.optim.Adam(self._generator.parameters(), lr=self._gen_lr) + discriminator_opt = torch.optim.Adam(discriminator.parameters(), lr=self._dis_lr) return discriminator, generator_opt, discriminator_opt @@ -547,17 +521,11 @@ def fit_sequences(self, sequences, context_types, data_types): """ self._analyze_data(sequences, context_types, data_types) - data = self._build_tensor( - self._data_to_tensor, sequences, 'data', dim=1 - ) - context = self._build_tensor( - self._context_to_tensor, sequences, 'context', dim=0 - ) + data = self._build_tensor(self._data_to_tensor, sequences, 'data', dim=1) + context = self._build_tensor(self._context_to_tensor, sequences, 'context', dim=0) data_context = _expand_context(data, context) - discriminator, generator_opt, discriminator_opt = ( - self._build_fit_artifacts() - ) + discriminator, generator_opt, discriminator_opt = self._build_fit_artifacts() iterator = range(self._epochs) if self._verbose: @@ -579,9 +547,7 @@ def fit_sequences(self, sequences, context_types, data_types): if self._verbose: d_loss = discriminator_score.item() g_loss = generator_score.item() - iterator.set_description( - f'Epoch {epoch + 1} | D Loss {d_loss} | G Loss {g_loss}' - ) + iterator.set_description(f'Epoch {epoch + 1} | D Loss {d_loss} | G Loss {g_loss}') def sample_sequence(self, context, sequence_length=None): """Sample a single sequence conditioned on context. @@ -596,9 +562,7 @@ def sample_sequence(self, context, sequence_length=None): A list of lists (data) corresponding to the types specified in data_types when fit was called. """ - context = ( - self._context_to_tensor(context).unsqueeze(0).to(self._device) - ) + context = self._context_to_tensor(context).unsqueeze(0).to(self._device) with torch.no_grad(): generated = self._generate(context, sequence_length) diff --git a/deepecho/models/par.py b/deepecho/models/par.py index 48a3094..57e2a68 100644 --- a/deepecho/models/par.py +++ b/deepecho/models/par.py @@ -30,32 +30,24 @@ def forward(self, x, c): x = torch.cat( [ x, - c.unsqueeze(0).expand( - x.shape[0], c.shape[0], c.shape[1] - ), + c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]), ], dim=2, ) x = self.down(x) - x = torch.nn.utils.rnn.pack_padded_sequence( - x, lengths, enforce_sorted=False - ) + x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, enforce_sorted=False) x, _ = self.rnn(x) x, lengths = torch.nn.utils.rnn.pad_packed_sequence(x) x = self.up(x) - x = torch.nn.utils.rnn.pack_padded_sequence( - x, lengths, enforce_sorted=False - ) + x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, enforce_sorted=False) else: if self.context_size: x = torch.cat( [ x, - c.unsqueeze(0).expand( - x.shape[0], c.shape[0], c.shape[1] - ), + c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]), ], dim=2, ) @@ -231,9 +223,7 @@ def _data_to_tensor(self, data): if pd.isnull(data[key][i]) or props['range'] == 0: x[r_idx] = 0.0 else: - x[r_idx] = (data[key][i] - props['min']) / props[ - 'range' - ] + x[r_idx] = (data[key][i] - props['min']) / props['range'] x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0 @@ -337,9 +327,7 @@ def fit_sequences(self, sequences, context_types, data_types): X.append(self._data_to_tensor(sequence['data'])) C.append(self._context_to_tensor(sequence['context'])) - X = torch.nn.utils.rnn.pack_sequence(X, enforce_sorted=False).to( - self.device - ) + X = torch.nn.utils.rnn.pack_sequence(X, enforce_sorted=False).to(self.device) if self._ctx_dims: C = torch.stack(C, dim=0).to(self.device) @@ -360,9 +348,7 @@ def fit_sequences(self, sequences, context_types, data_types): Y_padded, _ = torch.nn.utils.rnn.pad_packed_sequence(Y) optimizer.zero_grad() - loss = self._compute_loss( - X_padded[1:, :, :], Y_padded[:-1, :, :], seq_len - ) + loss = self._compute_loss(X_padded[1:, :, :], Y_padded[:-1, :, :], seq_len) loss.backward() epoch_loss_df = pd.DataFrame({ @@ -378,9 +364,7 @@ def fit_sequences(self, sequences, context_types, data_types): self.loss_values = epoch_loss_df if self.verbose: - iterator.set_description( - pbar_description.format(loss=loss.item()) - ) + iterator.set_description(pbar_description.format(loss=loss.item())) optimizer.step() @@ -417,9 +401,7 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): dist = torch.distributions.normal.Normal( mu[: seq_len[i], i], sigma[: seq_len[i], i] ) - log_likelihood += torch.sum( - dist.log_prob(X_padded[-seq_len[i] :, i, mu_idx]) - ) + log_likelihood += torch.sum(dist.log_prob(X_padded[-seq_len[i] :, i, mu_idx])) p_true = X_padded[: seq_len[i], i, missing_idx] p_pred = missing[: seq_len[i], i] @@ -430,25 +412,18 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] - r = ( - torch.nn.functional.softplus(Y_padded[:, :, r_idx]) - * props['range'] - ) + r = torch.nn.functional.softplus(Y_padded[:, :, r_idx]) * props['range'] p = torch.sigmoid(Y_padded[:, :, p_idx]) x = X_padded[:, :, r_idx] * props['range'] missing = torch.nn.LogSigmoid()(Y_padded[:, :, missing_idx]) for i in range(batch_size): - dist = ( - torch.distributions.negative_binomial.NegativeBinomial( - r[: seq_len[i], i], - p[: seq_len[i], i], - validate_args=False, - ) - ) - log_likelihood += torch.sum( - dist.log_prob(x[: seq_len[i], i]) + dist = torch.distributions.negative_binomial.NegativeBinomial( + r[: seq_len[i], i], + p[: seq_len[i], i], + validate_args=False, ) + log_likelihood += torch.sum(dist.log_prob(x[: seq_len[i], i])) p_true = X_padded[: seq_len[i], i, missing_idx] p_pred = missing[: seq_len[i], i] @@ -459,24 +434,18 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): elif props['type'] in ['categorical', 'ordinal']: idx = list(props['indices'].values()) - log_softmax = torch.nn.functional.log_softmax( - Y_padded[:, :, idx], dim=2 - ) + log_softmax = torch.nn.functional.log_softmax(Y_padded[:, :, idx], dim=2) for i in range(batch_size): target = X_padded[: seq_len[i], i, idx] predicted = log_softmax[: seq_len[i], i] target = torch.argmax(target, dim=1).unsqueeze(dim=1) - log_likelihood += torch.sum( - predicted.gather(dim=1, index=target) - ) + log_likelihood += torch.sum(predicted.gather(dim=1, index=target)) else: raise ValueError() - return -log_likelihood / ( - batch_size * len(self._data_map) * batch_size - ) + return -log_likelihood / (batch_size * len(self._data_map) * batch_size) def _tensor_to_data(self, x): # Force CPU on x @@ -497,19 +466,14 @@ def _tensor_to_data(self, x): if (x[i, 0, missing_idx] > 0) and props['nulls']: data[key].append(None) else: - data[key].append( - x[i, 0, mu_idx].item() * props['std'] + props['mu'] - ) + data[key].append(x[i, 0, mu_idx].item() * props['std'] + props['mu']) elif props['type'] in ['count']: r_idx, _p_idx, missing_idx = props['indices'] if x[i, 0, missing_idx] > 0 and props['nulls']: data[key].append(None) else: - sample = ( - x[i, 0, r_idx].item() * props['range'] - + props['min'] - ) + sample = x[i, 0, r_idx].item() * props['range'] + props['min'] data[key].append(int(sample)) elif props['type'] in ['categorical', 'ordinal']: @@ -541,40 +505,25 @@ def _sample_state(self, x): x[0, 0, sigma_idx] = 0.0 log_likelihood += torch.sum(dist.log_prob(x[0, 0, mu_idx])) - dist = torch.distributions.Bernoulli( - torch.sigmoid(x[0, 0, missing_idx]) - ) + dist = torch.distributions.Bernoulli(torch.sigmoid(x[0, 0, missing_idx])) x[0, 0, missing_idx] = dist.sample() - x[0, 0, mu_idx] = x[0, 0, mu_idx] * ( - 1.0 - x[0, 0, missing_idx] - ) - log_likelihood += torch.sum( - dist.log_prob(x[0, 0, missing_idx]) - ) + x[0, 0, mu_idx] = x[0, 0, mu_idx] * (1.0 - x[0, 0, missing_idx]) + log_likelihood += torch.sum(dist.log_prob(x[0, 0, missing_idx])) elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] - r = ( - torch.nn.functional.softplus(x[0, 0, r_idx]) - * props['range'] - ) + r = torch.nn.functional.softplus(x[0, 0, r_idx]) * props['range'] p = torch.sigmoid(x[0, 0, p_idx]) - dist = torch.distributions.negative_binomial.NegativeBinomial( - r, p - ) + dist = torch.distributions.negative_binomial.NegativeBinomial(r, p) x[0, 0, r_idx] = dist.sample() x[0, 0, p_idx] = 0.0 log_likelihood += torch.sum(dist.log_prob(x[0, 0, r_idx])) x[0, 0, r_idx] /= props['range'] - dist = torch.distributions.Bernoulli( - torch.sigmoid(x[0, 0, missing_idx]) - ) + dist = torch.distributions.Bernoulli(torch.sigmoid(x[0, 0, missing_idx])) x[0, 0, missing_idx] = dist.sample() x[0, 0, r_idx] = x[0, 0, r_idx] * (1.0 - x[0, 0, missing_idx]) - log_likelihood += torch.sum( - dist.log_prob(x[0, 0, missing_idx]) - ) + log_likelihood += torch.sum(dist.log_prob(x[0, 0, missing_idx])) elif props['type'] in ['categorical', 'ordinal']: idx = list(props['indices'].values()) @@ -600,19 +549,12 @@ def _sample_sequence(self, context, min_length, max_length): next_x, ll = self._sample_state(self._model(x, context)[-1:, :, :]) x = torch.cat([x, next_x], dim=0) log_likelihood += ll - if ( - next_x[0, 0, self._data_map['']['indices']['']] - > 0.0 - ): + if next_x[0, 0, self._data_map['']['indices']['']] > 0.0: if min_length <= step + 1 <= max_length: break # received end token - next_x[ - 0, 0, self._data_map['']['indices'][''] - ] = 1.0 - next_x[0, 0, self._data_map['']['indices']['']] = ( - 0.0 - ) + next_x[0, 0, self._data_map['']['indices']['']] = 1.0 + next_x[0, 0, self._data_map['']['indices']['']] = 0.0 return x[1:, :, :], log_likelihood @@ -647,9 +589,7 @@ def sample_sequence(self, context, sequence_length=None): best_x, best_ll = None, float('-inf') for _ in range(self.sample_size): with torch.no_grad(): - x, log_likelihood = self._sample_sequence( - context, min_length, max_length - ) + x, log_likelihood = self._sample_sequence(context, min_length, max_length) if log_likelihood > best_ll: best_x = x diff --git a/deepecho/sequences.py b/deepecho/sequences.py index 785bbe6..847c511 100644 --- a/deepecho/sequences.py +++ b/deepecho/sequences.py @@ -65,9 +65,7 @@ def segment_by_time(sequence, segment_size, sequence_index): return sequences -def segment_sequence( - sequence, segment_size, sequence_index, drop_sequence_index=True -): +def segment_sequence(sequence, segment_size, sequence_index, drop_sequence_index=True): """Segment the sequence in segments of the indicated time length or size. If a ``sequence_index`` is given, data will be sorted by it first. @@ -112,9 +110,7 @@ def _convert_to_dicts(segments, context_columns): if context_columns: context = segment[context_columns] if len(context.drop_duplicates()) > 1: - raise ValueError( - 'Context columns are not constant within each segment.' - ) + raise ValueError('Context columns are not constant within each segment.') context = context.iloc[0].values segment = segment.drop(context_columns, axis=1) @@ -179,21 +175,15 @@ def assemble_sequences( List of ``pandas.DataFrames`` containing each segment. """ if not entity_columns: - segments = segment_sequence( - data, segment_size, sequence_index, drop_sequence_index - ) + segments = segment_sequence(data, segment_size, sequence_index, drop_sequence_index) else: segments = [] - groupby_columns = ( - entity_columns[0] if len(entity_columns) == 1 else entity_columns - ) + groupby_columns = entity_columns[0] if len(entity_columns) == 1 else entity_columns for _, sequence in data.groupby(groupby_columns): sequence.drop(entity_columns, axis=1, inplace=True) if context_columns: if len(sequence[context_columns].drop_duplicates()) > 1: - raise ValueError( - 'Context columns are not constant within each entity.' - ) + raise ValueError('Context columns are not constant within each entity.') entity_segments = segment_sequence( sequence, segment_size, sequence_index, drop_sequence_index diff --git a/pyproject.toml b/pyproject.toml index df1502c..e712f26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,7 @@ build-backend = 'setuptools.build_meta' [tool.ruff] preview = true -line-length = 79 +line-length = 99 src = ["deepecho"] target-version = "py312" exclude = [ diff --git a/tasks.py b/tasks.py index 9c33b12..2ed53f3 100644 --- a/tasks.py +++ b/tasks.py @@ -31,9 +31,7 @@ def check_dependencies(c): @task def integration(c): - c.run( - 'python -m pytest ./tests/integration --reruns 3 --cov=deepecho --cov-report=xml' - ) + c.run('python -m pytest ./tests/integration --reruns 3 --cov=deepecho --cov-report=xml') @task @@ -52,18 +50,12 @@ def _get_minimum_versions(dependencies, python_version): req = Requirement(dependency) if ';' in dependency: marker = req.marker - if marker and not marker.evaluate({ - 'python_version': python_version - }): + if marker and not marker.evaluate({'python_version': python_version}): continue # Skip this dependency if the marker does not apply to the current Python version if req.name not in min_versions: min_version = next( - ( - spec.version - for spec in req.specifier - if spec.operator in ('>=', '==') - ), + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), None, ) if min_version: @@ -72,11 +64,7 @@ def _get_minimum_versions(dependencies, python_version): elif '@' not in min_versions[req.name]: existing_version = Version(min_versions[req.name].split('==')[1]) new_version = next( - ( - spec.version - for spec in req.specifier - if spec.operator in ('>=', '==') - ), + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), existing_version, ) if new_version > existing_version: @@ -126,9 +114,7 @@ def readme(c): @task def tutorials(c): - for ipynb_file in glob.glob('tutorials/*.ipynb') + glob.glob( - 'tutorials/**/*.ipynb' - ): + for ipynb_file in glob.glob('tutorials/*.ipynb') + glob.glob('tutorials/**/*.ipynb'): if '.ipynb_checkpoints' not in ipynb_file: c.run( ( diff --git a/tests/unit/test_sequences.py b/tests/unit/test_sequences.py index cca33cc..019a480 100644 --- a/tests/unit/test_sequences.py +++ b/tests/unit/test_sequences.py @@ -52,9 +52,7 @@ def test_segment_by_time(): 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [9, 8, 7, 6, 5, 4, 3, 2, 1], }) - sequence_index = pd.date_range( - start='2001-01-01', periods=9, freq='1d' - ).to_series() + sequence_index = pd.date_range(start='2001-01-01', periods=9, freq='1d').to_series() segment_size = pd.to_timedelta('3d') out = segment_by_time(sequence, segment_size, sequence_index) @@ -276,9 +274,7 @@ def test__assemble_sequences_entity_and_time_segment_size(): 'c': [9, 8, 7, 6], 'time': pd.date_range(start='2001-01-01', periods=4, freq='1d'), }) - out = assemble_sequences( - data, entity_columns, context_columns, pd.to_timedelta('2d'), 'time' - ) + out = assemble_sequences(data, entity_columns, context_columns, pd.to_timedelta('2d'), 'time') assert isinstance(out, list) assert out == [