diff --git a/Makefile b/Makefile index 7989545..1ae4df3 100644 --- a/Makefile +++ b/Makefile @@ -81,9 +81,6 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a .PHONY: lint-deepecho lint-deepecho: ## check style with flake8 and isort - flake8 deepecho - isort -c --recursive deepecho - pylint deepecho --rcfile=setup.cfg .PHONY: lint-tests lint-tests: ## check style with flake8 and isort @@ -92,17 +89,15 @@ lint-tests: ## check style with flake8 and isort .PHONY: lint lint: ## Run all code style checks - invoke lint + ruff check . + ruff format . --check .PHONY: fix-lint -fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find deepecho tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive deepecho tests - isort --apply --atomic --recursive deepecho tests - +fix-lint: ## fix lint issues using ruff + ruff check --fix . + ruff format . # TEST TARGETS - .PHONY: test-unit test-unit: ## run unit tests using pytest invoke unit diff --git a/deepecho/demo.py b/deepecho/demo.py index 56984fa..229a39b 100644 --- a/deepecho/demo.py +++ b/deepecho/demo.py @@ -9,4 +9,6 @@ def load_demo(): """Load the demo DataFrame.""" - return pd.read_csv(os.path.join(_DATA_PATH, 'demo.csv'), parse_dates=['date']) + return pd.read_csv( + os.path.join(_DATA_PATH, 'demo.csv'), parse_dates=['date'] + ) diff --git a/deepecho/models/base.py b/deepecho/models/base.py index fa44058..1bfda64 100644 --- a/deepecho/models/base.py +++ b/deepecho/models/base.py @@ -6,7 +6,7 @@ from deepecho.sequences import assemble_sequences -class DeepEcho(): +class DeepEcho: """The base class for DeepEcho models.""" _verbose = True @@ -28,7 +28,13 @@ def _validate(sequences, context_types, data_types): data_types: See `fit`. """ - dtypes = set(['continuous', 'categorical', 'ordinal', 'count', 'datetime']) + dtypes = set([ + 'continuous', + 'categorical', + 'ordinal', + 'count', + 'datetime', + ]) assert all(dtype in dtypes for dtype in context_types) assert all(dtype in dtypes for dtype in data_types) @@ -94,13 +100,22 @@ def _get_data_types(data, data_types, columns): elif kind == 'M': dtypes_list.append('datetime') else: - error = f'Unsupported data_type for column {column}: {dtype}' + error = ( + f'Unsupported data_type for column {column}: {dtype}' + ) raise ValueError(error) return dtypes_list - def fit(self, data, entity_columns=None, context_columns=None, - data_types=None, segment_size=None, sequence_index=None): + def fit( + self, + data, + entity_columns=None, + context_columns=None, + data_types=None, + segment_size=None, + sequence_index=None, + ): """Fit the model to a dataframe containing time series data. Args: @@ -131,7 +146,9 @@ def fit(self, data, entity_columns=None, context_columns=None, such as integer values or datetimes. """ if not entity_columns and segment_size is None: - raise TypeError('If the data has no `entity_columns`, `segment_size` must be given.') + raise TypeError( + 'If the data has no `entity_columns`, `segment_size` must be given.' + ) if segment_size is not None and not isinstance(segment_size, int): if sequence_index is None: raise TypeError( @@ -159,9 +176,16 @@ def fit(self, data, entity_columns=None, context_columns=None, self._data_columns.remove(sequence_index) data_types = self._get_data_types(data, data_types, self._data_columns) - context_types = self._get_data_types(data, data_types, self._context_columns) + context_types = self._get_data_types( + data, data_types, self._context_columns + ) sequences = assemble_sequences( - data, self._entity_columns, self._context_columns, segment_size, sequence_index) + data, + self._entity_columns, + self._context_columns, + segment_size, + sequence_index, + ) # Validate and fit self._validate(sequences, context_types, data_types) @@ -212,7 +236,9 @@ def sample(self, num_entities=None, context=None, sequence_length=None): """ if context is None: if num_entities is None: - raise TypeError('Either context or num_entities must be not None') + raise TypeError( + 'Either context or num_entities must be not None' + ) context = self._context_values.sample(num_entities, replace=True) context = context.reset_index(drop=True) @@ -242,7 +268,7 @@ def sample(self, num_entities=None, context=None, sequence_length=None): # Reformat as a DataFrame group = pd.DataFrame( dict(zip(self._data_columns, sequence)), - columns=self._data_columns + columns=self._data_columns, ) group[self._entity_columns] = entity_values for column, value in zip(self._context_columns, context_values): diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index 94a7979..734dafa 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -13,10 +13,15 @@ def _expand_context(data, context): - return torch.cat([ - data, - context.unsqueeze(0).expand(data.shape[0], context.shape[0], context.shape[1]) - ], dim=2) + return torch.cat( + [ + data, + context.unsqueeze(0).expand( + data.shape[0], context.shape[0], context.shape[1] + ), + ], + dim=2, + ) class BasicGenerator(torch.nn.Module): @@ -47,7 +52,9 @@ class BasicGenerator(torch.nn.Module): Device to which this Module is associated to. """ - def __init__(self, context_size, latent_size, hidden_size, data_size, device): + def __init__( + self, context_size, latent_size, hidden_size, data_size, device + ): super().__init__() self.latent_size = latent_size self.rnn = torch.nn.GRU(context_size + latent_size, hidden_size) @@ -65,7 +72,7 @@ def forward(self, context=None, sequence_length=None): """ latent = torch.randn( size=(sequence_length, context.size(0), self.latent_size), - device=self.device + device=self.device, ) latent = _expand_context(latent, context) @@ -150,8 +157,16 @@ class BasicGANModel(DeepEcho): _model_data_size = None _generator = None - def __init__(self, epochs=1024, latent_size=32, hidden_size=16, - gen_lr=1e-3, dis_lr=1e-3, cuda=True, verbose=True): + def __init__( + self, + epochs=1024, + latent_size=32, + hidden_size=16, + gen_lr=1e-3, + dis_lr=1e-3, + cuda=True, + verbose=True, + ): self._epochs = epochs self._gen_lr = gen_lr self._dis_lr = dis_lr @@ -211,7 +226,7 @@ def _index_map(columns, types): 'type': column_type, 'min': np.min(values), 'max': np.max(values), - 'indices': (dimensions, dimensions + 1) + 'indices': (dimensions, dimensions + 1), } dimensions += 2 @@ -221,10 +236,7 @@ def _index_map(columns, types): indices[value] = dimensions dimensions += 1 - mapping[column] = { - 'type': column_type, - 'indices': indices - } + mapping[column] = {'type': column_type, 'indices': indices} else: raise ValueError(f'Unsupported type: {column_type}') @@ -239,21 +251,31 @@ def _analyze_data(self, sequences, context_types, data_types): - Index map and dimensions for the context. - Index map and dimensions for the data. """ - sequence_lengths = np.array([len(sequence['data'][0]) for sequence in sequences]) + sequence_lengths = np.array([ + len(sequence['data'][0]) for sequence in sequences + ]) self._max_sequence_length = np.max(sequence_lengths) - self._fixed_length = (sequence_lengths == self._max_sequence_length).all() + self._fixed_length = ( + sequence_lengths == self._max_sequence_length + ).all() # Concatenate all the context sequences together context = [] for column in range(len(context_types)): - context.append([sequence['context'][column] for sequence in sequences]) + context.append([ + sequence['context'][column] for sequence in sequences + ]) - self._context_map, self._context_size = self._index_map(context, context_types) + self._context_map, self._context_size = self._index_map( + context, context_types + ) # Concatenate all the data sequences together data = [] for column in range(len(data_types)): - data.append(sum([sequence['data'][column] for sequence in sequences], [])) + data.append( + sum([sequence['data'][column] for sequence in sequences], []) + ) self._data_map, self._data_size = self._index_map(data, data_types) @@ -317,7 +339,7 @@ def _value_to_tensor(self, tensor, value, properties): self._one_hot_encode(tensor, value, properties) else: - raise ValueError() # Theoretically unreachable + raise ValueError() # Theoretically unreachable def _data_to_tensor(self, data): """Convert the input data to the corresponding tensor. @@ -366,11 +388,13 @@ def _tensor_to_data(self, tensor): for row in range(sequence_length): if column_type in ('continuous', 'count'): round_value = column_type == 'count' - value = self._denormalize(tensor, row, properties, round_value=round_value) + value = self._denormalize( + tensor, row, properties, round_value=round_value + ) elif column_type in ('categorical', 'ordinal'): value = self._one_hot_decode(tensor, row, properties) else: - raise ValueError() # Theoretically unreachable + raise ValueError() # Theoretically unreachable column_data.append(value) @@ -394,10 +418,14 @@ def _transform(self, data): if column_type in ('continuous', 'count'): value_idx, missing_idx = properties['indices'] data[:, :, value_idx] = torch.tanh(data[:, :, value_idx]) - data[:, :, missing_idx] = torch.sigmoid(data[:, :, missing_idx]) + data[:, :, missing_idx] = torch.sigmoid( + data[:, :, missing_idx] + ) elif column_type in ('categorical', 'ordinal'): indices = list(properties['indices'].values()) - data[:, :, indices] = torch.nn.functional.softmax(data[:, :, indices]) + data[:, :, indices] = torch.nn.functional.softmax( + data[:, :, indices] + ) return data @@ -412,7 +440,7 @@ def _truncate(self, generated): end_flag = sequence[:, self._data_size] if (end_flag == 1.0).any(): cut_idx = end_flag.detach().cpu().numpy().argmax() - sequence[cut_idx + 1:] = 0.0 + sequence[cut_idx + 1 :] = 0.0 def _generate(self, context, sequence_length=None): generated = self._generator( @@ -426,7 +454,9 @@ def _generate(self, context, sequence_length=None): return generated - def _discriminator_step(self, discriminator, discriminator_opt, data_context, context): + def _discriminator_step( + self, discriminator, discriminator_opt, data_context, context + ): real_scores = discriminator(data_context) fake = self._generate(context) @@ -470,8 +500,12 @@ def _build_fit_artifacts(self): hidden_size=self._hidden_size, ).to(self._device) - generator_opt = torch.optim.Adam(self._generator.parameters(), lr=self._gen_lr) - discriminator_opt = torch.optim.Adam(discriminator.parameters(), lr=self._dis_lr) + generator_opt = torch.optim.Adam( + self._generator.parameters(), lr=self._gen_lr + ) + discriminator_opt = torch.optim.Adam( + discriminator.parameters(), lr=self._dis_lr + ) return discriminator, generator_opt, discriminator_opt @@ -513,11 +547,17 @@ def fit_sequences(self, sequences, context_types, data_types): """ self._analyze_data(sequences, context_types, data_types) - data = self._build_tensor(self._data_to_tensor, sequences, 'data', dim=1) - context = self._build_tensor(self._context_to_tensor, sequences, 'context', dim=0) + data = self._build_tensor( + self._data_to_tensor, sequences, 'data', dim=1 + ) + context = self._build_tensor( + self._context_to_tensor, sequences, 'context', dim=0 + ) data_context = _expand_context(data, context) - discriminator, generator_opt, discriminator_opt = self._build_fit_artifacts() + discriminator, generator_opt, discriminator_opt = ( + self._build_fit_artifacts() + ) iterator = range(self._epochs) if self._verbose: @@ -539,7 +579,9 @@ def fit_sequences(self, sequences, context_types, data_types): if self._verbose: d_loss = discriminator_score.item() g_loss = generator_score.item() - iterator.set_description(f'Epoch {epoch + 1} | D Loss {d_loss} | G Loss {g_loss}') + iterator.set_description( + f'Epoch {epoch + 1} | D Loss {d_loss} | G Loss {g_loss}' + ) def sample_sequence(self, context, sequence_length=None): """Sample a single sequence conditioned on context. @@ -554,7 +596,9 @@ def sample_sequence(self, context, sequence_length=None): A list of lists (data) corresponding to the types specified in data_types when fit was called. """ - context = self._context_to_tensor(context).unsqueeze(0).to(self._device) + context = ( + self._context_to_tensor(context).unsqueeze(0).to(self._device) + ) with torch.no_grad(): generated = self._generate(context, sequence_length) diff --git a/deepecho/models/par.py b/deepecho/models/par.py index a9009bf..48a3094 100644 --- a/deepecho/models/par.py +++ b/deepecho/models/par.py @@ -27,24 +27,38 @@ def forward(self, x, c): if isinstance(x, torch.nn.utils.rnn.PackedSequence): x, lengths = torch.nn.utils.rnn.pad_packed_sequence(x) if self.context_size: - x = torch.cat([ - x, - c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]) - ], dim=2) + x = torch.cat( + [ + x, + c.unsqueeze(0).expand( + x.shape[0], c.shape[0], c.shape[1] + ), + ], + dim=2, + ) x = self.down(x) - x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, enforce_sorted=False) + x = torch.nn.utils.rnn.pack_padded_sequence( + x, lengths, enforce_sorted=False + ) x, _ = self.rnn(x) x, lengths = torch.nn.utils.rnn.pad_packed_sequence(x) x = self.up(x) - x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, enforce_sorted=False) + x = torch.nn.utils.rnn.pack_padded_sequence( + x, lengths, enforce_sorted=False + ) else: if self.context_size: - x = torch.cat([ - x, - c.unsqueeze(0).expand(x.shape[0], c.shape[0], c.shape[1]) - ], dim=2) + x = torch.cat( + [ + x, + c.unsqueeze(0).expand( + x.shape[0], c.shape[0], c.shape[1] + ), + ], + dim=2, + ) x = self.down(x) x, _ = self.rnn(x) @@ -126,7 +140,7 @@ def _idx_map(self, x, t): 'mu': np.nanmean(x[i]), 'std': np.nanstd(x[i]), 'nulls': pd.isnull(x[i]).any(), - 'indices': (idx, idx + 1, idx + 2) + 'indices': (idx, idx + 1, idx + 2), } idx += 3 @@ -136,15 +150,12 @@ def _idx_map(self, x, t): 'min': np.nanmin(x[i]), 'range': np.nanmax(x[i]) - np.nanmin(x[i]), 'nulls': pd.isnull(x[i]).any(), - 'indices': (idx, idx + 1, idx + 2) + 'indices': (idx, idx + 1, idx + 2), } idx += 3 elif t == 'categorical' or t == 'ordinal': - idx_map[i] = { - 'type': t, - 'indices': {} - } + idx_map[i] = {'type': t, 'indices': {}} idx += 1 for v in set(x[i]): if pd.isnull(v): @@ -186,8 +197,8 @@ def _build(self, sequences, context_types, data_types): 'indices': { '': self._data_dims, '': self._data_dims + 1, - '': self._data_dims + 2 - } + '': self._data_dims + 2, + }, } self._data_dims += 3 @@ -220,12 +231,17 @@ def _data_to_tensor(self, data): if pd.isnull(data[key][i]) or props['range'] == 0: x[r_idx] = 0.0 else: - x[r_idx] = (data[key][i] - props['min']) / props['range'] + x[r_idx] = (data[key][i] - props['min']) / props[ + 'range' + ] x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0 - elif props['type'] in ['categorical', 'ordinal']: # categorical + elif props['type'] in [ + 'categorical', + 'ordinal', + ]: # categorical value = data[key][i] if pd.isnull(value): value = None @@ -250,15 +266,21 @@ def _context_to_tensor(self, context): for key, props in self._ctx_map.items(): if props['type'] in ['continuous', 'datetime']: mu_idx, sigma_idx, missing_idx = props['indices'] - x[mu_idx] = 0.0 if (pd.isnull(context[key]) or props['std'] == 0) else ( - context[key] - props['mu']) / props['std'] + x[mu_idx] = ( + 0.0 + if (pd.isnull(context[key]) or props['std'] == 0) + else (context[key] - props['mu']) / props['std'] + ) x[sigma_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0 elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] - x[r_idx] = 0.0 if (pd.isnull(context[key]) or props['range'] == 0) else ( - context[key] - props['min']) / props['range'] + x[r_idx] = ( + 0.0 + if (pd.isnull(context[key]) or props['range'] == 0) + else (context[key] - props['min']) / props['range'] + ) x[p_idx] = 0.0 x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0 @@ -315,7 +337,9 @@ def fit_sequences(self, sequences, context_types, data_types): X.append(self._data_to_tensor(sequence['data'])) C.append(self._context_to_tensor(sequence['context'])) - X = torch.nn.utils.rnn.pack_sequence(X, enforce_sorted=False).to(self.device) + X = torch.nn.utils.rnn.pack_sequence(X, enforce_sorted=False).to( + self.device + ) if self._ctx_dims: C = torch.stack(C, dim=0).to(self.device) @@ -336,22 +360,27 @@ def fit_sequences(self, sequences, context_types, data_types): Y_padded, _ = torch.nn.utils.rnn.pad_packed_sequence(Y) optimizer.zero_grad() - loss = self._compute_loss(X_padded[1:, :, :], Y_padded[:-1, :, :], seq_len) + loss = self._compute_loss( + X_padded[1:, :, :], Y_padded[:-1, :, :], seq_len + ) loss.backward() epoch_loss_df = pd.DataFrame({ 'Epoch': [epoch], - 'Loss': [loss.item()] + 'Loss': [loss.item()], }) if not self.loss_values.empty: - self.loss_values = pd.concat( - [self.loss_values, epoch_loss_df] - ).reset_index(drop=True) + self.loss_values = pd.concat([ + self.loss_values, + epoch_loss_df, + ]).reset_index(drop=True) else: self.loss_values = epoch_loss_df if self.verbose: - iterator.set_description(pbar_description.format(loss=loss.item())) + iterator.set_description( + pbar_description.format(loss=loss.item()) + ) optimizer.step() @@ -375,7 +404,7 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): This list contains the length of each sequence. """ log_likelihood = 0.0 - _, batch_size, input_size = X_padded.shape + _, batch_size, _input_size = X_padded.shape for key, props in self._data_map.items(): if props['type'] in ['continuous', 'timestamp']: @@ -386,47 +415,68 @@ def _compute_loss(self, X_padded, Y_padded, seq_len): for i in range(batch_size): dist = torch.distributions.normal.Normal( - mu[:seq_len[i], i], sigma[:seq_len[i], i]) - log_likelihood += torch.sum(dist.log_prob(X_padded[-seq_len[i]:, i, mu_idx])) - - p_true = X_padded[:seq_len[i], i, missing_idx] - p_pred = missing[:seq_len[i], i] + mu[: seq_len[i], i], sigma[: seq_len[i], i] + ) + log_likelihood += torch.sum( + dist.log_prob(X_padded[-seq_len[i] :, i, mu_idx]) + ) + + p_true = X_padded[: seq_len[i], i, missing_idx] + p_pred = missing[: seq_len[i], i] log_likelihood += torch.sum(p_true * p_pred) - log_likelihood += torch.sum((1.0 - p_true) * torch.log( - 1.0 - torch.exp(p_pred))) + log_likelihood += torch.sum( + (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)) + ) elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] - r = torch.nn.functional.softplus(Y_padded[:, :, r_idx]) * props['range'] + r = ( + torch.nn.functional.softplus(Y_padded[:, :, r_idx]) + * props['range'] + ) p = torch.sigmoid(Y_padded[:, :, p_idx]) x = X_padded[:, :, r_idx] * props['range'] missing = torch.nn.LogSigmoid()(Y_padded[:, :, missing_idx]) for i in range(batch_size): - dist = torch.distributions.negative_binomial.NegativeBinomial( - r[:seq_len[i], i], p[:seq_len[i], i], validate_args=False) - log_likelihood += torch.sum(dist.log_prob(x[:seq_len[i], i])) - - p_true = X_padded[:seq_len[i], i, missing_idx] - p_pred = missing[:seq_len[i], i] + dist = ( + torch.distributions.negative_binomial.NegativeBinomial( + r[: seq_len[i], i], + p[: seq_len[i], i], + validate_args=False, + ) + ) + log_likelihood += torch.sum( + dist.log_prob(x[: seq_len[i], i]) + ) + + p_true = X_padded[: seq_len[i], i, missing_idx] + p_pred = missing[: seq_len[i], i] log_likelihood += torch.sum(p_true * p_pred) - log_likelihood += torch.sum((1.0 - p_true) * torch.log( - 1.0 - torch.exp(p_pred))) + log_likelihood += torch.sum( + (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)) + ) elif props['type'] in ['categorical', 'ordinal']: idx = list(props['indices'].values()) - log_softmax = torch.nn.functional.log_softmax(Y_padded[:, :, idx], dim=2) + log_softmax = torch.nn.functional.log_softmax( + Y_padded[:, :, idx], dim=2 + ) for i in range(batch_size): - target = X_padded[:seq_len[i], i, idx] - predicted = log_softmax[:seq_len[i], i] + target = X_padded[: seq_len[i], i, idx] + predicted = log_softmax[: seq_len[i], i] target = torch.argmax(target, dim=1).unsqueeze(dim=1) - log_likelihood += torch.sum(predicted.gather(dim=1, index=target)) + log_likelihood += torch.sum( + predicted.gather(dim=1, index=target) + ) else: raise ValueError() - return -log_likelihood / (batch_size * len(self._data_map) * batch_size) + return -log_likelihood / ( + batch_size * len(self._data_map) * batch_size + ) def _tensor_to_data(self, x): # Force CPU on x @@ -443,18 +493,23 @@ def _tensor_to_data(self, x): data[key] = [] for i in range(seq_len): if props['type'] in ['continuous', 'datetime']: - mu_idx, sigma_idx, missing_idx = props['indices'] + mu_idx, _sigma_idx, missing_idx = props['indices'] if (x[i, 0, missing_idx] > 0) and props['nulls']: data[key].append(None) else: - data[key].append(x[i, 0, mu_idx].item() * props['std'] + props['mu']) + data[key].append( + x[i, 0, mu_idx].item() * props['std'] + props['mu'] + ) elif props['type'] in ['count']: - r_idx, p_idx, missing_idx = props['indices'] + r_idx, _p_idx, missing_idx = props['indices'] if x[i, 0, missing_idx] > 0 and props['nulls']: data[key].append(None) else: - sample = x[i, 0, r_idx].item() * props['range'] + props['min'] + sample = ( + x[i, 0, r_idx].item() * props['range'] + + props['min'] + ) data[key].append(int(sample)) elif props['type'] in ['categorical', 'ordinal']: @@ -473,7 +528,7 @@ def _tensor_to_data(self, x): def _sample_state(self, x): log_likelihood = 0.0 - seq_len, batch_size, input_size = x.shape + seq_len, batch_size, _input_size = x.shape assert seq_len == 1 and batch_size == 1 for key, props in self._data_map.items(): @@ -486,25 +541,40 @@ def _sample_state(self, x): x[0, 0, sigma_idx] = 0.0 log_likelihood += torch.sum(dist.log_prob(x[0, 0, mu_idx])) - dist = torch.distributions.Bernoulli(torch.sigmoid(x[0, 0, missing_idx])) + dist = torch.distributions.Bernoulli( + torch.sigmoid(x[0, 0, missing_idx]) + ) x[0, 0, missing_idx] = dist.sample() - x[0, 0, mu_idx] = x[0, 0, mu_idx] * (1.0 - x[0, 0, missing_idx]) - log_likelihood += torch.sum(dist.log_prob(x[0, 0, missing_idx])) + x[0, 0, mu_idx] = x[0, 0, mu_idx] * ( + 1.0 - x[0, 0, missing_idx] + ) + log_likelihood += torch.sum( + dist.log_prob(x[0, 0, missing_idx]) + ) elif props['type'] in ['count']: r_idx, p_idx, missing_idx = props['indices'] - r = torch.nn.functional.softplus(x[0, 0, r_idx]) * props['range'] + r = ( + torch.nn.functional.softplus(x[0, 0, r_idx]) + * props['range'] + ) p = torch.sigmoid(x[0, 0, p_idx]) - dist = torch.distributions.negative_binomial.NegativeBinomial(r, p) + dist = torch.distributions.negative_binomial.NegativeBinomial( + r, p + ) x[0, 0, r_idx] = dist.sample() x[0, 0, p_idx] = 0.0 log_likelihood += torch.sum(dist.log_prob(x[0, 0, r_idx])) x[0, 0, r_idx] /= props['range'] - dist = torch.distributions.Bernoulli(torch.sigmoid(x[0, 0, missing_idx])) + dist = torch.distributions.Bernoulli( + torch.sigmoid(x[0, 0, missing_idx]) + ) x[0, 0, missing_idx] = dist.sample() x[0, 0, r_idx] = x[0, 0, r_idx] * (1.0 - x[0, 0, missing_idx]) - log_likelihood += torch.sum(dist.log_prob(x[0, 0, missing_idx])) + log_likelihood += torch.sum( + dist.log_prob(x[0, 0, missing_idx]) + ) elif props['type'] in ['categorical', 'ordinal']: idx = list(props['indices'].values()) @@ -530,12 +600,19 @@ def _sample_sequence(self, context, min_length, max_length): next_x, ll = self._sample_state(self._model(x, context)[-1:, :, :]) x = torch.cat([x, next_x], dim=0) log_likelihood += ll - if next_x[0, 0, self._data_map['']['indices']['']] > 0.0: + if ( + next_x[0, 0, self._data_map['']['indices']['']] + > 0.0 + ): if min_length <= step + 1 <= max_length: break # received end token - next_x[0, 0, self._data_map['']['indices']['']] = 1.0 - next_x[0, 0, self._data_map['']['indices']['']] = 0.0 + next_x[ + 0, 0, self._data_map['']['indices'][''] + ] = 1.0 + next_x[0, 0, self._data_map['']['indices']['']] = ( + 0.0 + ) return x[1:, :, :], log_likelihood @@ -570,7 +647,9 @@ def sample_sequence(self, context, sequence_length=None): best_x, best_ll = None, float('-inf') for _ in range(self.sample_size): with torch.no_grad(): - x, log_likelihood = self._sample_sequence(context, min_length, max_length) + x, log_likelihood = self._sample_sequence( + context, min_length, max_length + ) if log_likelihood > best_ll: best_x = x diff --git a/deepecho/sequences.py b/deepecho/sequences.py index e425053..785bbe6 100644 --- a/deepecho/sequences.py +++ b/deepecho/sequences.py @@ -65,7 +65,9 @@ def segment_by_time(sequence, segment_size, sequence_index): return sequences -def segment_sequence(sequence, segment_size, sequence_index, drop_sequence_index=True): +def segment_sequence( + sequence, segment_size, sequence_index, drop_sequence_index=True +): """Segment the sequence in segments of the indicated time length or size. If a ``sequence_index`` is given, data will be sorted by it first. @@ -110,27 +112,29 @@ def _convert_to_dicts(segments, context_columns): if context_columns: context = segment[context_columns] if len(context.drop_duplicates()) > 1: - raise ValueError('Context columns are not constant within each segment.') + raise ValueError( + 'Context columns are not constant within each segment.' + ) context = context.iloc[0].values segment = segment.drop(context_columns, axis=1) else: context = [] - lists = [ - list(row) - for _, row in segment.items() - ] - sequences.append({ - 'context': context, - 'data': lists - }) + lists = [list(row) for _, row in segment.items()] + sequences.append({'context': context, 'data': lists}) return sequences -def assemble_sequences(data, entity_columns, context_columns, segment_size, - sequence_index, drop_sequence_index=True): +def assemble_sequences( + data, + entity_columns, + context_columns, + segment_size, + sequence_index, + drop_sequence_index=True, +): """Build sequences from the data, grouping first by entity and then segmenting by size. Input is a ``pandas.DataFrame`` containing all the data, lists of entity and context @@ -175,18 +179,25 @@ def assemble_sequences(data, entity_columns, context_columns, segment_size, List of ``pandas.DataFrames`` containing each segment. """ if not entity_columns: - segments = segment_sequence(data, segment_size, sequence_index, drop_sequence_index) + segments = segment_sequence( + data, segment_size, sequence_index, drop_sequence_index + ) else: segments = [] - groupby_columns = entity_columns[0] if len(entity_columns) == 1 else entity_columns + groupby_columns = ( + entity_columns[0] if len(entity_columns) == 1 else entity_columns + ) for _, sequence in data.groupby(groupby_columns): sequence.drop(entity_columns, axis=1, inplace=True) if context_columns: if len(sequence[context_columns].drop_duplicates()) > 1: - raise ValueError('Context columns are not constant within each entity.') + raise ValueError( + 'Context columns are not constant within each entity.' + ) - entity_segments = segment_sequence(sequence, segment_size, - sequence_index, drop_sequence_index) + entity_segments = segment_sequence( + sequence, segment_size, sequence_index, drop_sequence_index + ) segments.extend(entity_segments) return _convert_to_dicts(segments, context_columns) diff --git a/pyproject.toml b/pyproject.toml index 89acad6..df1502c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,29 +57,7 @@ dev = [ 'watchdog>=0.8.3,<0.11', # style check - 'flake8>=3.7.7,<4', - 'flake8-absolute-import>=1.0,<2', - 'flake8-docstrings>=1.5.0,<2', - 'flake8-sfs>=0.0.3,<0.1', - 'isort>=4.3.4,<5', - 'pylint>=2.5.3,<3', - 'flake8-builtins>=1.5.3,<1.6', - 'flake8-debugger>=4.0.0,<4.1', - 'flake8-mock>=0.3,<0.4', - 'dlint>=0.11.0,<0.12', - 'flake8-eradicate>=1.1.0,<1.2', - 'flake8-mutable>=1.2.0,<1.3', - 'flake8-fixme>=1.1.1,<1.2', - 'flake8-multiline-containers>=0.0.18,<0.1', - 'flake8-quotes>=3.3.0,<4', - 'flake8-variables-names>=0.0.4,<0.1', - 'pep8-naming>=0.12.1,<0.13', - 'flake8-expression-complexity>=0.0.9,<0.1', - 'flake8-print>=4.0.0,<4.1', - - # fix style issues - 'autoflake>=1.1,<2', - 'autopep8>=1.4.3,<1.6', + 'ruff>=0.3.2,<1', # distribute on PyPI 'twine>=1.10.0,<4', @@ -175,3 +153,49 @@ replace = "__version__ = '{new_version}'" [build-system] requires = ['setuptools', 'wheel'] build-backend = 'setuptools.build_meta' + +[tool.ruff] +preview = true +line-length = 79 +src = ["deepecho"] +target-version = "py312" +exclude = [ + "docs", + ".tox", + ".git", + "__pycache__", + ".ipynb_checkpoints" +] + +[tool.ruff.lint] +select = [ + # Pyflakes + "F", + # Pycodestyle + "E", + "W", + # isort + "I001" +] +ignore = [ + "E501", + "D107", # Missing docstring in __init__ + "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 +] + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"] + +[tool.ruff.lint.isort] +known-first-party = ["deepecho"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] + +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +preview = true + +[tool.ruff.lint.pydocstyle] +convention = "google" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 47cc2d5..0000000 --- a/setup.cfg +++ /dev/null @@ -1,32 +0,0 @@ -[flake8] -max-line-length = 99 -exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints -docstring-convetion = google -extend-ignore = - SFS3, # String formating using f-string - VNE001, # Single letter variable names are not allowed. - D107, # Missing docstring in __init__ - D417 # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 -ignore-names = - X, - C, - X_padded, - Y, - Y_padded - -[aliases] -test = pytest - -[pylint] -persistent = no -extension-pkg-whitelist = numpy -generated-members = torch.* -min-similarity-lines = 5 -ignore-comments = yes -ignore-docstrings = yes -ignore-imports = yes -max-args = 10 -ignore = par.py -disable = R0914, R0902, R0903, C0102, C0209, W0703, W0223, E1102 -good-names = i, j, k, X, x, y, X_train, X_test, ex - diff --git a/tasks.py b/tasks.py index 91c3bce..9c33b12 100644 --- a/tasks.py +++ b/tasks.py @@ -16,7 +16,7 @@ '>=': operator.ge, '>': operator.gt, '<': operator.lt, - '<=': operator.le + '<=': operator.le, } @@ -31,7 +31,9 @@ def check_dependencies(c): @task def integration(c): - c.run('python -m pytest ./tests/integration --reruns 3 --cov=deepecho --cov-report=xml') + c.run( + 'python -m pytest ./tests/integration --reruns 3 --cov=deepecho --cov-report=xml' + ) @task @@ -50,19 +52,37 @@ def _get_minimum_versions(dependencies, python_version): req = Requirement(dependency) if ';' in dependency: marker = req.marker - if marker and not marker.evaluate({'python_version': python_version}): + if marker and not marker.evaluate({ + 'python_version': python_version + }): continue # Skip this dependency if the marker does not apply to the current Python version if req.name not in min_versions: - min_version = next((spec.version for spec in req.specifier if spec.operator in ('>=', '==')), None) + min_version = next( + ( + spec.version + for spec in req.specifier + if spec.operator in ('>=', '==') + ), + None, + ) if min_version: min_versions[req.name] = f'{req.name}=={min_version}' elif '@' not in min_versions[req.name]: existing_version = Version(min_versions[req.name].split('==')[1]) - new_version = next((spec.version for spec in req.specifier if spec.operator in ('>=', '==')), existing_version) + new_version = next( + ( + spec.version + for spec in req.specifier + if spec.operator in ('>=', '==') + ), + existing_version, + ) if new_version > existing_version: - min_versions[req.name] = f'{req.name}=={new_version}' # Change when a valid newer version is found + min_versions[req.name] = ( + f'{req.name}=={new_version}' # Change when a valid newer version is found + ) return list(min_versions.values()) @@ -77,7 +97,8 @@ def install_minimum(c): minimum_versions = _get_minimum_versions(dependencies, python_version) if minimum_versions: - c.run(f'python -m pip install {" ".join(minimum_versions)}') + install_deps = ' '.join(minimum_versions) + c.run(f'python -m pip install {install_deps}') @task @@ -105,21 +126,24 @@ def readme(c): @task def tutorials(c): - for ipynb_file in glob.glob('tutorials/*.ipynb') + glob.glob('tutorials/**/*.ipynb'): + for ipynb_file in glob.glob('tutorials/*.ipynb') + glob.glob( + 'tutorials/**/*.ipynb' + ): if '.ipynb_checkpoints' not in ipynb_file: - c.run(( - 'jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 ' - f'--to=html --stdout "{ipynb_file}"' - ), hide='out') + c.run( + ( + 'jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 ' + f'--to=html --stdout "{ipynb_file}"' + ), + hide='out', + ) @task def lint(c): check_dependencies(c) - c.run('flake8 deepecho') - c.run('flake8 tests') - c.run('isort -c --recursive deepecho tests') - c.run('pylint deepecho --rcfile=setup.cfg') + c.run('ruff check .') + c.run('ruff format . --check') def remove_readonly(func, path, _): diff --git a/tests/integration/test_basic_gan.py b/tests/integration/test_basic_gan.py index 8ed516d..c58d6a1 100644 --- a/tests/integration/test_basic_gan.py +++ b/tests/integration/test_basic_gan.py @@ -16,15 +16,15 @@ def test_basic(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] + ], }, { 'context': [], 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] - } + ], + }, ] context_types = [] data_types = ['continuous', 'continuous'] @@ -41,15 +41,15 @@ def test_conditional(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] + ], }, { 'context': [1], 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'continuous'] @@ -66,15 +66,15 @@ def test_mixed(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, 1, 0, 1, 0, 1], - ] + ], }, { 'context': [1], 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'categorical'] @@ -91,15 +91,15 @@ def test_count(self): 'data': [ [0, 5, 5, 3, 1, 1], [0, 1, 2, 1, 0, 1], - ] + ], }, { 'context': [1.1], 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['continuous'] data_types = ['count', 'categorical'] @@ -116,15 +116,15 @@ def test_variable_length(self): 'data': [ [0, 5, 5, 3, 1, 1, 0], [0, 1, 2, 1, 0, 1, 2], - ] + ], }, { 'context': [1], 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['count'] data_types = ['count', 'categorical'] diff --git a/tests/integration/test_par.py b/tests/integration/test_par.py index 4e172ec..55ce0c0 100644 --- a/tests/integration/test_par.py +++ b/tests/integration/test_par.py @@ -18,15 +18,15 @@ def test_basic(self): 'data': [ [0.0, np.nan, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], - ] + ], }, { 'context': [], 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1, np.nan], - ] - } + ], + }, ] context_types = [] data_types = ['continuous', 'continuous'] @@ -47,15 +47,15 @@ def test_conditional(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, np.nan, 0.0], - ] + ], }, { 'context': [1], 'data': [ [0.5, 0.4, 0.3, 0.2, 0.1, 0.0], [0.0, 0.1, np.nan, 0.3, 0.4, 0.5], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'continuous'] @@ -76,15 +76,15 @@ def test_mixed(self): 'data': [ [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, 1, 0, 1, 0, 1], - ] + ], }, { 'context': [1], 'data': [ [0.5, np.nan, 0.3, 0.2, np.nan, 0.0], [0, 1, 0, 1, np.nan, 1], - ] - } + ], + }, ] context_types = ['categorical'] data_types = ['continuous', 'categorical'] @@ -105,15 +105,15 @@ def test_count(self): 'data': [ [0, 5, 5, np.nan, 1, 1], [0, 1, 2, 1, 0, 1], - ] + ], }, { 'context': [1.1], 'data': [ [1, 6, 6, 4, 2, 2], [0, 1, 0, 1, 0, 1], - ] - } + ], + }, ] context_types = ['continuous'] data_types = ['count', 'categorical'] @@ -134,15 +134,15 @@ def test_variable_length(self): 'data': [ [0, 5, 5, 3, 1, 1, 0], [0, 1, 2, 1, 0, 1, 2], - ] + ], }, { 'context': [1], 'data': [ [1, 6, 6, 4, 2, 2], [np.nan, 1, 0, 1, 0, np.nan], - ] - } + ], + }, ] context_types = ['count'] data_types = ['count', 'categorical'] diff --git a/tests/test_tasks.py b/tests/test_tasks.py index c78986c..9099d48 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -1,4 +1,5 @@ """Tests for the ``tasks.py`` file.""" + from tasks import _get_minimum_versions @@ -15,7 +16,7 @@ def test_get_minimum_versions(): "pandas>=1.2.0,<2;python_version<'3.10'", "pandas>=1.3.0,<2;python_version>='3.10'", 'humanfriendly>=8.2,<11', - 'pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas' + 'pandas @ git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', ] # Run diff --git a/tests/unit/test_sequences.py b/tests/unit/test_sequences.py index 8718b23..cca33cc 100644 --- a/tests/unit/test_sequences.py +++ b/tests/unit/test_sequences.py @@ -4,7 +4,11 @@ import pytest from deepecho.sequences import ( - assemble_sequences, segment_by_size, segment_by_time, segment_sequence) + assemble_sequences, + segment_by_size, + segment_by_time, + segment_sequence, +) def test_segment_by_size(): @@ -19,18 +23,27 @@ def test_segment_by_size(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [4, 5, 6], - 'b': [6, 5, 4], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [7, 8, 9], - 'b': [3, 2, 1], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [4, 5, 6], + 'b': [6, 5, 4], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [7, 8, 9], + 'b': [3, 2, 1], + }), + out[2], + ) def test_segment_by_time(): @@ -39,7 +52,9 @@ def test_segment_by_time(): 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [9, 8, 7, 6, 5, 4, 3, 2, 1], }) - sequence_index = pd.date_range(start='2001-01-01', periods=9, freq='1d').to_series() + sequence_index = pd.date_range( + start='2001-01-01', periods=9, freq='1d' + ).to_series() segment_size = pd.to_timedelta('3d') out = segment_by_time(sequence, segment_size, sequence_index) @@ -47,18 +62,27 @@ def test_segment_by_time(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [4, 5, 6], - 'b': [6, 5, 4], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [7, 8, 9], - 'b': [3, 2, 1], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [4, 5, 6], + 'b': [6, 5, 4], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [7, 8, 9], + 'b': [3, 2, 1], + }), + out[2], + ) def test_segment_sequence(): @@ -73,18 +97,27 @@ def test_segment_sequence(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [7, 8, 9], - 'b': [3, 2, 1], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'a': [4, 5, 6], - 'b': [6, 5, 4], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [7, 8, 9], + 'b': [3, 2, 1], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'a': [4, 5, 6], + 'b': [6, 5, 4], + }), + out[2], + ) def test_segment_sequence_sequence_index(): @@ -99,15 +132,24 @@ def test_segment_sequence_sequence_index(): assert isinstance(out, list) assert len(out) == 3 - pd.testing.assert_frame_equal(pd.DataFrame({ - 'b': [9, 8, 7], - }), out[0]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'b': [6, 5, 4], - }), out[1]) - pd.testing.assert_frame_equal(pd.DataFrame({ - 'b': [3, 2, 1], - }), out[2]) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'b': [9, 8, 7], + }), + out[0], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'b': [6, 5, 4], + }), + out[1], + ) + pd.testing.assert_frame_equal( + pd.DataFrame({ + 'b': [3, 2, 1], + }), + out[2], + ) def test__assemble_sequences_no_entity_no_context(): @@ -234,7 +276,9 @@ def test__assemble_sequences_entity_and_time_segment_size(): 'c': [9, 8, 7, 6], 'time': pd.date_range(start='2001-01-01', periods=4, freq='1d'), }) - out = assemble_sequences(data, entity_columns, context_columns, pd.to_timedelta('2d'), 'time') + out = assemble_sequences( + data, entity_columns, context_columns, pd.to_timedelta('2d'), 'time' + ) assert isinstance(out, list) assert out == [