Skip to content

Commit

Permalink
Fix prime dimensions (openproblems-bio#44)
Browse files Browse the repository at this point in the history
* added function to find balanced divisors

* slight change

* added padding for the first_place method
  • Loading branch information
ttunja authored May 22, 2024
1 parent c0fe3a5 commit baf9c73
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 36 deletions.
31 changes: 0 additions & 31 deletions src/task/methods/lstm_gru_cnn_ensemble/closest_sqrt_factor.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/task/methods/lstm_gru_cnn_ensemble/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ functionality:
- path: predict.py
- path: prepare_data.py
- path: train.py
- path: closest_sqrt_factor.py
- path: divisor_finder.py

platforms:
- type: docker
Expand Down
67 changes: 67 additions & 0 deletions src/task/methods/lstm_gru_cnn_ensemble/divisor_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import math

def closest_sqrt_factor(x):
"""
Finds the factor of x that is closest to the square root of x.
Args:
x: The number to find the closest factor of.
Returns:
The closest factor to the square root of x, or -1 if x is less than 1.
"""

# Base cases
if x < 1:
return -1
if x == 1:
return 1

# Start from the square root of x (rounded down)
start = math.isqrt(x)

# Check if the start value is a factor
if x % start == 0:
return start

# Look for factors above and below the start
for factor in range(start, 0, -1):
if x % factor == 0:
other_factor = x // factor
return min(factor, other_factor, key=lambda f: abs(f - math.sqrt(x)))

def find_balanced_divisors(n, threshold=100):
"""
Finds a number greater than or equal to n that has two divisors with the smallest
difference within a specified threshold.
Args:
n: The starting number to find balanced divisors for.
threshold: The maximum allowable difference between the two divisors.
Default is 100.
Returns:
A tuple containing the number with balanced divisors and a list of the two divisors.
"""

# Start with the initial number n
current_n = n
while True:

# Find the factor of current_n that is closest to its square root
divisor1 = closest_sqrt_factor(current_n)

# Calculate the corresponding divisor by dividing current_n by divisor1
divisor2 = current_n//divisor1

# Check if divisor2 is actually an integer
# This is a safeguard to ensure divisor2 is a whole number
if divisor2 != (current_n/divisor1):
raise ValueError(f"divisor2 is not an integer: {divisor2}")

# Check if the absolute difference between the two divisors is less than the threshold
if abs(divisor1 - divisor2) < threshold:
return current_n, [divisor1, divisor2]

# If the difference is not acceptable, increment the number and try again
current_n += 1
22 changes: 19 additions & 3 deletions src/task/methods/lstm_gru_cnn_ensemble/helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sklearn.model_selection import KFold as KF
from models import Conv, LSTM, GRU
from helper_classes import Dataset
from divisor_finder import find_balanced_divisors

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Expand Down Expand Up @@ -39,7 +40,18 @@ def one_hot_encode(data_train, data_test, out_dir):
train_features = encoder.transform(data_train)
test_features = encoder.transform(data_test)
np.save(f"{out_dir}/one_hot_train.npy", train_features.toarray().astype(float))
np.save(f"{out_dir}/one_hot_test.npy", test_features.toarray().astype(float))
np.save(f"{out_dir}/one_hot_test.npy", test_features.toarray().astype(float))

def pad_to_balanced_shape(x, target_shape):
current_size = list(x.shape)
target_size = current_size[:-1] + [target_shape[0] * target_shape[1]]
padding_needed = target_size[-1] - current_size[-1]
if padding_needed > 0:
padding = np.zeros(current_size[:-1] + [padding_needed], dtype=x.dtype)
padded = np.concatenate((x, padding), axis=-1)
else:
padded = x
return padded

def build_ChemBERTa_features(smiles_list):
chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
Expand Down Expand Up @@ -100,7 +112,12 @@ def combine_features(data_aug_dfs, chem_feats, main_df, one_hot_dfs=None, quanti
vec_ = np.concatenate([vec_, chem_feat[i]])
final_vec = np.concatenate([vec_,np.zeros(add_len-vec_.shape[0],)])
new_vecs.append(final_vec)
return np.stack(new_vecs, axis=0).astype(float).reshape(len(main_df), 1, add_len)

new_final_vec = np.stack(new_vecs, axis=0).astype(float).reshape(len(main_df), 1, add_len)
_, input_shape = find_balanced_divisors(new_final_vec.shape[-1])
if input_shape[0]*input_shape[1] != new_final_vec.shape[-1]:
new_final_vec = pad_to_balanced_shape(new_final_vec, (input_shape[0], input_shape[1]))
return new_final_vec

def augment_data(x_, y_):
copy_x = x_.copy()
Expand Down Expand Up @@ -220,7 +237,6 @@ def train_validate(X_vec, X_vec_light, X_vec_heavy, y, cell_types_sm_names, conf
trained_models = {'initial': [], 'light': [], 'heavy': []}
print(paths["model_dir"])
if not os.path.exists(paths["model_dir"]):
print("MODEL DIR DID NOT EXIST!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
os.makedirs(paths["model_dir"], exist_ok=True)
if not os.path.exists(paths["logs_dir"]):
os.makedirs(paths["logs_dir"], exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion src/task/methods/lstm_gru_cnn_ensemble/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import torch.nn as nn
from helper_classes import LogCoshLoss
import numpy as np
from closest_sqrt_factor import closest_sqrt_factor
from divisor_finder import closest_sqrt_factor
from torchsummary import summary

class Conv(nn.Module):
Expand Down

0 comments on commit baf9c73

Please sign in to comment.