Skip to content

Resource allocation colormap #3453

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updates to @antgonza comments
  • Loading branch information
Gossty committed Jan 12, 2025
commit 2cb3f37194215af93cd73abfc8c77ce3fbe17f20
72 changes: 0 additions & 72 deletions notebooks/resource-allocation/upload_df.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this file? Can it be deleted?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can delete this file.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, thank you; then please rm.

This file was deleted.

6 changes: 3 additions & 3 deletions qiita_db/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1343,7 +1343,7 @@ def test_minimize_const(self):
self.df[self.col_name] = self.df.samples * self.df['columns']
fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False)

mem_models, time_models = qdb.util._retrieve_equations()
mem_models, time_models = qdb.util.retrieve_equations()
bm_name, bm, options = qdb.util._resource_allocation_plot_helper(
self.df, axs[0], 'MaxRSSRaw', mem_models, self.col_name)
# check that the algorithm chooses correct model for MaxRSSRaw and
Expand All @@ -1357,7 +1357,7 @@ def test_minimize_const(self):
msg=f"""Best memory model
doesn't match
{bm_name} != 'mem_model4'""")
self.assertEqual(bm, mem_models['mem_model4'],
self.assertEqual(bm, mem_models['mem_model4']['equation'],
msg=f"""Best memory model
doesn't match
Coefficients:{k} {a} {b}
Expand All @@ -1377,7 +1377,7 @@ def test_minimize_const(self):
doesn't match
{bm_name} != 'time_model4'""")

self.assertEqual(bm, time_models[bm_name],
self.assertEqual(bm, time_models[bm_name]['equation'],
msg=f"""Best time model
doesn't match
Coefficients:{k} {a} {b}
Expand Down
100 changes: 59 additions & 41 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,27 +82,6 @@
from scipy.optimize import minimize


def get_model_name(model):
if model == 'mem_model1':
return "k * log(x) + x * a + b"
elif model == 'mem_model2':
return "k * log(x) + b * log(x)^2 + a"
elif model == 'mem_model3':
return "k * log(x) + b * log(x)^2 + a * log(x)^3"
elif model == 'mem_model4':
return "k * log(x) + b * log(x)^2 + a * log(x)^2.5"
elif model == 'time_model1':
return "a + b + log(x) * k"
elif model == 'time_model2':
return "a + b * x + log(x) * k"
elif model == 'time_model3':
return "a + b * log(x)^2 + log(x) * k"
elif model == 'time_model4':
return "a * log(x)^3 + b * log(x)^2 + log(x) * k"
else:
return "Unknown model"


def scrub_data(s):
r"""Scrubs data fields of characters not allowed by PostgreSQL

Expand Down Expand Up @@ -2369,7 +2348,7 @@ def resource_allocation_plot(df, col_name):
fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False)

ax = axs[0]
mem_models, time_models = _retrieve_equations()
mem_models, time_models = retrieve_equations()

# models for memory
_resource_allocation_plot_helper(
Expand All @@ -2382,9 +2361,9 @@ def resource_allocation_plot(df, col_name):
return fig, axs


def _retrieve_equations():
def retrieve_equations():
'''
Helepr function for resource_allocation_plot.
Helper function for resource_allocation_plot.
Retrieves equations from db. Creates dictionary for memory and time models.

Returns
Expand All @@ -2397,16 +2376,40 @@ def _retrieve_equations():
'''
memory_models = {}
time_models = {}
res = []
with qdb.sql_connection.TRN:
sql = ''' SELECT * FROM qiita.allocation_equations; '''
qdb.sql_connection.TRN.add(sql)
res = qdb.sql_connection.TRN.execute_fetchindex()
for models in res:
if 'mem' in models[1]:
memory_models[models[1]] = lambda x, k, a, b: eval(models[2])
else:
time_models[models[1]] = lambda x, k, a, b: eval(models[2])
return (memory_models, time_models)
for models in res:
model_name = "Unknown model"
if models[1] == 'mem_model1':
model_name = "k * log(x) + x * a + b"
elif models[1] == 'mem_model2':
model_name = "k * log(x) + b * log(x)^2 + a"
elif models[1] == 'mem_model3':
model_name = "k * log(x) + b * log(x)^2 + a * log(x)^3"
elif models[1] == 'mem_model4':
model_name = "k * log(x) + b * log(x)^2 + a * log(x)^2.5"
elif models[1] == 'time_model1':
model_name = "a + b + log(x) * k"
elif models[1] == 'time_model2':
model_name = "a + b * x + log(x) * k"
elif models[1] == 'time_model3':
model_name = "a + b * log(x)^2 + log(x) * k"
elif models[1] == 'time_model4':
model_name = "a * log(x)^3 + b * log(x)^2 + log(x) * k"
if 'mem' in models[1]:
memory_models[models[1]] = {
"equation_name": model_name,
"equation": lambda x, k, a, b: eval(models[2])
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this? I mean the model_name is the expression column in the database, no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have this for readability on the frontend. It was hard to read "np.log10(x)...". Maybe there's a way to automatically convert those from lambda function to a readable equation?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, I think leaving as np.log or doing just log in the database will be fine. Note that if you replace np.log for log in the database you would need to change the import numpy as np to from numpy import log.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We previously also talked about adding parenthesis to ensure correct order of operations between systems, which makes the readability worse. Should we still keep the parenthesis or should we assume the regular order of operations? For example:
(k * (np.log(x))) + (b * ((np.log(x))**2)) + (a * ((np.log(x))**3))

Could just be k * log(x) + b * log(x)**2 + a * log(x)**3

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the extra parenthesis so we don't assume the parenthesis order.

else:
time_models[models[1]] = {
"equation_name": model_name,
"equation": lambda x, k, a, b: eval(models[2])
}
return (memory_models, time_models)


def retrieve_resource_data(cname, sname, version, columns):
Expand Down Expand Up @@ -2483,9 +2486,20 @@ def _resource_allocation_plot_helper(
Specifies x axis for the graph
curr: str, required
Either MaxRSSRaw or ElapsedRaw (y axis)
models: dictionary, required
Dictionary of functions that will be used for visualization
models: dictionary, required. Follows this structure
equation_name: string
Human readable representation of the equation
equation: Python lambda function
Lambda function representing equation to optimizse

Returns
-------
best_model_name: string
the name of the best model from the table
best_model: function
best fitting function for the current dictionary models
options: object
object containing constants for the best model (e.g. k, a, b in kx+b*a)
"""

x_data, y_data = df[col_name], df[curr]
Expand Down Expand Up @@ -2560,7 +2574,7 @@ def _resource_allocation_plot_helper(
label=host)
ax.set_title(
f'k||a||b: {k}||{a}||{b}\n'
f'model: {get_model_name(best_model_name)}\n'
f'model: {models[best_model_name]["equation_name"]}\n'
f'real: {mini} || {maxi}\n'
f'calculated: {cmin} || {cmax}\n'
f'failures: {failures}')
Expand All @@ -2583,8 +2597,11 @@ def _resource_allocation_calculate(
current type (e.g. MaxRSSRaw)
col_name: str, required
Specifies x axis for the graph
models: dictionary, required
Dictionary of functions that will be used for visualization
models: dictionary, required. Follows this structure
equation_name: string
Human readable representation of the equation
equation: Python lambda function
Lambda function representing equation to optimizse
depth: int, required
Maximum number of iterations in binary search
tolerance: int, required,
Expand All @@ -2607,6 +2624,7 @@ def _resource_allocation_calculate(
best_failures = np.inf
best_max = np.inf
for model_name, model in models.items():
model_equation = model['equation']
# start values for binary search, where sl is left, sr is right
# penalty weight must be positive & non-zero, hence, sl >= 1.
# the upper bound for error can be an arbitrary large number
Expand All @@ -2624,13 +2642,13 @@ def _resource_allocation_calculate(
while left < right and cnt < depth:
middle = (left + right) // 2
options = minimize(_resource_allocation_custom_loss, init,
args=(x, y, model, middle))
args=(x, y, model_equation, middle))
k, a, b = options.x
# important: here we take the 2nd (last) value of tuple since
# the helper function returns success, then failures.
failures_df = _resource_allocation_success_failures(
df, k, a, b, model, col_name, type_)[-1]
y_plot = model(x, k, a, b)
df, k, a, b, model_equation, col_name, type_)[-1]
y_plot = model_equation(x, k, a, b)
if not any(y_plot):
continue
cmax = max(y_plot)
Expand Down Expand Up @@ -2678,7 +2696,7 @@ def _resource_allocation_calculate(
best_failures = prev_failures
best_max = min_max
best_model_name = model_name
best_model = model
best_model = model_equation
best_result = res
return best_model_name, best_model, best_result

Expand All @@ -2695,8 +2713,8 @@ def _resource_allocation_custom_loss(params, x, y, model, p):
Represents x data for the function calculation
y: pandas.Series (pandas column), required
Represents y data for the function calculation
models: list, required
List of functions that will be used for visualization
model: Python function
Lambda function representing current equation
p: int, required
Penalty weight for custom loss function

Expand Down
Loading