Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added an option to flatten_dims in RobustCostFunction. #503

Merged
merged 5 commits into from
May 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions tests/core/test_robust_cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,90 @@ def test_mask_jacobians(loss_cls):
torch.testing.assert_close(err, err_expected)
for j1, j2 in zip(jac, jac_expected):
torch.testing.assert_close(j1, j2)


def _data_model(a, b, x):
return a * x.square() + b


def _generate_data(num_points=100, a=1, b=0.5, noise_factor=0.01):
data_x = torch.rand((1, num_points))
noise = torch.randn((1, num_points)) * noise_factor
return data_x, _data_model(a, b, data_x) + noise


@pytest.mark.parametrize("batch_size", [1, 4])
def test_flatten_dims(batch_size):
mhmukadam marked this conversation as resolved.
Show resolved Hide resolved
# This creates two objectives for a regression problem
# and compares their linearization
# - Obj1: N 1d robust costs functions each evaluating one residual term
# - Obj2: 1 Nd robust cost function that evaluates all residuals at once
# Data for regression problem y ~ Normal(Ax^2 + B, sigma)
n = 10
data_x, data_y = _generate_data(num_points=n)
data_y[:, :5] = 1000 # include some extreme outliers for robust cost

# optimization variables are of type Vector with 2 degrees of freedom (dof)
# one for A and one for B
ab = th.Vector(2, name="ab")

def residual_fn(optim_vars, aux_vars):
ab = optim_vars[0]
x, y = aux_vars
return y.tensor - _data_model(ab.tensor[:, :1], ab.tensor[:, 1:], x.tensor)

w = th.ScaleCostWeight(0.5)
log_loss_radius = th.as_variable(0.5)

# First create an objective with individual cost functions per error term
# Need individual aux variables to represent data of each residual terms
xs = [th.Vector(1, name=f"x{i}") for i in range(n)]
ys = [th.Vector(1, name=f"y{i}") for i in range(n)]
obj_unrolled = th.Objective()
for i in range(n):
obj_unrolled.add(
th.RobustCostFunction(
th.AutoDiffCostFunction(
(ab,), residual_fn, 1, aux_vars=(xs[i], ys[i]), cost_weight=w
),
th.HuberLoss,
log_loss_radius,
name=f"rcf{i}",
)
)
lin_unrolled = th.DenseLinearization(obj_unrolled)
th_inputs = {f"x{i}": data_x[:, i].view(1, 1) for i in range(data_x.shape[1])}
th_inputs.update({f"y{i}": data_y[:, i].view(1, 1) for i in range(data_y.shape[1])})
th_inputs.update({"ab": torch.rand((batch_size, 2))})
obj_unrolled.update(th_inputs)
lin_unrolled.linearize()

# Now one with a single vectorized cost function, and flatten_dims=True
# Residual terms call all be represented with "batched" data variables
xb = th.Vector(n, name="xb")
yb = th.Vector(n, name="yb")
obj_flattened = th.Objective()
obj_flattened.add(
th.RobustCostFunction(
th.AutoDiffCostFunction(
(ab,), residual_fn, n, aux_vars=(xb, yb), cost_weight=w
),
th.HuberLoss,
log_loss_radius,
name="rcf",
flatten_dims=True,
)
)
lin_flattened = th.DenseLinearization(obj_flattened)
th_inputs = {
"xb": data_x,
"yb": data_y,
"ab": th_inputs["ab"], # reuse the previous random value
}
obj_flattened.update(th_inputs)
lin_flattened.linearize()

# Both objectives should result in the same error and linearizations
torch.testing.assert_close(obj_unrolled.error(), obj_flattened.error())
torch.testing.assert_close(lin_unrolled.b, lin_flattened.b)
torch.testing.assert_close(lin_unrolled.AtA, lin_flattened.AtA)
27 changes: 24 additions & 3 deletions theseus/core/robust_cost_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,13 @@
# by `weighted_error()` and **NOT** the one returned by
# `weighted_jacobians_error()`.
#
# Finally, since we apply the weight before the robust loss, we adopt the convention
# Since we apply the weight before the robust loss, we adopt the convention
# that `robust_cost_fn.jacobians() == robust_cost_fn.weighted_jacobians_error()`, and
# `robust_cost_fn.error() == robust_cost_fn.weighed_error()`.
#
# The flag `flatten_dims` can be used to apply the loss to each dimension of the error
# as if it was a separate error term (for example, if one writes a regression problem
# as a single CostFunction with each dimension being a residual term).
class RobustCostFunction(CostFunction):
_EPS = 1e-20

Expand All @@ -52,6 +56,7 @@ def __init__(
cost_function: CostFunction,
loss_cls: Type[RobustLoss],
log_loss_radius: Variable,
flatten_dims: bool = False,
name: Optional[str] = None,
):
self.cost_function = cost_function
Expand All @@ -70,6 +75,7 @@ def __init__(
self.log_loss_radius = log_loss_radius
self.register_aux_var("log_loss_radius")
self.loss = loss_cls()
self.flatten_dims = flatten_dims

def error(self) -> torch.Tensor:
warnings.warn(
Expand All @@ -80,9 +86,13 @@ def error(self) -> torch.Tensor:

def weighted_error(self) -> torch.Tensor:
weighted_error = self.cost_function.weighted_error()
if self.flatten_dims:
weighted_error = weighted_error.reshape(-1, 1)
squared_norm = torch.sum(weighted_error**2, dim=1, keepdim=True)
error_loss = self.loss.evaluate(squared_norm, self.log_loss_radius.tensor)

if self.flatten_dims:
return (error_loss.reshape(-1, self.dim()) + RobustCostFunction._EPS).sqrt()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if something like (error_loss.reshape(-1, self.dim()).sum(dim=-1, keepdims=True) + RobustCostFunction._EPS).sqrt() is better and more consistent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this change the result? The test I added checks that the current behavior is the same as flatten_dims=False for separate cost functions, which is what we want.

# The return value is a hacky way to make it so that
# ||weighted_error||^2 = error_loss
# By doing this we avoid having to change the objective's error computation
Expand All @@ -107,15 +117,25 @@ def weighted_jacobians_error(self) -> Tuple[List[torch.Tensor], torch.Tensor]:
weighted_jacobians,
weighted_error,
) = self.cost_function.weighted_jacobians_error()
if self.flatten_dims:
weighted_error = weighted_error.reshape(-1, 1)
for i, wj in enumerate(weighted_jacobians):
weighted_jacobians[i] = wj.view(-1, 1, wj.shape[2])
squared_norm = torch.sum(weighted_error**2, dim=1, keepdim=True)
rescale = (
self.loss.linearize(squared_norm, self.log_loss_radius.tensor)
+ RobustCostFunction._EPS
).sqrt()

return [
rescaled_jacobians = [
rescale.view(-1, 1, 1) * jacobian for jacobian in weighted_jacobians
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following my previous comments, this will be something like torch.einsum("ni, nij->nij", rescale.view(-1, self.dims()), jacobians)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment as above.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With .diagonal, it presumes that the Jacobian matrix is a square matrix. And for a jacobian, it would be dE/dX, which determines its shape is (dim(E), dim(x)). My question is, will it be necessarily equal for dim(E) and dim(X)?

Copy link
Contributor Author

@luisenp luisenp Apr 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good observation! I forgot to mention that this implementation assumes that we want to model N robust cost functions of dim=1, each with $k$ variables of dof=1, with a single robust cost function of dimension N and $k$ variables of dof=N. Basically, this assumes you want to use a single RobustCost object to model a problem of the form

$$J = \sum_{i=1:N} \rho(f_i(x_i, y_i, ...)^2)$$

Supporting cost function of dim > 1 and other combinations of variables dof gets messier, and I'm not sure it's worth the effort.

Note that this feature is mostly for programming convenience. The alternative of creating individual cost functions objects doesn't hurt performance, because TheseusLayer by defaults vectorizes cost functions with the same pattern so they are evaluated in a single call.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it seems it's not feasible for the case like in tutorial 4, motion model (First figure). In that case, the Error is the combination of 2 vectors, which could be usual and convenient in many case. Therefore, in my original code, I obtain the dimension of the input X through the Jacobian matrix. Because the error itself only contains the batch size and the dimension of the error. And I believe based on the chain rule the dRobustLoss/dOriLoss should be able to be multiplied by the original Jocobian directly.
image
image
image

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. You are right. Currently, I do not see an optimization problem that would result in where dim(E) != dim(X) or cannot be decoupled into multiple cost functions that fulfill dim(E) = dim(X). We can further discuss it if one encounters those problems. Thank you! I have learned a lot from your project!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem, and thank you for all your useful feedback!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xphsean12 I have to apologize, because turns out I was wrong in some of the above! We definitely want to support the case were dim(error) != dim(X). For example, in the regression problem, dim(error) = N_points, but dim(X) = 1 both for A and B. In the above I was mixing up optim vars with aux vars.

Luckily, the fix is still pretty simple, and I have rewritten the test to check for the case above, using the same regression example. You can look at the last few commits if you are curious.

The code still assumes that flatten_dims acts as if it was dim(err) independent costs of dim=1.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your feedback! By the way, for the radius, (maybe) I found a new issue. Since I do not have very concrete evidence, I am not going to raise an issue so far. In my previous commit, I change the radius = radius.exp() into .square(). @fantaosha mention that exp() is better for differentiating. However, the easiest approach should be just let radius = radius. And we just need to let the user input an radius >= 0.

The main difference is that (1) based on my observation, .exp() is much slower than radius = radius when I try to train the Huber radius (2) training an value, which will be further pass to the exp() function may be very difficult. Because when changing the trained value from 0 to 5, the actual radius being used is changed from 1 to 148. The exponential growth may cause training an accurate radius difficult. Because the radius will be enlarged by an exponential function. I am not sure whether my second concern is correct, just kind of thinking.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fantaosha Any comments? I don't think using exp() should be particularly problematic. Maybe you need to lower your learning rate?

], rescale * weighted_error
]
rescaled_error = rescale * weighted_error
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line might also need changes as well

if self.flatten_dims:
return [
rj.reshape(-1, self.dim(), rj.shape[2]) for rj in rescaled_jacobians
], rescaled_error.reshape(-1, self.dim())
return rescaled_jacobians, rescaled_error

def dim(self) -> int:
return self.cost_function.dim()
Expand All @@ -126,6 +146,7 @@ def _copy_impl(self, new_name: Optional[str] = None) -> "RobustCostFunction":
type(self.loss),
self.log_loss_radius.copy(),
name=new_name,
flatten_dims=self.flatten_dims,
)

@property
Expand Down
6 changes: 6 additions & 0 deletions theseus/core/vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from .cost_function import AutoDiffCostFunction, CostFunction
from .objective import Objective
from .robust_cost_function import RobustCostFunction
from .variable import Variable

_CostFunctionSchema = Tuple[str, ...]
Expand All @@ -23,6 +24,11 @@ def _fullname(obj) -> str:
_name = f"{obj.__module__}.{obj.__class__.__name__}"
if isinstance(obj, AutoDiffCostFunction):
_name += f"__{id(obj._err_fn)}"
if isinstance(obj, RobustCostFunction):
_name += (
f"__{_fullname(obj.cost_function)}__"
f"{_fullname(obj.loss)}__{obj.flatten_dims}"
)
return _name

def _varinfo(var) -> str:
Expand Down