soap

`optimus_dl.modules.optim.soap` ¶

SOAP optimizer

`SOAP` ¶

Bases: Optimizer

Implements SOAP algorithm (https://arxiv.org/abs/2409.11321).

Parameters:

Name	Type	Description	Default
`params`	`Iterable[nn.parameter.Parameter]`	Iterable of parameters to optimize or dictionaries defining parameter groups.	required
`lr`	`float`, optional, defaults to 0.003	The learning rate to use.	`0.003`
`betas`	`Tuple[float,float]`, optional, defaults to `(0.95, 0.95)`	Adam's betas parameters (b1, b2).	`(0.95, 0.95)`
`shampoo_beta`	`float`, optional, defaults to -1	If >= 0, use this beta for the preconditioner (L and R in paper, state['GG'] below) moving average instead of betas[1].	`-1`
`eps`	`float`, optional, defaults to 1e-08	Adam's epsilon for numerical stability.	`1e-08`
`weight_decay`	`float`, optional, defaults to 0.01	weight decay coefficient.	`0.01`
`precondition_frequency`	`int`, optional, defaults to 10	How often to update the preconditioner.	`10`
`max_precond_dim`	`int`, optional, defaults to 10000	Maximum dimension of the preconditioner. Set to 10000, so that we exclude most common vocab sizes while including layers.	`10000`
`merge_dims`	`bool`, optional, defaults to `False`	Whether or not to merge dimensions of the preconditioner.	`False`
`precondition_1d`	`bool`, optional, defaults to `False`	Whether or not to precondition 1D gradients.	`False`
`normalize_grads`	`bool`, optional, defaults to `False`	Whether or not to normalize gradients per layer. Helps at large precondition_frequency (~100 in our experiments), but hurts performance at small precondition_frequency (~10 in our experiments).	`False`
`data_format`	`str`, optional, defaults to `channels_first`	Data format of the input for convolutional layers. Should be "channels_last" for data_format of NHWC and "channels_first" for NCHW.	`'channels_first'`
`correct_bias`	`bool`, optional, defaults to `True`	Whether or not to use bias correction in Adam.	`True`

Source code in optimus_dl/modules/optim/soap.py

class SOAP(optim.Optimizer):
    """
    Implements SOAP algorithm (https://arxiv.org/abs/2409.11321).

    Parameters:
        params (`Iterable[nn.parameter.Parameter]`):
            Iterable of parameters to optimize or dictionaries defining parameter groups.
        lr (`float`, *optional*, defaults to 0.003):
            The learning rate to use.
        betas (`Tuple[float,float]`, *optional*, defaults to `(0.95, 0.95)`):
            Adam's betas parameters (b1, b2).
        shampoo_beta (`float`, *optional*, defaults to -1):
            If >= 0, use this beta for the preconditioner (L and R in paper, state['GG'] below) moving average instead of betas[1].
        eps (`float`, *optional*, defaults to 1e-08):
            Adam's epsilon for numerical stability.
        weight_decay (`float`, *optional*, defaults to 0.01): weight decay coefficient.
        precondition_frequency (`int`, *optional*, defaults to 10):
            How often to update the preconditioner.
        max_precond_dim (`int`, *optional*, defaults to 10000):
            Maximum dimension of the preconditioner.
            Set to 10000, so that we exclude most common vocab sizes while including layers.
        merge_dims (`bool`, *optional*, defaults to `False`):
            Whether or not to merge dimensions of the preconditioner.
        precondition_1d (`bool`, *optional*, defaults to `False`):
            Whether or not to precondition 1D gradients.
        normalize_grads (`bool`, *optional*, defaults to `False`):
            Whether or not to normalize gradients per layer.
            Helps at large precondition_frequency (~100 in our experiments),
            but hurts performance at small precondition_frequency (~10 in our experiments).
        data_format (`str`, *optional*, defaults to `channels_first`):
            Data format of the input for convolutional layers.
            Should be "channels_last" for data_format of NHWC and "channels_first" for NCHW.
        correct_bias (`bool`, *optional*, defaults to `True`):
            Whether or not to use bias correction in Adam.
    """

    def __init__(
        self,
        params,
        lr: float = 3e-3,
        betas=(0.95, 0.95),
        shampoo_beta: float = -1,
        eps: float = 1e-8,
        weight_decay: float = 0.01,
        precondition_frequency: int = 10,
        max_precond_dim: int = 10000,  #
        merge_dims: bool = False,  # Merge dimensions till the product of the dimensions is less than or equal to max_precond_dim.
        precondition_1d: bool = False,
        normalize_grads: bool = False,
        data_format: str = "channels_first",
        correct_bias: bool = True,
    ):
        defaults = {
            "lr": lr,
            "betas": betas,
            "shampoo_beta": shampoo_beta,
            "eps": eps,
            "weight_decay": weight_decay,
            "precondition_frequency": precondition_frequency,
            "max_precond_dim": max_precond_dim,
            "merge_dims": merge_dims,
            "precondition_1d": precondition_1d,
            "normalize_grads": normalize_grads,
            "correct_bias": correct_bias,
        }
        super().__init__(params, defaults)
        self._data_format = data_format

    def merge_dims(self, grad, max_precond_dim):
        """
        Merges dimensions of the gradient tensor till the product of the dimensions is less than or equal to max_precond_dim.
        """
        assert self._data_format in ["channels_first", "channels_last"]
        if self._data_format == "channels_last" and grad.dim() == 4:
            grad = grad.permute(0, 3, 1, 2)
        shape = grad.shape
        new_shape = []

        curr_shape = 1
        for sh in shape:
            temp_shape = curr_shape * sh
            if temp_shape > max_precond_dim:
                if curr_shape > 1:
                    new_shape.append(curr_shape)
                    curr_shape = sh
                else:
                    new_shape.append(sh)
                    curr_shape = 1
            else:
                curr_shape = temp_shape

        if curr_shape > 1 or len(new_shape) == 0:
            new_shape.append(curr_shape)

        new_grad = grad.reshape(new_shape)
        return new_grad

    @torch.no_grad()
    def step(self, closure=None):
        """
        Performs a single optimization step.

        Arguments:
            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
        """
        if closure is None:
            loss = None
        else:
            loss = closure()

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad

                state = self.state[p]

                if "step" not in state:
                    state["step"] = 0

                # State initialization
                if "exp_avg" not in state:
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(grad)
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(grad)

                if "Q" not in state:
                    self.init_preconditioner(
                        grad,
                        state,
                        precondition_frequency=group["precondition_frequency"],
                        precondition_1d=group["precondition_1d"],
                        shampoo_beta=(
                            group["shampoo_beta"]
                            if group["shampoo_beta"] >= 0
                            else group["betas"][1]
                        ),
                        max_precond_dim=group["max_precond_dim"],
                        merge_dims=group["merge_dims"],
                    )
                    self.update_preconditioner(
                        grad,
                        state,
                        max_precond_dim=group["max_precond_dim"],
                        merge_dims=group["merge_dims"],
                        precondition_1d=group["precondition_1d"],
                    )
                    continue  # first step is skipped so that we never use the current gradients in the projection.

                # Projecting gradients to the eigenbases of Shampoo's preconditioner
                # i.e. projecting to the eigenbases of matrices in state['GG']
                grad_projected = self.project(
                    grad,
                    state,
                    merge_dims=group["merge_dims"],
                    max_precond_dim=group["max_precond_dim"],
                )

                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                beta1, beta2 = group["betas"]

                state["step"] += 1

                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                exp_avg.mul_(beta1).add_(grad_projected, alpha=(1.0 - beta1))
                exp_avg_sq.mul_(beta2).add_(
                    grad_projected.square(), alpha=(1.0 - beta2)
                )

                denom = exp_avg_sq.sqrt().add_(group["eps"])

                # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
                # i.e. projecting to the eigenbases of matrices in state['GG']
                # exp_avg_projected = self.project(exp_avg, state, merge_dims=group["merge_dims"],
                #                                  max_precond_dim=group['max_precond_dim'])
                exp_avg_projected = exp_avg

                step_size = group["lr"]
                if group["correct_bias"]:
                    bias_correction1 = 1.0 - beta1 ** (state["step"])
                    bias_correction2 = 1.0 - beta2 ** (state["step"])
                    step_size = step_size * (bias_correction2**0.5) / bias_correction1

                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
                # to the original space
                norm_grad = self.project_back(
                    exp_avg_projected / denom,
                    state,
                    merge_dims=group["merge_dims"],
                    max_precond_dim=group["max_precond_dim"],
                )

                if group["normalize_grads"]:
                    norm_grad = norm_grad / (1e-30 + torch.mean(norm_grad**2) ** 0.5)

                p.add_(norm_grad, alpha=-step_size)

                # From AdamW code: Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want to decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                # Add weight decay at the end (fixed version)
                if group["weight_decay"] > 0.0:
                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))

                # Update is done after the gradient step to avoid using current gradients in the projection.
                self.update_preconditioner(
                    grad,
                    state,
                    max_precond_dim=group["max_precond_dim"],
                    merge_dims=group["merge_dims"],
                    precondition_1d=group["precondition_1d"],
                )

        return loss

    def init_preconditioner(
        self,
        grad,
        state,
        precondition_frequency=10,
        shampoo_beta=0.95,
        max_precond_dim=10000,
        precondition_1d=False,
        merge_dims=False,
    ):
        """
        Initializes the preconditioner matrices (L and R in the paper).
        """
        state["GG"] = (
            []
        )  # Will hold all the preconditioner matrices (L and R in the paper).
        if grad.dim() == 1:
            if not precondition_1d or grad.shape[0] > max_precond_dim:
                state["GG"].append([])
            else:
                state["GG"].append(
                    torch.zeros(grad.shape[0], grad.shape[0], device=grad.device)
                )
        else:
            if merge_dims:
                grad = self.merge_dims(grad, max_precond_dim)

            for sh in grad.shape:
                if sh > max_precond_dim:
                    state["GG"].append([])
                else:
                    state["GG"].append(torch.zeros(sh, sh, device=grad.device))

        state["Q"] = None  # Will hold all the eigenbases of the preconditioner.
        state["precondition_frequency"] = precondition_frequency
        state["shampoo_beta"] = shampoo_beta

    def project(self, grad, state, merge_dims=False, max_precond_dim=10000):
        """
        Projects the gradient to the eigenbases of the preconditioner.
        """
        original_shape = grad.shape
        if merge_dims:
            if grad.dim() == 4 and self._data_format == "channels_last":
                permuted_shape = grad.permute(0, 3, 1, 2).shape
            grad = self.merge_dims(grad, max_precond_dim)

        for mat in state["Q"]:
            if len(mat) > 0:
                grad = torch.tensordot(
                    grad,
                    mat,
                    dims=[[0], [0]],
                )
            else:
                permute_order = list(range(1, len(grad.shape))) + [0]
                grad = grad.permute(permute_order)

        if merge_dims:
            if self._data_format == "channels_last" and len(original_shape) == 4:
                grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
            else:
                grad = grad.reshape(original_shape)
        return grad

    def update_preconditioner(
        self,
        grad,
        state,
        max_precond_dim=10000,
        merge_dims=False,
        precondition_1d=False,
    ):
        """
        Updates the preconditioner matrices and the eigenbases (L, R, Q_L, Q_R in the paper).
        """
        if state["Q"] is not None:
            state["exp_avg"] = self.project_back(
                state["exp_avg"],
                state,
                merge_dims=merge_dims,
                max_precond_dim=max_precond_dim,
            )
        if grad.dim() == 1:
            if precondition_1d and grad.shape[0] <= max_precond_dim:
                state["GG"][0].lerp_(
                    grad.unsqueeze(1) @ grad.unsqueeze(0), 1 - state["shampoo_beta"]
                )
        else:
            if merge_dims:
                new_grad = self.merge_dims(grad, max_precond_dim)
                for idx, sh in enumerate(new_grad.shape):
                    if sh <= max_precond_dim:
                        outer_product = torch.tensordot(
                            new_grad,
                            new_grad,
                            dims=[
                                [
                                    *chain(
                                        range(idx), range(idx + 1, len(new_grad.shape))
                                    )
                                ]
                            ]
                            * 2,
                        )
                        state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])
            else:
                for idx, sh in enumerate(grad.shape):
                    if sh <= max_precond_dim:
                        outer_product = torch.tensordot(
                            grad,
                            grad,
                            # Contracts across all dimensions except for k.
                            dims=[[*chain(range(idx), range(idx + 1, len(grad.shape)))]]
                            * 2,
                        )
                        state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])

        if state["Q"] is None:
            state["Q"] = self.get_orthogonal_matrix(state["GG"])
        if state["step"] > 0 and state["step"] % state["precondition_frequency"] == 0:
            state["Q"] = self.get_orthogonal_matrix_QR(
                state, max_precond_dim, merge_dims
            )
            # state['Q'] = self.get_fast_QR(state, max_precond_dim, merge_dims)

        if state["step"] > 0:
            state["exp_avg"] = self.project(
                state["exp_avg"],
                state,
                merge_dims=merge_dims,
                max_precond_dim=max_precond_dim,
            )

    def project_back(self, grad, state, merge_dims=False, max_precond_dim=10000):
        """
        Projects the gradient back to the original space.
        """
        original_shape = grad.shape
        if merge_dims:
            if self._data_format == "channels_last" and grad.dim() == 4:
                permuted_shape = grad.permute(0, 3, 1, 2).shape
            grad = self.merge_dims(grad, max_precond_dim)
        for mat in state["Q"]:
            if len(mat) > 0:
                grad = torch.tensordot(
                    grad,
                    mat,
                    dims=[[0], [1]],
                )
            else:
                permute_order = list(range(1, len(grad.shape))) + [0]
                grad = grad.permute(permute_order)

        if merge_dims:
            if self._data_format == "channels_last" and len(original_shape) == 4:
                grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
            else:
                grad = grad.reshape(original_shape)
        return grad

    def get_orthogonal_matrix(self, mat):
        """
        Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
        """
        matrix = []
        for m in mat:
            if len(m) == 0:
                matrix.append([])
                continue
            if m.data.dtype != torch.float:
                float_data = False
                original_type = m.data.dtype
                original_device = m.data.device
                matrix.append(m.data.float())
            else:
                float_data = True
                matrix.append(m.data)

        final = []
        for m in matrix:
            if len(m) == 0:
                final.append([])
                continue
            try:
                _, Q = torch.linalg.eigh(
                    m + 1e-30 * torch.eye(m.shape[0], device=m.device)
                )
            except:  # noqa: E722
                _, Q = torch.linalg.eigh(
                    m.to(torch.float64) + 1e-30 * torch.eye(m.shape[0], device=m.device)
                )
                Q = Q.to(m.dtype)
            Q = torch.flip(Q, [1])

            if not float_data:
                Q = Q.to(original_device).type(original_type)
            final.append(Q)
        return final

    def get_orthogonal_matrix_QR(self, state, max_precond_dim=10000, merge_dims=False):
        """
        Computes the eigenbases of the preconditioner using one round of power iteration
        followed by torch.linalg.qr decomposition.
        """
        precond_list = state["GG"]
        orth_list = state["Q"]

        matrix = []
        orth_matrix = []
        for m, o in zip(precond_list, orth_list, strict=False):
            if len(m) == 0:
                matrix.append([])
                orth_matrix.append([])
                continue
            if m.data.dtype != torch.float:
                float_data = False
                original_type = m.data.dtype
                original_device = m.data.device
                matrix.append(m.data.float())
                orth_matrix.append(o.data.float())
            else:
                float_data = True
                matrix.append(m.data.float())
                orth_matrix.append(o.data.float())

        orig_shape = state["exp_avg_sq"].shape
        if self._data_format == "channels_last" and len(orig_shape) == 4:
            permuted_shape = state["exp_avg_sq"].permute(0, 3, 1, 2).shape
        if merge_dims:
            exp_avg_sq = self.merge_dims(state["exp_avg_sq"], max_precond_dim)
        else:
            exp_avg_sq = state["exp_avg_sq"]

        final = []
        for ind, (m, o) in enumerate(zip(matrix, orth_matrix, strict=False)):
            if len(m) == 0:
                final.append([])
                continue
            est_eig = torch.diag(o.T @ m @ o)
            sort_idx = torch.argsort(est_eig, descending=True)
            exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
            o = o[:, sort_idx]
            power_iter = m @ o
            Q, _ = torch.linalg.qr(power_iter)

            if not float_data:
                Q = Q.to(original_device).type(original_type)
            final.append(Q)

        if merge_dims:
            if self._data_format == "channels_last" and len(orig_shape) == 4:
                exp_avg_sq = exp_avg_sq.reshape(permuted_shape).permute(0, 2, 3, 1)
            else:
                exp_avg_sq = exp_avg_sq.reshape(orig_shape)

        state["exp_avg_sq"] = exp_avg_sq
        return final

`get_orthogonal_matrix(mat)` ¶

Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.

Source code in optimus_dl/modules/optim/soap.py

def get_orthogonal_matrix(self, mat):
    """
    Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
    """
    matrix = []
    for m in mat:
        if len(m) == 0:
            matrix.append([])
            continue
        if m.data.dtype != torch.float:
            float_data = False
            original_type = m.data.dtype
            original_device = m.data.device
            matrix.append(m.data.float())
        else:
            float_data = True
            matrix.append(m.data)

    final = []
    for m in matrix:
        if len(m) == 0:
            final.append([])
            continue
        try:
            _, Q = torch.linalg.eigh(
                m + 1e-30 * torch.eye(m.shape[0], device=m.device)
            )
        except:  # noqa: E722
            _, Q = torch.linalg.eigh(
                m.to(torch.float64) + 1e-30 * torch.eye(m.shape[0], device=m.device)
            )
            Q = Q.to(m.dtype)
        Q = torch.flip(Q, [1])

        if not float_data:
            Q = Q.to(original_device).type(original_type)
        final.append(Q)
    return final

`get_orthogonal_matrix_QR(state, max_precond_dim=10000, merge_dims=False)` ¶

Computes the eigenbases of the preconditioner using one round of power iteration followed by torch.linalg.qr decomposition.

Source code in optimus_dl/modules/optim/soap.py

def get_orthogonal_matrix_QR(self, state, max_precond_dim=10000, merge_dims=False):
    """
    Computes the eigenbases of the preconditioner using one round of power iteration
    followed by torch.linalg.qr decomposition.
    """
    precond_list = state["GG"]
    orth_list = state["Q"]

    matrix = []
    orth_matrix = []
    for m, o in zip(precond_list, orth_list, strict=False):
        if len(m) == 0:
            matrix.append([])
            orth_matrix.append([])
            continue
        if m.data.dtype != torch.float:
            float_data = False
            original_type = m.data.dtype
            original_device = m.data.device
            matrix.append(m.data.float())
            orth_matrix.append(o.data.float())
        else:
            float_data = True
            matrix.append(m.data.float())
            orth_matrix.append(o.data.float())

    orig_shape = state["exp_avg_sq"].shape
    if self._data_format == "channels_last" and len(orig_shape) == 4:
        permuted_shape = state["exp_avg_sq"].permute(0, 3, 1, 2).shape
    if merge_dims:
        exp_avg_sq = self.merge_dims(state["exp_avg_sq"], max_precond_dim)
    else:
        exp_avg_sq = state["exp_avg_sq"]

    final = []
    for ind, (m, o) in enumerate(zip(matrix, orth_matrix, strict=False)):
        if len(m) == 0:
            final.append([])
            continue
        est_eig = torch.diag(o.T @ m @ o)
        sort_idx = torch.argsort(est_eig, descending=True)
        exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
        o = o[:, sort_idx]
        power_iter = m @ o
        Q, _ = torch.linalg.qr(power_iter)

        if not float_data:
            Q = Q.to(original_device).type(original_type)
        final.append(Q)

    if merge_dims:
        if self._data_format == "channels_last" and len(orig_shape) == 4:
            exp_avg_sq = exp_avg_sq.reshape(permuted_shape).permute(0, 2, 3, 1)
        else:
            exp_avg_sq = exp_avg_sq.reshape(orig_shape)

    state["exp_avg_sq"] = exp_avg_sq
    return final

`init_preconditioner(grad, state, precondition_frequency=10, shampoo_beta=0.95, max_precond_dim=10000, precondition_1d=False, merge_dims=False)` ¶

Initializes the preconditioner matrices (L and R in the paper).

Source code in optimus_dl/modules/optim/soap.py

def init_preconditioner(
    self,
    grad,
    state,
    precondition_frequency=10,
    shampoo_beta=0.95,
    max_precond_dim=10000,
    precondition_1d=False,
    merge_dims=False,
):
    """
    Initializes the preconditioner matrices (L and R in the paper).
    """
    state["GG"] = (
        []
    )  # Will hold all the preconditioner matrices (L and R in the paper).
    if grad.dim() == 1:
        if not precondition_1d or grad.shape[0] > max_precond_dim:
            state["GG"].append([])
        else:
            state["GG"].append(
                torch.zeros(grad.shape[0], grad.shape[0], device=grad.device)
            )
    else:
        if merge_dims:
            grad = self.merge_dims(grad, max_precond_dim)

        for sh in grad.shape:
            if sh > max_precond_dim:
                state["GG"].append([])
            else:
                state["GG"].append(torch.zeros(sh, sh, device=grad.device))

    state["Q"] = None  # Will hold all the eigenbases of the preconditioner.
    state["precondition_frequency"] = precondition_frequency
    state["shampoo_beta"] = shampoo_beta

`merge_dims(grad, max_precond_dim)` ¶

Merges dimensions of the gradient tensor till the product of the dimensions is less than or equal to max_precond_dim.

Source code in optimus_dl/modules/optim/soap.py

def merge_dims(self, grad, max_precond_dim):
    """
    Merges dimensions of the gradient tensor till the product of the dimensions is less than or equal to max_precond_dim.
    """
    assert self._data_format in ["channels_first", "channels_last"]
    if self._data_format == "channels_last" and grad.dim() == 4:
        grad = grad.permute(0, 3, 1, 2)
    shape = grad.shape
    new_shape = []

    curr_shape = 1
    for sh in shape:
        temp_shape = curr_shape * sh
        if temp_shape > max_precond_dim:
            if curr_shape > 1:
                new_shape.append(curr_shape)
                curr_shape = sh
            else:
                new_shape.append(sh)
                curr_shape = 1
        else:
            curr_shape = temp_shape

    if curr_shape > 1 or len(new_shape) == 0:
        new_shape.append(curr_shape)

    new_grad = grad.reshape(new_shape)
    return new_grad

`project(grad, state, merge_dims=False, max_precond_dim=10000)` ¶

Projects the gradient to the eigenbases of the preconditioner.

Source code in optimus_dl/modules/optim/soap.py

def project(self, grad, state, merge_dims=False, max_precond_dim=10000):
    """
    Projects the gradient to the eigenbases of the preconditioner.
    """
    original_shape = grad.shape
    if merge_dims:
        if grad.dim() == 4 and self._data_format == "channels_last":
            permuted_shape = grad.permute(0, 3, 1, 2).shape
        grad = self.merge_dims(grad, max_precond_dim)

    for mat in state["Q"]:
        if len(mat) > 0:
            grad = torch.tensordot(
                grad,
                mat,
                dims=[[0], [0]],
            )
        else:
            permute_order = list(range(1, len(grad.shape))) + [0]
            grad = grad.permute(permute_order)

    if merge_dims:
        if self._data_format == "channels_last" and len(original_shape) == 4:
            grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
        else:
            grad = grad.reshape(original_shape)
    return grad

`project_back(grad, state, merge_dims=False, max_precond_dim=10000)` ¶

Projects the gradient back to the original space.

Source code in optimus_dl/modules/optim/soap.py

def project_back(self, grad, state, merge_dims=False, max_precond_dim=10000):
    """
    Projects the gradient back to the original space.
    """
    original_shape = grad.shape
    if merge_dims:
        if self._data_format == "channels_last" and grad.dim() == 4:
            permuted_shape = grad.permute(0, 3, 1, 2).shape
        grad = self.merge_dims(grad, max_precond_dim)
    for mat in state["Q"]:
        if len(mat) > 0:
            grad = torch.tensordot(
                grad,
                mat,
                dims=[[0], [1]],
            )
        else:
            permute_order = list(range(1, len(grad.shape))) + [0]
            grad = grad.permute(permute_order)

    if merge_dims:
        if self._data_format == "channels_last" and len(original_shape) == 4:
            grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
        else:
            grad = grad.reshape(original_shape)
    return grad

`step(closure=None)` ¶

Performs a single optimization step.

Parameters:

Name	Type	Description	Default
`closure`	`Callable`, optional	A closure that reevaluates the model and returns the loss.	`None`

Source code in optimus_dl/modules/optim/soap.py

@torch.no_grad()
def step(self, closure=None):
    """
    Performs a single optimization step.

    Arguments:
        closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
    """
    if closure is None:
        loss = None
    else:
        loss = closure()

    for group in self.param_groups:
        for p in group["params"]:
            if p.grad is None:
                continue
            grad = p.grad

            state = self.state[p]

            if "step" not in state:
                state["step"] = 0

            # State initialization
            if "exp_avg" not in state:
                # Exponential moving average of gradient values
                state["exp_avg"] = torch.zeros_like(grad)
                # Exponential moving average of squared gradient values
                state["exp_avg_sq"] = torch.zeros_like(grad)

            if "Q" not in state:
                self.init_preconditioner(
                    grad,
                    state,
                    precondition_frequency=group["precondition_frequency"],
                    precondition_1d=group["precondition_1d"],
                    shampoo_beta=(
                        group["shampoo_beta"]
                        if group["shampoo_beta"] >= 0
                        else group["betas"][1]
                    ),
                    max_precond_dim=group["max_precond_dim"],
                    merge_dims=group["merge_dims"],
                )
                self.update_preconditioner(
                    grad,
                    state,
                    max_precond_dim=group["max_precond_dim"],
                    merge_dims=group["merge_dims"],
                    precondition_1d=group["precondition_1d"],
                )
                continue  # first step is skipped so that we never use the current gradients in the projection.

            # Projecting gradients to the eigenbases of Shampoo's preconditioner
            # i.e. projecting to the eigenbases of matrices in state['GG']
            grad_projected = self.project(
                grad,
                state,
                merge_dims=group["merge_dims"],
                max_precond_dim=group["max_precond_dim"],
            )

            exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
            beta1, beta2 = group["betas"]

            state["step"] += 1

            # Decay the first and second moment running average coefficient
            # In-place operations to update the averages at the same time
            exp_avg.mul_(beta1).add_(grad_projected, alpha=(1.0 - beta1))
            exp_avg_sq.mul_(beta2).add_(
                grad_projected.square(), alpha=(1.0 - beta2)
            )

            denom = exp_avg_sq.sqrt().add_(group["eps"])

            # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
            # i.e. projecting to the eigenbases of matrices in state['GG']
            # exp_avg_projected = self.project(exp_avg, state, merge_dims=group["merge_dims"],
            #                                  max_precond_dim=group['max_precond_dim'])
            exp_avg_projected = exp_avg

            step_size = group["lr"]
            if group["correct_bias"]:
                bias_correction1 = 1.0 - beta1 ** (state["step"])
                bias_correction2 = 1.0 - beta2 ** (state["step"])
                step_size = step_size * (bias_correction2**0.5) / bias_correction1

            # Projecting back the preconditioned (by Adam) exponential moving average of gradients
            # to the original space
            norm_grad = self.project_back(
                exp_avg_projected / denom,
                state,
                merge_dims=group["merge_dims"],
                max_precond_dim=group["max_precond_dim"],
            )

            if group["normalize_grads"]:
                norm_grad = norm_grad / (1e-30 + torch.mean(norm_grad**2) ** 0.5)

            p.add_(norm_grad, alpha=-step_size)

            # From AdamW code: Just adding the square of the weights to the loss function is *not*
            # the correct way of using L2 regularization/weight decay with Adam,
            # since that will interact with the m and v parameters in strange ways.
            #
            # Instead we want to decay the weights in a manner that doesn't interact
            # with the m/v parameters. This is equivalent to adding the square
            # of the weights to the loss with plain (non-momentum) SGD.
            # Add weight decay at the end (fixed version)
            if group["weight_decay"] > 0.0:
                p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))

            # Update is done after the gradient step to avoid using current gradients in the projection.
            self.update_preconditioner(
                grad,
                state,
                max_precond_dim=group["max_precond_dim"],
                merge_dims=group["merge_dims"],
                precondition_1d=group["precondition_1d"],
            )

    return loss

`update_preconditioner(grad, state, max_precond_dim=10000, merge_dims=False, precondition_1d=False)` ¶

Updates the preconditioner matrices and the eigenbases (L, R, Q_L, Q_R in the paper).

Source code in optimus_dl/modules/optim/soap.py

def update_preconditioner(
    self,
    grad,
    state,
    max_precond_dim=10000,
    merge_dims=False,
    precondition_1d=False,
):
    """
    Updates the preconditioner matrices and the eigenbases (L, R, Q_L, Q_R in the paper).
    """
    if state["Q"] is not None:
        state["exp_avg"] = self.project_back(
            state["exp_avg"],
            state,
            merge_dims=merge_dims,
            max_precond_dim=max_precond_dim,
        )
    if grad.dim() == 1:
        if precondition_1d and grad.shape[0] <= max_precond_dim:
            state["GG"][0].lerp_(
                grad.unsqueeze(1) @ grad.unsqueeze(0), 1 - state["shampoo_beta"]
            )
    else:
        if merge_dims:
            new_grad = self.merge_dims(grad, max_precond_dim)
            for idx, sh in enumerate(new_grad.shape):
                if sh <= max_precond_dim:
                    outer_product = torch.tensordot(
                        new_grad,
                        new_grad,
                        dims=[
                            [
                                *chain(
                                    range(idx), range(idx + 1, len(new_grad.shape))
                                )
                            ]
                        ]
                        * 2,
                    )
                    state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])
        else:
            for idx, sh in enumerate(grad.shape):
                if sh <= max_precond_dim:
                    outer_product = torch.tensordot(
                        grad,
                        grad,
                        # Contracts across all dimensions except for k.
                        dims=[[*chain(range(idx), range(idx + 1, len(grad.shape)))]]
                        * 2,
                    )
                    state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])

    if state["Q"] is None:
        state["Q"] = self.get_orthogonal_matrix(state["GG"])
    if state["step"] > 0 and state["step"] % state["precondition_frequency"] == 0:
        state["Q"] = self.get_orthogonal_matrix_QR(
            state, max_precond_dim, merge_dims
        )
        # state['Q'] = self.get_fast_QR(state, max_precond_dim, merge_dims)

    if state["step"] > 0:
        state["exp_avg"] = self.project(
            state["exp_avg"],
            state,
            merge_dims=merge_dims,
            max_precond_dim=max_precond_dim,
        )

`SoapConfig` `dataclass` ¶

Bases: RegistryConfigStrict

Configuration for SOAP optimizer. Refer to optimus_dl.modules.optim.soap.SOAP for more details.

Parameters:

Name	Type	Default
`lr`	`float`	`0.003`
`betas`	`tuple[float, float]`	`(0.95, 0.95)`
`shampoo_beta`	`float`	`-1`
`eps`	`float`	`1e-08`
`weight_decay`	`float`	`0.01`
`precondition_frequency`	`int`	`10`
`max_precond_dim`	`int`	`10000`
`merge_dims`	`bool`	`False`
`precondition_1d`	`bool`	`False`
`normalize_grads`	`bool`	`False`
`data_format`	`str`	`'channels_first'`
`correct_bias`	`bool`	`True`

Source code in optimus_dl/modules/optim/soap.py

@dataclass
class SoapConfig(RegistryConfigStrict):
    """
    Configuration for SOAP optimizer.
    Refer to `optimus_dl.modules.optim.soap.SOAP` for more details.
    """

    lr: float = 3e-3
    betas: tuple[float, float] = (0.95, 0.95)
    shampoo_beta: float = -1
    eps: float = 1e-8
    weight_decay: float = 0.01
    precondition_frequency: int = 10
    max_precond_dim: int = 10000
    merge_dims: bool = False
    precondition_1d: bool = False
    normalize_grads: bool = False
    data_format: str = "channels_first"
    correct_bias: bool = True

soap

optimus_dl.modules.optim.soap ¶

SOAP ¶

get_orthogonal_matrix(mat) ¶

get_orthogonal_matrix_QR(state, max_precond_dim=10000, merge_dims=False) ¶

init_preconditioner(grad, state, precondition_frequency=10, shampoo_beta=0.95, max_precond_dim=10000, precondition_1d=False, merge_dims=False) ¶

merge_dims(grad, max_precond_dim) ¶

project(grad, state, merge_dims=False, max_precond_dim=10000) ¶

project_back(grad, state, merge_dims=False, max_precond_dim=10000) ¶

step(closure=None) ¶

update_preconditioner(grad, state, max_precond_dim=10000, merge_dims=False, precondition_1d=False) ¶

SoapConfig dataclass ¶

`optimus_dl.modules.optim.soap` ¶

`SOAP` ¶

`get_orthogonal_matrix(mat)` ¶

`get_orthogonal_matrix_QR(state, max_precond_dim=10000, merge_dims=False)` ¶

`init_preconditioner(grad, state, precondition_frequency=10, shampoo_beta=0.95, max_precond_dim=10000, precondition_1d=False, merge_dims=False)` ¶

`merge_dims(grad, max_precond_dim)` ¶

`project(grad, state, merge_dims=False, max_precond_dim=10000)` ¶

`project_back(grad, state, merge_dims=False, max_precond_dim=10000)` ¶

`step(closure=None)` ¶

`update_preconditioner(grad, state, max_precond_dim=10000, merge_dims=False, precondition_1d=False)` ¶

`SoapConfig` `dataclass` ¶