bitsandbytes-rocm/bitsandbytes/autograd/_functions.py

import operator
import warnings

import torch
import bitsandbytes.functional as F

from dataclasses import dataclass
from functools import reduce  # Required in Python 3

# math.prod not compatible with python < 3.8
def prod(iterable):
    return reduce(operator.mul, iterable, 1)

tensor = torch.Tensor

"""
    This class pools outlier dimensions across layers.
    This is particularly important for small models where outlier features
    are less systematic and occur with low frequency.
"""
class GlobalOutlierPooler:
    _instance = None

    def __init__(self):
        raise RuntimeError("Call get_instance() instead")

    def initialize(self):
        self.outliers = set()
        self.model_dim = None

    @classmethod
    def get_instance(cls):
        if cls._instance is None:
            cls._instance = cls.__new__(cls)
            cls._instance.initialize()
        return cls._instance

    def add_outliers(self, outlier_idx, feature_dim):
        if self.model_dim is None:
            self.model_dim = feature_dim
        if feature_dim != self.model_dim:
            return  # we do not encode outliers for the 2nd FFN layer

        self.outliers.update(outlier_idx.tolist())

    def get_current_outlier_idx(self):
        return torch.Tensor(list(self.outliers)).to(torch.int64)


class MatMul8bit(torch.autograd.Function):
    @staticmethod
    def forward(ctx, A, B, out=None, quant_type="vector", precision=[8, 8, 8]):

        if precision[0] != 8:
            with torch.no_grad():
                output = torch.matmul(A, B)
        else:
            if len(B.shape) == 2:
                dim = 0
            else:
                dim = 1
            qA, SA = F.vectorwise_quant(A, dim=-1, quant_type=quant_type)
            qB, SB = F.vectorwise_quant(B, dim=dim, quant_type=quant_type)
            iout = F.igemm(qA, qB)
            output = F.vectorwise_mm_dequant(iout, SA, SB, A.dtype, quant_type)

        if A.requires_grad or B.requires_grad:
            ctx.save_for_backward(A, B)

        ctx.quant_type = quant_type
        ctx.precision = precision

        return output

    @staticmethod
    def backward(ctx, grad_output):
        A, B = ctx.saved_tensors
        quant_type = ctx.quant_type
        precision = ctx.precision
        grad_A = grad_B = None

        if B.requires_grad:
            if len(A.shape) == 3:
                dims = [0, 1]
                # bsi -> ibs
                permute_dim = [0, 2, 1]
            else:
                dims = [0]
                # bs -> sb
                permute_dim = [1, 0]

            if precision[1] != 8:
                with torch.no_grad():
                    grad_B = torch.matmul(A.permute(permute_dim), grad_output)
            else:
                if len(B.shape) == 2 and len(A.shape) == 3:
                    grad_output = grad_output.contiguous()
                    if not grad_output.is_contiguous():
                        grad_output.contiguous()
                    qgrad_output, S1 = F.vectorwise_quant(
                        grad_output.view(-1, grad_output.shape[2]),
                        dim=0,
                        quant_type=quant_type,
                    )
                    if not A.is_contiguous():
                        A = A.contiguous()
                    qA, S2 = F.vectorwise_quant(
                        A.view(-1, A.shape[2]), dim=0, quant_type=quant_type
                    )
                    igrad_B = F.igemm(qA.t(), qgrad_output)
                    grad_B = F.vectorwise_mm_dequant(
                        igrad_B, S2.t(), S1, grad_output.dtype, quant_type
                    )
                else:
                    qgrad_output, S1 = F.vectorwise_quant(
                        grad_output, dim=dims, quant_type=quant_type
                    )
                    qA, S2 = F.vectorwise_quant(
                        A, dim=dims, quant_type=quant_type
                    )
                    igrad_B = F.igemm(qA.permute(permute_dim), qgrad_output)
                    grad_B = F.vectorwise_mm_dequant(
                        igrad_B,
                        S2.permute(permute_dim),
                        S1,
                        grad_output.dtype,
                        quant_type,
                    )

        if A.requires_grad:
            if len(grad_output.shape) == 3:
                dims = [2]
            else:
                dims = [1]

            if len(B.shape) == 3:
                # bio -> boi
                permute_dim = [0, 2, 1]
                dim_B = dims
            else:
                # io -> oi
                permute_dim = [1, 0]
                dim_B = [1]

            if precision[2] != 8:
                with torch.no_grad():
                    grad_A = torch.matmul(grad_output, B.permute(permute_dim))
            else:
                qgrad_output, S1 = F.vectorwise_quant(
                    grad_output, dim=dims, quant_type=quant_type
                )
                qB, S3 = F.vectorwise_quant(B, dim=dim_B, quant_type=quant_type)
                igrad_A = F.igemm(qgrad_output, qB.permute(permute_dim))
                grad_A = F.vectorwise_mm_dequant(
                    igrad_A,
                    S1,
                    S3.permute(permute_dim),
                    grad_output.dtype,
                    quant_type,
                )

        return grad_A, grad_B, None, None, None


mm_cublas = MatMul8bit.apply
bmm_cublas = MatMul8bit.apply
matmul_cublas = MatMul8bit.apply


@dataclass
class MatmulLtState:
    CB = None
    CxB = None
    SB = None
    SCB = None

    CxBt = None
    SBt = None
    CBt = None

    subB = None

    outlier_pool = None
    has_accumulated_gradients = False
    threshold = 0.0
    idx = None
    is_training = True
    has_fp16_weights = True
    memory_efficient_backward = False
    use_pool = False
    formatB = F.get_special_format_str()

    def reset_grads(self):
        self.CB = None
        self.CxB = None
        self.SB = None
        self.SCB = None

        self.CxBt = None
        self.SBt = None
        self.CBt = None


class MatMul8bitLt(torch.autograd.Function):
    @staticmethod
    def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):
        # default to pytorch behavior if inputs are empty
        ctx.is_empty = False
        if prod(A.shape) == 0:
            ctx.is_empty = True
            ctx.A = A
            ctx.B = B
            ctx.bias = bias
            if A.shape[-1] == B.shape[0]:
                return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device)
            else:
                return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device)

        # 1. Quantize A
        # 2. Quantize B
        # 3. Matmul
        # 4. Mixed-precision decomposition matmul
        # 5. Save state
        formatB = state.formatB
        input_shape = A.shape
        if state.outlier_pool is None:
            state.outlier_pool = GlobalOutlierPooler.get_instance()

        # Cast A to fp16
        if A.dtype != torch.float16:
            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

        # 1. Quantize A
        if len(A.shape) == 3:
            A = A.view(-1, A.shape[-1]).contiguous()
        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(
            A.to(torch.float16), threshold=state.threshold
        )

        if state.threshold > 0.0 and coo_tensorA is not None:
            if state.has_fp16_weights:
                idx = torch.unique(coo_tensorA.colidx).long()
                CA[:, idx] = 0
                CAt[:, idx] = 0
                subA = A[:, idx]
                state.subB = B[:, idx].t().contiguous()
                state.idx = idx
            else:
                if state.CxB is None:
                    # B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions
                    # we also need to convert it to the turing/ampere format
                    state.CxB, state.SB = F.transform(state.CB, to_order=formatB)
        else:
            if not state.has_fp16_weights and state.CxB is None:
                state.CxB, state.SB = F.transform(state.CB, to_order=formatB)
            subA = None

        # 2. Quantize B
        if state.has_fp16_weights:
            has_grad = True if (getattr(B, "grad", None) is not None) else False
            is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1)
            if is_transposed:
                B = B.contiguous()

            if (state.is_training and not has_grad) or state.CxB is None:
                state.reset_grads()
                (
                    CB,
                    state.CBt,
                    state.SCB,
                    state.SCBt,
                    coo_tensorB,
                ) = F.double_quant(B.to(torch.float16))
                state.CxB, state.SB = F.transform(CB, to_order=formatB)
        else:
            has_grad = False

        if coo_tensorA is not None and not state.has_fp16_weights:
            # extract outliers

            outlier_idx = torch.unique(coo_tensorA.colidx)
            state.idx = outlier_idx
            # state.outlier_pool.add_outliers(outlier_idx, A.shape[-1])
            # if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]:
            #    # do not use pool for 2nd FFN layer
            #    state.idx = state.outlier_pool.get_current_outlier_idx().to(A.device)
            # else:
            #    state.idx = outlier_idx
            outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int())
            state.subB = (
                (outliers * state.SCB.view(-1, 1) / 127.0)
                .t()
                .contiguous()
                .to(A.dtype)
            )
            CA[:, state.idx.long()] = 0
            CAt[:, state.idx.long()] = 0
            subA = A[:, state.idx.long()]

        shapeB = state.SB[0]

        if len(input_shape) == 3:
            output_shape = (input_shape[0], input_shape[1], shapeB[0])
        else:
            output_shape = (input_shape[0], shapeB[0])

        # 3. Matmul
        C32A, SA = F.transform(CA, "col32")
        out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
        # we apply the fused bias here

        if bias is None or bias.dtype == torch.float16:
            output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
            output = output.to(A.dtype)
        else:  # apply bias separately
            output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None)
            output = output.to(A.dtype).add_(bias)

        # 4. Mixed-precision decomposition matmul
        if coo_tensorA is not None and subA is not None:
            output += torch.matmul(subA, state.subB)

        # 5. Save state
        ctx.state = state

        ctx.formatB = formatB
        ctx.grad_shape = input_shape
        ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype

        if any(ctx.needs_input_grad[:2]):
            ctx.tensors = (CAt, subA)
            ctx.tensor_states = (SCAt, state.idx)
        else:
            ctx.tensors = [None, None]
            ctx.tensor_states = (None, None)
            ctx.save_for_backward(None, None)


        clone_func = torch.clone if len(output_shape) == 3 else lambda x : x
        return clone_func(output.view(output_shape))

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.is_empty:
            bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias))
            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
        req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
        CAt, subA = ctx.tensors
        SCAt, idx = ctx.tensor_states
        formatB = ctx.formatB
        state = ctx.state
        grad_A = grad_B = grad_bias = None

        if req_gradBias:
            # compute grad_bias first before changing grad_output dtype
            grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)

        # Cast grad_output to fp16
        if len(grad_output.shape) == 3:
            grad_output = grad_output.reshape(
                -1, grad_output.shape[-1]
            ).contiguous()

        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
        if req_gradB:
            CxAt, SAt = F.transform(CAt, formatB, transpose=True)
            C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
            gradB32, SgradB32 = F.igemmlt(C32grad, CxAt, Sgrad, SAt)
            grad_B = F.mm_dequant(gradB32, SgradB32, SCgradt, SCAt)
            if state.threshold > 0.0 and subA is not None:
                grad_B[:, idx] += torch.matmul(grad_output.t(), subA)

        if req_gradA:
            if state.CBt is not None:
                C32grad, Sgrad = F.transform(Cgrad, "col32")
                if state.CxBt is None:
                    state.CxBt, state.SBt = F.transform(
                        state.CBt, to_order=formatB, transpose=True
                    )
                gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt)
                grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A)

            elif state.CB is not None:
                CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1. / 127.0))
                grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
            else:
                raise Exception('State must contain either CBt or CB matrix for backward')

        return grad_A, grad_B, None, grad_bias, None


def matmul(
    A: tensor,
    B: tensor,
    out: tensor = None,
    state: MatmulLtState = None,
    threshold=0.0,
    bias=None
):
    state = state or MatmulLtState()
    if threshold > 0.0:
        state.threshold = threshold
    return MatMul8bitLt.apply(A, B, out, bias, state)
Removed prod for Python <= 3.7 compatibility. 2022-08-08 12:20:36 +00:00			`import operator`
some kind of warning or something when this is first executed to make people aware that a cast happens and the operation quantization is performed in fp16. 2022-09-17 17:46:04 +00:00			`import warnings`

Most tests passing. 2022-07-22 21:41:05 +00:00			`import torch`
			`import bitsandbytes.functional as F`

Removed prod for Python <= 3.7 compatibility. 2022-08-08 12:20:36 +00:00			`from dataclasses import dataclass`
			`from functools import reduce # Required in Python 3`

Fixed prod Python < 3.7 compatibility in function.py. 2022-08-08 16:13:22 +00:00			`# math.prod not compatible with python < 3.8`
Removed prod for Python <= 3.7 compatibility. 2022-08-08 12:20:36 +00:00			`def prod(iterable):`
			`return reduce(operator.mul, iterable, 1)`

Most tests passing. 2022-07-22 21:41:05 +00:00			`tensor = torch.Tensor`

ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`"""`
Most tests passing. 2022-07-22 21:41:05 +00:00			`This class pools outlier dimensions across layers.`
Remove trailing whitespace & ensure newline at EOF 2022-10-27 11:11:29 +00:00			`This is particularly important for small models where outlier features`
Most tests passing. 2022-07-22 21:41:05 +00:00			`are less systematic and occur with low frequency.`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`"""`
Simplify statements into equivalent, modern variants via pyupgrade --py37-plus. The changes e.g. are subclassing from object, calling super() with super(ThisClass, self), or old-style syntax formatting. 2022-10-27 11:14:13 +00:00			`class GlobalOutlierPooler:`
Most tests passing. 2022-07-22 21:41:05 +00:00			`_instance = None`

			`def __init__(self):`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`raise RuntimeError("Call get_instance() instead")`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`def initialize(self):`
			`self.outliers = set()`
			`self.model_dim = None`

			`@classmethod`
			`def get_instance(cls):`
			`if cls._instance is None:`
			`cls._instance = cls.__new__(cls)`
			`cls._instance.initialize()`
			`return cls._instance`

			`def add_outliers(self, outlier_idx, feature_dim):`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if self.model_dim is None:`
			`self.model_dim = feature_dim`
			`if feature_dim != self.model_dim:`
			`return # we do not encode outliers for the 2nd FFN layer`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`self.outliers.update(outlier_idx.tolist())`

			`def get_current_outlier_idx(self):`
			`return torch.Tensor(list(self.outliers)).to(torch.int64)`


ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`class MatMul8bit(torch.autograd.Function):`
Most tests passing. 2022-07-22 21:41:05 +00:00			`@staticmethod`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`def forward(ctx, A, B, out=None, quant_type="vector", precision=[8, 8, 8]):`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`if precision[0] != 8:`
			`with torch.no_grad():`
			`output = torch.matmul(A, B)`
			`else:`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if len(B.shape) == 2:`
			`dim = 0`
			`else:`
			`dim = 1`
Most tests passing. 2022-07-22 21:41:05 +00:00			`qA, SA = F.vectorwise_quant(A, dim=-1, quant_type=quant_type)`
			`qB, SB = F.vectorwise_quant(B, dim=dim, quant_type=quant_type)`
			`iout = F.igemm(qA, qB)`
			`output = F.vectorwise_mm_dequant(iout, SA, SB, A.dtype, quant_type)`

			`if A.requires_grad or B.requires_grad:`
			`ctx.save_for_backward(A, B)`

			`ctx.quant_type = quant_type`
			`ctx.precision = precision`

			`return output`

			`@staticmethod`
			`def backward(ctx, grad_output):`
			`A, B = ctx.saved_tensors`
			`quant_type = ctx.quant_type`
			`precision = ctx.precision`
			`grad_A = grad_B = None`

			`if B.requires_grad:`
			`if len(A.shape) == 3:`
			`dims = [0, 1]`
			`# bsi -> ibs`
			`permute_dim = [0, 2, 1]`
			`else:`
			`dims = [0]`
			`# bs -> sb`
			`permute_dim = [1, 0]`

			`if precision[1] != 8:`
			`with torch.no_grad():`
			`grad_B = torch.matmul(A.permute(permute_dim), grad_output)`
			`else:`
			`if len(B.shape) == 2 and len(A.shape) == 3:`
			`grad_output = grad_output.contiguous()`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if not grad_output.is_contiguous():`
			`grad_output.contiguous()`
			`qgrad_output, S1 = F.vectorwise_quant(`
			`grad_output.view(-1, grad_output.shape[2]),`
			`dim=0,`
			`quant_type=quant_type,`
			`)`
			`if not A.is_contiguous():`
			`A = A.contiguous()`
			`qA, S2 = F.vectorwise_quant(`
			`A.view(-1, A.shape[2]), dim=0, quant_type=quant_type`
			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00			`igrad_B = F.igemm(qA.t(), qgrad_output)`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`grad_B = F.vectorwise_mm_dequant(`
			`igrad_B, S2.t(), S1, grad_output.dtype, quant_type`
			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00			`else:`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`qgrad_output, S1 = F.vectorwise_quant(`
			`grad_output, dim=dims, quant_type=quant_type`
			`)`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`qA, S2 = F.vectorwise_quant(`
			`A, dim=dims, quant_type=quant_type`
			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00			`igrad_B = F.igemm(qA.permute(permute_dim), qgrad_output)`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`grad_B = F.vectorwise_mm_dequant(`
			`igrad_B,`
			`S2.permute(permute_dim),`
			`S1,`
			`grad_output.dtype,`
			`quant_type,`
			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`if A.requires_grad:`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if len(grad_output.shape) == 3:`
			`dims = [2]`
			`else:`
			`dims = [1]`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`if len(B.shape) == 3:`
			`# bio -> boi`
			`permute_dim = [0, 2, 1]`
			`dim_B = dims`
			`else:`
			`# io -> oi`
			`permute_dim = [1, 0]`
			`dim_B = [1]`

			`if precision[2] != 8:`
			`with torch.no_grad():`
			`grad_A = torch.matmul(grad_output, B.permute(permute_dim))`
			`else:`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`qgrad_output, S1 = F.vectorwise_quant(`
			`grad_output, dim=dims, quant_type=quant_type`
			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00			`qB, S3 = F.vectorwise_quant(B, dim=dim_B, quant_type=quant_type)`
			`igrad_A = F.igemm(qgrad_output, qB.permute(permute_dim))`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`grad_A = F.vectorwise_mm_dequant(`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`igrad_A,`
			`S1,`
			`S3.permute(permute_dim),`
			`grad_output.dtype,`
			`quant_type,`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`return grad_A, grad_B, None, None, None`


			`mm_cublas = MatMul8bit.apply`
			`bmm_cublas = MatMul8bit.apply`
			`matmul_cublas = MatMul8bit.apply`

ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00
Most tests passing. 2022-07-22 21:41:05 +00:00			`@dataclass`
			`class MatmulLtState:`
			`CB = None`
			`CxB = None`
			`SB = None`
			`SCB = None`

			`CxBt = None`
			`SBt = None`
			`CBt = None`

			`subB = None`

			`outlier_pool = None`
			`has_accumulated_gradients = False`
			`threshold = 0.0`
			`idx = None`
			`is_training = True`
			`has_fp16_weights = True`
refactoring 2022-09-11 03:26:15 +00:00			`memory_efficient_backward = False`
Most tests passing. 2022-07-22 21:41:05 +00:00			`use_pool = False`
			`formatB = F.get_special_format_str()`

			`def reset_grads(self):`
			`self.CB = None`
			`self.CxB = None`
			`self.SB = None`
			`self.SCB = None`

			`self.CxBt = None`
			`self.SBt = None`
refactoring 2022-09-11 03:26:15 +00:00			`self.CBt = None`
Most tests passing. 2022-07-22 21:41:05 +00:00

			`class MatMul8bitLt(torch.autograd.Function):`
			`@staticmethod`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):`
Added fixes for the case that matmullt dim A is zero, e.g. [0, 768]. 2022-08-03 18:54:01 +00:00			`# default to pytorch behavior if inputs are empty`
			`ctx.is_empty = False`
Removed prod for Python <= 3.7 compatibility. 2022-08-08 12:20:36 +00:00			`if prod(A.shape) == 0:`
Added fixes for the case that matmullt dim A is zero, e.g. [0, 768]. 2022-08-03 18:54:01 +00:00			`ctx.is_empty = True`
			`ctx.A = A`
			`ctx.B = B`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`ctx.bias = bias`
Added fixes for the case that matmullt dim A is zero, e.g. [0, 768]. 2022-08-03 18:54:01 +00:00			`if A.shape[-1] == B.shape[0]:`
cast edge case 2022-09-17 21:35:42 +00:00			`return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device)`
Added fixes for the case that matmullt dim A is zero, e.g. [0, 768]. 2022-08-03 18:54:01 +00:00			`else:`
cast edge case 2022-09-17 21:35:42 +00:00			`return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device)`
Added fixes for the case that matmullt dim A is zero, e.g. [0, 768]. 2022-08-03 18:54:01 +00:00
Most tests passing. 2022-07-22 21:41:05 +00:00			`# 1. Quantize A`
			`# 2. Quantize B`
			`# 3. Matmul`
			`# 4. Mixed-precision decomposition matmul`
			`# 5. Save state`
			`formatB = state.formatB`
			`input_shape = A.shape`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if state.outlier_pool is None:`
			`state.outlier_pool = GlobalOutlierPooler.get_instance()`
req_gradA for casted & more efficient and accurate fp16 backward 2022-08-28 21:56:08 +00:00
			`# Cast A to fp16`
change typecast behavior 2022-09-17 21:07:05 +00:00			`if A.dtype != torch.float16:`
cast properly 2022-09-17 21:35:03 +00:00			`warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")`
req_gradA for casted & more efficient and accurate fp16 backward 2022-08-28 21:56:08 +00:00
Most tests passing. 2022-07-22 21:41:05 +00:00			`# 1. Quantize A`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if len(A.shape) == 3:`
			`A = A.view(-1, A.shape[-1]).contiguous()`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(`
change typecast behavior 2022-09-17 21:07:05 +00:00			`A.to(torch.float16), threshold=state.threshold`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`)`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`if state.threshold > 0.0 and coo_tensorA is not None:`
			`if state.has_fp16_weights:`
			`idx = torch.unique(coo_tensorA.colidx).long()`
			`CA[:, idx] = 0`
			`CAt[:, idx] = 0`
			`subA = A[:, idx]`
			`state.subB = B[:, idx].t().contiguous()`
			`state.idx = idx`
refactoring 2022-08-23 20:51:00 +00:00			`else:`
			`if state.CxB is None:`
			`# B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions`
			`# we also need to convert it to the turing/ampere format`
			`state.CxB, state.SB = F.transform(state.CB, to_order=formatB)`
Most tests passing. 2022-07-22 21:41:05 +00:00			`else:`
			`if not state.has_fp16_weights and state.CxB is None:`
			`state.CxB, state.SB = F.transform(state.CB, to_order=formatB)`
			`subA = None`

			`# 2. Quantize B`
			`if state.has_fp16_weights:`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`has_grad = True if (getattr(B, "grad", None) is not None) else False`
Most tests passing. 2022-07-22 21:41:05 +00:00			`is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1)`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`if is_transposed:`
			`B = B.contiguous()`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`if (state.is_training and not has_grad) or state.CxB is None:`
			`state.reset_grads()`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`(`
			`CB,`
			`state.CBt,`
			`state.SCB,`
			`state.SCBt,`
			`coo_tensorB,`
recast to fp16 2022-09-17 20:34:22 +00:00			`) = F.double_quant(B.to(torch.float16))`
Most tests passing. 2022-07-22 21:41:05 +00:00			`state.CxB, state.SB = F.transform(CB, to_order=formatB)`
			`else:`
			`has_grad = False`

Matmullt with direct outlier extraction for 8-bit inference. 2022-07-27 02:15:35 +00:00			`if coo_tensorA is not None and not state.has_fp16_weights:`
			`# extract outliers`

			`outlier_idx = torch.unique(coo_tensorA.colidx)`
Fixed direct extraction masking. 2022-07-27 08:46:35 +00:00			`state.idx = outlier_idx`
add memory effcient backward option 2022-09-11 02:51:29 +00:00			`# state.outlier_pool.add_outliers(outlier_idx, A.shape[-1])`
			`# if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]:`
			`# # do not use pool for 2nd FFN layer`
			`# state.idx = state.outlier_pool.get_current_outlier_idx().to(A.device)`
			`# else:`
			`# state.idx = outlier_idx`
Fixed direct extraction masking. 2022-07-27 08:46:35 +00:00			`outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int())`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`state.subB = (`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`(outliers * state.SCB.view(-1, 1) / 127.0)`
			`.t()`
			`.contiguous()`
cast properly 2022-09-17 21:30:57 +00:00			`.to(A.dtype)`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`)`
Matmullt with direct outlier extraction for 8-bit inference. 2022-07-27 02:15:35 +00:00			`CA[:, state.idx.long()] = 0`
			`CAt[:, state.idx.long()] = 0`
			`subA = A[:, state.idx.long()]`

Most tests passing. 2022-07-22 21:41:05 +00:00			`shapeB = state.SB[0]`

			`if len(input_shape) == 3:`
			`output_shape = (input_shape[0], input_shape[1], shapeB[0])`
			`else:`
			`output_shape = (input_shape[0], shapeB[0])`

			`# 3. Matmul`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`C32A, SA = F.transform(CA, "col32")`
Most tests passing. 2022-07-22 21:41:05 +00:00			`out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`# we apply the fused bias here`
un-fuse bias 2022-09-17 20:44:28 +00:00
un-fuse bias 2022-09-17 20:46:37 +00:00			`if bias is None or bias.dtype == torch.float16:`
			`output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)`
change typecast behavior 2022-09-17 21:07:05 +00:00			`output = output.to(A.dtype)`
un-fuse bias 2022-09-17 20:46:37 +00:00			`else: # apply bias separately`
			`output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None)`
change typecast behavior 2022-09-17 21:07:05 +00:00			`output = output.to(A.dtype).add_(bias)`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`# 4. Mixed-precision decomposition matmul`
Fixed direct extraction masking. 2022-07-27 08:46:35 +00:00			`if coo_tensorA is not None and subA is not None:`
rollback 2022-09-17 21:43:56 +00:00			`output += torch.matmul(subA, state.subB)`
Most tests passing. 2022-07-22 21:41:05 +00:00
			`# 5. Save state`
			`ctx.state = state`

			`ctx.formatB = formatB`
			`ctx.grad_shape = input_shape`
change typecast behavior 2022-09-17 21:15:18 +00:00			`ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype`
Most tests passing. 2022-07-22 21:41:05 +00:00
cast edge case 2022-09-17 21:36:46 +00:00			`if any(ctx.needs_input_grad[:2]):`
Most tests passing. 2022-07-22 21:41:05 +00:00			`ctx.tensors = (CAt, subA)`
			`ctx.tensor_states = (SCAt, state.idx)`
			`else:`
			`ctx.tensors = [None, None]`
			`ctx.tensor_states = (None, None)`
			`ctx.save_for_backward(None, None)`

add dtype <-> fp16 cast 2022-08-26 01:11:40 +00:00
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`clone_func = torch.clone if len(output_shape) == 3 else lambda x : x`
Most tests passing. 2022-07-22 21:41:05 +00:00			`return clone_func(output.view(output_shape))`

add memory effcient backward option 2022-09-11 02:51:29 +00:00			`@staticmethod`
Most tests passing. 2022-07-22 21:41:05 +00:00			`def backward(ctx, grad_output):`
Added fixes for the case that matmullt dim A is zero, e.g. [0, 768]. 2022-08-03 18:54:01 +00:00			`if ctx.is_empty:`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias))`
			`return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None`
change typecast behavior 2022-09-17 21:15:18 +00:00			`req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad`
add memory effcient backward option 2022-09-11 02:51:29 +00:00			`CAt, subA = ctx.tensors`
			`SCAt, idx = ctx.tensor_states`
			`formatB = ctx.formatB`
Most tests passing. 2022-07-22 21:41:05 +00:00			`state = ctx.state`
change order 2022-09-17 20:53:49 +00:00			`grad_A = grad_B = grad_bias = None`

			`if req_gradBias:`
			`# compute grad_bias first before changing grad_output dtype`
change typecast behavior 2022-09-17 21:19:22 +00:00			`grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)`
Most tests passing. 2022-07-22 21:41:05 +00:00
req_gradA for casted & more efficient and accurate fp16 backward 2022-08-28 21:56:08 +00:00			`# Cast grad_output to fp16`
Most tests passing. 2022-07-22 21:41:05 +00:00			`if len(grad_output.shape) == 3:`
add memory efficient backward 2022-08-23 20:39:54 +00:00			`grad_output = grad_output.reshape(`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`-1, grad_output.shape[-1]`
			`).contiguous()`
Most tests passing. 2022-07-22 21:41:05 +00:00
change typecast behavior 2022-09-17 21:15:18 +00:00			`Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))`
add memory effcient backward option 2022-09-11 02:51:29 +00:00			`if req_gradB:`
			`CxAt, SAt = F.transform(CAt, formatB, transpose=True)`
			`C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)`
			`gradB32, SgradB32 = F.igemmlt(C32grad, CxAt, Sgrad, SAt)`
debugprint 2022-09-17 22:02:13 +00:00			`grad_B = F.mm_dequant(gradB32, SgradB32, SCgradt, SCAt)`
add memory effcient backward option 2022-09-11 02:51:29 +00:00			`if state.threshold > 0.0 and subA is not None:`
reduce diff 2022-09-17 21:47:58 +00:00			`grad_B[:, idx] += torch.matmul(grad_output.t(), subA)`
add memory effcient backward option 2022-09-11 02:51:29 +00:00
Most tests passing. 2022-07-22 21:41:05 +00:00			`if req_gradA:`
clarified an exception message 2022-09-11 03:18:44 +00:00			`if state.CBt is not None:`
add memory effcient backward option 2022-09-11 02:51:29 +00:00			`C32grad, Sgrad = F.transform(Cgrad, "col32")`
			`if state.CxBt is None:`
			`state.CxBt, state.SBt = F.transform(`
			`state.CBt, to_order=formatB, transpose=True`
			`)`
			`gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt)`
change typecast behavior 2022-09-17 21:21:15 +00:00			`grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A)`
change typecast behavior 2022-09-17 21:15:18 +00:00
clarified an exception message 2022-09-11 03:18:44 +00:00			`elif state.CB is not None:`
review 2022-09-20 03:36:32 +00:00			`CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1. / 127.0))`
change typecast behavior 2022-09-17 21:21:15 +00:00			`grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)`
add memory effcient backward option 2022-09-11 02:51:29 +00:00			`else:`
clarified an exception message 2022-09-11 03:18:44 +00:00			`raise Exception('State must contain either CBt or CB matrix for backward')`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00
			`return grad_A, grad_B, None, grad_bias, None`
Most tests passing. 2022-07-22 21:41:05 +00:00

ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`def matmul(`
reran black with linelength 80 for greater readability 2022-08-01 16:32:47 +00:00			`A: tensor,`
			`B: tensor,`
			`out: tensor = None,`
			`state: MatmulLtState = None,`
			`threshold=0.0,`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`bias=None`
ran black and isort for coherent code formatting 2022-08-01 10:31:48 +00:00			`):`
Most tests passing. 2022-07-22 21:41:05 +00:00			`state = state or MatmulLtState()`
			`if threshold > 0.0:`
			`state.threshold = threshold`
Added fused bias to matmullt. 2022-08-16 19:00:54 +00:00			`return MatMul8bitLt.apply(A, B, out, bias, state)`