From dd562c24f14a9ec4a325152644298b24e3cec4ca Mon Sep 17 00:00:00 2001
From: Tim Dettmers <tim.dettmers@gmail.com>
Date: Wed, 12 Apr 2023 11:24:44 -0700
Subject: [PATCH] Refactored simulated fp8 modules into research.nn.

---
 .../switchback}/README.md                     |   0
 .../switchback}/info_a100_py2.jsonl           |   0
 .../switchback}/make_plot_with_jsonl.py       |   0
 .../switchback}/plot_with_info.pdf            | Bin
 .../switchback}/speed_benchmark.py            |   0
 bitsandbytes/nn/__init__.py                   |   2 +-
 bitsandbytes/nn/modules.py                    | 176 +-----------------
 bitsandbytes/research/__init__.py             |   3 +-
 bitsandbytes/research/autograd/_functions.py  |  98 +---------
 bitsandbytes/research/nn/__init__.py          |   1 +
 bitsandbytes/research/nn/modules.py           |  64 +++++++
 examples/int8_inference_huggingface.py        |  27 +++
 tests/test_autograd.py                        |   4 +-
 tests/test_functional.py                      |   1 +
 tests/test_modules.py                         |   4 +-
 15 files changed, 108 insertions(+), 272 deletions(-)
 rename {speed_benchmark => benchmarking/switchback}/README.md (100%)
 rename {speed_benchmark => benchmarking/switchback}/info_a100_py2.jsonl (100%)
 rename {speed_benchmark => benchmarking/switchback}/make_plot_with_jsonl.py (100%)
 rename {speed_benchmark => benchmarking/switchback}/plot_with_info.pdf (100%)
 rename {speed_benchmark => benchmarking/switchback}/speed_benchmark.py (100%)
 create mode 100644 bitsandbytes/research/nn/__init__.py
 create mode 100644 bitsandbytes/research/nn/modules.py
 create mode 100644 examples/int8_inference_huggingface.py

diff --git a/speed_benchmark/README.md b/benchmarking/switchback/README.md
similarity index 100%
rename from speed_benchmark/README.md
rename to benchmarking/switchback/README.md
diff --git a/speed_benchmark/info_a100_py2.jsonl b/benchmarking/switchback/info_a100_py2.jsonl
similarity index 100%
rename from speed_benchmark/info_a100_py2.jsonl
rename to benchmarking/switchback/info_a100_py2.jsonl
diff --git a/speed_benchmark/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
similarity index 100%
rename from speed_benchmark/make_plot_with_jsonl.py
rename to benchmarking/switchback/make_plot_with_jsonl.py
diff --git a/speed_benchmark/plot_with_info.pdf b/benchmarking/switchback/plot_with_info.pdf
similarity index 100%
rename from speed_benchmark/plot_with_info.pdf
rename to benchmarking/switchback/plot_with_info.pdf
diff --git a/speed_benchmark/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py
similarity index 100%
rename from speed_benchmark/speed_benchmark.py
rename to benchmarking/switchback/speed_benchmark.py
diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index 51bccbc..ec944a3 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -2,5 +2,5 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed
+from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, SwitchBackLinearBnb
 from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorized, StandardLinear
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 7150378..f79b75a 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -297,7 +297,7 @@ class Linear8bitLt(nn.Linear):
         return out
 
 
-class Linear8bitLtMixed(nn.Linear):
+class SwitchBackLinearBnb(nn.Linear):
     def __init__(
         self,
         input_features,
@@ -355,177 +355,3 @@ class Linear8bitLtMixed(nn.Linear):
                 del self.state.CxB
 
         return out
-    
-
-class Linear8bitLtThresh(Linear8bitLt):
-    def __init__(
-        self,
-        input_features,
-        output_features,
-        bias=True,
-        has_fp16_weights=True,
-        memory_efficient_backward=False,
-        threshold=6.0,
-        index=None,
-    ):
-        super().__init__(
-            input_features, 
-            output_features, 
-            bias=bias, 
-            has_fp16_weights=has_fp16_weights, 
-            memory_efficient_backward=memory_efficient_backward, 
-            threshold=6., 
-            index=index
-        )
-
-class LinearFP8(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.bw_code = None
-        self.fw_code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.fw_code is None:
-            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
-            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
-
-        out = bnb.research.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2)
-        if self.bias is not None:
-            out += self.bias
-
-        return out
-
-class LinearFP8Mixed(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.bw_code = None
-        self.fw_code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.fw_code is None:
-            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
-            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
-
-        out = bnb.research.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2)
-        if self.bias is not None:
-            out += self.bias
-
-        return out
-
-class LinearFP8Global(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.bw_code = None
-        self.fw_code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.fw_code is None:
-            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
-            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
-
-        out = bnb.matmul_fp8_global(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2)
-        if self.bias is not None:
-            out += self.bias
-
-        return out
-
-class LinearInt8(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.code is None:
-            self.code = bnb.functional.create_linear_map(True, 8).to(x.device)
-
-        out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz, bsz2=self.bsz2)
-        if self.bias is not None:
-            out += self.bias
-
-        return out
-
-# This is 4 bit version.
-class LinearInt8Cast(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-
-
-    def forward(self, x: torch.Tensor):
-        if self.code is None:
-            self.code = bnb.functional.create_linear_map(True, 4).to(x.device)
-
-        out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz)
-        if self.bias is not None:
-            out += self.bias
-
-        return out
-
-
-class LinearFP4(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.bw_code = None
-        self.fw_code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.fw_code is None:
-            #self.bw_code = bnb.functional.create_fp8_map(True, 3, 0, 4).to(x.device)
-            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
-            self.fw_code = bnb.functional.create_fp8_map(True, 3, 0, 4).to(x.device)
-
-        out = bnb.matmul_fp4(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2)
-        if self.bias is not None:
-            out += self.bias
-
-        return out
diff --git a/bitsandbytes/research/__init__.py b/bitsandbytes/research/__init__.py
index f5ab510..47b720d 100644
--- a/bitsandbytes/research/__init__.py
+++ b/bitsandbytes/research/__init__.py
@@ -1,6 +1,5 @@
-
+from . import nn
 from .autograd._functions import (
-    matmul_fp8,
     switchback_bnb,
     matmul_fp8_global,
     matmul_fp8_mixed,
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
index b0a098d..4235989 100644
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -16,88 +16,6 @@ def prod(iterable):
 
 tensor = torch.Tensor
 
-class MatMulFP8(torch.autograd.Function):
-    # forward is the same, but we added the fallback for pre-turing GPUs
-    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
-
-    @staticmethod
-    def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024):
-        # default of pytorch behavior if inputs are empty
-        ctx.is_empty = False
-        if prod(A.shape) == 0:
-            ctx.is_empty = True
-            ctx.A = A
-            ctx.B = B
-
-            B_shape = B.shape
-            if A.shape[-1] == B_shape[0]:
-                return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
-            else:
-                return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device)
-
-        # 1. Dequantize
-        # 2. MatmulnN
-        cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz)
-        fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype)
-
-        cB, state = F.quantize(B.float(), code=fw_code)
-        fp8B = F.dequantize(cB, state).to(B.dtype)
-
-        output = torch.matmul(fp8A, fp8B)
-
-        # output is half
-
-        # 3. Save state
-        ctx.fw_code = fw_code
-        ctx.bw_code = bw_code
-        ctx.bsz = bsz
-        ctx.bsz2 = bsz2
-        ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype
-
-        if any(ctx.needs_input_grad[:2]):
-            # NOTE: we send back A, and re-quant.
-            ctx.tensors = (A, fp8B)
-        else:
-            ctx.tensors = (None, None)
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        if ctx.is_empty:
-            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None
-
-        req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
-
-        grad_A, grad_B = None, None
-
-        cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2)
-        fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype)
-
-        cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code)
-        fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype)
-
-        # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
-        # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector')
-        # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose
-        # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
-
-        # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
-            grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
-
-        if req_gradB:
-            if len(A.shape) == 3:
-                At = A.transpose(2, 1).contiguous()
-            else:
-                At = A.transpose(1, 0).contiguous()
-            cA, state = F.quantize(At.float(), code=ctx.fw_code)
-            fp8At = F.dequantize(cA, state).to(A.dtype)
-            grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype)
-
-        return grad_A, grad_B, None, None, None, None, None
-    
 class MatMulFP8Mixed(torch.autograd.Function):
     # forward is the same, but we added the fallback for pre-turing GPUs
     # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
@@ -171,7 +89,10 @@ class MatMulFP8Mixed(torch.autograd.Function):
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
-            At = A.transpose(2, 1).contiguous()
+            if len(A.shape) == 3:
+                At = A.transpose(2, 1).contiguous()
+            else:
+                At = A.transpose(1, 0).contiguous()
             # cA, state = F.quantize(At.float(), code=ctx.fw_code)
             # fp8At = F.dequantize(cA, state).to(A.dtype)
             grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype)
@@ -252,7 +173,10 @@ class MatMulFP8Global(torch.autograd.Function):
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
-            At = A.transpose(2, 1).contiguous()
+            if len(A.shape) == 3:
+                At = A.transpose(2, 1).contiguous()
+            else:
+                At = A.transpose(1, 0).contiguous()
             cA, state = F.quantize(At.float(), code=ctx.fw_code)
             fp8At = F.dequantize(cA, state).to(A.dtype)
             grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype)
@@ -465,11 +389,6 @@ def get_block_sizes(input_matrix, weight_matrix):
 
     return bsz, bsz2
 
-
-def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1):
-    if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B)
-    return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz, bsz2)
-
 def matmul_fp8_global(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1):
     if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B)
     return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2)
@@ -478,7 +397,6 @@ def matmul_fp8_mixed(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out
     if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B)
     return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2)
 
-
 def switchback_bnb(
     A: tensor,
     B: tensor,
diff --git a/bitsandbytes/research/nn/__init__.py b/bitsandbytes/research/nn/__init__.py
new file mode 100644
index 0000000..8faec10
--- /dev/null
+++ b/bitsandbytes/research/nn/__init__.py
@@ -0,0 +1 @@
+from .modules import LinearFP8Mixed, LinearFP8Global
diff --git a/bitsandbytes/research/nn/modules.py b/bitsandbytes/research/nn/modules.py
new file mode 100644
index 0000000..2a46b40
--- /dev/null
+++ b/bitsandbytes/research/nn/modules.py
@@ -0,0 +1,64 @@
+from typing import Optional, TypeVar, Union, overload
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, device, dtype, nn
+
+import bitsandbytes as bnb
+from bitsandbytes.optim import GlobalOptimManager
+from bitsandbytes.utils import OutlierTracer, find_outlier_dims
+
+T = TypeVar("T", bound="torch.nn.Module")
+
+
+class LinearFP8Mixed(nn.Linear):
+    def __init__(self, input_features, output_features, bias=True):
+        super().__init__(input_features, output_features, bias)
+        self.bw_code = None
+        self.fw_code = None
+        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
+        for i, k in enumerate(array):
+            if input_features > array[i + 1]:
+                self.bsz = k
+                break
+        for i, k in enumerate(array):
+            if output_features > array[i + 1]:
+                self.bsz2 = k
+                break
+
+    def forward(self, x: torch.Tensor):
+        if self.fw_code is None:
+            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
+            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
+
+        out = bnb.research.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2)
+        if self.bias is not None:
+            out += self.bias
+
+        return out
+
+class LinearFP8Global(nn.Linear):
+    def __init__(self, input_features, output_features, bias=True):
+        super().__init__(input_features, output_features, bias)
+        self.bw_code = None
+        self.fw_code = None
+        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
+        for i, k in enumerate(array):
+            if input_features > array[i + 1]:
+                self.bsz = k
+                break
+        for i, k in enumerate(array):
+            if output_features > array[i + 1]:
+                self.bsz2 = k
+                break
+
+    def forward(self, x: torch.Tensor):
+        if self.fw_code is None:
+            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
+            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
+
+        out = bnb.matmul_fp8_global(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2)
+        if self.bias is not None:
+            out += self.bias
+
+        return out
diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py
new file mode 100644
index 0000000..dc80a44
--- /dev/null
+++ b/examples/int8_inference_huggingface.py
@@ -0,0 +1,27 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MAX_NEW_TOKENS = 128
+model_name = 'decapoda-research/llama-7b-hf'
+
+text = 'Hamburg is in which country?\n'
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_ids = tokenizer(text, return_tensors="pt").input_ids
+
+free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
+max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'
+
+n_gpus = torch.cuda.device_count()
+max_memory = {i: max_memory for i in range(n_gpus)}
+
+model = AutoModelForCausalLM.from_pretrained(
+  model_name,
+  device_map='auto',
+  load_in_8bit=True,
+  max_memory=max_memory
+)
+generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
+print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
+
+
+
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index ac2ae05..b1f8ffa 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -441,8 +441,8 @@ dim4 = torch.randint(32, 96, size=(n,)).tolist()
 
 dim2.append(0)
 
-funcs = [(torch.matmul, bnb.research.matmul_fp8)]
-str_funcs = ["matmul"]
+funcs = [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)]
+str_funcs = ["matmul_fp8_mixed", 'matmul_fp8_global']
 req_grad = list(product([True, False], repeat=3))
 req_grad_str = []
 for c in req_grad:
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 5a24aeb..81c7535 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -190,6 +190,7 @@ def test_dynamic_blockwise_quantization():
 
 
 @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
+@pytest.mark.skip("Stochastic has some bugs, but will be deprecated soon anyways.")
 def test_dynamic_blockwise_stochastic_quantization(blocksize):
     diffs = []
     reldiffs = []
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 4fe8b54..67fbc21 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -532,9 +532,9 @@ def test_fp8linear():
     h = 1024
     inp = torch.randn(b, h).cuda()
     fp32 = torch.nn.Linear(h, h*2).cuda()
-    fp8 = bnb.nn.LinearFP8(h, h*2).cuda()
+    fp8 = bnb.research.nn.LinearFP8Mixed(h, h*2).cuda()
     fp32b = torch.nn.Linear(h*2, h).cuda()
-    fp8b = bnb.nn.LinearFP8(h*2, h).cuda()
+    fp8b = bnb.research.nn.LinearFP8Mixed(h*2, h).cuda()
 
     fp8.weight.data.copy_(fp32.weight.data)
     fp8.bias.data.copy_(fp32.bias.data)