2448 lines
80 KiB
Python
2448 lines
80 KiB
Python
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
import ctypes as ct
|
|
import itertools
|
|
import operator
|
|
import random
|
|
import torch
|
|
import itertools
|
|
import math
|
|
from scipy.stats import norm
|
|
import numpy as np
|
|
|
|
from functools import reduce # Required in Python 3
|
|
from typing import Tuple
|
|
from torch import Tensor
|
|
|
|
from .cextension import COMPILED_WITH_CUDA, lib
|
|
|
|
|
|
# math.prod not compatible with python < 3.8
|
|
def prod(iterable):
|
|
return reduce(operator.mul, iterable, 1)
|
|
|
|
name2qmap = {}
|
|
|
|
if COMPILED_WITH_CUDA:
|
|
"""C FUNCTIONS FOR OPTIMIZERS"""
|
|
str2optimizer32bit = {}
|
|
str2optimizer32bit["adam"] = (lib.cadam32bit_gfp32, lib.cadam32bit_gfp16, lib.cadam32bit_gbf16)
|
|
str2optimizer32bit["momentum"] = (
|
|
lib.cmomentum32bit_g32,
|
|
lib.cmomentum32bit_g16,
|
|
)
|
|
str2optimizer32bit["rmsprop"] = (
|
|
lib.crmsprop32bit_g32,
|
|
lib.crmsprop32bit_g16,
|
|
)
|
|
str2optimizer32bit["adagrad"] = (
|
|
lib.cadagrad32bit_g32,
|
|
lib.cadagrad32bit_g16,
|
|
)
|
|
|
|
str2optimizer8bit = {}
|
|
str2optimizer8bit["adam"] = (
|
|
lib.cadam_static_8bit_g32,
|
|
lib.cadam_static_8bit_g16,
|
|
)
|
|
str2optimizer8bit["momentum"] = (
|
|
lib.cmomentum_static_8bit_g32,
|
|
lib.cmomentum_static_8bit_g16,
|
|
)
|
|
str2optimizer8bit["rmsprop"] = (
|
|
lib.crmsprop_static_8bit_g32,
|
|
lib.crmsprop_static_8bit_g16,
|
|
)
|
|
str2optimizer8bit["lamb"] = (
|
|
lib.cadam_static_8bit_g32,
|
|
lib.cadam_static_8bit_g16,
|
|
)
|
|
str2optimizer8bit["lars"] = (
|
|
lib.cmomentum_static_8bit_g32,
|
|
lib.cmomentum_static_8bit_g16,
|
|
)
|
|
|
|
str2optimizer8bit_blockwise = {}
|
|
str2optimizer8bit_blockwise["adam"] = (
|
|
lib.cadam_8bit_blockwise_fp32,
|
|
lib.cadam_8bit_blockwise_fp16,
|
|
lib.cadam_8bit_blockwise_bf16,
|
|
)
|
|
str2optimizer8bit_blockwise["momentum"] = (
|
|
lib.cmomentum_8bit_blockwise_fp32,
|
|
lib.cmomentum_8bit_blockwise_fp16,
|
|
)
|
|
str2optimizer8bit_blockwise["rmsprop"] = (
|
|
lib.crmsprop_8bit_blockwise_fp32,
|
|
lib.crmsprop_8bit_blockwise_fp16,
|
|
)
|
|
str2optimizer8bit_blockwise["adagrad"] = (
|
|
lib.cadagrad_8bit_blockwise_fp32,
|
|
lib.cadagrad_8bit_blockwise_fp16,
|
|
)
|
|
|
|
class GlobalPageManager:
|
|
_instance = None
|
|
|
|
def __init__(self):
|
|
raise RuntimeError("Call get_instance() instead")
|
|
|
|
def initialize(self):
|
|
self.paged_tensors = []
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
if cls._instance is None:
|
|
cls._instance = cls.__new__(cls)
|
|
cls._instance.initialize()
|
|
return cls._instance
|
|
|
|
def prefetch_all(self, to_cpu=False):
|
|
for t in self.paged_tensors:
|
|
prefetch_tensor(t, to_cpu)
|
|
|
|
|
|
|
|
class CUBLAS_Context:
|
|
_instance = None
|
|
|
|
def __init__(self):
|
|
raise RuntimeError("Call get_instance() instead")
|
|
|
|
def initialize(self):
|
|
self.context = {}
|
|
# prev_device = torch.cuda.current_device()
|
|
# for i in range(torch.cuda.device_count()):
|
|
# torch.cuda.set_device(torch.device('cuda', i))
|
|
# self.context.append(ct.c_void_p(lib.get_context()))
|
|
# torch.cuda.set_device(prev_device)
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
if cls._instance is None:
|
|
cls._instance = cls.__new__(cls)
|
|
cls._instance.initialize()
|
|
return cls._instance
|
|
|
|
def get_context(self, device):
|
|
if device.index not in self.context:
|
|
prev_device = torch.cuda.current_device()
|
|
torch.cuda.set_device(device)
|
|
self.context[device.index] = ct.c_void_p(lib.get_context())
|
|
torch.cuda.set_device(prev_device)
|
|
return self.context[device.index]
|
|
|
|
|
|
class Cusparse_Context:
|
|
_instance = None
|
|
|
|
def __init__(self):
|
|
raise RuntimeError("Call get_instance() instead")
|
|
|
|
def initialize(self):
|
|
self.context = ct.c_void_p(lib.get_cusparse())
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
if cls._instance is None:
|
|
cls._instance = cls.__new__(cls)
|
|
cls._instance.initialize()
|
|
return cls._instance
|
|
|
|
dtype2bytes = {}
|
|
dtype2bytes[torch.float32] = 4
|
|
dtype2bytes[torch.float16] = 2
|
|
dtype2bytes[torch.bfloat16] = 2
|
|
dtype2bytes[torch.uint8] = 1
|
|
dtype2bytes[torch.int8] = 1
|
|
|
|
def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)):
|
|
num_bytes = dtype2bytes[dtype]*prod(shape)
|
|
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
|
|
c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
|
|
new_array = np.ctypeslib.as_array(c_ptr, shape=shape)
|
|
out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)).view(shape)
|
|
out.is_paged = True
|
|
out.page_deviceid = device.index
|
|
return out
|
|
|
|
def prefetch_tensor(A, to_cpu=False):
|
|
assert A.is_paged, 'Only paged tensors can be prefetched!'
|
|
if to_cpu:
|
|
deviceid = -1
|
|
else:
|
|
deviceid = A.page_deviceid
|
|
|
|
num_bytes = dtype2bytes[A.dtype]*A.numel()
|
|
lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid))
|
|
|
|
def elementwise_func(func_name, A, B, value, prefetch=True):
|
|
func = None
|
|
if A.dtype == torch.float32:
|
|
func = getattr(lib, f'c{func_name}_fp32', None)
|
|
cvalue = ct.c_float(value)
|
|
elif A.dtype == torch.uint8:
|
|
func = getattr(lib, f'c{func_name}_uint8', None)
|
|
cvalue = ct.c_uint8(value)
|
|
|
|
if func is None: raise NotImplementedError(f'Function not implemented: {func_name}')
|
|
|
|
is_managed = getattr(A, 'is_managed', False)
|
|
if is_managed and prefetch:
|
|
prefetch_tensor(A)
|
|
if B is not None: prefetch_tensor(B)
|
|
|
|
func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel()))
|
|
if A.is_paged or B.is_paged:
|
|
# paged function are fully asynchronous
|
|
# if we return from this function, we want to the tensor
|
|
# to be in the correct state, that is the final state after the
|
|
# operation occured. So we synchronize.
|
|
torch.cuda.synchronize()
|
|
|
|
def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
|
|
def arange(A, device=None): elementwise_func('arange', A, None, 0)
|
|
def _mul(A, B, device=None): elementwise_func('_mul', A, B, 0)
|
|
|
|
|
|
def create_linear_map(signed=True, total_bits=8, add_zero=True):
|
|
sign = (-1.0 if signed else 0.0)
|
|
total_values = 2**total_bits
|
|
if add_zero or total_bits < 8:
|
|
# add a zero
|
|
# since we simulate less bits by having zeros in the data type, we
|
|
# we need to center the quantization around zero and as such lose
|
|
# a single value
|
|
total_values = (2**total_bits if not signed else 2**total_bits-1)
|
|
|
|
values = torch.linspace(sign, 1.0, total_values)
|
|
gap = 256 - values.numel()
|
|
if gap == 0:
|
|
return values
|
|
else:
|
|
l = values.numel()//2
|
|
#return torch.Tensor(values[:l].tolist() + [-1e-6]*((gap//2)-1) + [0]*2 + [1e-6]*((gap//2)-1) + values[l:].tolist())
|
|
return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist())
|
|
|
|
def create_custom_map(seed=0, scale=0.01):
|
|
v = [12, 10, 8, 6, 3, 2, 1]
|
|
# 16-bit 7B 22.33, 4-bit best 22.88, FP4 23.25, 4-bit 95 22.97, 4-bit evo 22.45
|
|
# 16-bit 13B 70.35, 4-bit best 67.16, FP4 100.78, 4-bit-95 69.39, 4-bit evo 70.48
|
|
|
|
# 13B 100 steps:
|
|
# - 4-bit evo: 86.02
|
|
# - 4-bit norm: 78.73
|
|
# - 4-bit FP4:
|
|
# - 16-bit:
|
|
|
|
# interval search on normal distribution
|
|
#v = [3.090232306167813, 1.4589770349449647, 1.064410327932115, 0.7896806653244509, 0.5646884166925807, 0.3653406435875121, 0.17964844284441311] # 0.999 26.5
|
|
#v = [2.3263478740408408, 1.4050715603096329, 1.0364333894937898, 0.7721932141886848, 0.5533847195556727, 0.3584587932511938, 0.1763741647808615] # 0.99 24.99
|
|
#v = [1.6448536269514722, 1.2040469600267016, 0.9208229763683788, 0.6971414348463417, 0.5039653672113453, 0.3280721075316511, 0.16184416680396213] # 0.95 24.53 22.97
|
|
#v = [1.4050715603096329, 1.0803193408149558, 0.8416212335729143, 0.643345405392917, 0.4676987991145084, 0.3054807880993974, 0.1509692154967774] # 0.92 24.81
|
|
#v = [1.2815515655446004, 1.0062699858608395, 0.7916386077433746, 0.6084981344998837, 0.4438613119262478, 0.29050677112339396, 0.14372923370582416] # 0.9 24.68
|
|
#v = [1.8807936081512509, 1.2980047163986055, 0.9769954022693226, 0.7341502955472268, 0.5285136765472481, 0.343225833559403, 0.16910470304375366] # 0.97 25.03
|
|
#v = [1.7506860712521692, 1.2496468758017434, 0.9485350408266378, 0.7155233557034365, 0.5162006366043174, 0.3356393360829622, 0.16547334454641704] # 0.96 24.85 23.01
|
|
#v = [1.5547735945968535, 1.1608220210715001, 0.893800631179489, 0.6789921163940618, 0.4918050830048072, 0.3205236191093902, 0.15821711945563585] # 0.94 24.47
|
|
#v = [1.475791028179171, 1.1196635980209986, 0.8674156943957149, 0.6610637542614526, 0.4797170937629045, 0.31299335020578195, 0.15459215234139795] # 0.93 24.85
|
|
#v = [1.5981931399228175, 1.1821583959486879, 0.9072289939325966, 0.6880384454306778, 0.49787602226482025, 0.3242955535308664, 0.160030379970179] # 0.945 24.287
|
|
##v = [1.6164363711150211, 1.1908453913294612, 0.9126463450304729, 0.6916727602238111, 0.5003095327012462, 0.3258056171348078, 0.1607558311941979] # 0.947 24.293
|
|
#v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.207
|
|
#v = [1.6118251211466303, 1.188665228776879, 0.9112895004060624, 0.690763326564427, 0.4997008778346997, 0.3254280317127771, 0.16057446047146948] # 0.9465 24.30
|
|
#v = [1.6027040905517569, 1.184321770169049, 0.9085808314549837, 0.6889461706317986, 0.4984841229538408, 0.32467299997597887, 0.1602117348657326] # 0.9455 24.293
|
|
#v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88
|
|
|
|
# 7B evo start
|
|
#v = [1.62129629, 1.18870191, 0.90848106, 0.69108646, 0.50515268, 0.34927819905, 0.14122701] # 22.06
|
|
#v = [1.6143079205628337, 1.1888081407660314, 0.8990131955745421, 0.694373759813679, 0.5083033257326773, 0.3452499746844963, 0.1148939728228951]
|
|
#v = [1.614442766030303, 1.189401918639665, 0.8998038168964273, 0.6953094818279475, 0.5073264599048384, 0.3449003790823619, 0.11428378427205564]
|
|
|
|
# 13B evo start
|
|
#v = [1.6077535089716468, 1.1914902148179205, 0.8999752421085561, 0.6967904489387543, 0.4949093928311768, 0.30920472033044544, 0.15391602735952042]
|
|
#v = [1.586363722436466, 1.202610827188916, 0.9003332576346587, 0.6904888715206972, 0.49490974688233724, 0.2971151461329376, 0.15683230810738283]
|
|
v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908]
|
|
|
|
# mean evo 7B + 13B
|
|
#v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237]
|
|
|
|
# theoretically optiomal (0.93333)
|
|
#v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333
|
|
|
|
if seed > 0:
|
|
v = np.array(v)
|
|
np.random.seed(seed)
|
|
v += np.random.randn(7)*scale
|
|
print(v.tolist())
|
|
#v[0] += (np.random.randn(1)*0.001)[0]
|
|
#v[-1] += (np.random.randn(1)*0.001)[0]
|
|
#print(v[0], v[-1])
|
|
v = v.tolist()
|
|
values = v + [0]*(256-14) + \
|
|
v[::-1]
|
|
|
|
values = torch.Tensor(values)
|
|
values[0:7] *= -1
|
|
values = values.sort().values
|
|
values /= values.max()
|
|
assert values.numel() == 256
|
|
return values
|
|
|
|
def create_normal_map(offset=0.9677083, use_extra_value=True):
|
|
|
|
if use_extra_value:
|
|
# one more positive value, this is an asymmetric type
|
|
v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist()
|
|
v2 = [0]*(256-15) ## we have 15 non-zero values in this data type
|
|
v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
|
|
v = v1 + v2 + v3
|
|
else:
|
|
v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist()
|
|
v2 = [0]*(256-14) ## we have 14 non-zero values in this data type
|
|
v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
|
|
v = v1 + v2 + v3
|
|
|
|
values = torch.Tensor(v)
|
|
values = values.sort().values
|
|
values /= values.max()
|
|
assert values.numel() == 256
|
|
return values
|
|
|
|
def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8):
|
|
e = exponent_bits
|
|
p = precision_bits
|
|
has_sign = 1 if signed else 0
|
|
assert e+p == total_bits-has_sign
|
|
# the exponent is biased to 2^(e-1) -1 == 0
|
|
evalues = []
|
|
pvalues = []
|
|
for i, val in enumerate(range(-((2**(exponent_bits-has_sign))), 2**(exponent_bits-has_sign), 1)):
|
|
evalues.append(2**val)
|
|
|
|
|
|
values = []
|
|
lst = list(itertools.product([0, 1], repeat=precision_bits))
|
|
#for ev in evalues:
|
|
bias = 2**(exponent_bits-1)-1
|
|
for evalue in range(2**(exponent_bits)):
|
|
for bit_pattern in lst:
|
|
value = (1 if evalue != 0 else 0)
|
|
for i, pval in enumerate(list(bit_pattern)):
|
|
value += pval*(2**-(i+1))
|
|
if evalue == 0:
|
|
# subnormals
|
|
value = value*2**-(bias-1)
|
|
else:
|
|
# normals
|
|
value = value*2**-(evalue-bias-2)
|
|
values.append(value)
|
|
if signed:
|
|
values.append(-value)
|
|
|
|
|
|
assert len(values) == 2**total_bits
|
|
values.sort()
|
|
if total_bits < 8:
|
|
gap = 256 - len(values)
|
|
for i in range(gap):
|
|
values.append(0)
|
|
values.sort()
|
|
code = torch.Tensor(values)
|
|
code /= code.max()
|
|
|
|
return code
|
|
|
|
|
|
|
|
def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
|
|
"""
|
|
Creates the dynamic quantiztion map.
|
|
|
|
The dynamic data type is made up of a dynamic exponent and
|
|
fraction. As the exponent increase from 0 to -7 the number
|
|
of bits available for the fraction shrinks.
|
|
|
|
This is a generalization of the dynamic type where a certain
|
|
number of the bits and be reserved for the linear quantization
|
|
region (the fraction). n determines the maximum number of
|
|
exponent bits.
|
|
|
|
For more details see
|
|
(8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
|
|
"""
|
|
|
|
data = []
|
|
# these are additional items that come from the case
|
|
# where all the exponent bits are zero and no
|
|
# indicator bit is present
|
|
non_sign_bits = total_bits - (1 if signed else 0)
|
|
additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
|
|
if not signed:
|
|
additional_items = 2 * additional_items
|
|
for i in range(max_exponent_bits):
|
|
fraction_items = int((2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1))
|
|
boundaries = torch.linspace(0.1, 1, fraction_items)
|
|
means = (boundaries[:-1] + boundaries[1:]) / 2.0
|
|
data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
|
|
if signed:
|
|
data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
|
|
|
|
if additional_items > 0:
|
|
boundaries = torch.linspace(0.1, 1, additional_items + 1)
|
|
means = (boundaries[:-1] + boundaries[1:]) / 2.0
|
|
data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
|
|
if signed:
|
|
data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
|
|
|
|
data.append(0)
|
|
data.append(1.0)
|
|
|
|
gap = 256 - len(data)
|
|
for i in range(gap):
|
|
data.append(0)
|
|
|
|
data.sort()
|
|
return Tensor(data)
|
|
|
|
def create_quantile_map(A, total_bits=8):
|
|
q = estimate_quantiles(A, num_quantiles=2**total_bits-1)
|
|
q = q.tolist()
|
|
q.append(0)
|
|
|
|
gap = 256 - len(q)
|
|
for i in range(gap):
|
|
q.append(0)
|
|
|
|
q.sort()
|
|
|
|
q = Tensor(q)
|
|
q = q/q.abs().max()
|
|
return q
|
|
|
|
def get_special_format_str():
|
|
if not torch.cuda.is_available(): return 'col_turing'
|
|
major, _minor = torch.cuda.get_device_capability()
|
|
if major <= 7:
|
|
return "col_turing"
|
|
if major == 8:
|
|
return "col_ampere"
|
|
return "col_turing"
|
|
|
|
|
|
|
|
def is_on_gpu(tensors):
|
|
on_gpu = True
|
|
gpu_ids = set()
|
|
for t in tensors:
|
|
if t is None: continue # NULL pointers are fine
|
|
is_paged = getattr(t, 'is_paged', False)
|
|
on_gpu &= (t.device.type == 'cuda' or is_paged)
|
|
if not is_paged:
|
|
gpu_ids.add(t.device.index)
|
|
if not on_gpu:
|
|
raise TypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}')
|
|
if len(gpu_ids) > 1:
|
|
raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}')
|
|
return on_gpu
|
|
|
|
def get_ptr(A: Tensor) -> ct.c_void_p:
|
|
"""
|
|
Get the ctypes pointer from a PyTorch Tensor.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.tensor
|
|
The PyTorch tensor.
|
|
|
|
Returns
|
|
-------
|
|
ctypes.c_void_p
|
|
"""
|
|
if A is None:
|
|
return None
|
|
else:
|
|
return ct.c_void_p(A.data.data_ptr())
|
|
|
|
|
|
def pre_call(device):
|
|
prev_device = torch.cuda.current_device()
|
|
torch.cuda.set_device(device)
|
|
return prev_device
|
|
|
|
|
|
def post_call(prev_device):
|
|
torch.cuda.set_device(prev_device)
|
|
|
|
|
|
def get_transform_func(dtype, orderA, orderOut, transpose=False):
|
|
name = f'ctransform_{(8 if dtype == torch.int8 else 32)}_{orderA}_to_{orderOut}_{"t" if transpose else "n"}'
|
|
if not hasattr(lib, name):
|
|
print(name)
|
|
raise ValueError(
|
|
f"Transform function not supported: {orderA} to {orderOut} for data type {dtype} and transpose={transpose}"
|
|
)
|
|
else:
|
|
return getattr(lib, name)
|
|
|
|
|
|
def get_transform_buffer(
|
|
shape, dtype, device, to_order, from_order="row", transpose=False
|
|
):
|
|
# init_func = torch.empty
|
|
init_func = torch.zeros
|
|
dims = len(shape)
|
|
|
|
if dims == 2:
|
|
rows = shape[0]
|
|
elif dims == 3:
|
|
rows = shape[0] * shape[1]
|
|
cols = shape[-1]
|
|
|
|
state = (shape, to_order)
|
|
if transpose:
|
|
# swap dims
|
|
tmp = rows
|
|
rows = cols
|
|
cols = tmp
|
|
state = (shape[::-1], to_order)
|
|
|
|
if to_order == "row" or to_order == "col":
|
|
return init_func(shape, dtype=dtype, device=device), state
|
|
elif to_order == "col32":
|
|
# blocks of 32 columns (padded)
|
|
cols = 32 * ((cols + 31) // 32)
|
|
return init_func((rows, cols), dtype=dtype, device=device), state
|
|
elif to_order == "col_turing":
|
|
# blocks of 32 columns and 8 rows
|
|
cols = 32 * ((cols + 31) // 32)
|
|
rows = 8 * ((rows + 7) // 8)
|
|
return init_func((rows, cols), dtype=dtype, device=device), state
|
|
elif to_order == "col_ampere":
|
|
# blocks of 32 columns and 32 rows
|
|
cols = 32 * ((cols + 31) // 32)
|
|
rows = 32 * ((rows + 31) // 32)
|
|
return init_func((rows, cols), dtype=dtype, device=device), state
|
|
else:
|
|
raise NotImplementedError(f"To_order not supported: {to_order}")
|
|
|
|
|
|
def nvidia_transform(
|
|
A,
|
|
to_order,
|
|
from_order="row",
|
|
out=None,
|
|
transpose=False,
|
|
state=None,
|
|
ld=None,
|
|
):
|
|
if state is None:
|
|
state = (A.shape, from_order)
|
|
else:
|
|
from_order = state[1]
|
|
if out is None:
|
|
out, new_state = get_transform_buffer(
|
|
state[0], A.dtype, A.device, to_order, state[1]
|
|
)
|
|
else:
|
|
new_state = (state[1], to_order)
|
|
func = get_transform_func(A.dtype, from_order, to_order, transpose)
|
|
|
|
shape = state[0]
|
|
if len(shape) == 2:
|
|
dim1 = ct.c_int32(shape[0])
|
|
dim2 = ct.c_int32(shape[1])
|
|
elif ld is not None:
|
|
n = prod(shape)
|
|
dim1 = prod([shape[i] for i in ld])
|
|
dim2 = ct.c_int32(n // dim1)
|
|
dim1 = ct.c_int32(dim1)
|
|
else:
|
|
dim1 = ct.c_int32(shape[0] * shape[1])
|
|
dim2 = ct.c_int32(shape[2])
|
|
|
|
ptr = CUBLAS_Context.get_instance().get_context(A.device)
|
|
func(ptr, get_ptr(A), get_ptr(out), dim1, dim2)
|
|
|
|
return out, new_state
|
|
|
|
|
|
def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor:
|
|
'''
|
|
Estimates 256 equidistant quantiles on the input tensor eCDF.
|
|
|
|
Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
|
|
via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
|
|
and the extreme quantiles close to 0 and 1 have high variance / large estimation
|
|
errors. These large errors can be avoided by using the offset variable which trims
|
|
the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
|
|
trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
|
|
usually has a much lower error but is not a minimum entropy encoding. Given an offset
|
|
of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The input tensor. Any shape.
|
|
out : torch.Tensor
|
|
Tensor with the 256 estimated quantiles.
|
|
offset : float
|
|
The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
|
|
num_quantiles : int
|
|
The number of equally spaced quantiles.
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
The 256 quantiles in float32 datatype.
|
|
'''
|
|
if A.numel() < 256: raise NotImplementedError(f'Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.')
|
|
if num_quantiles > 256: raise NotImplementedError(f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}")
|
|
if num_quantiles < 256 and offset == 1/(512):
|
|
# override default arguments
|
|
offset = 1/(2*num_quantiles)
|
|
|
|
if out is None: out = torch.zeros((256,), dtype=torch.float32, device=A.device)
|
|
is_on_gpu([A, out])
|
|
device = pre_call(A.device)
|
|
if A.dtype == torch.float32:
|
|
lib.cestimate_quantiles_fp32(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
|
|
elif A.dtype == torch.float16:
|
|
lib.cestimate_quantiles_fp16(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
|
|
else:
|
|
raise NotImplementedError(f"Not supported data type {A.dtype}")
|
|
post_call(device)
|
|
|
|
if num_quantiles < 256:
|
|
step = round(256/num_quantiles)
|
|
idx = torch.linspace(0, 255, num_quantiles).long().to(A.device)
|
|
out = out[idx]
|
|
|
|
return out
|
|
|
|
|
|
def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, rand=None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor:
|
|
"""
|
|
Quantize tensor A in blocks of size 4096 values.
|
|
|
|
Quantizes tensor A by dividing it into blocks of 4096 values.
|
|
Then the absolute maximum value within these blocks is calculated
|
|
for the non-linear quantization.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The input tensor.
|
|
code : torch.Tensor
|
|
The quantization map.
|
|
absmax : torch.Tensor
|
|
The absmax values.
|
|
rand : torch.Tensor
|
|
The tensor for stochastic rounding.
|
|
out : torch.Tensor
|
|
The output tensor (8-bit).
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
The 8-bit tensor.
|
|
tuple(torch.Tensor, torch.Tensor):
|
|
The quantization state to undo the quantization.
|
|
"""
|
|
|
|
|
|
if code is None:
|
|
if "dynamic" not in name2qmap:
|
|
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
|
|
code = name2qmap["dynamic"]
|
|
|
|
if absmax is None:
|
|
n = A.numel()
|
|
blocks = n // blocksize
|
|
blocks += 1 if n % blocksize > 0 else 0
|
|
absmax = torch.zeros((blocks,), device=A.device)
|
|
|
|
if out is None:
|
|
out = torch.zeros_like(A, dtype=torch.uint8)
|
|
|
|
if A.device.type != 'cpu':
|
|
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
|
|
cblocksize = ct.c_int32(blocksize)
|
|
prev_device = pre_call(A.device)
|
|
code = code.to(A.device)
|
|
if rand is not None:
|
|
is_on_gpu([code, A, out, absmax, rand])
|
|
assert blocksize==4096
|
|
assert rand.numel() >= 1024
|
|
rand_offset = random.randint(0, 1023)
|
|
if A.dtype == torch.float32:
|
|
lib.cquantize_blockwise_stochastic_fp32(get_ptr(code), get_ptr(A),get_ptr(absmax), get_ptr(out), get_ptr(rand), ct.c_int32(rand_offset), ct.c_int(A.numel()))
|
|
elif A.dtype == torch.float16:
|
|
lib.cquantize_blockwise_stochastic_fp16(get_ptr(code), get_ptr(A),get_ptr(absmax), get_ptr(out), get_ptr(rand), ct.c_int32(rand_offset), ct.c_int(A.numel()))
|
|
else:
|
|
raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
|
|
else:
|
|
is_on_gpu([code, A, out, absmax])
|
|
if A.dtype == torch.float32:
|
|
lib.cquantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
|
|
elif A.dtype == torch.float16:
|
|
lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
|
|
else:
|
|
raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
|
|
post_call(A.device)
|
|
else:
|
|
# cpu
|
|
code = code.cpu()
|
|
assert rand is None
|
|
lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
|
|
|
|
if nested:
|
|
offset = absmax.mean()
|
|
absmax -= offset
|
|
qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
|
|
state = [qabsmax, code, blocksize, nested, offset, state2]
|
|
else:
|
|
state = [absmax, code, blocksize, nested, None, None]
|
|
|
|
|
|
|
|
return out, state
|
|
|
|
|
|
def dequantize_blockwise(
|
|
A: Tensor,
|
|
quant_state: Tuple[Tensor, Tensor] = None,
|
|
absmax: Tensor = None,
|
|
code: Tensor = None,
|
|
out: Tensor = None,
|
|
blocksize: int = 4096,
|
|
nested=False
|
|
) -> Tensor:
|
|
"""
|
|
Dequantizes blockwise quantized values.
|
|
|
|
Dequantizes the tensor A with maximum absolute values absmax in
|
|
blocks of size 4096.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The input 8-bit tensor.
|
|
quant_state : tuple(torch.Tensor, torch.Tensor)
|
|
Tuple of code and absmax values.
|
|
absmax : torch.Tensor
|
|
The absmax values.
|
|
code : torch.Tensor
|
|
The quantization map.
|
|
out : torch.Tensor
|
|
Dequantized output tensor (default: float32)
|
|
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
Dequantized tensor (default: float32)
|
|
"""
|
|
assert quant_state is not None or absmax is not None
|
|
if code is None and quant_state is None:
|
|
if "dynamic" not in name2qmap:
|
|
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
|
|
code = name2qmap["dynamic"]
|
|
|
|
if out is None:
|
|
out = torch.zeros_like(A, dtype=torch.float32)
|
|
if quant_state is None:
|
|
quant_state = (absmax, code, blocksize)
|
|
else:
|
|
absmax, code, blocksize, nested, offset, state2 = quant_state
|
|
if nested:
|
|
absmax = dequantize_blockwise(absmax, state2)
|
|
absmax += offset
|
|
|
|
if A.device.type != 'cpu':
|
|
device = pre_call(A.device)
|
|
code = code.to(A.device)
|
|
if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
|
|
raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
|
|
is_on_gpu([A, absmax, out])
|
|
if out.dtype == torch.float32:
|
|
lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
|
|
elif out.dtype == torch.float16:
|
|
lib.cdequantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
|
|
else:
|
|
raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
|
|
post_call(A.device)
|
|
else:
|
|
code = code.cpu()
|
|
lib.cdequantize_blockwise_cpu_fp32(get_ptr(quant_state[1]), get_ptr(A), get_ptr(quant_state[0]), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
|
|
|
|
return out
|
|
|
|
def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
|
|
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
|
|
|
|
def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
|
|
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
|
|
|
|
def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
|
|
"""
|
|
Quantize tensor A in blocks of 4-bit values.
|
|
|
|
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The input tensor.
|
|
absmax : torch.Tensor
|
|
The absmax values.
|
|
out : torch.Tensor
|
|
The output tensor (8-bit).
|
|
blocksize : int
|
|
The blocksize used in quantization.
|
|
quant_type : str
|
|
The 4-bit quantization data type {fp4, nf4}
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
The 8-bit tensor with packed 4-bit values.
|
|
tuple(torch.Tensor, torch.Size, torch.dtype, int):
|
|
The quantization state to undo the quantization.
|
|
"""
|
|
if A.device.type != 'cuda':
|
|
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
|
|
if quant_type not in ['fp4', 'nf4']:
|
|
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
|
|
|
|
n = A.numel()
|
|
input_shape = A.shape
|
|
|
|
if absmax is None:
|
|
blocks = n // blocksize
|
|
blocks += 1 if n % blocksize > 0 else 0
|
|
absmax = torch.zeros((blocks,), device=A.device)
|
|
|
|
|
|
if out is None:
|
|
out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
|
|
|
|
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
|
|
|
|
prev_device = pre_call(A.device)
|
|
is_on_gpu([A, out, absmax])
|
|
|
|
if A.dtype == torch.float32:
|
|
if quant_type == 'fp4':
|
|
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
|
|
else:
|
|
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
|
|
elif A.dtype == torch.float16:
|
|
if quant_type == 'fp4':
|
|
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
|
|
else:
|
|
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
|
|
else:
|
|
raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
|
|
post_call(A.device)
|
|
|
|
if compress_statistics:
|
|
offset = absmax.mean()
|
|
absmax -= offset
|
|
#code = create_custom_map().to(absmax.device)
|
|
#qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
|
|
qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
|
|
del absmax
|
|
state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
|
|
else:
|
|
state = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
|
|
|
|
return out, state
|
|
|
|
def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
|
|
return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
|
|
|
|
def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
|
|
return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
|
|
|
|
def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
|
|
"""
|
|
Dequantizes FP4 blockwise quantized values.
|
|
|
|
Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The input 8-bit tensor (packed 4-bit values).
|
|
quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
|
|
Tuple of absmax values, original tensor shape and original dtype.
|
|
absmax : torch.Tensor
|
|
The absmax values.
|
|
out : torch.Tensor
|
|
Dequantized output tensor.
|
|
blocksize : int
|
|
The blocksize used in quantization.
|
|
quant_type : str
|
|
The 4-bit quantization data type {fp4, nf4}
|
|
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
Dequantized tensor.
|
|
"""
|
|
if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
|
|
raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
|
|
if quant_type not in ['fp4', 'nf4']:
|
|
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
|
|
|
|
if quant_state is None:
|
|
assert absmax is not None and out is not None
|
|
shape = out.shape
|
|
dtype = out.dtype
|
|
else:
|
|
absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state
|
|
|
|
|
|
if compressed_stats is not None:
|
|
offset, state2 = compressed_stats
|
|
absmax = dequantize_blockwise(absmax, state2)
|
|
absmax += offset
|
|
|
|
if out is None:
|
|
out = torch.empty(shape, dtype=dtype, device=A.device)
|
|
|
|
n = out.numel()
|
|
|
|
|
|
device = pre_call(A.device)
|
|
is_on_gpu([A, absmax, out])
|
|
if out.dtype == torch.float32:
|
|
if quant_type == 'fp4':
|
|
lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
|
|
else:
|
|
lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
|
|
elif out.dtype == torch.float16:
|
|
if quant_type == 'fp4':
|
|
lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
|
|
else:
|
|
lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
|
|
else:
|
|
raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
|
|
post_call(A.device)
|
|
|
|
is_transposed = (True if A.shape[0] == 1 else False)
|
|
if is_transposed: return out.t()
|
|
else: return out
|
|
|
|
|
|
def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
|
|
if code is None:
|
|
if "dynamic" not in name2qmap:
|
|
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
|
|
code = name2qmap["dynamic"]
|
|
code = code.to(A.device)
|
|
|
|
absmax = torch.abs(A).max()
|
|
inp = A / absmax
|
|
out = quantize_no_absmax(inp, code, out)
|
|
return out, (absmax, code)
|
|
|
|
|
|
def dequantize(
|
|
A: Tensor,
|
|
quant_state: Tuple[Tensor, Tensor] = None,
|
|
absmax: Tensor = None,
|
|
code: Tensor = None,
|
|
out: Tensor = None,
|
|
) -> Tensor:
|
|
assert quant_state is not None or absmax is not None
|
|
if code is None and quant_state is None:
|
|
if "dynamic" not in name2qmap:
|
|
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
|
|
code = name2qmap["dynamic"]
|
|
code = code.to(A.device)
|
|
|
|
if quant_state is None:
|
|
quant_state = (absmax, code)
|
|
out = dequantize_no_absmax(A, quant_state[1], out)
|
|
return out * quant_state[0]
|
|
|
|
|
|
def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
|
|
'''
|
|
Quantizes input tensor to 8-bit.
|
|
|
|
Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
|
|
`out` using the quantization map `code`.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The input tensor.
|
|
code : torch.Tensor
|
|
The quantization map.
|
|
out : torch.Tensor, optional
|
|
The output tensor. Needs to be of type byte.
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
Quantized 8-bit tensor.
|
|
'''
|
|
if out is None: out = torch.zeros_like(A, dtype=torch.uint8)
|
|
is_on_gpu([A, out])
|
|
lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
|
|
return out
|
|
|
|
|
|
def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
|
|
'''
|
|
Dequantizes the 8-bit tensor to 32-bit.
|
|
|
|
Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
|
|
the quantization map `code`.
|
|
|
|
Parameters
|
|
----------
|
|
A : torch.Tensor
|
|
The 8-bit input tensor.
|
|
code : torch.Tensor
|
|
The quantization map.
|
|
out : torch.Tensor
|
|
The 32-bit output tensor.
|
|
|
|
Returns
|
|
-------
|
|
torch.Tensor:
|
|
32-bit output tensor.
|
|
'''
|
|
if out is None: out = torch.zeros_like(A, dtype=torch.float32)
|
|
is_on_gpu([code, A, out])
|
|
lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
|
|
return out
|
|
|
|
|
|
def optimizer_update_32bit(
|
|
optimizer_name: str,
|
|
g: Tensor,
|
|
p: Tensor,
|
|
state1: Tensor,
|
|
beta1: float,
|
|
eps: float,
|
|
step: int,
|
|
lr: float,
|
|
state2: Tensor = None,
|
|
beta2: float = 0.0,
|
|
weight_decay: float = 0.0,
|
|
gnorm_scale: float = 1.0,
|
|
unorm_vec: Tensor = None,
|
|
max_unorm: float = 0.0,
|
|
skip_zeros=False,
|
|
) -> None:
|
|
"""
|
|
Performs an inplace optimizer update with one or two optimizer states.
|
|
|
|
Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.
|
|
|
|
Parameters
|
|
----------
|
|
optimizer_name : str
|
|
The name of the optimizer: {adam}.
|
|
g : torch.Tensor
|
|
Gradient tensor.
|
|
p : torch.Tensor
|
|
Parameter tensor.
|
|
state1 : torch.Tensor
|
|
Optimizer state 1.
|
|
beta1 : float
|
|
Optimizer beta1.
|
|
eps : float
|
|
Optimizer epsilon.
|
|
weight_decay : float
|
|
Weight decay.
|
|
step : int
|
|
Current optimizer step.
|
|
lr : float
|
|
The learning rate.
|
|
state2 : torch.Tensor
|
|
Optimizer state 2.
|
|
beta2 : float
|
|
Optimizer beta2.
|
|
gnorm_scale : float
|
|
The factor to rescale the gradient to the max clip value.
|
|
unorm_vec : torch.Tensor
|
|
The tensor for the update norm.
|
|
max_unorm : float
|
|
The maximum update norm relative to the weight norm.
|
|
skip_zeros : bool
|
|
Whether to skip zero-valued gradients or not (default: False).
|
|
"""
|
|
|
|
param_norm = 0.0
|
|
if max_unorm > 0.0:
|
|
param_norm = torch.norm(p.data.float())
|
|
|
|
|
|
optim_func = None
|
|
if g.dtype == torch.float32:
|
|
optim_func = str2optimizer32bit[optimizer_name][0]
|
|
elif g.dtype == torch.float16:
|
|
optim_func = str2optimizer32bit[optimizer_name][1]
|
|
elif (g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name])==3):
|
|
optim_func = str2optimizer32bit[optimizer_name][2]
|
|
else:
|
|
raise ValueError(f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}")
|
|
|
|
is_on_gpu([g, p, state1, state2, unorm_vec])
|
|
prev_device = pre_call(g.device)
|
|
optim_func(
|
|
get_ptr(g),
|
|
get_ptr(p),
|
|
get_ptr(state1),
|
|
get_ptr(state2),
|
|
get_ptr(unorm_vec),
|
|
ct.c_float(max_unorm),
|
|
ct.c_float(param_norm),
|
|
ct.c_float(beta1),
|
|
ct.c_float(beta2),
|
|
ct.c_float(eps),
|
|
ct.c_float(weight_decay),
|
|
ct.c_int32(step),
|
|
ct.c_float(lr),
|
|
ct.c_float(gnorm_scale),
|
|
ct.c_bool(skip_zeros),
|
|
ct.c_int32(g.numel()))
|
|
post_call(prev_device)
|
|
|
|
|
|
def optimizer_update_8bit(
|
|
optimizer_name: str,
|
|
g: Tensor,
|
|
p: Tensor,
|
|
state1: Tensor,
|
|
state2: Tensor,
|
|
beta1: float,
|
|
beta2: float,
|
|
eps: float,
|
|
step: int,
|
|
lr: float,
|
|
qmap1: Tensor,
|
|
qmap2: Tensor,
|
|
max1: Tensor,
|
|
max2: Tensor,
|
|
new_max1: Tensor,
|
|
new_max2: Tensor,
|
|
weight_decay: float = 0.0,
|
|
gnorm_scale: float = 1.0,
|
|
unorm_vec: Tensor = None,
|
|
max_unorm: float = 0.0,
|
|
) -> None:
|
|
"""
|
|
Performs an inplace Adam update.
|
|
|
|
Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
|
|
Uses AdamW formulation if weight decay > 0.0.
|
|
|
|
Parameters
|
|
----------
|
|
optimizer_name : str
|
|
The name of the optimizer. Choices {adam, momentum}
|
|
g : torch.Tensor
|
|
Gradient tensor.
|
|
p : torch.Tensor
|
|
Parameter tensor.
|
|
state1 : torch.Tensor
|
|
Adam state 1.
|
|
state2 : torch.Tensor
|
|
Adam state 2.
|
|
beta1 : float
|
|
Adam beta1.
|
|
beta2 : float
|
|
Adam beta2.
|
|
eps : float
|
|
Adam epsilon.
|
|
weight_decay : float
|
|
Weight decay.
|
|
step : int
|
|
Current optimizer step.
|
|
lr : float
|
|
The learning rate.
|
|
qmap1 : torch.Tensor
|
|
Quantization map for first Adam state.
|
|
qmap2 : torch.Tensor
|
|
Quantization map for second Adam state.
|
|
max1 : torch.Tensor
|
|
Max value for first Adam state update.
|
|
max2 : torch.Tensor
|
|
Max value for second Adam state update.
|
|
new_max1 : torch.Tensor
|
|
Max value for the next Adam update of the first state.
|
|
new_max2 : torch.Tensor
|
|
Max value for the next Adam update of the second state.
|
|
gnorm_scale : float
|
|
The factor to rescale the gradient to the max clip value.
|
|
unorm_vec : torch.Tensor
|
|
The tensor for the update norm.
|
|
max_unorm : float
|
|
The maximum update norm relative to the weight norm.
|
|
"""
|
|
|
|
param_norm = 0.0
|
|
if max_unorm > 0.0:
|
|
param_norm = torch.norm(p.data.float())
|
|
|
|
if g.dtype == torch.float32 and state1.dtype == torch.uint8:
|
|
str2optimizer8bit[optimizer_name][0](
|
|
get_ptr(p),
|
|
get_ptr(g),
|
|
get_ptr(state1),
|
|
get_ptr(state2),
|
|
get_ptr(unorm_vec),
|
|
ct.c_float(max_unorm),
|
|
ct.c_float(param_norm),
|
|
ct.c_float(beta1),
|
|
ct.c_float(beta2),
|
|
ct.c_float(eps),
|
|
ct.c_int32(step),
|
|
ct.c_float(lr),
|
|
get_ptr(qmap1),
|
|
get_ptr(qmap2),
|
|
get_ptr(max1),
|
|
get_ptr(max2),
|
|
get_ptr(new_max1),
|
|
get_ptr(new_max2),
|
|
ct.c_float(weight_decay),
|
|
ct.c_float(gnorm_scale),
|
|
ct.c_int32(g.numel()),
|
|
)
|
|
elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
|
|
str2optimizer8bit[optimizer_name][1](
|
|
get_ptr(p),
|
|
get_ptr(g),
|
|
get_ptr(state1),
|
|
get_ptr(state2),
|
|
get_ptr(unorm_vec),
|
|
ct.c_float(max_unorm),
|
|
ct.c_float(param_norm),
|
|
ct.c_float(beta1),
|
|
ct.c_float(beta2),
|
|
ct.c_float(eps),
|
|
ct.c_int32(step),
|
|
ct.c_float(lr),
|
|
get_ptr(qmap1),
|
|
get_ptr(qmap2),
|
|
get_ptr(max1),
|
|
get_ptr(max2),
|
|
get_ptr(new_max1),
|
|
get_ptr(new_max2),
|
|
ct.c_float(weight_decay),
|
|
ct.c_float(gnorm_scale),
|
|
ct.c_int32(g.numel()),
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
|
|
)
|
|
|
|
|
|
def optimizer_update_8bit_blockwise(
|
|
optimizer_name: str,
|
|
g: Tensor,
|
|
p: Tensor,
|
|
state1: Tensor,
|
|
state2: Tensor,
|
|
beta1: float,
|
|
beta2: float,
|
|
eps: float,
|
|
step: int,
|
|
lr: float,
|
|
qmap1: Tensor,
|
|
qmap2: Tensor,
|
|
absmax1: Tensor,
|
|
absmax2: Tensor,
|
|
weight_decay: float = 0.0,
|
|
gnorm_scale: float = 1.0,
|
|
skip_zeros=False,
|
|
) -> None:
|
|
|
|
optim_func = None
|
|
if g.dtype == torch.float32 and state1.dtype == torch.uint8:
|
|
optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
|
|
elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
|
|
optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
|
|
elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and
|
|
len(str2optimizer8bit_blockwise[optimizer_name])==3):
|
|
optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
|
|
else:
|
|
raise ValueError(
|
|
f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
|
|
)
|
|
|
|
is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
|
|
|
|
prev_device = pre_call(g.device)
|
|
optim_func(
|
|
get_ptr(p),
|
|
get_ptr(g),
|
|
get_ptr(state1),
|
|
get_ptr(state2),
|
|
ct.c_float(beta1),
|
|
ct.c_float(beta2),
|
|
ct.c_float(eps),
|
|
ct.c_int32(step),
|
|
ct.c_float(lr),
|
|
get_ptr(qmap1),
|
|
get_ptr(qmap2),
|
|
get_ptr(absmax1),
|
|
get_ptr(absmax2),
|
|
ct.c_float(weight_decay),
|
|
ct.c_float(gnorm_scale),
|
|
ct.c_bool(skip_zeros),
|
|
ct.c_int32(g.numel()),
|
|
)
|
|
post_call(prev_device)
|
|
|
|
def percentile_clipping(
|
|
grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5
|
|
):
|
|
"""Applies percentile clipping
|
|
|
|
grad: torch.Tensor
|
|
The gradient tensor.
|
|
gnorm_vec: torch.Tensor
|
|
Vector of gradient norms. 100 elements expected.
|
|
step: int
|
|
The current optimiation steps (number of past gradient norms).
|
|
|
|
"""
|
|
is_on_gpu([grad, gnorm_vec])
|
|
if grad.dtype == torch.float32:
|
|
lib.cpercentile_clipping_g32(
|
|
get_ptr(grad),
|
|
get_ptr(gnorm_vec),
|
|
ct.c_int32(step),
|
|
ct.c_int32(grad.numel()),
|
|
)
|
|
elif grad.dtype == torch.float16:
|
|
lib.cpercentile_clipping_g16(
|
|
get_ptr(grad),
|
|
get_ptr(gnorm_vec),
|
|
ct.c_int32(step),
|
|
ct.c_int32(grad.numel()),
|
|
)
|
|
else:
|
|
raise ValueError(f"Gradient type {grad.dtype} not supported!")
|
|
|
|
current_gnorm = torch.sqrt(gnorm_vec[step % 100])
|
|
vals, idx = torch.sort(gnorm_vec)
|
|
clip_value = torch.sqrt(vals[percentile])
|
|
gnorm_scale = 1.0
|
|
|
|
if current_gnorm > clip_value:
|
|
gnorm_scale = clip_value / current_gnorm
|
|
|
|
return current_gnorm, clip_value, gnorm_scale
|
|
|
|
|
|
def histogram_scatter_add_2d(
|
|
histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor
|
|
):
|
|
assert len(histogram.shape) == 2
|
|
assert histogram.dtype == torch.float32
|
|
assert source.dtype == torch.float32
|
|
assert index1.dtype == torch.int32
|
|
assert index2.dtype == torch.int32
|
|
|
|
assert histogram.device.type == "cuda"
|
|
assert index1.device.type == "cuda"
|
|
assert index2.device.type == "cuda"
|
|
assert source.device.type == "cuda"
|
|
|
|
maxdim1 = ct.c_int32(histogram.shape[0])
|
|
n = ct.c_int32(index1.numel())
|
|
is_on_gpu([histogram, index1, index2, source])
|
|
lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
|
|
|
|
def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
|
|
if not torch.cuda.is_initialized(): torch.cuda.init()
|
|
if A.dtype != expected_type or B.dtype != expected_type:
|
|
raise TypeError(
|
|
f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}"
|
|
)
|
|
|
|
sA = A.shape
|
|
sB = B.shape
|
|
tA = transposed_A
|
|
tB = transposed_B
|
|
|
|
correct = True
|
|
|
|
if len(sA) == 2 and len(sB) == 2:
|
|
if not tA and not tB and A.shape[1] != B.shape[0]:
|
|
correct = False
|
|
elif tA and not tB and A.shape[0] != B.shape[0]:
|
|
correct = False
|
|
elif tA and tB and A.shape[0] != B.shape[1]:
|
|
correct = False
|
|
elif not tA and tB and A.shape[1] != B.shape[1]:
|
|
correct = False
|
|
elif len(sA) == 3 and len(sB) == 2:
|
|
if not tA and not tB and A.shape[2] != B.shape[0]:
|
|
correct = False
|
|
elif tA and not tB and A.shape[1] != B.shape[0]:
|
|
correct = False
|
|
elif tA and tB and A.shape[1] != B.shape[1]:
|
|
correct = False
|
|
elif not tA and tB and A.shape[2] != B.shape[1]:
|
|
correct = False
|
|
elif len(sA) == 3 and len(sB) == 3:
|
|
if not tA and not tB and A.shape[2] != B.shape[1]:
|
|
correct = False
|
|
elif tA and not tB and A.shape[1] != B.shape[1]:
|
|
correct = False
|
|
elif tA and tB and A.shape[1] != B.shape[2]:
|
|
correct = False
|
|
elif not tA and tB and A.shape[2] != B.shape[2]:
|
|
correct = False
|
|
|
|
if out is not None:
|
|
sout = out.shape
|
|
# special case common in backprop
|
|
if not correct and len(sA) == 3 and len(sB) == 3:
|
|
if (
|
|
sout[0] == sA[2]
|
|
and sout[1] == sB[2]
|
|
and sA[0] == sB[0]
|
|
and sA[1] == sB[1]
|
|
):
|
|
correct = True
|
|
else:
|
|
if len(sA) == 2 and len(sB) == 2:
|
|
if not tA and not tB:
|
|
sout = (sA[0], sB[1])
|
|
elif tA and tB:
|
|
sout = (sA[1], sB[0])
|
|
elif tA and not tB:
|
|
sout = (sA[1], sB[1])
|
|
elif not tA and tB:
|
|
sout = (sA[0], sB[0])
|
|
elif len(sA) == 3 and len(sB) == 2:
|
|
if not tA and not tB:
|
|
sout = (sA[0], sA[1], sB[1])
|
|
elif tA and tB:
|
|
sout = (sA[0], sA[2], sB[0])
|
|
elif tA and not tB:
|
|
sout = (sA[0], sA[2], sB[1])
|
|
elif not tA and tB:
|
|
sout = (sA[0], sA[1], sB[0])
|
|
elif len(sA) == 3 and len(sB) == 3:
|
|
if not tA and not tB:
|
|
sout = (sA[0], sA[1], sB[2])
|
|
elif tA and tB:
|
|
sout = (sA[0], sA[2], sB[1])
|
|
elif tA and not tB:
|
|
sout = (sA[0], sA[2], sB[2])
|
|
elif not tA and tB:
|
|
sout = (sA[0], sA[1], sB[1])
|
|
|
|
if not correct:
|
|
raise ValueError(
|
|
f"Tensor dimensions incorrect for matrix mulitiplication: A x B: {sA} x {sB} with transpose for A x B: {tA} x {tB}."
|
|
)
|
|
|
|
return sout
|
|
|
|
def cutlass3_gemm(
|
|
A: Tensor,
|
|
B: Tensor,
|
|
out: Tensor = None,
|
|
transposed_A=False,
|
|
transposed_B=False,
|
|
state=None
|
|
):
|
|
#sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
|
|
if state is None:
|
|
Bshape = B.shape
|
|
bout = Bshape[1]
|
|
else:
|
|
Bshape = state[1]
|
|
bout = Bshape[0]
|
|
if out is None:
|
|
out = torch.zeros(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
|
|
|
|
sA = A.shape
|
|
sB = B.shape
|
|
if transposed_A and len(sA) == 2:
|
|
sA = (sA[1], sA[0])
|
|
elif transposed_A and len(sA) == 3:
|
|
sA = (sA[0], sA[2], sA[0])
|
|
if transposed_B and len(sB) == 2:
|
|
sB = (sB[1], sB[0])
|
|
elif transposed_B and len(sB) == 3:
|
|
sB = (sB[0], sB[2], sB[0])
|
|
# this is a mess: cuBLAS expect column major, but PyTorch is row major.
|
|
# So to perform the matrix multiplication, we have to treat A, B, and C matrices
|
|
# (transpose of row major is column major)
|
|
# This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
|
|
|
|
# matrices in the input arguments for cuBLAS
|
|
# column major: A @ B = C: [m, k] @ [k, n] = [m, n]
|
|
# row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
|
|
# column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
|
|
if len(sB) == 2:
|
|
if B.stride()[0] == B.shape[1]:
|
|
transposed_B = False
|
|
elif B.stride()[1] == B.shape[0]:
|
|
transposed_B = True
|
|
if len(A.shape) == 2:
|
|
if A.stride()[0] == A.shape[1]:
|
|
transposed_A = False
|
|
elif A.stride()[1] == A.shape[0]:
|
|
transposed_A = True
|
|
else:
|
|
if A.stride()[1] == A.shape[2]:
|
|
transposed_A = False
|
|
elif A.stride()[2] == A.shape[1]:
|
|
transposed_A = True
|
|
|
|
if len(sA) == 2:
|
|
n = sA[0]
|
|
ldb = A.stride()[1 if transposed_A else 0]
|
|
elif len(sA) == 3 and len(sB) == 2:
|
|
n = sA[0] * sA[1]
|
|
ldb = sA[2]
|
|
|
|
m = sB[1]
|
|
k = sB[0]
|
|
lda = B.stride()[0]
|
|
ldc = sB[1]
|
|
elif len(sB) == 3:
|
|
# special case
|
|
assert len(sA) == 3
|
|
if not (sA[0] == sB[0] and sA[1] == sB[1]):
|
|
raise ValueError(
|
|
f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
|
|
)
|
|
|
|
transposed_A = True
|
|
transposed_B = False
|
|
|
|
m = sB[2]
|
|
n = sA[2]
|
|
k = sB[0] * sB[1]
|
|
|
|
lda = n
|
|
ldb = sA[2]
|
|
ldc = m
|
|
|
|
ptr = CUBLAS_Context.get_instance().get_context(A.device)
|
|
|
|
# B^T @ A^T = C^T
|
|
# [km, nk -> mn]
|
|
#lda = ldb = ldc = 1
|
|
#lda = 1
|
|
if state is not None:
|
|
m = Bshape[0]
|
|
k = Bshape[1]
|
|
lda = Bshape[0]
|
|
ldc = Bshape[0]
|
|
ldb = (ldb+1)//2
|
|
#print(m, n, k, lda, ldb, ldc)
|
|
is_on_gpu([B, A, out])
|
|
m = ct.c_int32(m)
|
|
n = ct.c_int32(n)
|
|
k = ct.c_int32(k)
|
|
lda = ct.c_int32(lda)
|
|
ldb = ct.c_int32(ldb)
|
|
ldc = ct.c_int32(ldc)
|
|
|
|
if B.dtype == torch.uint8:
|
|
lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
|
|
elif A.dtype == torch.float32:
|
|
lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
|
|
elif A.dtype == torch.float16:
|
|
lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
|
|
else:
|
|
raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}')
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
def igemm(
|
|
A: Tensor,
|
|
B: Tensor,
|
|
out: Tensor = None,
|
|
transposed_A=False,
|
|
transposed_B=False,
|
|
):
|
|
sout = check_matmul(A, B, out, transposed_A, transposed_B)
|
|
if out is None:
|
|
out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
|
|
if len(A.shape) == 3 and len(B.shape) == 3:
|
|
if A.shape[0] == B.shape[0] and A.shape[2] == B.shape[1]:
|
|
return batched_igemm(A, B, out)
|
|
|
|
sA = A.shape
|
|
sB = B.shape
|
|
if transposed_A and len(sA) == 2:
|
|
sA = (sA[1], sA[0])
|
|
elif transposed_A and len(sA) == 3:
|
|
sA = (sA[0], sA[2], sA[0])
|
|
if transposed_B and len(sB) == 2:
|
|
sB = (sB[1], sB[0])
|
|
elif transposed_B and len(sB) == 3:
|
|
sB = (sB[0], sB[2], sB[0])
|
|
# this is a mess: cuBLAS expect column major, but PyTorch is row major.
|
|
# So to perform the matrix multiplication, we have to treat A, B, and C matrices
|
|
# (transpose of row major is column major)
|
|
# This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
|
|
|
|
# matrices in the input arguments for cuBLAS
|
|
# column major: A @ B = C: [m, k] @ [k, n] = [m, n]
|
|
# row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
|
|
# column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
|
|
if len(sB) == 2:
|
|
if B.stride()[0] == B.shape[1]:
|
|
transposed_B = False
|
|
elif B.stride()[1] == B.shape[0]:
|
|
transposed_B = True
|
|
if len(A.shape) == 2:
|
|
if A.stride()[0] == A.shape[1]:
|
|
transposed_A = False
|
|
elif A.stride()[1] == A.shape[0]:
|
|
transposed_A = True
|
|
else:
|
|
if A.stride()[1] == A.shape[2]:
|
|
transposed_A = False
|
|
elif A.stride()[2] == A.shape[1]:
|
|
transposed_A = True
|
|
|
|
if len(sA) == 2:
|
|
n = sA[0]
|
|
ldb = A.stride()[1 if transposed_A else 0]
|
|
elif len(sA) == 3 and len(sB) == 2:
|
|
n = sA[0] * sA[1]
|
|
ldb = sA[2]
|
|
|
|
m = sB[1]
|
|
k = sB[0]
|
|
lda = B.stride()[(1 if transposed_B else 0)]
|
|
ldc = sB[1]
|
|
elif len(sB) == 3:
|
|
# special case
|
|
assert len(sA) == 3
|
|
if not (sA[0] == sB[0] and sA[1] == sB[1]):
|
|
raise ValueError(
|
|
f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
|
|
)
|
|
|
|
transposed_A = True
|
|
transposed_B = False
|
|
|
|
m = sB[2]
|
|
n = sA[2]
|
|
k = sB[0] * sB[1]
|
|
|
|
lda = m
|
|
ldb = sA[2]
|
|
ldc = m
|
|
|
|
ptr = CUBLAS_Context.get_instance().get_context(A.device)
|
|
|
|
# B^T @ A^T = C^T
|
|
# [km, nk -> mn]
|
|
is_on_gpu([B, A, out])
|
|
lib.cigemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k),
|
|
get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc))
|
|
return out
|
|
|
|
|
|
def batched_igemm(
|
|
A: Tensor,
|
|
B: Tensor,
|
|
out: Tensor = None,
|
|
transposed_A=False,
|
|
transposed_B=False,
|
|
):
|
|
if not len(A.shape) == 3 or not len(B.shape) == 3:
|
|
raise ValueError(
|
|
f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}"
|
|
)
|
|
sout = check_matmul(A, B, out, transposed_A, transposed_B)
|
|
if out is None:
|
|
out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
|
|
|
|
if B.is_contiguous():
|
|
lda = B.stride()[1]
|
|
transposed_A = False
|
|
else:
|
|
s = B.stride()
|
|
if s[0] != B.shape[0]:
|
|
B = B.contiguous()
|
|
lda = B.stride()[1]
|
|
elif s[2] == B.shape[1]:
|
|
transposed_A = True
|
|
lda = B.stride()[2]
|
|
else:
|
|
if s[2] == 1:
|
|
B = B.contiguous()
|
|
lda = B.stride()[1]
|
|
elif s[1] == 1:
|
|
B = B.contiguous()
|
|
lda = B.stride()[1]
|
|
else:
|
|
B = B.contiguous()
|
|
lda = B.stride()[1]
|
|
|
|
if A.is_contiguous():
|
|
ldb = A.stride()[1]
|
|
transposed_B = False
|
|
else:
|
|
s = A.stride()
|
|
if s[0] != A.shape[0]:
|
|
A = A.contiguous()
|
|
ldb = A.stride()[1]
|
|
transposed_B = False
|
|
elif s[2] == A.shape[1]:
|
|
ldb = A.stride()[2]
|
|
transposed_B = True
|
|
else:
|
|
A = A.contiguous()
|
|
ldb = A.stride()[1]
|
|
transposed_B = False
|
|
|
|
# this is a mess: cuBLAS expect column major, but PyTorch is row major.
|
|
# So to perform the matrix multiplication, we have to treat A, B, and C matrices
|
|
# (transpose of row major is column major)
|
|
# This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
|
|
# matrices in the input arguments for cuBLAS
|
|
|
|
# column major: A @ B = C: [batch, m, k] @ [batch, k, n] = [batch, m, n]
|
|
# row major: B^T @ A^T = C^T: [batch, m, k] @ [batch, k, n] = [batch, m, n]
|
|
# column major with row major layout: B^T @ A^T = C^T: [batch, k, m] @ [batch, n, k] = [batch, n, m]
|
|
num_batch = A.shape[0]
|
|
n = A.shape[1]
|
|
m = B.shape[2]
|
|
k = B.shape[1]
|
|
|
|
ldc = m
|
|
|
|
strideA = B.shape[1] * B.shape[2]
|
|
strideB = A.shape[1] * A.shape[2]
|
|
strideC = A.shape[1] * B.shape[2]
|
|
|
|
ptr = CUBLAS_Context.get_instance().get_context(A.device)
|
|
|
|
is_on_gpu([B, A, out])
|
|
lib.cbatched_igemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k),
|
|
get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc),
|
|
ct.c_long(strideA), ct.c_long(strideB), ct.c_long(strideC), ct.c_uint32(num_batch))
|
|
return out
|
|
|
|
|
|
def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
|
|
shapeA = SA[0]
|
|
shapeB = SB[0]
|
|
dimsA = len(shapeA)
|
|
dimsB = len(shapeB)
|
|
assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
|
|
if dimsA == 2:
|
|
m = shapeA[0]
|
|
elif dimsA == 3:
|
|
m = shapeA[0] * shapeA[1]
|
|
|
|
rows = n = shapeB[0]
|
|
assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
|
|
|
|
# if the tensor is empty, return a transformed empty tensor with the right dimensions
|
|
if shapeA[0] == 0 and dimsA == 2:
|
|
return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
|
|
elif shapeA[1] == 0 and dimsA == 3:
|
|
return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
|
|
|
|
if dimsA == 2 and out is None:
|
|
out, Sout = get_transform_buffer(
|
|
(shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
|
|
)
|
|
elif dimsA == 3 and out is None:
|
|
out, Sout = get_transform_buffer(
|
|
(shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
|
|
)
|
|
|
|
assert dimsB != 3, "len(B.shape)==3 not supported"
|
|
assert A.device.type == "cuda"
|
|
assert B.device.type == "cuda"
|
|
assert A.dtype == torch.int8
|
|
assert B.dtype == torch.int8
|
|
assert out.dtype == dtype
|
|
assert SA[1] == "col32"
|
|
assert SB[1] in ["col_turing", "col_ampere"]
|
|
assert Sout[1] == "col32"
|
|
assert (
|
|
shapeA[-1] == shapeB[-1]
|
|
), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
|
|
formatB = SB[1]
|
|
prev_device = A.device
|
|
torch.cuda.set_device(A.device)
|
|
|
|
ptr = CUBLAS_Context.get_instance().get_context(A.device)
|
|
ptrA = get_ptr(A)
|
|
ptrB = get_ptr(B)
|
|
ptrC = get_ptr(out)
|
|
|
|
k = shapeA[-1]
|
|
lda = ct.c_int32(m * 32)
|
|
if formatB == "col_turing":
|
|
# turing: tiles with rows filled up to multiple of 8 rows by 32 columns
|
|
# n = rows
|
|
ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
|
|
else:
|
|
# ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
|
|
# n = rows
|
|
ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
|
|
|
|
ldc = ct.c_int32(m * 32)
|
|
m = ct.c_int32(m)
|
|
n = ct.c_int32(n)
|
|
k = ct.c_int32(k)
|
|
|
|
has_error = 0
|
|
ptrRowScale = get_ptr(None)
|
|
is_on_gpu([A, B, out])
|
|
if formatB == 'col_turing':
|
|
if dtype == torch.int32:
|
|
has_error = lib.cigemmlt_turing_32(
|
|
ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
|
|
)
|
|
else:
|
|
has_error = lib.cigemmlt_turing_8(
|
|
ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
|
|
)
|
|
elif formatB == "col_ampere":
|
|
if dtype == torch.int32:
|
|
has_error = lib.cigemmlt_ampere_32(
|
|
ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
|
|
)
|
|
else:
|
|
has_error = lib.cigemmlt_ampere_8(
|
|
ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
|
|
)
|
|
|
|
if has_error == 1:
|
|
print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
|
|
raise Exception('cublasLt ran into an error!')
|
|
|
|
torch.cuda.set_device(prev_device)
|
|
|
|
return out, Sout
|
|
|
|
|
|
def mm_dequant(
|
|
A,
|
|
quant_state,
|
|
row_stats,
|
|
col_stats,
|
|
out=None,
|
|
new_row_stats=None,
|
|
new_col_stats=None,
|
|
bias=None
|
|
):
|
|
assert A.dtype == torch.int32
|
|
if bias is not None: assert bias.dtype == torch.float16
|
|
out_shape = quant_state[0]
|
|
if len(out_shape) == 3:
|
|
out_shape = (out_shape[0] * out_shape[1], out_shape[2])
|
|
|
|
if out is None:
|
|
out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
|
|
if new_row_stats is None:
|
|
new_row_stats = torch.empty(
|
|
out_shape[0], dtype=torch.float32, device=A.device
|
|
)
|
|
if new_col_stats is None:
|
|
new_col_stats = torch.empty(
|
|
out_shape[1], dtype=torch.float32, device=A.device
|
|
)
|
|
assert (
|
|
new_row_stats.shape[0] == row_stats.shape[0]
|
|
), f"{new_row_stats.shape} vs {row_stats.shape}"
|
|
assert (
|
|
new_col_stats.shape[0] == col_stats.shape[0]
|
|
), f"{new_col_stats.shape} vs {col_stats.shape}"
|
|
|
|
prev_device = pre_call(A.device)
|
|
ptrA = get_ptr(A)
|
|
ptrOut = get_ptr(out)
|
|
ptrRowStats = get_ptr(row_stats)
|
|
ptrColStats = get_ptr(col_stats)
|
|
ptrNewRowStats = get_ptr(new_row_stats)
|
|
ptrNewColStats = get_ptr(new_col_stats)
|
|
ptrBias = get_ptr(bias)
|
|
numRows = ct.c_int32(out_shape[0])
|
|
numCols = ct.c_int32(out_shape[1])
|
|
|
|
is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
|
|
lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
|
|
post_call(prev_device)
|
|
|
|
return out
|
|
|
|
|
|
def get_colrow_absmax(
|
|
A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
|
|
):
|
|
assert A.dtype == torch.float16
|
|
device = A.device
|
|
|
|
cols = A.shape[-1]
|
|
if len(A.shape) == 3:
|
|
rows = A.shape[0] * A.shape[1]
|
|
else:
|
|
rows = A.shape[0]
|
|
|
|
col_tiles = (cols + 255) // 256
|
|
tiled_rows = ((rows + 15) // 16) * 16
|
|
if row_stats is None:
|
|
row_stats = torch.empty(
|
|
(rows,), dtype=torch.float32, device=device
|
|
).fill_(-50000.0)
|
|
if col_stats is None:
|
|
col_stats = torch.empty(
|
|
(cols,), dtype=torch.float32, device=device
|
|
).fill_(-50000.0)
|
|
|
|
if nnz_block_ptr is None and threshold > 0.0:
|
|
nnz_block_ptr = torch.zeros(
|
|
((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device
|
|
)
|
|
|
|
ptrA = get_ptr(A)
|
|
ptrRowStats = get_ptr(row_stats)
|
|
ptrColStats = get_ptr(col_stats)
|
|
ptrNnzrows = get_ptr(nnz_block_ptr)
|
|
rows = ct.c_int32(rows)
|
|
cols = ct.c_int32(cols)
|
|
|
|
prev_device = pre_call(A.device)
|
|
is_on_gpu([A, row_stats, col_stats, nnz_block_ptr])
|
|
lib.cget_col_row_stats(ptrA, ptrRowStats, ptrColStats, ptrNnzrows, ct.c_float(threshold), rows, cols)
|
|
post_call(prev_device)
|
|
|
|
if threshold > 0.0:
|
|
nnz_block_ptr.cumsum_(0)
|
|
|
|
return row_stats, col_stats, nnz_block_ptr
|
|
|
|
|
|
class COOSparseTensor:
|
|
def __init__(self, rows, cols, nnz, rowidx, colidx, values):
|
|
assert rowidx.dtype == torch.int32
|
|
assert colidx.dtype == torch.int32
|
|
assert values.dtype == torch.float16
|
|
assert values.numel() == nnz
|
|
assert rowidx.numel() == nnz
|
|
assert colidx.numel() == nnz
|
|
|
|
self.rows = rows
|
|
self.cols = cols
|
|
self.nnz = nnz
|
|
self.rowidx = rowidx
|
|
self.colidx = colidx
|
|
self.values = values
|
|
|
|
|
|
class CSRSparseTensor:
|
|
def __init__(self, rows, cols, nnz, rowptr, colidx, values):
|
|
assert rowptr.dtype == torch.int32
|
|
assert colidx.dtype == torch.int32
|
|
assert values.dtype == torch.float16
|
|
assert values.numel() == nnz
|
|
assert colidx.numel() == nnz
|
|
assert rowptr.numel() == rows + 1
|
|
|
|
self.rows = rows
|
|
self.cols = cols
|
|
self.nnz = nnz
|
|
self.rowptr = rowptr
|
|
self.colidx = colidx
|
|
self.values = values
|
|
|
|
|
|
class CSCSparseTensor:
|
|
def __init__(self, rows, cols, nnz, colptr, rowidx, values):
|
|
assert colptr.dtype == torch.int32
|
|
assert rowidx.dtype == torch.int32
|
|
assert values.dtype == torch.float16
|
|
assert values.numel() == nnz
|
|
assert rowidx.numel() == nnz
|
|
assert colptr.numel() == cols + 1
|
|
|
|
self.rows = rows
|
|
self.cols = cols
|
|
self.nnz = nnz
|
|
self.colptr = colptr
|
|
self.rowidx = rowidx
|
|
self.values = values
|
|
|
|
|
|
def coo2csr(cooA):
|
|
values, counts = torch.unique(cooA.rowidx, return_counts=True)
|
|
values.add_(1)
|
|
rowptr = torch.zeros(
|
|
(cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device
|
|
)
|
|
rowptr.scatter_(index=values.long(), src=counts.int(), dim=0)
|
|
rowptr.cumsum_(0)
|
|
return CSRSparseTensor(
|
|
cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values
|
|
)
|
|
|
|
|
|
def coo2csc(cooA):
|
|
val, col2rowidx = torch.sort(cooA.colidx)
|
|
rowidx = cooA.rowidx[col2rowidx]
|
|
values = cooA.values[col2rowidx]
|
|
colvalues, counts = torch.unique(val, return_counts=True)
|
|
colvalues.add_(1)
|
|
colptr = torch.zeros(
|
|
(cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device
|
|
)
|
|
colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0)
|
|
colptr.cumsum_(0)
|
|
return CSCSparseTensor(
|
|
cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values
|
|
)
|
|
|
|
|
|
def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
|
|
rowidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
|
|
colidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
|
|
values = torch.zeros((nnz,), dtype=dtype, device=device)
|
|
return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
|
|
|
|
|
|
def double_quant(
|
|
A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
|
|
):
|
|
device = A.device
|
|
assert A.dtype == torch.half
|
|
assert device.type == "cuda"
|
|
prev_device = pre_call(A.device)
|
|
|
|
cols = A.shape[-1]
|
|
if len(A.shape) == 3:
|
|
rows = A.shape[0] * A.shape[1]
|
|
else:
|
|
rows = A.shape[0]
|
|
|
|
if row_stats is None or col_stats is None:
|
|
row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
|
|
A, threshold=threshold
|
|
)
|
|
|
|
if out_col is None:
|
|
out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
|
|
if out_row is None:
|
|
out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
|
|
|
|
coo_tensor = None
|
|
ptrA = get_ptr(A)
|
|
ptrColStats = get_ptr(col_stats)
|
|
ptrRowStats = get_ptr(row_stats)
|
|
ptrOutCol = get_ptr(out_col)
|
|
ptrOutRow = get_ptr(out_row)
|
|
|
|
is_on_gpu([A, col_stats, row_stats, out_col, out_row])
|
|
if threshold > 0.0:
|
|
nnz = nnz_row_ptr[-1].item()
|
|
if nnz > 0:
|
|
coo_tensor = coo_zeros(
|
|
A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
|
|
)
|
|
ptrRowIdx = get_ptr(coo_tensor.rowidx)
|
|
ptrColIdx = get_ptr(coo_tensor.colidx)
|
|
ptrVal = get_ptr(coo_tensor.values)
|
|
ptrRowPtr = get_ptr(nnz_row_ptr)
|
|
|
|
lib.cdouble_rowcol_quant(
|
|
ptrA,
|
|
ptrRowStats,
|
|
ptrColStats,
|
|
ptrOutCol,
|
|
ptrOutRow,
|
|
ptrRowIdx,
|
|
ptrColIdx,
|
|
ptrVal,
|
|
ptrRowPtr,
|
|
ct.c_float(threshold),
|
|
ct.c_int32(rows),
|
|
ct.c_int32(cols),
|
|
)
|
|
val, idx = torch.sort(coo_tensor.rowidx)
|
|
coo_tensor.rowidx = val
|
|
coo_tensor.colidx = coo_tensor.colidx[idx]
|
|
coo_tensor.values = coo_tensor.values[idx]
|
|
else:
|
|
lib.cdouble_rowcol_quant(
|
|
ptrA,
|
|
ptrRowStats,
|
|
ptrColStats,
|
|
ptrOutCol,
|
|
ptrOutRow,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
ct.c_float(0.0),
|
|
ct.c_int32(rows),
|
|
ct.c_int32(cols),
|
|
)
|
|
else:
|
|
lib.cdouble_rowcol_quant(
|
|
ptrA,
|
|
ptrRowStats,
|
|
ptrColStats,
|
|
ptrOutCol,
|
|
ptrOutRow,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
ct.c_float(threshold),
|
|
ct.c_int32(rows),
|
|
ct.c_int32(cols),
|
|
)
|
|
post_call(prev_device)
|
|
|
|
return out_row, out_col, row_stats, col_stats, coo_tensor
|
|
|
|
|
|
def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
|
|
prev_device = pre_call(A.device)
|
|
if state is None: state = (A.shape, from_order)
|
|
else: from_order = state[1]
|
|
if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
|
|
else: new_state = (state[0], to_order) # (shape, order)
|
|
|
|
shape = state[0]
|
|
if len(shape) == 2:
|
|
dim1 = ct.c_int32(shape[0])
|
|
dim2 = ct.c_int32(shape[1])
|
|
else:
|
|
dim1 = ct.c_int32(shape[0] * shape[1])
|
|
dim2 = ct.c_int32(shape[2])
|
|
|
|
is_on_gpu([A, out])
|
|
if to_order == 'col32':
|
|
if transpose:
|
|
lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
else:
|
|
lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
elif to_order == "col_turing":
|
|
if transpose:
|
|
lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
else:
|
|
lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
elif to_order == "col_ampere":
|
|
if transpose:
|
|
lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
else:
|
|
lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
elif to_order == "row":
|
|
if from_order == "col_turing":
|
|
lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
elif from_order == "col_ampere":
|
|
lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
|
|
else:
|
|
raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
|
|
|
|
post_call(prev_device)
|
|
|
|
return out, new_state
|
|
|
|
|
|
def spmm_coo(cooA, B, out=None):
|
|
if out is None:
|
|
out = torch.empty(
|
|
(cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype
|
|
)
|
|
nnz = cooA.nnz
|
|
assert cooA.rowidx.numel() == nnz
|
|
assert cooA.colidx.numel() == nnz
|
|
assert cooA.values.numel() == nnz
|
|
assert cooA.cols == B.shape[0]
|
|
|
|
transposed_B = False if B.is_contiguous() else True
|
|
|
|
ldb = B.stride()[(1 if transposed_B else 0)]
|
|
ldc = B.shape[1]
|
|
|
|
ptr = Cusparse_Context.get_instance().context
|
|
|
|
ptrRowidx = get_ptr(cooA.rowidx)
|
|
ptrColidx = get_ptr(cooA.colidx)
|
|
ptrValues = get_ptr(cooA.values)
|
|
ptrB = get_ptr(B)
|
|
ptrC = get_ptr(out)
|
|
cnnz = ct.c_int32(cooA.nnz)
|
|
crowsA = ct.c_int32(cooA.rows)
|
|
ccolsA = ct.c_int32(cooA.cols)
|
|
ccolsB = ct.c_int32(B.shape[1])
|
|
cldb = ct.c_int32(ldb)
|
|
cldc = ct.c_int32(ldc)
|
|
|
|
is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out])
|
|
lib.cspmm_coo(ptr, ptrRowidx, ptrColidx, ptrValues, cnnz, crowsA, ccolsA, ccolsB, cldb, ptrB, cldc, ptrC, ct.c_bool(transposed_B))
|
|
|
|
return out
|
|
|
|
|
|
def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
|
|
if out is None:
|
|
out = torch.zeros(
|
|
(cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype
|
|
)
|
|
nnz = cooA.nnz
|
|
assert cooA.rowidx.numel() == nnz
|
|
assert cooA.colidx.numel() == nnz
|
|
assert cooA.values.numel() == nnz
|
|
assert cooA.cols == B.shape[0], f"{cooA.cols} vs {B.shape}"
|
|
|
|
transposed_B = False if B.is_contiguous() else True
|
|
|
|
ldb = B.stride()[(1 if transposed_B else 0)]
|
|
ldc = B.shape[1]
|
|
|
|
values, counts = torch.unique(cooA.rowidx, return_counts=True)
|
|
offset = counts.cumsum(0).int()
|
|
max_count, max_idx = torch.sort(counts, descending=True)
|
|
max_idx = max_idx.int()
|
|
max_count = max_count.int()
|
|
assert (
|
|
max_count[0] <= 32
|
|
), f"Current max count per row is 8 but found {max_count[0]}."
|
|
assert B.dtype in [torch.float16, torch.int8]
|
|
ptrOffset = get_ptr(offset)
|
|
ptrMaxCount = get_ptr(max_count)
|
|
ptrMaxIdx = get_ptr(max_idx)
|
|
|
|
ptrRowidx = get_ptr(cooA.rowidx)
|
|
ptrColidx = get_ptr(cooA.colidx)
|
|
ptrValues = get_ptr(cooA.values)
|
|
ptrB = get_ptr(B)
|
|
ptrC = get_ptr(out)
|
|
ptrDequantStats = get_ptr(dequant_stats)
|
|
cnnz_rows = ct.c_int32(counts.numel())
|
|
cnnz = ct.c_int32(cooA.nnz)
|
|
crowsA = ct.c_int32(cooA.rows)
|
|
ccolsA = ct.c_int32(cooA.cols)
|
|
crowsB = ct.c_int32(B.shape[1])
|
|
ccolsB = ct.c_int32(B.shape[1])
|
|
cldb = ct.c_int32(ldb)
|
|
cldc = ct.c_int32(ldc)
|
|
|
|
is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out, dequant_stats])
|
|
if B.dtype == torch.float16:
|
|
lib.cspmm_coo_very_sparse_naive_fp16(
|
|
ptrMaxCount,
|
|
ptrMaxIdx,
|
|
ptrOffset,
|
|
ptrRowidx,
|
|
ptrColidx,
|
|
ptrValues,
|
|
ptrB,
|
|
ptrC,
|
|
ptrDequantStats,
|
|
cnnz_rows,
|
|
cnnz,
|
|
crowsA,
|
|
crowsB,
|
|
ccolsB,
|
|
)
|
|
elif B.dtype == torch.int8:
|
|
lib.cspmm_coo_very_sparse_naive_int8(
|
|
ptrMaxCount,
|
|
ptrMaxIdx,
|
|
ptrOffset,
|
|
ptrRowidx,
|
|
ptrColidx,
|
|
ptrValues,
|
|
ptrB,
|
|
ptrC,
|
|
ptrDequantStats,
|
|
cnnz_rows,
|
|
cnnz,
|
|
crowsA,
|
|
crowsB,
|
|
ccolsB,
|
|
)
|
|
# else: assertion error
|
|
|
|
return out
|
|
|
|
|
|
C = 127.0
|
|
|
|
|
|
def vectorwise_quant(x, dim=1, quant_type="vector"):
|
|
if quant_type == "linear":
|
|
max1 = torch.abs(x).max().float()
|
|
xq = torch.round(x / max1 * 127).to(torch.int8)
|
|
return xq, max1
|
|
elif quant_type in ["vector", "row"]:
|
|
max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
|
|
xq = torch.round(x * (C / max1)).to(torch.int8)
|
|
return xq, max1
|
|
elif quant_type == "zeropoint":
|
|
dtype = x.dtype
|
|
x = x.float()
|
|
dyna = x.max() - x.min()
|
|
if dyna == 0:
|
|
dyna = 1
|
|
qx = 255.0 / dyna
|
|
minx = x.min()
|
|
zpx = torch.round(minx * qx)
|
|
x = torch.round(qx * x - zpx) + zpx
|
|
return x, qx
|
|
elif quant_type in ["vector-zeropoint", "row-zeropoint"]:
|
|
dtype = x.dtype
|
|
x = x.float()
|
|
dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(
|
|
x, dim=dim, keepdim=True
|
|
)
|
|
dyna[dyna == 0] = 1
|
|
qx = 255.0 / dyna
|
|
minx = torch.amin(x, dim=dim, keepdim=True)
|
|
zpx = torch.round(minx * qx)
|
|
x = torch.round(qx * x - zpx) + zpx
|
|
return x, qx
|
|
elif quant_type == "truncated-vector":
|
|
with torch.no_grad():
|
|
absx = torch.abs(x)
|
|
max1 = torch.amax(absx, dim=dim, keepdim=True)
|
|
max1 = max1 * 0.7
|
|
idx = absx > max1.expand_as(absx)
|
|
sign = torch.sign(x[idx])
|
|
x[idx] = max1.expand_as(absx)[idx] * sign
|
|
xq = torch.round(x / max1 * C).to(torch.int8)
|
|
return xq, max1
|
|
else:
|
|
return None
|
|
|
|
|
|
def vectorwise_dequant(xq, max1, quant_type="vector"):
|
|
if quant_type == "vector":
|
|
x = (xq / C * max1).to(torch.float32)
|
|
return x
|
|
else:
|
|
return None
|
|
|
|
|
|
def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half, quant_type="vector"):
|
|
if quant_type == "linear":
|
|
norm = S1 * S2 / (C * C)
|
|
# double cast needed to prevent overflows
|
|
return (xq.float() * norm).to(dtype)
|
|
elif quant_type == "zeropoint":
|
|
norm = 1.0 / (S1 * S2)
|
|
return (xq.float() * norm).to(dtype)
|
|
elif quant_type == "row-zeropoint":
|
|
norm = 1.0 / (S1 * S2)
|
|
x = xq.float()
|
|
if len(S1.shape) == 3 and len(x.shape) == 2:
|
|
S1 = S1.squeeze(0)
|
|
if len(S2.shape) == 3 and len(x.shape) == 2:
|
|
S2 = S2.squeeze(0)
|
|
if len(S1.shape) == 2:
|
|
x *= norm
|
|
else:
|
|
x *= norm
|
|
return x.to(dtype)
|
|
elif quant_type == "vector-zeropoint":
|
|
x = xq.float()
|
|
if len(S1.shape) == 3 and len(x.shape) == 2:
|
|
S1 = S1.squeeze(0)
|
|
if len(S2.shape) == 3 and len(x.shape) == 2:
|
|
S2 = S2.squeeze(0)
|
|
if len(S1.shape) == 2:
|
|
x *= 1.0 / S1
|
|
else:
|
|
x *= 1.0 / S1
|
|
x *= 1.0 / S2.t()
|
|
return x.to(dtype)
|
|
elif quant_type == "row":
|
|
x = xq.float()
|
|
if len(S1.shape) == 3 and len(x.shape) == 2:
|
|
S1 = S1.squeeze(0)
|
|
if len(S2.shape) == 3 and len(x.shape) == 2:
|
|
S2 = S2.squeeze(0)
|
|
if len(S1.shape) == 2:
|
|
x *= S1 * S2 / (C * C)
|
|
else:
|
|
x *= S1 * S2 / (C * C)
|
|
return x.to(dtype)
|
|
elif quant_type in ["truncated-vector", "vector"]:
|
|
x = xq.float()
|
|
if len(S1.shape) == 3 and len(x.shape) == 2:
|
|
S1 = S1.squeeze(0)
|
|
if len(S2.shape) == 3 and len(x.shape) == 2:
|
|
S2 = S2.squeeze(0)
|
|
if len(S1.shape) == 2:
|
|
x *= S1 / C
|
|
else:
|
|
x *= S1 / C
|
|
x *= S2 / C
|
|
return x.to(dtype)
|
|
else:
|
|
return None
|
|
|
|
|
|
def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
|
|
offset = B.float().t().sum(0) * (SA[0] + SA[1])
|
|
x = xq.float()
|
|
if len(xq.shape) == 2 and len(SB.shape) == 3:
|
|
SB = SB.squeeze(0)
|
|
if len(SB.shape) == 2:
|
|
x *= SB.t() / 127
|
|
else:
|
|
x *= SB / 127
|
|
x *= SA[1] / 127
|
|
x += offset
|
|
return x.to(dtype)
|
|
|
|
|
|
def extract_outliers(A, SA, idx):
|
|
shapeA = SA[0]
|
|
formatA = SA[1]
|
|
assert formatA in ["col_turing", "col_ampere"]
|
|
assert A.device.type == "cuda"
|
|
|
|
out = torch.zeros(
|
|
(shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
|
|
)
|
|
|
|
idx_size = ct.c_int32(idx.numel())
|
|
rows = ct.c_int32(shapeA[0])
|
|
cols = ct.c_int32(shapeA[1])
|
|
ptrA = get_ptr(A)
|
|
ptrIdx = get_ptr(idx)
|
|
ptrOut = get_ptr(out)
|
|
|
|
prev_device = pre_call(A.device)
|
|
if formatA == 'col_turing':
|
|
lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
|
|
elif formatA == "col_ampere":
|
|
lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
|
|
post_call(prev_device)
|
|
|
|
return out
|
|
|
|
def pipeline_test(A, batch_size):
|
|
out = torch.zeros_like(A)
|
|
lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
|
|
return out
|