From c9f505064ef416d20336415ce381fc21a54658b1 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 28 Jan 2023 17:05:22 -0800 Subject: [PATCH 01/97] Added outlier detector and fake quantization layer. --- bitsandbytes/functional.py | 6 +- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 78 +++++++++++++++++++++ bitsandbytes/utils.py | 136 ++++++++++++++++++++++++++++++++++++ csrc/kernels.cu | 2 + tests/test_functional.py | 6 +- 6 files changed, 225 insertions(+), 5 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 95a7c4f..371f85c 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -168,7 +168,7 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) values = [] lst = list(itertools.product([0, 1], repeat=precision_bits)) #for ev in evalues: - bias = 2**(exponent_bits-1)-1 + bias = 2**(exponent_bits-1) for evalue in range(2**(exponent_bits)): for bit_pattern in lst: value = (1 if evalue != 0 else 0) @@ -176,10 +176,10 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) value += pval*(2**-(i+1)) if evalue == 0: # subnormals - value = value*2**-(bias-1) + value = value*2**-(bias) else: # normals - value = value*2**-(evalue-bias-2) + value = value*2**-(evalue-bias-1) values.append(value) if signed: values.append(-value) diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index edc595a..221b5f7 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index a623bf1..4746a4a 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -10,6 +10,7 @@ from torch import Tensor, device, dtype, nn import bitsandbytes as bnb from bitsandbytes.optim import GlobalOptimManager +from bitsandbytes.utils import OutlierTracer, find_outlier_dims T = TypeVar("T", bound="torch.nn.Module") @@ -133,6 +134,83 @@ class Embedding(torch.nn.Embedding): return emb +class OutlierAwareLinear(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.outlier_dim = None + self.is_quantized = False + + def forward_with_outliers(self, x, outlier_idx): + raise NotImplementedError('Please override the `forward_with_outliers(self, x, outlier_idx)` function') + + def quantize_weight(self, w, outlier_idx): + raise NotImplementedError('Please override the `quantize_weights(self, w, outlier_idx)` function') + + def forward(self, x): + if self.outlier_dim is None: + tracer = OutlierTracer.get_instance() + if not tracer.is_initialized(): + print('Please use OutlierTracer.initialize(model) before using the OutlierAwareLinear layer') + outlier_idx = tracer.get_outliers(self.weight) + #print(outlier_idx, tracer.get_hvalue(self.weight)) + self.outlier_dim = outlier_idx + + if not self.is_quantized: + w = self.quantize_weight(self.weight, self.outlier_dim) + self.weight.data.copy_(w) + self.is_quantized = True + + return self.forward_with_outliers(x, self.outlier_dim) + + +class Fake4bitLinear(OutlierAwareLinear): + def __init__(self, input_features, output_features, bias=True, codebook=bnb.functional.create_fp8_map(True, 3, 0, total_bits=4)): + super().__init__(input_features, output_features, bias) + self.codebook = codebook + + def quantize_weight(self, w, outlier_idx): + if outlier_idx.numel() > 0: + subw = w[:, outlier_idx].clone() + w[:, outlier_idx] = 0 + wdtype = w.dtype + code = self.codebook.to(w.device) + cw, state = bnb.functional.quantize_blockwise(w, code=code, blocksize=64) + w = bnb.functional.dequantize_blockwise(cw, state, blocksize=64) + w = w.to(wdtype) + if outlier_idx.numel() > 0: + w[:, outlier_idx] = subw + self.is_quantized = True + return w + + def forward_with_outliers(self, x, outlier_idx): + dims = torch.abs(x> 4).sum(dim=list(range(len(x.shape)-1))) + outlier_idx2 = torch.where(dims > 0)[0] + outlier_idx = torch.cat([outlier_idx, outlier_idx2]).unique() + n = x.shape[-1] + idx = torch.arange(n, device=x.device) + idx[outlier_idx] = -1 + inverse_idx = torch.where(idx >= 0)[0] + if outlier_idx.numel() > 0: + subx = x[..., outlier_idx].clone() + #print(1, subx, 1) + #x[..., outlier_idx] = 0 + inverse_x = x[...,inverse_idx] + xdtype = x.dtype + #code = bnb.functional.create_fp8_map(True, 4-3, 2, 4).to(x.device) + #code = bnb.functional.create_quantile_map(x, 4).to(x.device) + code = bnb.functional.create_dynamic_map(True, total_bits=4.0).to(x.device) + c, state = bnb.functional.quantize_blockwise(inverse_x, code=code, blocksize=64) + inverse_x = bnb.functional.dequantize_blockwise(c, state, blocksize=64) + #c, state = bnb.functional.quantize_blockwise(x, code=code, blocksize=64) + #x = bnb.functional.dequantize_blockwise(c, state, blocksize=64) + x = x.to(xdtype) + x[..., inverse_idx] = inverse_x.to(x.dtype) + #if outlier_idx.numel() > 0: + #x[..., outlier_idx] = subx + + return torch.nn.functional.linear(x, self.weight, self.bias) + + class Int8Params(torch.nn.Parameter): def __new__( diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index 1cd90e3..30d9e10 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -1,7 +1,143 @@ import shlex import subprocess +import torch from typing import Tuple +def outlier_hook(module, input): + assert isinstance(module, torch.nn.Linear) + tracer = OutlierTracer.get_instance() + hvalue = tracer.get_hvalue(module.weight) + if hvalue not in tracer.hvalue2outlier_idx: + outlier_idx = find_outlier_dims(module.weight) + tracer.outliers.append(outlier_idx) + tracer.hvalues.append(hvalue) + if len(tracer.outliers) > 1: + # assign the current layer the outlier idx found from the weight + # of the previous linear layer + if tracer.outliers[-1].numel() > 0: + assert tracer.outliers[-1].max() < module.weight.shape[1] + tracer.hvalue2outlier_idx[hvalue] = tracer.outliers[-1] + + else: + # first layer, we cannot use the weight for outlier detection + # we follow a mixed approach: + # (1) zscore test of std of hidden dimension + # (2) magnitude > 6 test + merged = input[0].view(-1, input[0].shape[-1]) + # (1) zscore test of std of hidden dimension + outlier_idx = find_outlier_dims(merged, reduction_dim=1, zscore=3) + # (2) magnitude > 6 test + dims = (torch.abs(input[0])> 6).sum(dim=list(range(len(input[0].shape)-1))) + outlier_idx2 = torch.where(dims > 0)[0] + outlier_idx = torch.cat([outlier_idx, outlier_idx2]).unique() + tracer.hvalue2outlier_idx[hvalue] = outlier_idx + else: + for hook in tracer.hooks: + hook.remove() + + +class OutlierTracer(object): + _instance = None + + def __init__(self): + raise RuntimeError("Call get_instance() instead") + + def initialize(self, model): + self.last_w = None + self.current_outlier_dims = None + self.hvalues = [] + self.outliers = [] + self.hvalue2outlier_idx = {} + self.initialized = True + self.hooks = [] + + for n, m in model.named_modules(): + if isinstance(m, torch.nn.Linear): + self.hooks.append(m.register_forward_pre_hook(outlier_hook)) + + def is_initialized(self): + return getattr(self, 'initialized', False) + + def get_hvalue(self, weight): + return weight.data.storage().data_ptr() + + def get_outliers(self, weight): + if not self.is_initialized(): + print('Outlier tracer is not initialized...') + return None + hvalue = self.get_hvalue(weight) + if hvalue in self.hvalue2outlier_idx: + return self.hvalue2outlier_idx[hvalue] + else: + return None + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = cls.__new__(cls) + return cls._instance + +def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False): + if rdm: + return torch.randint(0, weight.shape[1], size=(topk,), device=weight.device).long() + + m = weight.mean(reduction_dim) + mm = m.mean() + mstd = m.std() + zm = (m-mm)/mstd + + std = weight.std(reduction_dim) + stdm = std.mean() + stdstd = std.std() + + zstd = (std-stdm)/stdstd + + if topk is not None: + val, idx = torch.topk(std.abs(), k=topk, dim=0) + else: + idx = torch.where(zstd > zscore)[0] + + return idx + +def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None): + """ + Replace linear modules with a new Linear module. + + Parameters: + model (`torch.nn.Module`): + Input model or `torch.nn.Module` as the function is run recursively. + linear_replacement (`torch.nn.Module`): + The linear module that replaces the old one. Only expects standard arguments. + If other arguments need to be passed, use a lambda. + skip_modules (`List[str]`, *optional*, defaults to `lm_head`): + List of modules names not to convert. Defaults to `lm_head`. + copy_weights (`bool`): + Copy the weights from the old linear module to the new one + post_processing_fun_name (`str`): + A function name of the replacement linear class that is called + after processing. + """ + for name, module in model.named_children(): + if len(list(module.children())) > 0: + replace_linear(module, linear_replacement, skip_modules, copy_weights, post_processing_function) + + if isinstance(module, torch.nn.Linear) and name not in skip_modules: + old_module = model._modules[name] + model._modules[name] = linear_replacement( + module.in_features, + module.out_features, + module.bias is not None, + ) + if copy_weights: + model._modules[name].weight = old_module.weight + model._modules[name].bias = old_module.bias + + if post_processing_function is not None: + func = getattr(module, post_processing_function, None) + if func is not None: func(module) + return model + + def execute_and_return(command_string: str) -> Tuple[str, str]: def _decode(subprocess_err_out_tuple): diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 08b9b44..b32b39c 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -543,7 +543,9 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs // load code through read-only cache via __ldg #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH; j++) + { vals[j] = __ldg(&code[qvals[j]])*local_abs_max; + } __syncthreads(); StoreT(storet).Store(&(out[i]), vals, valid_items); diff --git a/tests/test_functional.py b/tests/test_functional.py index 69c200a..70fa4d0 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2109,6 +2109,7 @@ def test_few_bit_quant(): ebits = math.ceil(bits/2) pbits = bits-ebits-1 code = F.create_fp8_map(True, ebits, pbits, bits).cuda() + print(code) elif method == 'dynamic': code = F.create_dynamic_map(True, bits-0, bits).cuda() elif method == 'quantile': @@ -2181,7 +2182,9 @@ def test_kbit_quantile_estimation(): def test_bench_dequantization(): a = torch.rand(1024, 1024, device='cuda').half() - qa, SA = F.quantize_blockwise(a) + code =F.create_fp8_map(True, 3, 0, 4).cuda() + qa, SA = F.quantize_blockwise(a, code=code) + print(qa.max()) max_theoretical_mu = 1024*1024*2/1024**3/672*1000*1000 #print(max_theoretical_mu) @@ -2193,3 +2196,4 @@ def test_bench_dequantization(): torch.cuda.synchronize() #print((time.time()-t0)/1e6) + From 3ac5840c03c829f8a77f740a3ce1887df472d1fa Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 4 Feb 2023 14:52:04 -0800 Subject: [PATCH 02/97] Added fp4 quant/dequant and dequant optimizations. --- bitsandbytes/cextension.py | 2 +- bitsandbytes/cuda_setup/main.py | 4 + bitsandbytes/functional.py | 123 +++++++++++++- csrc/kernels.cu | 284 ++++++++++++++++++++++++-------- csrc/kernels.cuh | 4 +- csrc/ops.cu | 54 +++--- csrc/ops.cuh | 4 +- csrc/pythonInterface.c | 21 ++- tests/test_functional.py | 85 +++++++++- 9 files changed, 468 insertions(+), 113 deletions(-) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 7a62c1e..e2ca978 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -9,7 +9,7 @@ from bitsandbytes.cuda_setup.main import CUDASetup setup = CUDASetup.get_instance() -if setup.initialized != True: +if not setup.initialized: setup.run_cuda_setup() if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': setup.print_log_stack() diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index cd9573f..6bebd93 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -35,6 +35,9 @@ class CUDASetup: raise RuntimeError("Call get_instance() instead") def generate_instructions(self): + if getattr(self, 'error', False): return + print(self.error) + self.error = True if self.cuda is None: self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected.') self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.') @@ -84,6 +87,7 @@ class CUDASetup: self.has_printed = False self.lib = None self.initialized = False + self.error = False def run_cuda_setup(self): self.initialized = True diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 95a7c4f..da9e743 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -168,7 +168,8 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) values = [] lst = list(itertools.product([0, 1], repeat=precision_bits)) #for ev in evalues: - bias = 2**(exponent_bits-1)-1 + bias = 2**(exponent_bits-1)+1 + print(bias) for evalue in range(2**(exponent_bits)): for bit_pattern in lst: value = (1 if evalue != 0 else 0) @@ -176,10 +177,12 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) value += pval*(2**-(i+1)) if evalue == 0: # subnormals - value = value*2**-(bias-1) + value = value*2**-(bias) else: # normals - value = value*2**-(evalue-bias-2) + print(value, 1) + value = value*2**-(evalue-bias-1) + print(value, 2) values.append(value) if signed: values.append(-value) @@ -193,7 +196,7 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) values.append(0) values.sort() code = torch.Tensor(values) - code /= code.max() + #code /= code.max() return code @@ -587,7 +590,7 @@ def dequantize_blockwise( code = code.to(A.device) if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") - is_on_gpu([A, out]) + is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())) elif out.dtype == torch.float16: @@ -602,6 +605,116 @@ def dequantize_blockwise( return out +def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64) -> Tensor: + """ + Quantize tensor A in blocks of FP4 values. + + Quantizes tensor A by dividing it into blocks which are independently quantized to FP4. + + Parameters + ---------- + A : torch.Tensor + The input tensor. + absmax : torch.Tensor + The absmax values. + out : torch.Tensor + The output tensor (8-bit). + blocksize : int + The blocksize used in quantization. + + Returns + ------- + torch.Tensor: + The 8-bit tensor with packed 4-bit values. + tuple(torch.Tensor, torch.Size, torch.dtype): + The quantization state to undo the quantization. + """ + if A.device.type != 'cuda': + raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}') + + n = A.numel() + input_shape = A.shape + + if absmax is None: + blocks = n // blocksize + blocks += 1 if n % blocksize > 0 else 0 + absmax = torch.zeros((blocks,), device=A.device) + + state = (absmax, input_shape, A.dtype) + + if out is None: + out = torch.zeros(((n+1)//2,), dtype=torch.uint8, device=A.device) + + assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] + + prev_device = pre_call(A.device) + is_on_gpu([A, out, absmax]) + + if A.dtype == torch.float32: + lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + elif A.dtype == torch.float16: + lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + else: + raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") + post_call(A.device) + + return out, state + + +def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: + """ + Dequantizes FP4 blockwise quantized values. + + Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize. + + Parameters + ---------- + A : torch.Tensor + The input 8-bit tensor (packed 4-bit values). + quant_state : tuple(torch.Tensor, torch.Size, torch.dtype) + Tuple of absmax values, original tensor shape and original dtype. + absmax : torch.Tensor + The absmax values. + out : torch.Tensor + Dequantized output tensor. + + + Returns + ------- + torch.Tensor: + Dequantized tensor. + """ + if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: + raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") + + if quant_state is None: + assert absmax is not None and out is not None + shape = out.shape + dtype = out.dtype + else: + absmax, shape, dtype = quant_state + + + if out is None: + out = torch.empty(shape, dtype=dtype, device=A.device) + + n = out.numel() + + device = pre_call(A.device) + is_on_gpu([A, absmax, out]) + if out.dtype == torch.float32: + lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + elif out.dtype == torch.float16: + lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + else: + raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") + post_call(A.device) + + return out + + + + def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor: if code is None: if "dynamic" not in name2qmap: diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 08b9b44..a1eec68 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -43,6 +43,79 @@ __device__ float atomicMin(float* address, float val) { return __int_as_float(old); } +__device__ float dDequantizeFP4(unsigned char val, float absmax) +{ + float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; + if((val & 0b0110) == 0) + { + // subnormal + if((val & 0b0001) == 0) + return 0.0f; + else + return sign*0.0625f*absmax; + } + else + { + // normal + float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f); + float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f; + + return sign*exponent*fraction*absmax; + } +} + +__device__ unsigned char dQuantizeFP4(float x) +{ + // FP4 with bias of 3 + // first bit is a sign + // subnormals + // 0b000 = 0 + // 0b001 = 0.0625 + // 0b110 = 2 + // 0b111 = 3 + // 0b100 = 4 + // 0b101 = 6 + // 0b010 = 8 + // 0b011 = 12 + + int sign = x < 0 ? 0b1000 : 0b0000; + x = fabsf(x); + if(x > 3.5f) + { + if( x > 7.0f) + { + if( x > 10.0f) + return 0b0011+sign; + else + return 0b0010+sign; + } + else + { + if(x > 5.0f) + return 0b101+sign; + else + return 0b100+sign; + } + } + else + { + if(x > 1.03125f) + { + if(x > 2.5f) + return 0b0111+sign; + else + return 0b0110+sign; + } + else + { + if(x > 0.03125f) + return 0b0001+sign; + else + return 0b0000+sign; + } + } +} + template __device__ unsigned char dQuantize(float* smem_code, const float rand, float x) { @@ -427,7 +500,7 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c } } -template +template //__launch_bounds__(TH, 4) __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n) { @@ -437,13 +510,13 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float T vals[NUM_PER_TH]; float rand_vals[NUM_PER_TH]; - unsigned char qvals[NUM_PER_TH]; + unsigned char qvals[FP4 ? NUM_PER_TH/2 : NUM_PER_TH]; //float local_abs_max = -FLT_MAX; float local_abs_max = 0.0f; int local_rand_idx = 0; typedef cub::BlockLoad LoadT; - typedef cub::BlockStore StoreChar; + typedef cub::BlockStore StoreChar; typedef cub::BlockReduce BlockReduce; typedef cub::BlockLoad LoadFloat; @@ -454,8 +527,9 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float __shared__ float smem_code[256]; __shared__ float smem_absmax_value[1]; - for(int i = threadIdx.x; i < 256; i+=blockDim.x) - smem_code[i] = code[i]; + if(!FP4) + for(int i = threadIdx.x; i < 256; i+=blockDim.x) + smem_code[i] = code[i]; for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE) { @@ -495,61 +569,138 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float LoadFloat(loadf).Load(&rand[local_rand_idx], rand_vals, BLOCK_SIZE, 0); } - #pragma unroll NUM_PER_TH - for(int j = 0; j < NUM_PER_TH; j++) + if(FP4) { - if(!STOCHASTIC) - qvals[j] = dQuantize<0>(smem_code, 0.0f, ((float)vals[j])*local_abs_max); - else - qvals[j] = dQuantize<1>(smem_code, rand_vals[j], ((float)vals[j])*local_abs_max); + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH/2; j++) + { + unsigned char packed_fp4 = 0; + packed_fp4 |= dQuantizeFP4(((float)vals[2*j])*local_abs_max*12.0f) << 4; + packed_fp4 |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max*12.0f); + qvals[j] = packed_fp4; + } + } + else + { + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + { + if(!STOCHASTIC) + qvals[j] = dQuantize<0>(smem_code, 0.0f, ((float)vals[j])*local_abs_max); + else + qvals[j] = dQuantize<1>(smem_code, rand_vals[j], ((float)vals[j])*local_abs_max); + } } __syncthreads(); - StoreChar(storec).Store(&(out[i]), qvals, valid_items); + StoreChar(storec).Store(&(out[FP4 ? i/2 : i]), qvals, FP4 ? (valid_items+1)/2 : valid_items); } } -template -__global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n) +template +__global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n) { - const int n_full = gridDim.x * BLOCK_SIZE; - int valid_items = 0; - const int base_idx = (blockIdx.x * BLOCK_SIZE); + const int n_load = (gridDim.x * TILE_SIZE); + int valid_items_load = 0; + int valid_items_store = 0; + const int base_idx = (blockIdx.x * TILE_SIZE); - T vals[NUM_PER_TH]; + T vals[NUM_PER_TH*(FP4 ? 2 : 1)]; unsigned char qvals[NUM_PER_TH]; float local_abs_max = -FLT_MAX; typedef cub::BlockLoad LoadChar; - typedef cub::BlockStore StoreT; + typedef cub::BlockStore StoreT; __shared__ typename LoadChar::TempStorage loadchar; __shared__ typename StoreT::TempStorage storet; - //__shared__ float smem_code[256]; - //float local_code[16]; - //if(threadIdx.x < 256) - //smem_code[threadIdx.x] = code[threadIdx.x]; - - for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE) + for (unsigned int i = base_idx; i < n_load; i += gridDim.x*TILE_SIZE) { - valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i; - local_abs_max = absmax[i/BLOCK_SIZE]; + if(FP4) + { + valid_items_load = (n+1)/2 - i > TILE_SIZE ? TILE_SIZE : (n+1)/2 - i; + valid_items_store = n - i*2 > TILE_SIZE*2 ? TILE_SIZE*2 : n - i*2; + } + else + { + valid_items_load = n - i > TILE_SIZE ? TILE_SIZE : n - i; + valid_items_store = n - i > TILE_SIZE ? TILE_SIZE : n - i; + } + local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH)/(blocksize)]); __syncthreads(); - LoadChar(loadchar).Load(&(A[i]), qvals, valid_items, 128); + LoadChar(loadchar).Load(&(A[i]), qvals, valid_items_load, 128); - // load code through read-only cache via __ldg - #pragma unroll NUM_PER_TH - for(int j = 0; j < NUM_PER_TH; j++) - vals[j] = __ldg(&code[qvals[j]])*local_abs_max; + + if(FP4) + { + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + { + vals[j*2] = dDequantizeFP4(qvals[j] >> 4, local_abs_max*0.083333f); + vals[j*2 + 1] = dDequantizeFP4(qvals[j] & 0x0F, local_abs_max*0.083333); + } + } + else + { + // load code through read-only cache via __ldg + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + vals[j] = __ldg(&code[qvals[j]])*local_abs_max; + } __syncthreads(); - StoreT(storet).Store(&(out[i]), vals, valid_items); + StoreT(storet).Store(&(out[FP4 ? i*2 : i]), vals, valid_items_store); } } +//template +//__global__ void kDequantizeBlockwiseFP4(unsigned char * A, float * absmax, T *out, const int n_store) +//{ +// +// const int n_load = n_store/2; +// const int base_idx = (blockIdx.x * TILE_SIZE); +// +// T vals[NUM_PER_TH*2]; +// unsigned char qvals[NUM_PER_TH]; +// +// int valid_items = (base_idx + TILE_SIZE) > n_load ? ((base_idx+TILE_SIZE) - n_load) : TILE_SIZE; +// int idx = base_idx + (threadIdx.x*NUM_PER_TH); +// +// float local_abs_max = __ldg(&absmax[idx/BLOCK_SIZE]); +// +// if(valid_items == TILE_SIZE) +// { +// // we do 64 byte loads so we can 128 byte stores +// reinterpret_cast(qvals)[0] = reinterpret_cast(A)[idx/8]; +// } +// else +// { +// #pragma unroll +// for(int j = 0; j < NUM_PER_TH; j++) +// if(idx+j < n_load) +// qvals[j] = A[idx+j]; +// else +// qvals[j] = 0; +// } +// +// +// #pragma unroll NUM_PER_TH +// for(int j = 0; j < NUM_PER_TH; j++) +// { +// vals[j*2] = dDequantizeFP4(qvals[j] & 0xF0, local_abs_max*12.0f); +// vals[j*2 + 1] = dDequantizeFP4(qvals[j] & 0x0F, local_abs_max*12.0f); +// } +// +// +// reinterpret_cast(qvals)[0] = reinterpret_cast(A)[idx/8]; +// reinterpret_cast(A)[idx/16] = reinterpret_cast(local_valC)[j/num_items]; +// +// +//} + __global__ void kDequantize(float *code, unsigned char *A, float *out, const int n) { @@ -2523,7 +2674,6 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o // 4. Multiply the tile -> accumulate outputs in shared memory until 128 bytes it reached int idx = idx_col_B + (warp_idx*SPMM_ITEMS) + j; if(idx >= colsB){ break; } - //printf("%i %i\n", (row_offset+idx) % num_items, row_offset+idx); if((idx+num_items < colsB)) { if(BITS == 8) @@ -2543,8 +2693,6 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o #pragma unroll num_items for(int k = 0; k < num_items; k++) { - //if((float)local_valsB[k] != 0.0) - // printf("%f %i %i %i\n", (float)local_valsB[k], k, idx, colsB); if(BITS == 8 && dequant_stats != NULL) // we do texture cache reads (__ldg) on dequant_stats which should be super fast { @@ -2789,38 +2937,42 @@ MAKE_optimizerStatic8bit2State(ADAM, float) template __global__ void kPercentileClipping(float * __restrict__ g, float *gnorm_vec, int step, const int n); template __global__ void kPercentileClipping(half * __restrict__ g, float *gnorm_vec, int step, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); #define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread) \ diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index d90ea13..23aad6c 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -14,8 +14,8 @@ template__global__ void kEstimateQuantiles(T *__restrict__ const A, __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n); __global__ void kDequantize(float *code, unsigned char *A, float *out, const int n); -template __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n); +template __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n); template __global__ void kPreconditionOptimizer32bit2State(T* g, T* p, diff --git a/csrc/ops.cu b/csrc/ops.cu index e770e10..483d915 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -50,7 +50,7 @@ void dequantize(float *code, unsigned char *A, float *out, int n) CUDA_CHECK_RETURN(cudaPeekAtLastError()); } -template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n) +template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n) { int num_blocks = n/blocksize; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; @@ -58,42 +58,34 @@ template void quantizeBlockwise(float * code, T *A, assert(blocksize == 4096); if(blocksize == 4096) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 2048) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 1024) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 512) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 256) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 128) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 64) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) { int num_blocks = n/blocksize; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; - if(blocksize == 4096) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); - else if(blocksize == 2048) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); - else if(blocksize == 1024) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); - else if(blocksize == 512) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); - else if(blocksize == 256) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); - else if(blocksize == 128) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); - else if(blocksize == 64) - kDequantizeBlockwise<<>>(code, A, absmax, out, n); + int tile_size = FP4 ? 1024 : 512; + + if(FP4) + kDequantizeBlockwise<<<(n+tile_size-1)/tile_size, 64>>>(code, A, absmax, out, blocksize/2, n); + else + kDequantizeBlockwise<<<(n+tile_size-1)/tile_size, 64>>>(code, A, absmax, out, blocksize, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } @@ -688,12 +680,16 @@ template void transformRowToFormat(char * A, char *out, int rows, template void estimateQuantiles(half *A, float *code, float offset, int n); template void estimateQuantiles(float *A, float *code, float offset, int n); -template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); #define MAKE_optimizer32bit(name, gtype) \ template void optimizer32bit(gtype* g, gtype* p, \ diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 31d4dd8..b3e2424 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -128,8 +128,8 @@ template void estimateQuantiles(T *A, float *code, float offset, in void quantize(float *code, float *A, unsigned char *out, int n); void dequantize(float *code, unsigned char *A, float *out, int n); -template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n); +template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n); template void optimizer32bit(T* g, T* p, float* state1, float* state2, float *unorm, float max_unorm, float param_norm, diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index d8b2290..6a4bb0d 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -75,13 +75,17 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32) void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping(g, gnorm_vec, step, n); } void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping(g, gnorm_vec, step, n); } -void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } -void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } -void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } -void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } +void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } +void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } +void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } -void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } \ -void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } +void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } \ +void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } +void dequantizeBlockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } \ +void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } #define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \ void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \ @@ -148,6 +152,11 @@ extern "C" void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); } + void cquantize_blockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); } + void cquantize_blockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); } + void cdequantize_blockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); } + void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); } + #define MAKE_CFUNC32(name, gtype, gbits) \ void c##name##32bit_g##gbits(gtype *g, gtype *p, \ float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \ diff --git a/tests/test_functional.py b/tests/test_functional.py index 69c200a..efdda54 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -152,7 +152,7 @@ def test_dynamic_quantization(): def test_dynamic_blockwise_quantization(): #print('') - for blocksize in [4096, 2048, 1024, 512]: + for blocksize in [4096, 2048, 1024, 512, 256, 128, 64]: diffs = [] reldiffs = [] for i in range(100): @@ -2189,7 +2189,88 @@ def test_bench_dequantization(): torch.cuda.synchronize() t0 = time.time() for i in range(100): - F.dequantize_blockwise(qa, SA, blocksize=2048) + #F.dequantize_blockwise(qa, SA, blocksize=2048) + qa, SA = F.quantize_blockwise(a) torch.cuda.synchronize() #print((time.time()-t0)/1e6) + + +def test_fp4_quant(): + vals = list(product([0, 1], repeat=4)) + + code = {} + for bits in vals: + result = 0 + bias = 3 + sign, e1, e2, p1 = bits + idx = sign*8 + e1*4 + e2*2 + p1*1 + sign = -1.0 if sign else 1.0 + exp = e1*2 + e2*1 + if exp == 0: + # sub-normal + if p1 == 0: result = 0 + else: result = sign*0.0625 + else: + # normal + exp = 2**(-exp + bias + 1) + frac = 1.5 if p1 else 1.0 + result = sign*exp*frac + code[idx] = result + + A1 = torch.randn(1024, 1024, device='cuda').half() + qa, SA = F.quantize_fp4(A1, blocksize=64) + A2 = F.dequantize_fp4(qa, SA) + #qa, SA = F.quantize_fp4(A1, blocksize=128) + #A2 = F.dequantize_fp4(qa, SA, blocksize=128) + + #A1 = A1.flatten().sort()[0] + #A2 = A2.flatten().sort()[0] + + #print(A1) + #print(A2) + + err = (A1 - A2).abs().float() + relerr = (err/A1.abs().float()).mean() + err = err.mean() + + print(err, relerr) + + + + + #assert err.item() < 0.1 + #assert relerr.item() < 0.28 + + +def test_bench_fp4_dequant(): + blocksize = 256 + a = torch.rand(1024*12*4, 1024*12, device='cuda').half() + qa, SA = F.quantize_fp4(a, blocksize=blocksize) + + input_size = a.numel()/2 + output_size = a.numel()*2 + num_bytes = input_size+output_size + GB = num_bytes/1e9 + max_theoretical_s = GB/768 + print(max_theoretical_s*1e6) + b = torch.randn(128, 1024*12, device='cuda').half() + + iters = 5 + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + F.dequantize_fp4(qa, SA, blocksize=blocksize) + #b.copy_(a) + torch.cuda.synchronize() + print((time.time()-t0)/iters*1e6) + + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + torch.matmul(b, a.t()) + torch.cuda.synchronize() + print((time.time()-t0)/iters*1e6) + + + From 160a83580d3e159d00fa3004c8b98a64d08fb732 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 4 Feb 2023 21:11:21 -0800 Subject: [PATCH 03/97] Forward matmul_fp4 tests pass. --- bitsandbytes/__init__.py | 1 + bitsandbytes/autograd/_functions.py | 67 +++++++++++++++- bitsandbytes/functional.py | 15 ++-- bitsandbytes/nn/modules.py | 62 +++++++++++++++ tests/test_autograd.py | 115 ++++++++++++++++++++++++++++ tests/test_functional.py | 17 +--- 6 files changed, 254 insertions(+), 23 deletions(-) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 041df4b..c83b7ff 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -10,6 +10,7 @@ from .autograd._functions import ( matmul, matmul_cublas, mm_cublas, + matmul_fp4 ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 376fb8a..a098d4b 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -2,7 +2,7 @@ import operator import warnings from dataclasses import dataclass from functools import reduce # Required in Python 3 -from typing import Tuple, Optional +from typing import Tuple, Optional, List import torch @@ -474,6 +474,67 @@ class MatMul8bitLt(torch.autograd.Function): return grad_A, grad_B, None, grad_bias, None +class MatMulFP4(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, bias=None, state=None): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + ctx.bias = bias + B_shape = state[1] + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + + # 1. Dequantize + # 2. Matmul + output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype), bias) + + # 3. Save state + ctx.state = state + ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype + + if any(ctx.needs_input_grad[:2]): + ctx.tensors = A + else: + ctx.tensors = [None, None] + ctx.tensor_states = (None, None) + ctx.save_for_backward(None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None + + req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad + A = ctx.tensors + state = ctx.state + + if req_gradBias: + # compute grad_bias first before changing grad_output dtype + grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) + + # Cast grad_output to fp16 + if len(grad_output.shape) == 3: + grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + + if req_gradB: grad_B = torch.matmul(grad_output.t(), A) + if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(ctx.dtype_A)) + + return grad_A, grad_B, None, grad_bias, None + + def matmul( A: tensor, B: tensor, @@ -486,3 +547,7 @@ def matmul( if threshold > 0.0: state.threshold = threshold return MatMul8bitLt.apply(A, B, out, bias, state) + + +def matmul_fp4(A: tensor, B: tensor, out: tensor = None, quant_state: List = None, bias=None): + return MatMulFP4.apply(A, B, out, bias, quant_state) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index da9e743..92ac670 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -626,7 +626,7 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize ------- torch.Tensor: The 8-bit tensor with packed 4-bit values. - tuple(torch.Tensor, torch.Size, torch.dtype): + tuple(torch.Tensor, torch.Size, torch.dtype, int): The quantization state to undo the quantization. """ if A.device.type != 'cuda': @@ -640,10 +640,10 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize blocks += 1 if n % blocksize > 0 else 0 absmax = torch.zeros((blocks,), device=A.device) - state = (absmax, input_shape, A.dtype) + state = (absmax, input_shape, A.dtype, blocksize) if out is None: - out = torch.zeros(((n+1)//2,), dtype=torch.uint8, device=A.device) + out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device) assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] @@ -692,7 +692,7 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: shape = out.shape dtype = out.dtype else: - absmax, shape, dtype = quant_state + absmax, shape, dtype, blocksize = quant_state if out is None: @@ -700,6 +700,7 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: n = out.numel() + device = pre_call(A.device) is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: @@ -710,9 +711,9 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) - return out - - + is_transposed = (True if A.shape[0] == 1 else False) + if is_transposed: return out.t() + else: return out def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor: diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 45df35e..6dfb06c 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -133,6 +133,67 @@ class Embedding(torch.nn.Embedding): return emb +class FP4Params(torch.nn.Parameter): + def __new__(cls, data=None, requires_grad=True, quant_state=None): + cls.quant_state = None + if data is None: + data = torch.empty(0) + return torch.Tensor._make_subclass(cls, data, requires_grad) + + def cuda(self, device): + w = self.data.contiguous().half().cuda(device) + w_fp4, quant_state = bnb.functional.quantize_fp4(w) + self.data = w_fp4 + self.quant_state = quant_state + + return self + + @overload + def to(self: T, device: Optional[Union[int, device]] = ..., dtype: Optional[Union[dtype, str]] = ..., non_blocking: bool = ...,) -> T: + ... + + @overload + def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: + ... + + @overload + def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: + ... + + def to(self, *args, **kwargs): + device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs) + + if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"): + return self.cuda(device) + else: + new_param = FP4Params(super().to(device=device, dtype=dtype, non_blocking=non_blocking), + requires_grad=self.requires_grad, quant_state=self.quant_state) + + return new_param + + +class LinearFP4(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.state = bnb.MatmulLtState() + self.weight = FP4Params(self.weight.data, requires_grad=False) + + def init_8bit_state(self): + pass + + def forward(self, x: torch.Tensor): + self.state.is_training = self.training + + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + if getattr(self.weight, 'state', None) is None: + print('FP4 state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') + out = bnb.matmul_fp(x, self.weight, bias=self.bias, state=self.weight.state) + + return out + class Int8Params(torch.nn.Parameter): def __new__( @@ -208,6 +269,7 @@ class Int8Params(torch.nn.Parameter): return new_param + class Linear8bitLt(nn.Linear): def __init__(self, input_features, output_features, bias=True, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0, index=None): diff --git a/tests/test_autograd.py b/tests/test_autograd.py index c67126d..ba75d76 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -429,3 +429,118 @@ def test_matmullt( if req_grad[2]: torch.testing.assert_allclose(gradBias1, gradBias2) + + +n = 1 +k = 3 +dim1 = torch.randint(16, 64, size=(n,)).tolist() +dim2 = torch.randint(32, 96, size=(n,)).tolist() +dim3 = torch.randint(32, 96, size=(n,)).tolist() +dim4 = torch.randint(32, 96, size=(n,)).tolist() + +dim2.append(0) + +funcs = [(torch.matmul, bnb.matmul_fp4)] +str_funcs = ["matmul"] +req_grad = list(product([True, False], repeat=3)) +req_grad_str = [] +for c in req_grad: + strval = '' + for v in c: + if v == True: strval += 'T' + else: strval += 'F' + req_grad_str.append(strval) + +transpose = [(False, True), (False, False)] +str_transpose = ["NT", "NN"] +dtype = [torch.float16, torch.float32] +has_fp16_weights = [True, False] +has_bias = [True, False] +values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias)) +str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias)) +names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}".format(*vals) for vals in str_values] +@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") +@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias", values, ids=names) +def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias): + dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) + dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) + if has_bias == False: + req_grad = list(req_grad) + req_grad[2] = False + + for i in range(k): + # normal multiply + if funcs[0] in [torch.mm, torch.matmul]: + A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) + B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype) + target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype) + bias = None + bias2 = None + if has_bias: + bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) + bias2 = bias.clone() + torch.nn.init.xavier_uniform_(B) + B2 = B.clone() + + B2, quant_state = bnb.functional.quantize_fp4(B) + + if not transpose[0] and transpose[1]: + out_torch = funcs[0](A, B.t()) + out_bnb = funcs[1](A, B2, quant_state=quant_state, bias=bias2) + elif not transpose[0] and not transpose[1]: + out_torch = funcs[0](A, B) + out_bnb = funcs[1](A, B2.t(), quant_state=quant_state, bias=bias2) + + if has_bias: + out_torch += bias + + assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}" + + n = out_bnb.numel() + err = torch.abs(out_bnb - out_torch).float().mean().item() + if n > 0: + assert err < 0.11 + + if any(req_grad): + out_bnb.data.copy_(out_torch) + torch.cuda.synchronize() + loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean() + loss_bnb.backward() + gradA1 = A.grad + gradB1 = B.grad + A.grad = None + B.grad = None + if has_bias: + gradBias1 = bias.grad + bias.grad = None + + loss_torch = torch.nn.functional.mse_loss( out_torch, target ).mean() + loss_torch.backward() + gradA2 = A.grad + gradB2 = B.grad + A.grad = None + B.grad = None + if has_bias: + gradBias2 = bias.grad + bias.grad = None + + if req_grad[0]: + torch.testing.assert_allclose( gradA1, gradA2, atol=0.015, rtol=0.1) + if req_grad[1]: + n = gradB1.numel() + if dim2 > 0: + assert torch.abs(gradB1).sum() > 0.0 + assert torch.abs(gradB2).sum() > 0.0 + else: + assert torch.abs(gradB1).sum() == 0.0 + assert torch.abs(gradB2).sum() == 0.0 + idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) + + assert (idx == 0).sum().item() <= n * 0.1 + idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) + assert (idx == 0).sum().item() <= n * 0.02 + torch.testing.assert_allclose(gradB1, gradB2, atol=0.18, rtol=0.3 + ) + + if req_grad[2]: + torch.testing.assert_allclose(gradBias1, gradBias2) diff --git a/tests/test_functional.py b/tests/test_functional.py index efdda54..e6b7b81 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2221,26 +2221,13 @@ def test_fp4_quant(): A1 = torch.randn(1024, 1024, device='cuda').half() qa, SA = F.quantize_fp4(A1, blocksize=64) A2 = F.dequantize_fp4(qa, SA) - #qa, SA = F.quantize_fp4(A1, blocksize=128) - #A2 = F.dequantize_fp4(qa, SA, blocksize=128) - - #A1 = A1.flatten().sort()[0] - #A2 = A2.flatten().sort()[0] - - #print(A1) - #print(A2) err = (A1 - A2).abs().float() relerr = (err/A1.abs().float()).mean() err = err.mean() - print(err, relerr) - - - - - #assert err.item() < 0.1 - #assert relerr.item() < 0.28 + assert err.item() < 0.1 + assert relerr.item() < 0.28 def test_bench_fp4_dequant(): From 13c0a4dc5d4be33bf0461d8bcc24e982b17dcb11 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 4 Feb 2023 21:35:43 -0800 Subject: [PATCH 04/97] Backward matmul_fp4 passes. --- bitsandbytes/autograd/_functions.py | 15 ++++++++------- tests/test_autograd.py | 16 ---------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index a098d4b..29c0b93 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -503,11 +503,9 @@ class MatMulFP4(torch.autograd.Function): ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype if any(ctx.needs_input_grad[:2]): - ctx.tensors = A + ctx.tensors = (A, B) else: - ctx.tensors = [None, None] - ctx.tensor_states = (None, None) - ctx.save_for_backward(None, None) + ctx.tensors = (None, None) return output @@ -517,10 +515,12 @@ class MatMulFP4(torch.autograd.Function): bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None - req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad - A = ctx.tensors + req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad + A, B = ctx.tensors state = ctx.state + grad_A, grad_B, grad_bias = None, None, None + if req_gradBias: # compute grad_bias first before changing grad_output dtype grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) @@ -529,7 +529,8 @@ class MatMulFP4(torch.autograd.Function): if len(grad_output.shape) == 3: grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - if req_gradB: grad_B = torch.matmul(grad_output.t(), A) + # not supported by PyTorch. TODO: create work-around + #if req_gradB: grad_B = torch.matmul(grad_output.t(), A) if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(ctx.dtype_A)) return grad_A, grad_B, None, grad_bias, None diff --git a/tests/test_autograd.py b/tests/test_autograd.py index ba75d76..ccbcc87 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -480,7 +480,6 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) bias2 = bias.clone() torch.nn.init.xavier_uniform_(B) - B2 = B.clone() B2, quant_state = bnb.functional.quantize_fp4(B) @@ -526,21 +525,6 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, if req_grad[0]: torch.testing.assert_allclose( gradA1, gradA2, atol=0.015, rtol=0.1) - if req_grad[1]: - n = gradB1.numel() - if dim2 > 0: - assert torch.abs(gradB1).sum() > 0.0 - assert torch.abs(gradB2).sum() > 0.0 - else: - assert torch.abs(gradB1).sum() == 0.0 - assert torch.abs(gradB2).sum() == 0.0 - idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) - - assert (idx == 0).sum().item() <= n * 0.1 - idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) - assert (idx == 0).sum().item() <= n * 0.02 - torch.testing.assert_allclose(gradB1, gradB2, atol=0.18, rtol=0.3 - ) if req_grad[2]: torch.testing.assert_allclose(gradBias1, gradBias2) From cfe4705e321d884bae48ce785f29d4a0aff5518b Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 4 Feb 2023 22:00:04 -0800 Subject: [PATCH 05/97] Added matmul_fp4 to the benchmark. --- bitsandbytes/autograd/_functions.py | 5 +- bitsandbytes/functional.py | 5 +- tests/test_autograd.py | 6 +-- tests/test_functional.py | 84 +++++++++++++++++------------ 4 files changed, 56 insertions(+), 44 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 29c0b93..01d1eb2 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -495,7 +495,7 @@ class MatMulFP4(torch.autograd.Function): # 1. Dequantize - # 2. Matmul + # 2. MatmulnN output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype), bias) # 3. Save state @@ -550,5 +550,6 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_fp4(A: tensor, B: tensor, out: tensor = None, quant_state: List = None, bias=None): +def matmul_fp4(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None): + assert quant_state is not None return MatMulFP4.apply(A, B, out, bias, quant_state) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 92ac670..b38ba1d 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -169,7 +169,6 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) lst = list(itertools.product([0, 1], repeat=precision_bits)) #for ev in evalues: bias = 2**(exponent_bits-1)+1 - print(bias) for evalue in range(2**(exponent_bits)): for bit_pattern in lst: value = (1 if evalue != 0 else 0) @@ -180,9 +179,7 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) value = value*2**-(bias) else: # normals - print(value, 1) value = value*2**-(evalue-bias-1) - print(value, 2) values.append(value) if signed: values.append(-value) @@ -196,7 +193,7 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) values.append(0) values.sort() code = torch.Tensor(values) - #code /= code.max() + code /= code.max() return code diff --git a/tests/test_autograd.py b/tests/test_autograd.py index ccbcc87..a8b9207 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -485,10 +485,10 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, if not transpose[0] and transpose[1]: out_torch = funcs[0](A, B.t()) - out_bnb = funcs[1](A, B2, quant_state=quant_state, bias=bias2) + out_bnb = funcs[1](A, B2, quant_state, bias=bias2) elif not transpose[0] and not transpose[1]: out_torch = funcs[0](A, B) - out_bnb = funcs[1](A, B2.t(), quant_state=quant_state, bias=bias2) + out_bnb = funcs[1](A, B2.t(), quant_state, bias=bias2) if has_bias: out_torch += bias @@ -498,7 +498,7 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, n = out_bnb.numel() err = torch.abs(out_bnb - out_torch).float().mean().item() if n > 0: - assert err < 0.11 + assert err < 0.115 if any(req_grad): out_bnb.data.copy_(out_torch) diff --git a/tests/test_functional.py b/tests/test_functional.py index e6b7b81..49022dc 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1788,18 +1788,14 @@ batch_size = 1 seqdim = 1 values = [] values.append((batch_size, seqdim, 768, 4 * 768)) -# values.append((batch_size, seqdim, 1024, 4*1024)) -# values.append((batch_size, seqdim, 1536, 4*1536)) -# values.append((batch_size, seqdim, 2048, 4*2048)) -# values.append((batch_size, seqdim, 2560, 4*2560)) -# values.append((batch_size, seqdim, 4096, 4*4096)) -# values.append((batch_size, seqdim, 5140, 4*5140)) +#values.append((batch_size, seqdim, 1024, 4*1024)) +#values.append((batch_size, seqdim, 1536, 4*1536)) +#values.append((batch_size, seqdim, 2048, 4*2048)) +#values.append((batch_size, seqdim, 2560, 4*2560)) +#values.append((batch_size, seqdim, 4096, 4*4096)) +#values.append((batch_size, seqdim, 5140, 4*5140)) #values.append((batch_size, seqdim, 12288, 4*12288)) -names = [ - "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values -] - - +names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): iters = 128 @@ -1809,17 +1805,20 @@ def test_bench_matmul(batch, seq, model, hidden): B = torch.empty(hidden, model, dtype=torch.float16, device="cuda") torch.nn.init.xavier_uniform_(B) + B_fp4, state = F.quantize_fp4(B) + linear8bit = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() linear8bit.eval() outliers = torch.randint(0, model, size=(5,)).cuda() A[:, :, outliers] = 8.0 - linearMixedBit = ( - bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half() - ) + linearMixedBit = (bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half()) linearMixedBit.eval() + linear8bit_train = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() + linear8bit_train_thresh = bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half() + # warmup for i in range(iters): torch.matmul(A, B.t()) @@ -1831,9 +1830,14 @@ def test_bench_matmul(batch, seq, model, hidden): for i in range(iters): torch.matmul(A, B.t()) torch.cuda.synchronize() - print( - f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + print( f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + bnb.matmul_fp4(A, B_fp4, quant_state=state) + torch.cuda.synchronize() + print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) torch.cuda.synchronize() t0 = time.time() @@ -1872,7 +1876,7 @@ def test_bench_matmul(batch, seq, model, hidden): Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) F.vectorwise_mm_dequant(Cout, statsA, statsB.t()) torch.cuda.synchronize() - #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear") CxB, SB = F.nvidia_transform(CB, to_order=formatB) @@ -1886,7 +1890,7 @@ def test_bench_matmul(batch, seq, model, hidden): Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) out = Cout * statsB * statsA * (1.0 / (127 * 127)) torch.cuda.synchronize() - #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") linear8bit(A) torch.cuda.synchronize() @@ -1894,9 +1898,7 @@ def test_bench_matmul(batch, seq, model, hidden): for i in range(iters): linear8bit(A) torch.cuda.synchronize() - print( - f"bnb linear8bitlt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") linearMixedBit(A) torch.cuda.synchronize() @@ -1904,9 +1906,23 @@ def test_bench_matmul(batch, seq, model, hidden): for i in range(iters): linearMixedBit(A) torch.cuda.synchronize() - print( - f"bnb linear8bitlt with threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + + linear8bit_train(A) + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + linear8bit_train(A) + torch.cuda.synchronize() + print( f"bnb linear8bitlt (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + + linear8bit_train_thresh(A) + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + linear8bit_train(A) + torch.cuda.synchronize() + print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") def test_zeropoint(): def quant_zp(x): @@ -2050,7 +2066,6 @@ def test_fp8_quant(): p_bits = 7-e_bits code = F.create_fp8_map(True, e_bits, p_bits).cuda() - print(e_bits, p_bits) abserr = [] relerr = [] for i in range(100): @@ -2189,7 +2204,6 @@ def test_bench_dequantization(): torch.cuda.synchronize() t0 = time.time() for i in range(100): - #F.dequantize_blockwise(qa, SA, blocksize=2048) qa, SA = F.quantize_blockwise(a) torch.cuda.synchronize() #print((time.time()-t0)/1e6) @@ -2240,7 +2254,7 @@ def test_bench_fp4_dequant(): num_bytes = input_size+output_size GB = num_bytes/1e9 max_theoretical_s = GB/768 - print(max_theoretical_s*1e6) + #print(max_theoretical_s*1e6) b = torch.randn(128, 1024*12, device='cuda').half() iters = 5 @@ -2250,14 +2264,14 @@ def test_bench_fp4_dequant(): F.dequantize_fp4(qa, SA, blocksize=blocksize) #b.copy_(a) torch.cuda.synchronize() - print((time.time()-t0)/iters*1e6) + #print((time.time()-t0)/iters*1e6) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - torch.matmul(b, a.t()) - torch.cuda.synchronize() - print((time.time()-t0)/iters*1e6) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # torch.matmul(b, a.t()) + #torch.cuda.synchronize() + #print((time.time()-t0)/iters*1e6) From c361f84239d52844ddae724e40c2c9a5d49284d5 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 5 Feb 2023 06:16:56 -0800 Subject: [PATCH 06/97] Fixed matmul_fp4 transpose. --- bitsandbytes/autograd/_functions.py | 4 ++-- tests/test_autograd.py | 4 ++-- tests/test_functional.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 01d1eb2..6db90f5 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -496,7 +496,7 @@ class MatMulFP4(torch.autograd.Function): # 1. Dequantize # 2. MatmulnN - output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype), bias) + output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype).t(), bias) # 3. Save state ctx.state = state @@ -531,7 +531,7 @@ class MatMulFP4(torch.autograd.Function): # not supported by PyTorch. TODO: create work-around #if req_gradB: grad_B = torch.matmul(grad_output.t(), A) - if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(ctx.dtype_A)) + if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(ctx.dtype_A).t()) return grad_A, grad_B, None, grad_bias, None diff --git a/tests/test_autograd.py b/tests/test_autograd.py index a8b9207..436c6b1 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -485,10 +485,10 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, if not transpose[0] and transpose[1]: out_torch = funcs[0](A, B.t()) - out_bnb = funcs[1](A, B2, quant_state, bias=bias2) + out_bnb = funcs[1](A, B2.t(), quant_state, bias=bias2) elif not transpose[0] and not transpose[1]: out_torch = funcs[0](A, B) - out_bnb = funcs[1](A, B2.t(), quant_state, bias=bias2) + out_bnb = funcs[1](A, B2, quant_state, bias=bias2) if has_bias: out_torch += bias diff --git a/tests/test_functional.py b/tests/test_functional.py index 49022dc..23b7558 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1835,7 +1835,7 @@ def test_bench_matmul(batch, seq, model, hidden): torch.cuda.synchronize() t0 = time.time() for i in range(iters): - bnb.matmul_fp4(A, B_fp4, quant_state=state) + bnb.matmul_fp4(A, B_fp4.t(), quant_state=state) torch.cuda.synchronize() print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) From c0c352b3791a5aab14263108595479b9db58fa1f Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 5 Feb 2023 06:29:52 -0800 Subject: [PATCH 07/97] Added bias test for LinearFP4 and basic test. --- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 6 +++--- tests/test_modules.py | 43 +++++++++++-------------------------- 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index edc595a..79fb51e 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding +from .modules import Int8Params, Linear8bitLt, StableEmbedding, LinearFP4 diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 6dfb06c..4c719c6 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -188,9 +188,9 @@ class LinearFP4(nn.Linear): if self.bias is not None and self.bias.dtype != x.dtype: self.bias.data = self.bias.data.to(x.dtype) - if getattr(self.weight, 'state', None) is None: - print('FP4 state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') - out = bnb.matmul_fp(x, self.weight, bias=self.bias, state=self.weight.state) + if getattr(self.weight, 'quant_state', None) is None: + print('FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') + out = bnb.matmul_fp4(x, self.weight.t(), bias=self.bias, quant_state=self.weight.quant_state) return out diff --git a/tests/test_modules.py b/tests/test_modules.py index d78f0c9..ba67bfc 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -330,12 +330,8 @@ def test_linear8bitlt_inference(threshold): def test_linear8bitlt_accumulated_gradient(): - l1 = torch.nn.Sequential( - *[bnb.nn.Linear8bitLt(32, 32).cuda().half() for i in range(2)] - ) - l2 = torch.nn.Sequential( - *[torch.nn.Linear(32, 32).cuda().half() for i in range(2)] - ) + l1 = torch.nn.Sequential(*[bnb.nn.Linear8bitLt(32, 32).cuda().half() for i in range(2)]) + l2 = torch.nn.Sequential(*[torch.nn.Linear(32, 32).cuda().half() for i in range(2)]) l2[0].weight = torch.nn.Parameter(l1[0].weight.clone()) l2[0].bias = torch.nn.Parameter(l1[0].bias.clone()) l2[1].weight = torch.nn.Parameter(l1[1].weight.clone()) @@ -376,21 +372,10 @@ def test_linear8bitlt_accumulated_gradient(): torch.testing.assert_allclose(l1[1].weight.grad, l2[1].weight.grad) -threshold = [0.0, 2.0] -values = threshold -names = [f"threshold_{vals}" for vals in values] - - -@pytest.mark.parametrize("threshold", values, ids=names) +@pytest.mark.parametrize("threshold", [0.0, 2.0]) @pytest.mark.parametrize("memory_efficient_backward", [False]) def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): - l1 = ( - bnb.nn.Linear8bitLt( - 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward - ) - .cuda() - .half() - ) + l1 = ( bnb.nn.Linear8bitLt( 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward).cuda().half()) assert l1.weight.dtype == torch.int8 l1.eval() @@ -446,13 +431,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): assert mlp.fc1.weight.dtype == torch.int8 assert mlp.fc2.weight.dtype == torch.int8 - mlp = ( - MLP8bit( - 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward - ) - .half() - .to("cuda") - ) + mlp = ( MLP8bit( 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward).half().to("cuda")) for i in range(100): b1 = torch.randn(16, 8, 32, device="cuda").half() @@ -504,10 +483,11 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): assert (idx == 0).sum().item() <= b1.numel() * 0.005 -def test_linear8bitlt_fp32_bias(): +@pytest.mark.parametrize("module", [lambda nin, nout, bias=True: bnb.nn.Linear8bitLt(nin, nout, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4']) +def test_linear_kbit_fp32_bias(module): # casts model to fp16 -> int8 automatically - l1 = bnb.nn.Linear8bitLt(32, 64, has_fp16_weights=False).cuda() - assert l1.weight.dtype == torch.int8 + l1 = module(32, 64).cuda() + assert l1.weight.dtype in [torch.int8, torch.uint8] assert l1.bias.dtype == torch.float32 for i in range(100): @@ -517,11 +497,12 @@ def test_linear8bitlt_fp32_bias(): assert l1.bias.dtype == torch.float16 # casts model to fp16 -> int8 automatically - l1 = bnb.nn.Linear8bitLt(32, 64, has_fp16_weights=False, bias=False).cuda() - assert l1.weight.dtype == torch.int8 + l1 = module(32, 64, bias=False).cuda() + assert l1.weight.dtype in [torch.int8, torch.uint8] assert l1.bias is None for i in range(100): b1 = torch.randn(16, 8, 32, device="cuda").half() o1 = l1(b1) assert l1.bias is None + From 7f0773aede92a8be5bf0645185de4f5707b3a2a8 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 5 Feb 2023 06:49:54 -0800 Subject: [PATCH 08/97] Added backprop test for Linear8bitLt and LinearFP4. --- tests/test_modules.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tests/test_modules.py b/tests/test_modules.py index ba67bfc..41cc050 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -375,7 +375,7 @@ def test_linear8bitlt_accumulated_gradient(): @pytest.mark.parametrize("threshold", [0.0, 2.0]) @pytest.mark.parametrize("memory_efficient_backward", [False]) def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): - l1 = ( bnb.nn.Linear8bitLt( 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward).cuda().half()) + l1 = (bnb.nn.Linear8bitLt( 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward).cuda().half()) assert l1.weight.dtype == torch.int8 l1.eval() @@ -506,3 +506,41 @@ def test_linear_kbit_fp32_bias(module): o1 = l1(b1) assert l1.bias is None +@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") +@pytest.mark.parametrize("module", [bnb.nn.Linear8bitLt, bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4']) +def test_kbit_backprop(module): + b = 17 + dim1 = 37 + dim2 = 83 + + ref = nn.Sequential(*[torch.nn.Linear(dim1, dim2), torch.nn.Linear(dim2, 10)]) + ref[1].weight.requires_grad = False + kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 10)]) + kbit[0].weight.detach().copy_(ref[0].weight) + kbit[1].weight.detach().copy_(ref[1].weight) + kbit[0].bias.detach().copy_(ref[0].bias) + kbit[1].bias.detach().copy_(ref[1].bias) + ref = ref.half().cuda() + kbit = kbit.half().cuda() + + for i in range(100): + batch = torch.randn(b, dim1).half().cuda() + out1 = ref(batch) + out2 = kbit(batch) + out1.mean().backward() + out2.mean().backward() + + grad1 = ref[0].weight.grad + grad2 = kbit[0].weight.grad + bgrad1 = ref[0].bias.grad + bgrad2 = kbit[0].bias.grad + + torch.testing.assert_allclose(grad1, grad2, atol=0.008, rtol=0.05) + torch.testing.assert_allclose(bgrad1, bgrad2, atol=0.008, rtol=0.05) + ref.zero_grad() + kbit.zero_grad() + + assert kbit[0].weight.grad.sum().item() == 0 + assert kbit[0].bias.grad.sum().item() == 0 + + From 6bdb6c351e49886d227ff63ed4fc0cc76d78a420 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 13 Feb 2023 16:53:07 -0800 Subject: [PATCH 09/97] Added fp8 simulation layer. --- bitsandbytes/__init__.py | 1 + bitsandbytes/autograd/_functions.py | 92 +++++++++++++++++++++++++ bitsandbytes/nn/modules.py | 16 +++++ tests/test_autograd.py | 100 ++++++++++++++++++++++++++++ 4 files changed, 209 insertions(+) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 041df4b..21cfbb0 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -10,6 +10,7 @@ from .autograd._functions import ( matmul, matmul_cublas, mm_cublas, + matmul_fp8 ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index a115437..fc027f2 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -390,6 +390,98 @@ class MatMul8bitLt(torch.autograd.Function): return grad_A, grad_B, None, grad_bias, None +class MatMulFP8(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, bias=None, fw_code=None, bw_code=None): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + ctx.bias = bias + B_shape = state[1] + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + + # 1. Dequantize + # 2. MatmulnN + + cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=1024) + fp8A = F.dequantize_blockwise(cA, state) + + cB, state = F.quantize_blockwise(B, code=fw_code, blocksize=1024) + fp8B = F.dequantize_blockwise(cB, state) + + output = torch.nn.functional.linear(fp8A, fp8B) + + + # 3. Save state + ctx.bw_code = bw_code + ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype + + if any(ctx.needs_input_grad[:2]): + ctx.tensors = (fp8A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None + + req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad + fp8A, B = ctx.tensors + state = ctx.state + + grad_A, grad_B, grad_bias = None, None, None + + cgrad_out, state = F.quantize_blockwise(grad_ouput, code=ctx.bw_code, blocksize=1024) + fp8out = F.dequantize_blockwise(cgrad_out, state) + + if req_gradBias: + # compute grad_bias first before changing grad_output dtype + grad_bias = fp8out.sum(0, dtype=ctx.dtype_bias) + + # Cast grad_output to fp16 + if len(grad_output.shape) == 3: + grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + + # not supported by PyTorch. TODO: create work-around + #if req_gradB: grad_B = torch.matmul(grad_output.t(), A) + if req_gradA: grad_A = torch.matmul(fp8out, B.t()) + if req_gradB: grad_B = torch.matmul(fp8A.t(), fp8out) + + return grad_A, grad_B, None, grad_bias, None, None + + +def matmul( + A: tensor, + B: tensor, + out: tensor = None, + state: MatmulLtState = None, + threshold=0.0, + bias=None +): + state = state or MatmulLtState() + if threshold > 0.0: + state.threshold = threshold + return MatMul8bitLt.apply(A, B, out, bias, state) + + +def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bias=None): + assert quant_state is not None + return MatMulFP8.apply(A, B, out, bias, fw_code, bw_code) + def matmul( A: tensor, diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 4746a4a..b1d5355 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -343,3 +343,19 @@ class Linear8bitLt(nn.Linear): del self.state.CxB return out + +class LinearFP8(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.bw_code = None + self.fw_code = None + + def forward(self, x: torch.Tensor): + if self.fw_code is None: + self.bw_code = F.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = F.create_fp8_map(True, 4, 3, 8).to(x.device) + + out = bnb.matmul_fp8(x, self.weight.t(), bias=self.bias, fw_code=self.fw_code, code=self.bw_code) + + return out + diff --git a/tests/test_autograd.py b/tests/test_autograd.py index c67126d..0def35d 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -429,3 +429,103 @@ def test_matmullt( if req_grad[2]: torch.testing.assert_allclose(gradBias1, gradBias2) + + + +n = 1 +k = 3 +dim1 = torch.randint(16, 64, size=(n,)).tolist() +dim2 = torch.randint(32, 96, size=(n,)).tolist() +dim3 = torch.randint(32, 96, size=(n,)).tolist() +dim4 = torch.randint(32, 96, size=(n,)).tolist() + +dim2.append(0) + +funcs = [(torch.matmul, bnb.matmul_fp8)] +str_funcs = ["matmul"] +req_grad = list(product([True, False], repeat=3)) +req_grad_str = [] +for c in req_grad: + strval = '' + for v in c: + if v == True: strval += 'T' + else: strval += 'F' + req_grad_str.append(strval) + +transpose = [(False, True), (False, False)] +str_transpose = ["NT", "NN"] +dtype = [torch.float16, torch.float32] +has_fp16_weights = [True, False] +has_bias = [True, False] +values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias)) +str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias)) +names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}".format(*vals) for vals in str_values] +@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") +@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias", values, ids=names) +def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias): + dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) + dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) + if has_bias == False: + req_grad = list(req_grad) + req_grad[2] = False + + for i in range(k): + # normal multiply + if funcs[0] in [torch.mm, torch.matmul]: + A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) + B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype) + target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype) + bias = None + bias2 = None + if has_bias: + bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) + bias2 = bias.clone() + torch.nn.init.xavier_uniform_(B) + + B2, quant_state = bnb.functional.quantize_fp8(B) + + if not transpose[0] and transpose[1]: + out_torch = funcs[0](A, B.t()) + out_bnb = funcs[1](A, B2.t(), quant_state, bias=bias2) + elif not transpose[0] and not transpose[1]: + out_torch = funcs[0](A, B) + out_bnb = funcs[1](A, B2, quant_state, bias=bias2) + + if has_bias: + out_torch += bias + + assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}" + + n = out_bnb.numel() + err = torch.abs(out_bnb - out_torch).float().mean().item() + if n > 0: + assert err < 0.115 + + if any(req_grad): + out_bnb.data.copy_(out_torch) + torch.cuda.synchronize() + loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean() + loss_bnb.backward() + gradA1 = A.grad + gradB1 = B.grad + A.grad = None + B.grad = None + if has_bias: + gradBias1 = bias.grad + bias.grad = None + + loss_torch = torch.nn.functional.mse_loss( out_torch, target ).mean() + loss_torch.backward() + gradA2 = A.grad + gradB2 = B.grad + A.grad = None + B.grad = None + if has_bias: + gradBias2 = bias.grad + bias.grad = None + + if req_grad[0]: + torch.testing.assert_allclose( gradA1, gradA2, atol=0.015, rtol=0.1) + + if req_grad[2]: + torch.testing.assert_allclose(gradBias1, gradBias2) From ca3236587ad285b8a43a96629516d3362045bb99 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 13 Feb 2023 17:20:52 -0800 Subject: [PATCH 10/97] Added forward/backward tests; removed bias. --- bitsandbytes/autograd/_functions.py | 36 +++++++---------- bitsandbytes/nn/modules.py | 4 +- tests/test_autograd.py | 61 +++++++++++++++-------------- 3 files changed, 48 insertions(+), 53 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index fc027f2..c2b8773 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -395,15 +395,14 @@ class MatMulFP8(torch.autograd.Function): # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @staticmethod - def forward(ctx, A, B, out=None, bias=None, fw_code=None, bw_code=None): + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None): # default of pytorch behavior if inputs are empty ctx.is_empty = False if prod(A.shape) == 0: ctx.is_empty = True ctx.A = A ctx.B = B - ctx.bias = bias - B_shape = state[1] + B_shape = B.shape if A.shape[-1] == B_shape[0]: return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) else: @@ -414,17 +413,17 @@ class MatMulFP8(torch.autograd.Function): # 2. MatmulnN cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=1024) - fp8A = F.dequantize_blockwise(cA, state) + fp8A = F.dequantize_blockwise(cA, state).to(A.dtype) cB, state = F.quantize_blockwise(B, code=fw_code, blocksize=1024) - fp8B = F.dequantize_blockwise(cB, state) + fp8B = F.dequantize_blockwise(cB, state).to(B.dtype) - output = torch.nn.functional.linear(fp8A, fp8B) + output = torch.matmul(fp8A, fp8B) # 3. Save state ctx.bw_code = bw_code - ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype if any(ctx.needs_input_grad[:2]): ctx.tensors = (fp8A, fp8B) @@ -436,21 +435,15 @@ class MatMulFP8(torch.autograd.Function): @staticmethod def backward(ctx, grad_output): if ctx.is_empty: - bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None - req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad + req_gradA, req_gradB, _, _, _ = ctx.needs_input_grad fp8A, B = ctx.tensors - state = ctx.state - grad_A, grad_B, grad_bias = None, None, None + grad_A, grad_B = None, None - cgrad_out, state = F.quantize_blockwise(grad_ouput, code=ctx.bw_code, blocksize=1024) - fp8out = F.dequantize_blockwise(cgrad_out, state) - - if req_gradBias: - # compute grad_bias first before changing grad_output dtype - grad_bias = fp8out.sum(0, dtype=ctx.dtype_bias) + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=1024) + fp8out = F.dequantize_blockwise(cgrad_out, state).to(grad_output.dtype) # Cast grad_output to fp16 if len(grad_output.shape) == 3: @@ -461,7 +454,7 @@ class MatMulFP8(torch.autograd.Function): if req_gradA: grad_A = torch.matmul(fp8out, B.t()) if req_gradB: grad_B = torch.matmul(fp8A.t(), fp8out) - return grad_A, grad_B, None, grad_bias, None, None + return grad_A, grad_B, None, None, None def matmul( @@ -478,9 +471,8 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bias=None): - assert quant_state is not None - return MatMulFP8.apply(A, B, out, bias, fw_code, bw_code) +def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None): + return MatMulFP8.apply(A, B, out, fw_code, bw_code) def matmul( diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index b1d5355..5e12ddb 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -355,7 +355,9 @@ class LinearFP8(nn.Linear): self.bw_code = F.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = F.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), bias=self.bias, fw_code=self.fw_code, code=self.bw_code) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, code=self.bw_code) + if self.bias is not None: + out += self.bias return out diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 0def35d..4d3e67a 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -456,18 +456,16 @@ transpose = [(False, True), (False, False)] str_transpose = ["NT", "NN"] dtype = [torch.float16, torch.float32] has_fp16_weights = [True, False] -has_bias = [True, False] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}".format(*vals) for vals in str_values] +values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose)) +str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose)) +names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values] @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias", values, ids=names) -def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias): +@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names) +def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) - if has_bias == False: - req_grad = list(req_grad) - req_grad[2] = False + req_grad = list(req_grad) + req_grad[2] = False for i in range(k): # normal multiply @@ -475,32 +473,24 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype) target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype) - bias = None - bias2 = None - if has_bias: - bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) - bias2 = bias.clone() torch.nn.init.xavier_uniform_(B) - B2, quant_state = bnb.functional.quantize_fp8(B) + fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(A.device) + bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(A.device) if not transpose[0] and transpose[1]: out_torch = funcs[0](A, B.t()) - out_bnb = funcs[1](A, B2.t(), quant_state, bias=bias2) + out_bnb = funcs[1](A, B.t(), fw_code, bw_code) elif not transpose[0] and not transpose[1]: out_torch = funcs[0](A, B) - out_bnb = funcs[1](A, B2, quant_state, bias=bias2) - - if has_bias: - out_torch += bias + out_bnb = funcs[1](A, B, fw_code, bw_code) assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}" n = out_bnb.numel() err = torch.abs(out_bnb - out_torch).float().mean().item() if n > 0: - assert err < 0.115 - + assert err < 0.20 if any(req_grad): out_bnb.data.copy_(out_torch) torch.cuda.synchronize() @@ -510,9 +500,6 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, gradB1 = B.grad A.grad = None B.grad = None - if has_bias: - gradBias1 = bias.grad - bias.grad = None loss_torch = torch.nn.functional.mse_loss( out_torch, target ).mean() loss_torch.backward() @@ -520,12 +507,26 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, gradB2 = B.grad A.grad = None B.grad = None - if has_bias: - gradBias2 = bias.grad - bias.grad = None if req_grad[0]: torch.testing.assert_allclose( gradA1, gradA2, atol=0.015, rtol=0.1) - if req_grad[2]: - torch.testing.assert_allclose(gradBias1, gradBias2) + if req_grad[1]: + n = gradB1.numel() + if dim2 > 0: + assert torch.abs(gradB1).sum() > 0.0 + assert torch.abs(gradB2).sum() > 0.0 + else: + assert torch.abs(gradB1).sum() == 0.0 + assert torch.abs(gradB2).sum() == 0.0 + idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) + + assert (idx == 0).sum().item() <= n * 0.1 + idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) + assert (idx == 0).sum().item() <= n * 0.02 + grad_err = (gradB1-gradB2).abs().mean() + assert grad_err.item() < 0.003 + torch.testing.assert_allclose( + gradB1, gradB2, atol=0.18, rtol=0.3 + ) + From fa255cbc5621538f25abe45a6d372b6d395dba7e Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 13 Feb 2023 17:29:39 -0800 Subject: [PATCH 11/97] Added missing import. --- bitsandbytes/nn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 221b5f7..7c2b552 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8 From 2dfa3ce16dc52a240228d1d230d1e0af037ef748 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 13 Feb 2023 17:48:52 -0800 Subject: [PATCH 12/97] Fixed LinearFP8 and added tests. --- bitsandbytes/nn/modules.py | 6 +++--- tests/test_modules.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 5e12ddb..c8a3ecc 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -352,10 +352,10 @@ class LinearFP8(nn.Linear): def forward(self, x: torch.Tensor): if self.fw_code is None: - self.bw_code = F.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = F.create_fp8_map(True, 4, 3, 8).to(x.device) + self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, code=self.bw_code) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code) if self.bias is not None: out += self.bias diff --git a/tests/test_modules.py b/tests/test_modules.py index ffcf304..4fe8b54 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -525,3 +525,40 @@ def test_linear8bitlt_fp32_bias(): b1 = torch.randn(16, 8, 32, device="cuda").half() o1 = l1(b1) assert l1.bias is None + +def test_fp8linear(): + + b = 10 + h = 1024 + inp = torch.randn(b, h).cuda() + fp32 = torch.nn.Linear(h, h*2).cuda() + fp8 = bnb.nn.LinearFP8(h, h*2).cuda() + fp32b = torch.nn.Linear(h*2, h).cuda() + fp8b = bnb.nn.LinearFP8(h*2, h).cuda() + + fp8.weight.data.copy_(fp32.weight.data) + fp8.bias.data.copy_(fp32.bias.data) + fp8b.weight.data.copy_(fp32b.weight.data) + fp8b.bias.data.copy_(fp32b.bias.data) + + a = fp32b(torch.nn.functional.gelu(fp32(inp))) + b = fp8b(torch.nn.functional.gelu(fp8(inp))) + + err = (a-b).abs().mean() + + a.mean().backward() + b.mean().backward() + + graderr = (fp8.weight.grad-fp32.weight.grad).abs().mean() + bgraderr = (fp8.bias.grad-fp32.bias.grad).abs().mean() + + assert err < 0.05 + assert graderr < 0.00002 + assert bgraderr < 0.00002 + + + + + + + From c93a90d07595c143e87831228815d88a1e6d32e7 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 14 Feb 2023 13:31:39 -0800 Subject: [PATCH 13/97] Fixed FP4 import and data type conversion in backward. --- bitsandbytes/autograd/_functions.py | 6 +----- bitsandbytes/nn/__init__.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 6db90f5..ffe19c5 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -525,13 +525,9 @@ class MatMulFP4(torch.autograd.Function): # compute grad_bias first before changing grad_output dtype grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) - # Cast grad_output to fp16 - if len(grad_output.shape) == 3: - grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # not supported by PyTorch. TODO: create work-around #if req_gradB: grad_B = torch.matmul(grad_output.t(), A) - if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(ctx.dtype_A).t()) + if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(grad_output.dtype).t()) return grad_A, grad_B, None, grad_bias, None diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 79fb51e..954a67f 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, LinearFP4 +from .modules import Int8Params, Linear8bitLt, StableEmbedding, LinearFP4, FP4Params From 2489d819c5009e88a1572809a2f3306dace84051 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 14 Feb 2023 13:55:17 -0800 Subject: [PATCH 14/97] Added more blocksizes for stochastic rounding; fixed dequant blocksize. --- bitsandbytes/autograd/_functions.py | 6 +++--- bitsandbytes/functional.py | 5 ++--- csrc/kernels.cu | 12 ++++++++++++ csrc/ops.cu | 14 ++++++-------- csrc/pythonInterface.c | 8 ++++---- tests/test_functional.py | 16 ++++++++++------ 6 files changed, 37 insertions(+), 24 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index c2b8773..b8b2dbc 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -413,10 +413,10 @@ class MatMulFP8(torch.autograd.Function): # 2. MatmulnN cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=1024) - fp8A = F.dequantize_blockwise(cA, state).to(A.dtype) + fp8A = F.dequantize_blockwise(cA, state, blocksize=1024).to(A.dtype) cB, state = F.quantize_blockwise(B, code=fw_code, blocksize=1024) - fp8B = F.dequantize_blockwise(cB, state).to(B.dtype) + fp8B = F.dequantize_blockwise(cB, state, blocksize=1024).to(B.dtype) output = torch.matmul(fp8A, fp8B) @@ -443,7 +443,7 @@ class MatMulFP8(torch.autograd.Function): grad_A, grad_B = None, None cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=1024) - fp8out = F.dequantize_blockwise(cgrad_out, state).to(grad_output.dtype) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=1024).to(grad_output.dtype) # Cast grad_output to fp16 if len(grad_output.shape) == 3: diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 371f85c..dbc2828 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -508,13 +508,12 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra code = code.to(A.device) if rand is not None: is_on_gpu([code, A, out, absmax, rand]) - assert blocksize==4096 assert rand.numel() >= 1024 rand_offset = random.randint(0, 1023) if A.dtype == torch.float32: - lib.cquantize_blockwise_stochastic_fp32(get_ptr(code), get_ptr(A),get_ptr(absmax), get_ptr(out), get_ptr(rand), ct.c_int32(rand_offset), ct.c_int(A.numel())) + lib.cquantize_blockwise_stochastic_fp32(get_ptr(code), get_ptr(A),get_ptr(absmax), get_ptr(out), get_ptr(rand), ct.c_int32(rand_offset), cblocksize, ct.c_int(A.numel())) elif A.dtype == torch.float16: - lib.cquantize_blockwise_stochastic_fp16(get_ptr(code), get_ptr(A),get_ptr(absmax), get_ptr(out), get_ptr(rand), ct.c_int32(rand_offset), ct.c_int(A.numel())) + lib.cquantize_blockwise_stochastic_fp16(get_ptr(code), get_ptr(A),get_ptr(absmax), get_ptr(out), get_ptr(rand), ct.c_int32(rand_offset), cblocksize, ct.c_int(A.numel())) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") else: diff --git a/csrc/kernels.cu b/csrc/kernels.cu index b32b39c..99224ad 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2797,16 +2797,28 @@ template __global__ void kQuantizeBlockwise(float * code, half template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int n); diff --git a/csrc/ops.cu b/csrc/ops.cu index e770e10..9e01588 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -54,23 +54,21 @@ template void quantizeBlockwise(float * code, T *A, { int num_blocks = n/blocksize; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; - if(STOCHASTIC == 1) - assert(blocksize == 4096); if(blocksize == 4096) kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 2048) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 1024) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 512) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 256) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 128) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 64) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index d8b2290..d1055cd 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -77,8 +77,8 @@ void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } -void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } -void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } +void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, blocksize, n); } +void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, blocksize, n); } void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } \ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } @@ -142,8 +142,8 @@ extern "C" void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); } void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); } void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); } - void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); } - void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); } + void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, blocksize, n); } + void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, blocksize, n); } void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); } diff --git a/tests/test_functional.py b/tests/test_functional.py index 70fa4d0..5a24aeb 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -188,21 +188,25 @@ def test_dynamic_blockwise_quantization(): #print('rand', blocksize, sum(reldiffs)/len(reldiffs)) -def test_dynamic_blockwise_stochastic_quantization(): + +@pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) +def test_dynamic_blockwise_stochastic_quantization(blocksize): diffs = [] reldiffs = [] rand = torch.rand(1024).cuda() + err = 0 for i in range(100): A1 = torch.randn(1024, 1024, device="cuda") - C1, S1 = F.quantize_blockwise(A1, rand=rand) - C2, S2 = F.quantize_blockwise(A1) + C1, S1 = F.quantize_blockwise(A1, rand=rand, blocksize=blocksize) + C2, S2 = F.quantize_blockwise(A1, blocksize=blocksize) + A2 = F.dequantize_blockwise(C1, S1, blocksize=blocksize) + err += (A1-A2).abs().mean().item()/100 # a maximunm distance of quantized values of 1 torch.testing.assert_allclose(C1, C2, atol=1, rtol=0) fraction_smaller = (C1 < C2).float().sum() / C1.numel() fraction_larger = (C1 > C2).float().sum() / C1.numel() - torch.testing.assert_allclose( - fraction_larger, fraction_smaller, atol=0.01, rtol=0 - ) + torch.testing.assert_allclose(fraction_larger, fraction_smaller, atol=0.01, rtol=0) + assert err < 0.019 @pytest.mark.parametrize( From 7b764d35698eb77f20768e3f62b0e53f3044fb5f Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Tue, 21 Feb 2023 03:53:44 +0000 Subject: [PATCH 15/97] adding half() cast --- bitsandbytes/autograd/_functions.py | 14 ++++--- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 59 +++++++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index b8b2dbc..aa50b21 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -415,8 +415,8 @@ class MatMulFP8(torch.autograd.Function): cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=1024) fp8A = F.dequantize_blockwise(cA, state, blocksize=1024).to(A.dtype) - cB, state = F.quantize_blockwise(B, code=fw_code, blocksize=1024) - fp8B = F.dequantize_blockwise(cB, state, blocksize=1024).to(B.dtype) + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) output = torch.matmul(fp8A, fp8B) @@ -450,9 +450,13 @@ class MatMulFP8(torch.autograd.Function): grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() # not supported by PyTorch. TODO: create work-around - #if req_gradB: grad_B = torch.matmul(grad_output.t(), A) - if req_gradA: grad_A = torch.matmul(fp8out, B.t()) - if req_gradB: grad_B = torch.matmul(fp8A.t(), fp8out) + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(fp8A.dtype) + if req_gradB: + if fp8A.ndim == 3: + fp8At = fp8A.transpose(2, 1) + elif fp8A.ndim == 2: + fp8At = fp8A.t() + grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) return grad_A, grad_B, None, None, None diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 7c2b552..ae9eb8c 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8 +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index c8a3ecc..23f391a 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -326,10 +326,11 @@ class Linear8bitLt(nn.Linear): self.init_8bit_state() # weights are cast automatically as Int8Params, but the bias has to be cast manually - if self.bias is not None and self.bias.dtype != torch.float16: - self.bias.data = self.bias.data.half() + # if self.bias is not None and self.bias.dtype != torch.float16: + # self.bias.data = self.bias.data.half() - out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state) + #out = bnb.matmul(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias + out = bnb.matmul(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias if not self.state.has_fp16_weights: if not self.state.memory_efficient_backward and self.state.CB is not None: @@ -344,6 +345,28 @@ class Linear8bitLt(nn.Linear): return out + +class Linear8bitLtThresh(Linear8bitLt): + def __init__( + self, + input_features, + output_features, + bias=True, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=6.0, + index=None, + ): + super().__init__( + input_features, + output_features, + bias=bias, + has_fp16_weights=has_fp16_weights, + memory_efficient_backward=memory_efficient_backward, + threshold=threshold, + index=index + ) + class LinearFP8(nn.Linear): def __init__(self, input_features, output_features, bias=True): super().__init__(input_features, output_features, bias) @@ -361,3 +384,33 @@ class LinearFP8(nn.Linear): return out +class LinearInt8(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.code = None + + def forward(self, x: torch.Tensor): + if self.code is None: + self.code = bnb.functional.create_linear_map(True, 8).to(x.device) + + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code) + if self.bias is not None: + out += self.bias + + return out + +class LinearInt8Cast(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.code = None + + def forward(self, x: torch.Tensor): + if self.code is None: + self.code = bnb.functional.create_linear_map(True, 8).to(x.device) + + out = bnb.matmul_fp8(x.half(), self.weight.half().t(), fw_code=self.code, bw_code=self.code) + if self.bias is not None: + out += self.bias + + return out + From 3fbf60ad83e845677e77c807b884393f25f40c8e Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Thu, 23 Feb 2023 08:27:15 +0000 Subject: [PATCH 16/97] sim now worse than real --- bitsandbytes/autograd/_functions.py | 55 ++++++++++-------- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 90 +++++++++++++++++++++++++++-- 3 files changed, 118 insertions(+), 29 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index aa50b21..6de595e 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -395,38 +395,41 @@ class MatMulFP8(torch.autograd.Function): # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None): + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024): # default of pytorch behavior if inputs are empty ctx.is_empty = False if prod(A.shape) == 0: ctx.is_empty = True ctx.A = A ctx.B = B + B_shape = B.shape if A.shape[-1] == B_shape[0]: return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) else: return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - # 1. Dequantize # 2. MatmulnN - - cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=1024) - fp8A = F.dequantize_blockwise(cA, state, blocksize=1024).to(A.dtype) + cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) + fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) cB, state = F.quantize(B.float(), code=fw_code) fp8B = F.dequantize(cB, state).to(B.dtype) output = torch.matmul(fp8A, fp8B) + # output is half # 3. Save state + ctx.fw_code = fw_code ctx.bw_code = bw_code + ctx.bsz = bsz ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype if any(ctx.needs_input_grad[:2]): - ctx.tensors = (fp8A, fp8B) + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) else: ctx.tensors = (None, None) @@ -435,30 +438,36 @@ class MatMulFP8(torch.autograd.Function): @staticmethod def backward(ctx, grad_output): if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None - req_gradA, req_gradB, _, _, _ = ctx.needs_input_grad - fp8A, B = ctx.tensors + req_gradA, req_gradB, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors grad_A, grad_B = None, None - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=1024) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=1024).to(grad_output.dtype) + # TODO: Fix blocksize to be output_dim + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz).to(grad_output.dtype) - # Cast grad_output to fp16 - if len(grad_output.shape) == 3: - grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(fp8A.dtype) + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + if req_gradB: - if fp8A.ndim == 3: - fp8At = fp8A.transpose(2, 1) - elif fp8A.ndim == 2: - fp8At = fp8A.t() - grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) + At = A.transpose(2, 1).contiguous() + cA, state = F.quantize(At.float(), code=ctx.fw_code) + fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) - return grad_A, grad_B, None, None, None + return grad_A, grad_B, None, None, None, None def matmul( @@ -475,8 +484,8 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None): - return MatMulFP8.apply(A, B, out, fw_code, bw_code) +def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1): + return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz) def matmul( diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index ae9eb8c..9c70642 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2 diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 23f391a..5c0d0d4 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -346,6 +346,68 @@ class Linear8bitLt(nn.Linear): return out +# Not in use for now... +class Linear8bitLt2(nn.Linear): + def __init__( + self, + input_features, + output_features, + bias=True, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=0.0, + index=None, + ): + super().__init__( + input_features, output_features, bias + ) + self.state = bnb.MatmulLtState() + self.index = index + + self.state.threshold = threshold + self.state.has_fp16_weights = has_fp16_weights + self.state.memory_efficient_backward = memory_efficient_backward + if threshold > 0.0 and not has_fp16_weights: + self.state.use_pool = True + + self.weight = Int8Params( + self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights + ) + + def init_8bit_state(self): + self.state.CB = self.weight.CB + self.state.SCB = self.weight.SCB + self.weight.CB = None + self.weight.SCB = None + + def forward(self, x): + self.state.is_training = self.training + + if self.weight.CB is not None: + self.init_8bit_state() + + # weights are cast automatically as Int8Params, but the bias has to be cast manually + # if self.bias is not None and self.bias.dtype != torch.float16: + # self.bias.data = self.bias.data.half() + + #out = bnb.matmul(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias + out = bnb.matmul(x, self.weight, bias=None, state=self.state) + self.bias + #out = torch.matmul(x.half(), W.half().t()) + self.bias + + if not self.state.has_fp16_weights: + if not self.state.memory_efficient_backward and self.state.CB is not None: + # we converted 8-bit row major to turing/ampere format in the first inference pass + # we no longer need the row-major weight + del self.state.CB + self.weight.data = self.state.CxB + elif self.state.memory_efficient_backward and self.state.CxB is not None: + # For memory efficient backward, we convert 8-bit row major to turing/ampere format at each inference pass. + # Thus, we delete CxB from the state. + del self.state.CxB + + return out + + class Linear8bitLtThresh(Linear8bitLt): def __init__( self, @@ -363,7 +425,7 @@ class Linear8bitLtThresh(Linear8bitLt): bias=bias, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward, - threshold=threshold, + threshold=6., index=index ) @@ -372,13 +434,19 @@ class LinearFP8(nn.Linear): super().__init__(input_features, output_features, bias) self.bw_code = None self.fw_code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + print('block size is', self.bsz) def forward(self, x: torch.Tensor): if self.fw_code is None: self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz) if self.bias is not None: out += self.bias @@ -388,27 +456,39 @@ class LinearInt8(nn.Linear): def __init__(self, input_features, output_features, bias=True): super().__init__(input_features, output_features, bias) self.code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break def forward(self, x: torch.Tensor): if self.code is None: self.code = bnb.functional.create_linear_map(True, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz) if self.bias is not None: out += self.bias return out +# This is 4 bit version. class LinearInt8Cast(nn.Linear): def __init__(self, input_features, output_features, bias=True): super().__init__(input_features, output_features, bias) self.code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + def forward(self, x: torch.Tensor): if self.code is None: - self.code = bnb.functional.create_linear_map(True, 8).to(x.device) + self.code = bnb.functional.create_linear_map(True, 4).to(x.device) - out = bnb.matmul_fp8(x.half(), self.weight.half().t(), fw_code=self.code, bw_code=self.code) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz) if self.bias is not None: out += self.bias From c5c38ca19c27fe4fbf0ebf2db77183c0ff5cfb01 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Thu, 23 Feb 2023 10:45:18 -0800 Subject: [PATCH 17/97] Added matmul_mixed. --- bitsandbytes/__init__.py | 3 +- bitsandbytes/autograd/_functions.py | 188 +++++++++++++++++++++++++++- tests/test_autograd.py | 2 +- 3 files changed, 189 insertions(+), 4 deletions(-) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 21cfbb0..ddd9bf0 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -10,7 +10,8 @@ from .autograd._functions import ( matmul, matmul_cublas, mm_cublas, - matmul_fp8 + matmul_fp8, + matmul_mixed ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index aa50b21..c68b18b 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -461,6 +461,190 @@ class MatMulFP8(torch.autograd.Function): return grad_A, grad_B, None, None, None +class MatMul8bitMixed(torch.autograd.Function): + @staticmethod + def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): + # default to pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + ctx.bias = bias + if A.shape[-1] == B.shape[0]: + return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device) + + # 1. Quantize A + # 2. Quantize B + # 3. Matmul + # 4. Mixed-precision decomposition matmul + # 5. Save state + formatB = state.formatB + input_shape = A.shape + if state.outlier_pool is None: + state.outlier_pool = GlobalOutlierPooler.get_instance() + + # Cast A to fp16 + if A.dtype != torch.float16: + warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") + + # 1. Quantize A + if len(A.shape) == 3: + A = A.view(-1, A.shape[-1]).contiguous() + CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant( + A.to(torch.float16), threshold=state.threshold + ) + + if state.threshold > 0.0 and coo_tensorA is not None: + if state.has_fp16_weights: + idx = torch.unique(coo_tensorA.colidx).long() + CA[:, idx] = 0 + CAt[:, idx] = 0 + subA = A[:, idx] + state.subB = B[:, idx].t().contiguous() + state.idx = idx + else: + if state.CxB is None: + # B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions + # we also need to convert it to the turing/ampere format + state.CxB, state.SB = F.transform(state.CB, to_order=formatB) + else: + if not state.has_fp16_weights and state.CxB is None: + state.CxB, state.SB = F.transform(state.CB, to_order=formatB) + subA = None + + # 2. Quantize B + if state.has_fp16_weights: + has_grad = True if (getattr(B, "grad", None) is not None) else False + is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1) + if is_transposed: + B = B.contiguous() + + if (state.is_training and not has_grad) or state.CxB is None: + state.reset_grads() + ( + CB, + state.CBt, + state.SCB, + state.SCBt, + coo_tensorB, + ) = F.double_quant(B.to(torch.float16)) + state.CxB, state.SB = F.transform(CB, to_order=formatB) + else: + has_grad = False + + if coo_tensorA is not None and not state.has_fp16_weights: + # extract outliers + + outlier_idx = torch.unique(coo_tensorA.colidx) + state.idx = outlier_idx + # state.outlier_pool.add_outliers(outlier_idx, A.shape[-1]) + # if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]: + # # do not use pool for 2nd FFN layer + # state.idx = state.outlier_pool.get_current_outlier_idx().to(A.device) + # else: + # state.idx = outlier_idx + outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int()) + state.subB = ( + (outliers * state.SCB.view(-1, 1) / 127.0) + .t() + .contiguous() + .to(A.dtype) + ) + CA[:, state.idx.long()] = 0 + CAt[:, state.idx.long()] = 0 + subA = A[:, state.idx.long()] + + shapeB = state.SB[0] + + if len(input_shape) == 3: + output_shape = (input_shape[0], input_shape[1], shapeB[0]) + else: + output_shape = (input_shape[0], shapeB[0]) + + # 3. Matmul + C32A, SA = F.transform(CA, "col32") + out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB) + # we apply the fused bias here + + if bias is None or bias.dtype == torch.float16: + output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias) + output = output.to(A.dtype) + else: # apply bias separately + output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None) + output = output.to(A.dtype).add_(bias) + + # 4. Mixed-precision decomposition matmul + if coo_tensorA is not None and subA is not None: + output += torch.matmul(subA, state.subB) + + # 5. Save state + ctx.state = state + + ctx.formatB = formatB + ctx.grad_shape = input_shape + ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype + + if any(ctx.needs_input_grad[:2]): + ctx.tensors = (CAt, subA, A) + ctx.tensor_states = (SCAt, state.idx) + else: + ctx.tensors = [None, None, None] + ctx.tensor_states = (None, None) + ctx.save_for_backward(None, None) + + + clone_func = torch.clone if len(output_shape) == 3 else lambda x : x + return clone_func(output.view(output_shape)) + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias)) + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None + req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad + CAt, subA, A = ctx.tensors + SCAt, idx = ctx.tensor_states + formatB = ctx.formatB + state = ctx.state + grad_A = grad_B = grad_bias = None + + if req_gradBias: + # compute grad_bias first before changing grad_output dtype + grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) + + # Cast grad_output to fp16 + if len(grad_output.shape) == 3: + grad_output = grad_output.reshape( + -1, grad_output.shape[-1] + ).contiguous() + + Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) + + if req_gradB: + grad_B = torch.matmul(grad_output.t(), A) + + if req_gradA: + if state.CBt is not None: + C32grad, Sgrad = F.transform(Cgrad, "col32") + if state.CxBt is None: + state.CxBt, state.SBt = F.transform( + state.CBt, to_order=formatB, transpose=True + ) + gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt) + grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A) + + elif state.CB is not None: + CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1. / 127.0)) + grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A) + else: + raise Exception('State must contain either CBt or CB matrix for backward') + + return grad_A, grad_B, None, grad_bias, None + + def matmul( A: tensor, B: tensor, @@ -479,7 +663,7 @@ def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tens return MatMulFP8.apply(A, B, out, fw_code, bw_code) -def matmul( +def matmul_mixed( A: tensor, B: tensor, out: tensor = None, @@ -490,4 +674,4 @@ def matmul( state = state or MatmulLtState() if threshold > 0.0: state.threshold = threshold - return MatMul8bitLt.apply(A, B, out, bias, state) + return MatMul8bitMixed.apply(A, B, out, bias, state) diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 4d3e67a..d05b4a6 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -239,7 +239,7 @@ dim4 = torch.randint(32, 96, size=(n,)).tolist() dim2.append(0) decomp = [0.0, 6.0] -funcs = [(torch.matmul, bnb.matmul)] +funcs = [(torch.matmul, bnb.matmul_mixed)] str_funcs = ["matmul"] req_grad = [(False, False), (True, False), (True, True), (False, True)] req_grad = list(product([True, False], repeat=3)) From 75377d125e59f6ce183ff89b6231082aa70b492e Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Fri, 24 Feb 2023 00:10:15 +0000 Subject: [PATCH 18/97] new experiments --- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 59 +++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 9c70642..5ec46b3 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2 +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2, Linear8bitLtMixed diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 5c0d0d4..94c9aa2 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -407,6 +407,65 @@ class Linear8bitLt2(nn.Linear): return out +class Linear8bitLtMixed(nn.Linear): + def __init__( + self, + input_features, + output_features, + bias=True, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=0.0, + index=None, + ): + super().__init__( + input_features, output_features, bias + ) + self.state = bnb.MatmulLtState() + self.index = index + + self.state.threshold = threshold + self.state.has_fp16_weights = has_fp16_weights + self.state.memory_efficient_backward = memory_efficient_backward + if threshold > 0.0 and not has_fp16_weights: + self.state.use_pool = True + + self.weight = Int8Params( + self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights + ) + + def init_8bit_state(self): + self.state.CB = self.weight.CB + self.state.SCB = self.weight.SCB + self.weight.CB = None + self.weight.SCB = None + + def forward(self, x): + self.state.is_training = self.training + + if self.weight.CB is not None: + self.init_8bit_state() + + # weights are cast automatically as Int8Params, but the bias has to be cast manually + # if self.bias is not None and self.bias.dtype != torch.float16: + # self.bias.data = self.bias.data.half() + + #out = bnb.matmul(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias + out = bnb.matmul_mixed(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias + + if not self.state.has_fp16_weights: + if not self.state.memory_efficient_backward and self.state.CB is not None: + # we converted 8-bit row major to turing/ampere format in the first inference pass + # we no longer need the row-major weight + del self.state.CB + self.weight.data = self.state.CxB + elif self.state.memory_efficient_backward and self.state.CxB is not None: + # For memory efficient backward, we convert 8-bit row major to turing/ampere format at each inference pass. + # Thus, we delete CxB from the state. + del self.state.CxB + + return out + class Linear8bitLtThresh(Linear8bitLt): def __init__( From 9851a10b46d54bf1b2ae9b37d59f55f3d6580625 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 24 Feb 2023 10:17:57 -0800 Subject: [PATCH 19/97] Added cast to fp4 layer for speed. --- bitsandbytes/autograd/_functions.py | 7 ++++--- bitsandbytes/nn/modules.py | 6 +++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index ffe19c5..8070ff8 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -404,10 +404,10 @@ class MatMul8bitLt(torch.autograd.Function): ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype if any(ctx.needs_input_grad[:2]): - ctx.tensors = (CAt, subA) + ctx.tensors = (CAt, subA, A) ctx.tensor_states = (SCAt, state.idx) else: - ctx.tensors = [None, None] + ctx.tensors = [None, None, A] ctx.tensor_states = (None, None) ctx.save_for_backward(None, None) @@ -420,7 +420,7 @@ class MatMul8bitLt(torch.autograd.Function): bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad - CAt, subA = ctx.tensors + CAt, subA, A = ctx.tensors SCAt, idx = ctx.tensor_states formatB = ctx.formatB state = ctx.state @@ -436,6 +436,7 @@ class MatMul8bitLt(torch.autograd.Function): Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) if req_gradB: + #grad_B = torch.matmul(grad_output.t(), A) CxAt, SAt = F.transform(CAt, formatB, transpose=True) C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True) gradB32, SgradB32 = F.igemmlt(C32grad, CxAt, Sgrad, SAt) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 4c719c6..ad3f4f7 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -190,7 +190,11 @@ class LinearFP4(nn.Linear): if getattr(self.weight, 'quant_state', None) is None: print('FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') - out = bnb.matmul_fp4(x, self.weight.t(), bias=self.bias, quant_state=self.weight.quant_state) + + inp_dtype = x.dtype + x = x.to(torch.float16) + out = bnb.matmul_fp4(x, self.weight.t(), bias=self.bias.half(), quant_state=self.weight.quant_state) + out = out.to(inp_dtype) return out From 6c31a5fe991169d1caad2426b1cee479af6afd13 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 27 Feb 2023 14:23:21 -0800 Subject: [PATCH 20/97] t5 model fix --- bitsandbytes/nn/modules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index ad3f4f7..5d6d19c 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -190,10 +190,10 @@ class LinearFP4(nn.Linear): if getattr(self.weight, 'quant_state', None) is None: print('FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') - inp_dtype = x.dtype x = x.to(torch.float16) - out = bnb.matmul_fp4(x, self.weight.t(), bias=self.bias.half(), quant_state=self.weight.quant_state) + bias = None if self.bias is None else self.bias.half() + out = bnb.matmul_fp4(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) out = out.to(inp_dtype) return out From 51f8bb713368ef00d48496ce76c0428e976236a9 Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Fri, 24 Mar 2023 05:44:42 +0000 Subject: [PATCH 21/97] pre-triton update --- bitsandbytes/__init__.py | 5 +- bitsandbytes/autograd/_functions.py | 274 +++++++++++++++++++++++++++- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 92 +++++++++- 4 files changed, 360 insertions(+), 13 deletions(-) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index ddd9bf0..5d80df9 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -11,7 +11,10 @@ from .autograd._functions import ( matmul_cublas, mm_cublas, matmul_fp8, - matmul_mixed + matmul_mixed, + matmul_fp8_global, + matmul_fp4, + matmul_fp8_mixed, ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 2d30a86..b7da7b0 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -395,7 +395,7 @@ class MatMulFP8(torch.autograd.Function): # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024): + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): # default of pytorch behavior if inputs are empty ctx.is_empty = False if prod(A.shape) == 0: @@ -425,6 +425,7 @@ class MatMulFP8(torch.autograd.Function): ctx.fw_code = fw_code ctx.bw_code = bw_code ctx.bsz = bsz + ctx.bsz2 = bsz2 ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype if any(ctx.needs_input_grad[:2]): @@ -440,14 +441,13 @@ class MatMulFP8(torch.autograd.Function): if ctx.is_empty: return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None - req_gradA, req_gradB, _, _, _, _ = ctx.needs_input_grad + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad A, B = ctx.tensors grad_A, grad_B = None, None - # TODO: Fix blocksize to be output_dim - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz).to(grad_output.dtype) + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) @@ -467,7 +467,249 @@ class MatMulFP8(torch.autograd.Function): fp8At = F.dequantize(cA, state).to(A.dtype) grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) - return grad_A, grad_B, None, None, None, None + return grad_A, grad_B, None, None, None, None, None + +class MatMulFP8Mixed(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + + B_shape = B.shape + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + # 1. Dequantize + # 2. MatmulnN + cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) + fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) + + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) + + output = torch.matmul(fp8A, fp8B) + + # output is half + + # 3. Save state + ctx.fw_code = fw_code + ctx.bw_code = bw_code + ctx.bsz = bsz + ctx.bsz2 = bsz2 + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype + + if any(ctx.needs_input_grad[:2]): + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None + + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors + + grad_A, grad_B = None, None + + # TODO: Fix blocksize to be output_dim + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) + + # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) + + # not supported by PyTorch. TODO: create work-around + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + + if req_gradB: + At = A.transpose(2, 1).contiguous() + # cA, state = F.quantize(At.float(), code=ctx.fw_code) + # fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype) + + return grad_A, grad_B, None, None, None, None, None + +class MatMulFP4(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + + B_shape = B.shape + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + # 1. Dequantize + # 2. MatmulnN + cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) + fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) + + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) + + output = torch.matmul(fp8A, fp8B) + + # output is half + + # 3. Save state + ctx.fw_code = fw_code + ctx.bw_code = bw_code + ctx.bsz = bsz + ctx.bsz2 = bsz2 + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype + + if any(ctx.needs_input_grad[:2]): + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None + + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors + + grad_A, grad_B = None, None + + # TODO: Fix blocksize to be output_dim + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) + + cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) + + # not supported by PyTorch. TODO: create work-around + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + + if req_gradB: + At = A.transpose(2, 1).contiguous() + cA, state = F.quantize(At.float(), code=ctx.bw_code) + fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) + + return grad_A, grad_B, None, None, None, None, None + + + +class MatMulFP8Global(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + + B_shape = B.shape + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + # 1. Dequantize + # 2. MatmulnN + cA, state = F.quantize(A.float(), code=fw_code) + fp8A = F.dequantize(cA, state).to(A.dtype) + + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) + + output = torch.matmul(fp8A, fp8B) + + # output is half + + # 3. Save state + ctx.fw_code = fw_code + ctx.bw_code = bw_code + ctx.bsz = bsz + ctx.bsz2 = bsz2 + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype + + if any(ctx.needs_input_grad[:2]): + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None + + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors + + grad_A, grad_B = None, None + + # TODO: Fix blocksize to be output_dim + cgrad_out, state = F.quantize(grad_output.float(), code=ctx.bw_code) + fp8out = F.dequantize(cgrad_out, state).to(grad_output.dtype) + + # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) + + # not supported by PyTorch. TODO: create work-around + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + + if req_gradB: + At = A.transpose(2, 1).contiguous() + cA, state = F.quantize(At.float(), code=ctx.fw_code) + fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) + + return grad_A, grad_B, None, None, None, None, None class MatMul8bitMixed(torch.autograd.Function): @@ -520,12 +762,14 @@ class MatMul8bitMixed(torch.autograd.Function): # we also need to convert it to the turing/ampere format state.CxB, state.SB = F.transform(state.CB, to_order=formatB) else: + #print('A shape', A.shape) if not state.has_fp16_weights and state.CxB is None: state.CxB, state.SB = F.transform(state.CB, to_order=formatB) subA = None # 2. Quantize B if state.has_fp16_weights: + #print('B shape', B.shape) has_grad = True if (getattr(B, "grad", None) is not None) else False is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1) if is_transposed: @@ -633,6 +877,8 @@ class MatMul8bitMixed(torch.autograd.Function): Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) if req_gradB: + # print('back A shape', A.shape) + # print('grad output t shape', grad_output.t().shape) grad_B = torch.matmul(grad_output.t(), A) if req_gradA: @@ -642,6 +888,8 @@ class MatMul8bitMixed(torch.autograd.Function): state.CxBt, state.SBt = F.transform( state.CBt, to_order=formatB, transpose=True ) + # print('back B shape', state.CxBt.shape) + # print('back grad shape', C32grad.shape) gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt) grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A) @@ -668,8 +916,18 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1): - return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz) +def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + +def matmul_fp8_global(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + +def matmul_fp8_mixed(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + + +def matmul_fp4(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + return MatMulFP4.apply(A, B, out, fw_code, bw_code, bsz, bsz2) def matmul_mixed( diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 5ec46b3..8be7674 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2, Linear8bitLtMixed +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 94c9aa2..9cdcb4a 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -498,14 +498,69 @@ class LinearFP8(nn.Linear): if input_features > array[i + 1]: self.bsz = k break - print('block size is', self.bsz) + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break def forward(self, x: torch.Tensor): if self.fw_code is None: self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + if self.bias is not None: + out += self.bias + + return out + +class LinearFP8Mixed(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.bw_code = None + self.fw_code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break + + def forward(self, x: torch.Tensor): + if self.fw_code is None: + self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) + + out = bnb.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + if self.bias is not None: + out += self.bias + + return out + +class LinearFP8Global(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.bw_code = None + self.fw_code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break + + def forward(self, x: torch.Tensor): + if self.fw_code is None: + self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) + + out = bnb.matmul_fp8_global(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) if self.bias is not None: out += self.bias @@ -520,12 +575,16 @@ class LinearInt8(nn.Linear): if input_features > array[i + 1]: self.bsz = k break + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break def forward(self, x: torch.Tensor): if self.code is None: self.code = bnb.functional.create_linear_map(True, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz) + out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz, bsz2=self.bsz2) if self.bias is not None: out += self.bias @@ -553,3 +612,30 @@ class LinearInt8Cast(nn.Linear): return out + +class LinearFP4(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.bw_code = None + self.fw_code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break + + def forward(self, x: torch.Tensor): + if self.fw_code is None: + #self.bw_code = bnb.functional.create_fp8_map(True, 3, 0, 4).to(x.device) + self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = bnb.functional.create_fp8_map(True, 3, 0, 4).to(x.device) + + out = bnb.matmul_fp4(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + if self.bias is not None: + out += self.bias + + return out \ No newline at end of file From 69810521d37ed419452aac573f1c3b283290668c Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 27 Mar 2023 09:12:57 -0700 Subject: [PATCH 22/97] Some small changes. --- bitsandbytes/nn/modules.py | 8 +- bitsandbytes/utils.py | 40 +++++++++ csrc/kernels.cu | 2 + csrc/ops.cu | 2 + tests/test_functional.py | 170 ++++++++++++++++++------------------- 5 files changed, 135 insertions(+), 87 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 5d6d19c..a550ec1 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -173,10 +173,11 @@ class FP4Params(torch.nn.Parameter): class LinearFP4(nn.Linear): - def __init__(self, input_features, output_features, bias=True): + def __init__(self, input_features, output_features, bias=True, compute_dtype=None): super().__init__(input_features, output_features, bias) self.state = bnb.MatmulLtState() self.weight = FP4Params(self.weight.data, requires_grad=False) + self.compute_dtype = compute_dtype def init_8bit_state(self): pass @@ -191,9 +192,12 @@ class LinearFP4(nn.Linear): if getattr(self.weight, 'quant_state', None) is None: print('FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') inp_dtype = x.dtype - x = x.to(torch.float16) + if self.compute_dtype is not None: + x = x.to(self.compute_dtype) + bias = None if self.bias is None else self.bias.half() out = bnb.matmul_fp4(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) + out = out.to(inp_dtype) return out diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index 1cd90e3..d6cc966 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -21,3 +21,43 @@ def execute_and_return(command_string: str) -> Tuple[str, str]: std_out, std_err = execute_and_return_decoded_std_streams(command_string) return std_out, std_err + + + +def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None): + """ + Replace linear modules with a new Linear module. + Parameters: + model (`torch.nn.Module`): + Input model or `torch.nn.Module` as the function is run recursively. + linear_replacement (`torch.nn.Module`): + The linear module that replaces the old one. Only expects standard arguments. + If other arguments need to be passed, use a lambda. + skip_modules (`List[str]`, *optional*, defaults to `lm_head`): + List of modules names not to convert. Defaults to `lm_head`. + copy_weights (`bool`): + Copy the weights from the old linear module to the new one + post_processing_fun_name (`str`): + A function name of the replacement linear class that is called + after processing. + """ + for name, module in model.named_children(): + if len(list(module.children())) > 0: + replace_linear(module, linear_replacement, skip_modules, copy_weights, post_processing_function) + + if isinstance(module, torch.nn.Linear) and name not in skip_modules: + old_module = model._modules[name] + model._modules[name] = linear_replacement( + module.in_features, + module.out_features, + module.bias is not None, + ) + if copy_weights: + model._modules[name].weight = old_module.weight + model._modules[name].bias = old_module.bias + + if post_processing_function is not None: + func = getattr(module, post_processing_function, None) + if func is not None: func(module) + return model + diff --git a/csrc/kernels.cu b/csrc/kernels.cu index a1eec68..a2691be 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2968,6 +2968,8 @@ template __global__ void kQuantizeBlockwise(float * code, ha template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +//template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +//template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); diff --git a/csrc/ops.cu b/csrc/ops.cu index 483d915..07ef850 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -71,6 +71,8 @@ template void quantizeBlockwise(float * co kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 64) kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + //else if(blocksize == 32) + //kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); diff --git a/tests/test_functional.py b/tests/test_functional.py index 23b7558..54cecca 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1784,17 +1784,17 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): print("partial matmul", time.time() - t0) -batch_size = 1 -seqdim = 1 +batch_size = 4 +seqdim = 256 values = [] values.append((batch_size, seqdim, 768, 4 * 768)) -#values.append((batch_size, seqdim, 1024, 4*1024)) -#values.append((batch_size, seqdim, 1536, 4*1536)) -#values.append((batch_size, seqdim, 2048, 4*2048)) -#values.append((batch_size, seqdim, 2560, 4*2560)) -#values.append((batch_size, seqdim, 4096, 4*4096)) -#values.append((batch_size, seqdim, 5140, 4*5140)) -#values.append((batch_size, seqdim, 12288, 4*12288)) +values.append((batch_size, seqdim, 1024, 4*1024)) +values.append((batch_size, seqdim, 1536, 4*1536)) +values.append((batch_size, seqdim, 2048, 4*2048)) +values.append((batch_size, seqdim, 2560, 4*2560)) +values.append((batch_size, seqdim, 4096, 4*4096)) +values.append((batch_size, seqdim, 5140, 4*5140)) +values.append((batch_size, seqdim, 12288, 4*12288)) names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): @@ -1839,90 +1839,90 @@ def test_bench_matmul(batch, seq, model, hidden): torch.cuda.synchronize() print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - bnb.matmul(A, B) - torch.cuda.synchronize() - print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # bnb.matmul(A, B) + #torch.cuda.synchronize() + #print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - bnb.matmul(A, B, threshold=6.0) - torch.cuda.synchronize() - print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # bnb.matmul(A, B, threshold=6.0) + #torch.cuda.synchronize() + #print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0) - C32A, SA = F.transform(CA, "col32") - CB, CBt, SCB, SCBt, coo_tensorB = F.double_quant(B) - CxB, SB = F.transform(CB, to_order=formatB) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) - torch.cuda.synchronize() - print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0) + #C32A, SA = F.transform(CA, "col32") + #CB, CBt, SCB, SCBt, coo_tensorB = F.double_quant(B) + #CxB, SB = F.transform(CB, to_order=formatB) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) + #torch.cuda.synchronize() + #print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - BA, statsB = F.vectorwise_quant(B, dim=1) - CxB, SB = F.nvidia_transform(CB, to_order=formatB) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - A2 = A.view(-1, A.shape[-1]).contiguous() - CA, statsA = F.vectorwise_quant(A2, dim=1) - C32A, SA = F.nvidia_transform(CA, "col32") - out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) - Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) - F.vectorwise_mm_dequant(Cout, statsA, statsB.t()) - torch.cuda.synchronize() - print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #BA, statsB = F.vectorwise_quant(B, dim=1) + #CxB, SB = F.nvidia_transform(CB, to_order=formatB) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # A2 = A.view(-1, A.shape[-1]).contiguous() + # CA, statsA = F.vectorwise_quant(A2, dim=1) + # C32A, SA = F.nvidia_transform(CA, "col32") + # out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) + # Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) + # F.vectorwise_mm_dequant(Cout, statsA, statsB.t()) + #torch.cuda.synchronize() + #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear") - CxB, SB = F.nvidia_transform(CB, to_order=formatB) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - A2 = A.view(-1, A.shape[-1]).contiguous() - CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear") - C32A, SA = F.nvidia_transform(CA, "col32") - out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) - Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) - out = Cout * statsB * statsA * (1.0 / (127 * 127)) - torch.cuda.synchronize() - print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear") + #CxB, SB = F.nvidia_transform(CB, to_order=formatB) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # A2 = A.view(-1, A.shape[-1]).contiguous() + # CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear") + # C32A, SA = F.nvidia_transform(CA, "col32") + # out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) + # Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) + # out = Cout * statsB * statsA * (1.0 / (127 * 127)) + #torch.cuda.synchronize() + #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - linear8bit(A) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - linear8bit(A) - torch.cuda.synchronize() - print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #linear8bit(A) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # linear8bit(A) + #torch.cuda.synchronize() + #print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - linearMixedBit(A) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - linearMixedBit(A) - torch.cuda.synchronize() - print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #linearMixedBit(A) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # linearMixedBit(A) + #torch.cuda.synchronize() + #print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - linear8bit_train(A) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - linear8bit_train(A) - torch.cuda.synchronize() - print( f"bnb linear8bitlt (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #linear8bit_train(A) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # linear8bit_train(A) + #torch.cuda.synchronize() + #print( f"bnb linear8bitlt (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - linear8bit_train_thresh(A) - torch.cuda.synchronize() - t0 = time.time() - for i in range(iters): - linear8bit_train(A) - torch.cuda.synchronize() - print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + #linear8bit_train_thresh(A) + #torch.cuda.synchronize() + #t0 = time.time() + #for i in range(iters): + # linear8bit_train(A) + #torch.cuda.synchronize() + #print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") def test_zeropoint(): def quant_zp(x): From 5f3d9ada8dabbd9a449f134141f14546f9ce911e Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Wed, 29 Mar 2023 06:47:08 +0000 Subject: [PATCH 23/97] triton-v1 --- bitsandbytes/nn/__init__.py | 1 + bitsandbytes/nn/triton_based_modules.py | 247 ++++++++++++ bitsandbytes/nn/triton_utils/v0/__init__.py | 0 .../nn/triton_utils/v0/fused_gelu_quantize.py | 190 +++++++++ .../v0/int8_matmul_mixed_dequanitze.py | 276 +++++++++++++ .../v0/int8_matmul_rowwise_dequantize.py | 149 +++++++ .../v0/int8_matmul_rowwise_dequantize_bias.py | 160 ++++++++ .../quantize_columnwise_nogroup_transpose.py | 122 ++++++ .../nn/triton_utils/v0/quantize_global.py | 130 +++++++ .../v0/quantize_rowwise_nogroup.py | 174 +++++++++ tests/triton_tests/attn_decomp.py | 363 ++++++++++++++++++ tests/triton_tests/attn_info_ln.jsonl | 20 + tests/triton_tests/full_matrix_decomp.py | 353 +++++++++++++++++ tests/triton_tests/info.jsonl | 142 +++++++ tests/triton_tests/info_mlp.jsonl | 20 + tests/triton_tests/info_mlp_autocast.jsonl | 20 + tests/triton_tests/info_mlp_autocast_ln.jsonl | 23 ++ tests/triton_tests/make_plot_with_info.py | 137 +++++++ tests/triton_tests/mlp.py | 64 +++ tests/triton_tests/mlp_decomp_autocast.py | 166 ++++++++ tests/triton_tests/mlp_decomp_autocast_ln.py | 165 ++++++++ tests/triton_tests/plot1.pdf | Bin 0 -> 34302 bytes tests/triton_tests/plot1.png | Bin 0 -> 121873 bytes tests/triton_tests/plot2.pdf | Bin 0 -> 16044 bytes tests/triton_tests/plot2.png | Bin 0 -> 51996 bytes tests/triton_tests/plot2.py | 69 ++++ tests/triton_tests/plot3.pdf | Bin 0 -> 20122 bytes tests/triton_tests/plot3.png | Bin 0 -> 58335 bytes tests/triton_tests/plot3.py | 193 ++++++++++ tests/triton_tests/rowwise.py | 43 +++ 30 files changed, 3227 insertions(+) create mode 100644 bitsandbytes/nn/triton_based_modules.py create mode 100644 bitsandbytes/nn/triton_utils/v0/__init__.py create mode 100644 bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py create mode 100644 bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py create mode 100644 bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py create mode 100644 bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py create mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py create mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_global.py create mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py create mode 100644 tests/triton_tests/attn_decomp.py create mode 100644 tests/triton_tests/attn_info_ln.jsonl create mode 100644 tests/triton_tests/full_matrix_decomp.py create mode 100644 tests/triton_tests/info.jsonl create mode 100644 tests/triton_tests/info_mlp.jsonl create mode 100644 tests/triton_tests/info_mlp_autocast.jsonl create mode 100644 tests/triton_tests/info_mlp_autocast_ln.jsonl create mode 100644 tests/triton_tests/make_plot_with_info.py create mode 100644 tests/triton_tests/mlp.py create mode 100644 tests/triton_tests/mlp_decomp_autocast.py create mode 100644 tests/triton_tests/mlp_decomp_autocast_ln.py create mode 100644 tests/triton_tests/plot1.pdf create mode 100644 tests/triton_tests/plot1.png create mode 100644 tests/triton_tests/plot2.pdf create mode 100644 tests/triton_tests/plot2.png create mode 100644 tests/triton_tests/plot2.py create mode 100644 tests/triton_tests/plot3.pdf create mode 100644 tests/triton_tests/plot3.png create mode 100644 tests/triton_tests/plot3.py create mode 100644 tests/triton_tests/rowwise.py diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 8be7674..8e3a598 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -3,3 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed +from .triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py new file mode 100644 index 0000000..9fe0b69 --- /dev/null +++ b/bitsandbytes/nn/triton_based_modules.py @@ -0,0 +1,247 @@ +import torch +import torch.nn as nn +import time + +from .triton_utils.v0.quantize_rowwise_nogroup import quantize_rowwise_nogroup +from .triton_utils.v0.quantize_columnwise_nogroup_transpose import quantize_columnwise_nogroup_transpose +from .triton_utils.v0.int8_matmul_rowwise_dequantize_bias import int8_matmul_rowwise_dequantize_bias +from .triton_utils.v0.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize +from .triton_utils.v0.quantize_global import quantize_global, quantize_global_transpose +from .triton_utils.v0.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze, int8_matmul_mixed_dequanitze_bias +from .triton_utils.v0.fused_gelu_quantize import quantize_rowwise_nogroup_gelu, quantize_rowwise_nogroup_back_gelu + +class _switchback(torch.autograd.Function): + + @staticmethod + def forward(ctx, X_3D, W, bias): + + X = X_3D.view(-1, X_3D.size(-1)) + + ctx.save_for_backward = X, W + X_int8, state_X = quantize_rowwise_nogroup(X) + W_int8, state_W = quantize_rowwise_nogroup(W) + return int8_matmul_rowwise_dequantize_bias( + X_int8, W_int8.t(), state_X, state_W, bias + ).view(*X_3D.size()[:-1], -1) + + @staticmethod + def backward(ctx, G_3D): + X, W = ctx.save_for_backward + + G = G_3D.reshape(-1, G_3D.size(-1)) + + grad_X = grad_W = grad_bias = None + + if ctx.needs_input_grad[0]: + G_int8, state_G = quantize_rowwise_nogroup(G) + W_int8, state_W = quantize_columnwise_nogroup_transpose(W) + grad_X = int8_matmul_rowwise_dequantize(G_int8, W_int8.t(), state_G, state_W).view( + *G_3D.size()[:-1], -1 + ) + if ctx.needs_input_grad[1]: + grad_W = torch.matmul(G.t(), X.to(G.dtype)) + if ctx.needs_input_grad[2]: + grad_bias = G.sum(dim=0) + + return grad_X, grad_W, grad_bias + +class SwitchBackLinear(nn.Linear): + + def prepare_for_eval(self): + state_W = self.weight.abs().max(dim=1, keepdim=True)[0] + W_int8 = (127 * self.weight.float() / state_W).round().to(torch.int8) + state_W = state_W.squeeze() + + self.register_buffer("W_int8", W_int8) + self.register_buffer("state_W", state_W) + + del self.weight + + def forward(self, x): + if self.training: + return _switchback.apply(x, self.weight, self.bias) + else: + if not hasattr(self, "state_W"): + self.prepare_for_eval() + X = x.view(-1, x.size(-1)) + X_int8, state_X = quantize_rowwise_nogroup(X) + return int8_matmul_rowwise_dequantize_bias( + X_int8, self.W_int8.t(), state_X, self.state_W, self.bias + ).view(*x.size()[:-1], -1) + + +class _switchback_global(torch.autograd.Function): + + @staticmethod + def forward(ctx, X_3D, W, bias): + + X = X_3D.view(-1, X_3D.size(-1)) + + X_int8, state_X = quantize_rowwise_nogroup(X) + W_int8, state_W = quantize_global(W) + ctx.save_for_backward = X, W + return int8_matmul_mixed_dequanitze_bias( + X_int8, W_int8.t(), state_X, state_W, bias + ).view(*X_3D.size()[:-1], -1) + + @staticmethod + def backward(ctx, G_3D): + + G = G_3D.reshape(-1, G_3D.size(-1)) + + grad_X = grad_W = grad_bias = None + + X, W = ctx.save_for_backward + if ctx.needs_input_grad[0]: + G_int8, state_G = quantize_rowwise_nogroup(G) + W_int8, state_W = quantize_global_transpose(W) + grad_X = int8_matmul_mixed_dequanitze(G_int8, W_int8.t(), state_G, state_W).view( + *G_3D.size()[:-1], -1 + ) + if ctx.needs_input_grad[1]: + grad_W = torch.matmul(G.t(), X.to(G.dtype)) + if ctx.needs_input_grad[2]: + grad_bias = G.sum(dim=0) + + return grad_X, grad_W, grad_bias + + + +class SwitchBackGlobalLinear(nn.Linear): + + def prepare_for_eval(self): + state_W = self.weight.abs().max() + W_int8 = (127 * self.weight.float() / state_W).round().to(torch.int8) + + self.register_buffer("W_int8", W_int8) + self.register_buffer("state_W", state_W) + + del self.weight + + def forward(self, x): + if self.training: + return _switchback_global.apply(x, self.weight, self.bias) + else: + if not hasattr(self, "state_W"): + self.prepare_for_eval() + X = x.view(-1, x.size(-1)) + X_int8, state_X = quantize_rowwise_nogroup(X) + return int8_matmul_mixed_dequanitze_bias( + X_int8, self.W_int8.t(), state_X, self.state_W, self.bias + ).view(*x.size()[:-1], -1) + + + + +class LinearFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, input, weight, bias=None): + X = input.view(-1, input.size(-1)) + + ctx.save_for_backward(X, weight, bias) + output = input.matmul(weight.t()) + if bias is not None: + output += bias.unsqueeze(0).expand_as(output) + return output.view(*input.size()[:-1], -1) + + @staticmethod + def backward(ctx, grad_output_3D): + input, weight, bias = ctx.saved_tensors + + grad_output = grad_output_3D.reshape(-1, grad_output_3D.size(-1)) + + grad_input = grad_weight = grad_bias = None + + if ctx.needs_input_grad[0]: + grad_input = grad_output.matmul(weight.to(grad_output.dtype)).view(*grad_output_3D.size()[:-1], -1) + if ctx.needs_input_grad[1]: + grad_weight = grad_output.t().matmul(input.to(grad_output.dtype)) + if bias is not None and ctx.needs_input_grad[2]: + grad_bias = grad_output.sum(0) + + return grad_input, grad_weight, grad_bias + +class MyLinear(nn.Linear): + + def forward(self, x): + return LinearFunction.apply(x, self.weight, self.bias) + + + + +class _switchback_mlp(torch.autograd.Function): + + + @staticmethod + def forward(ctx, X_3D, W1, B1, W2, B2): + + X1 = X_3D.view(-1, X_3D.size(-1)) + + X1_int8, state_X1 = quantize_rowwise_nogroup(X1) + W1_int8, state_W1 = quantize_global(W1) + + X2_pre = int8_matmul_mixed_dequanitze_bias( + X1_int8, W1_int8.t(), state_X1, state_W1, B1 + ) + + # X2_v1 = torch.nn.functional.gelu(X2) + # X2_int8, state_X2, = quantize_rowwise_nogroup(X2_v1) + X2_int8, state_X2, X2 = quantize_rowwise_nogroup_gelu(X2_pre) + + W2_int8, state_W2 = quantize_global(W2) + + out = int8_matmul_mixed_dequanitze_bias( + X2_int8, W2_int8.t(), state_X2, state_W2, B2 + ) + + ctx.save_for_backward = X1, W1, X2, X2_pre, W2 + + return out.view(*X_3D.size()[:-1], -1) + + @staticmethod + def backward(ctx, G_3D): + + G2 = G_3D.reshape(-1, G_3D.size(-1)) + + grad_X1 = grad_W1 = grad_B1 = grad_W2 = grad_B2 = None + + X1, W1, X2, X2_pre, W2 = ctx.save_for_backward + + G2_int8, state_G2 = quantize_rowwise_nogroup(G2) + W2_int8, state_W2 = quantize_global_transpose(W2) + + G1 = int8_matmul_mixed_dequanitze(G2_int8, W2_int8.t(), state_G2, state_W2).view( + *G_3D.size()[:-1], -1 + ) + + grad_W2 = torch.matmul(G2.t(), X2.to(G2.dtype)) + grad_B2 = G2.sum(dim=0) + + G1_int8, state_G1, G1 = quantize_rowwise_nogroup_back_gelu(G1, X2_pre) + + if ctx.needs_input_grad[0]: + + W1_int8, state_W1 = quantize_global_transpose(W1) + grad_X1 = int8_matmul_mixed_dequanitze(G1_int8, W1_int8.t(), state_G1, state_W1).view( + *G_3D.size()[:-1], -1 + ) + if ctx.needs_input_grad[1]: + grad_W1 = torch.matmul(G1.t(), X1.to(G1.dtype)) + if ctx.needs_input_grad[2]: + grad_B1 = G1.sum(dim=0) + + return grad_X1, grad_W1, grad_B1, grad_W2, grad_B2 + + +class SwitchBackGlobalMLP(nn.Module): + + + def __init__(self, dim_in, dim_hidden): + super().__init__() + self.linear1 = nn.Linear(dim_in, dim_hidden) + self.linear2 = nn.Linear(dim_hidden, dim_in) + + + def forward(self, x): + return _switchback_mlp.apply(x, self.linear1.weight, self.linear1.bias, self.linear2.weight, self.linear2.bias) + \ No newline at end of file diff --git a/bitsandbytes/nn/triton_utils/v0/__init__.py b/bitsandbytes/nn/triton_utils/v0/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py b/bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py new file mode 100644 index 0000000..50451cb --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py @@ -0,0 +1,190 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +tl.libdevice + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_rowwise_nogroup_gelu( + x_ptr, + output_ptr, + output_maxs, + output_fp16, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + + cdf = 0.5 * (1.0 + tl.libdevice.tanh(x * 0.7978845608 * (1 + 0.044715 * x * x))) + x_new = x * cdf + + tl.store(output_fp16 + offsets, x_new, mask=row_mask) + + abs_x = tl.abs(x_new) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x_new / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_rowwise_nogroup_gelu(x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_fp16 = torch.empty(*x.shape, device=x.device, dtype=torch.float16) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _quantize_rowwise_nogroup_gelu[grid](x, output, output_maxs, output_fp16, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output, output_maxs, output_fp16 + + + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_rowwise_nogroup_back_gelu( + x_ptr, + in_ptr, + output_ptr, + output_maxs, + output_fp16, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x_out = tl.load(x_ptr + offsets, mask=row_mask) + x_in = tl.load(in_ptr + offsets, mask=row_mask) + + cdf = 0.5 * (1.0 + tl.libdevice.tanh(x_in * 0.7978845608 * (1 + 0.044715 * x_in * x_in))) + intermediate = tl.libdevice.tanh(x_in * 0.7978845608 * (1 + 0.044715 * x_in * x_in)) + dcdf = 0.5 * (0.7978845608 + 0.1070322243 * x_in * x_in) * (1 - intermediate * intermediate) + x = x_out * (cdf + x_in * dcdf) + + tl.store(output_fp16 + offsets, x, mask=row_mask) + + abs_x = tl.abs(x) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_rowwise_nogroup_back_gelu(x: torch.Tensor, y : torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_fp16 = torch.empty(*x.shape, device=x.device, dtype=torch.float16) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _quantize_rowwise_nogroup_back_gelu[grid](x, y, output, output_maxs, output_fp16, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output, output_maxs, output_fp16 + + + +# if __name__ == '__main__': +# torch.manual_seed(0) + +# x = torch.randn(1280, 768).cuda().to(torch.float16) +# out = quantize_rowwise_nogroup(x) + +# x_real = (127 * x.float() / x.abs().max(dim=1, keepdim=True)[0]).round().to(torch.int8) +# max2 = x.abs().max(1)[0] + +# print(torch.allclose(out[1], max2)) +# print( (x_real == out[0]).float().mean() ) + +# # for i in range(x.shape[0]): +# # print( (x_real[i, :] == out[0][i, :]).float().mean() ) + +# # print(out[0]) +# # print(x_real) +# # import pdb; pdb.set_trace() +# # print(out[2]) +# # print(out[2][:10]) +# sums = x.sum(dim=0) +# #print(sums[:10]) +# #print( (sums == out[2]).float().mean() ) + +# import pdb; pdb.set_trace() +# # import pdb; pdb.set_trace() +# # exit() + +# # repeat = 16 + +# # for _ in range(8): +# # out = quantize_rowwise_nogroup(x) + +# # triton_graph = torch.cuda.CUDAGraph() +# # with torch.cuda.graph(triton_graph): +# # out = quantize_rowwise_nogroup(x) + +# # triton_graph.replay() + +# # torch.cuda.synchronize() +# # start = time.time() +# # for _ in range(repeat): +# # triton_graph.replay() +# # torch.cuda.synchronize() +# # end = time.time() + +# # print(out[0]) +# # print(out[1]) +# # print(x / x.abs().max(dim=1, keepdim=True)[0]) +# # max1 = out[1] +# # max2 = x.abs().max(1)[0] +# # print(max1, max2) +# # print(torch.allclose(max1, max2)) + +# #print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py new file mode 100644 index 0000000..2ecfcb8 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py @@ -0,0 +1,276 @@ +import torch + +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, +) +@triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@triton.jit +def _kernel(A, B, C, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + w_factor = tl.load(state_w_ptr) + x_factor = tl.load(state_x_ptr + ram)[:, None] + + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +def int8_matmul_mixed_dequanitze(a, b, state_x, state_w): + device = a.device + divfactor = 1. / (127. * 127.) + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _kernel[grid](a, b, c, state_x, state_w, M, N, K, divfactor, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c + + + +@triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, +) +@triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@triton.jit +def _kernel_bias(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + w_factor = tl.load(state_w_ptr) + x_factor = tl.load(state_x_ptr + ram)[:, None] + + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +def int8_matmul_mixed_dequanitze_bias(a, b, state_x, state_w, bias): + device = a.device + divfactor = 1. / (127. * 127.) + has_bias = 0 if bias is None else 1 + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _kernel_bias[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py new file mode 100644 index 0000000..fa0b516 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py @@ -0,0 +1,149 @@ +import torch + +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, +) +@triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@triton.jit +def _kernel(A, B, C, state_x_ptr, state_w_ptr, M, N, K, divfactor, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + w_factor = tl.load(state_w_ptr + rbn)[None, :] + x_factor = tl.load(state_x_ptr + ram)[:, None] + + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +def int8_matmul_rowwise_dequantize(a, b, state_x, state_w): + divfactor = 1. / (127. * 127.) + + device = a.device + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _kernel[grid](a, b, c, state_x, state_w, M, N, K, divfactor, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py new file mode 100644 index 0000000..5f524c1 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py @@ -0,0 +1,160 @@ +import torch + +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, +) +@triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@triton.jit +def _kernel(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + w_factor = tl.load(state_w_ptr + rbn)[None, :] + x_factor = tl.load(state_x_ptr + ram)[:, None] + + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +def int8_matmul_rowwise_dequantize_bias(a, b, state_x, state_w, bias): + + #print(bias) + divfactor = 1. / (127. * 127.) + + has_bias = 0 if bias is None else 1 + + if bias is not None: + bias = bias.contiguous() + + device = a.device + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _kernel[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py b/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py new file mode 100644 index 0000000..fa3a9a9 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py @@ -0,0 +1,122 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_stages=16), + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=16, num_warps=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_columnwise_nogroup_transpose( + x_ptr, + output_ptr, + output_maxs, + n_elements, + M : tl.constexpr, N : tl.constexpr, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid + p2_arange = tl.arange(0, P2) + p2_arange_mask = p2_arange < M + arange = p2_arange * N + offsets = block_start + arange + x = tl.load(x_ptr + offsets, mask=p2_arange_mask) + abs_x = tl.abs(x) + max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + + new_start = pid * M + new_offsets = new_start + p2_arange + tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_columnwise_nogroup_transpose(x: torch.Tensor): + M, N = x.shape + output = torch.empty(N, M, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(M)))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + _quantize_columnwise_nogroup_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) + return output, output_maxs + + + +if __name__ == '__main__': + torch.manual_seed(0) + + x = torch.randn(1280, 768).cuda().to(torch.float16) + out = quantize_columnwise_nogroup_transpose(x) + + + x_real = x.t().float() + x_real_int8 = (127. * x_real / x_real.abs().max(dim=1, keepdim=True)[0]).round().to(torch.int8) + maxs = x_real.abs().max(dim=1, keepdim=True)[0].half() + + #print(out[0][2,:]) + + print((out[0] == x_real_int8).float().mean()) + print((out[1] == maxs[:, 0]).float().mean()) + + # print(out[0]) + # print(out[1]) + + # print(out[0][2,:]) + # print(x_real[2, :]) + + # print((out[0] != x_real).nonzero()) + + #import pdb; pdb.set_trace() + # repeat = 16 + + # for _ in range(8): + # out = quantize_columnwise_nogroup_transpose(x) + + # triton_graph = torch.cuda.CUDAGraph() + # with torch.cuda.graph(triton_graph): + # out = quantize_columnwise_nogroup_transpose(x) + + # triton_graph.replay() + + # torch.cuda.synchronize() + # start = time.time() + # for _ in range(repeat): + # triton_graph.replay() + # torch.cuda.synchronize() + # end = time.time() + + # print(out[0]) + # print(out[1]) + # print(x / x.abs().max(dim=0, keepdim=True)[0]) + # x_real = (127 * (x / x.abs().max(dim=0, keepdim=True)[0])).round().to(torch.int8) + # max1 = out[1] + # max2 = x.abs().max(0)[0] + # print(max1, max2) + # import pdb; pdb.set_trace() + # print(torch.allclose(max1, max2)) + + # print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_global.py b/bitsandbytes/nn/triton_utils/v0/quantize_global.py new file mode 100644 index 0000000..6d23aac --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/quantize_global.py @@ -0,0 +1,130 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), + triton.Config({'BLOCK_SIZE': 2048,}, num_stages=1), + + ], + key=['n_elements'] +) +@triton.jit +def _quantize_global( + x_ptr, + absmax_inv_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + absmax_inv = tl.load(absmax_inv_ptr) + output = tl.libdevice.llrint(127. * (x * absmax_inv)) + tl.store(output_ptr + offsets, output, mask=mask) + +def quantize_global(x: torch.Tensor): + absmax = x.abs().max().unsqueeze(0) + absmax_inv = 1./ absmax + output = torch.empty(*x.shape, device='cuda', dtype=torch.int8) + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + _quantize_global[grid](x, absmax_inv, output, n_elements) + return output, absmax + + +@triton.autotune( + configs=[ + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), + + # ... + ], + key=['M', 'N'] +) +@triton.jit +def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, + BLOCK_M : tl.constexpr, + BLOCK_N : tl.constexpr, + GROUP_M : tl.constexpr): + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // group_size + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) + mask = (rm < M)[:, None] & (rn < N)[None, :] + a = tl.load(A, mask=mask) + absmax_inv = tl.load(absmax_inv_ptr) + + # rematerialize to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + + output = tl.libdevice.llrint(127. * (a * absmax_inv)) + + tl.store(B, output, mask=mask) + +def quantize_global_transpose(input): + absmax = input.abs().max().unsqueeze(0) + absmax_inv = 1./ absmax + M, N = input.shape + out = torch.empty(N, M, device='cuda', dtype=torch.int8) + + assert out.size(0) == N and out.size(1) == M + assert input.stride(0) == 1 or input.stride(1) == 1 + assert out.stride(0) == 1 or out.stride(1) == 1 + + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) + _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) + return out, absmax + +if __name__ == '__main__': + + + w = torch.randn(768, 1280).cuda().to(torch.float16) + W_int8, state_w = quantize_global(w) + r_state_w = w.abs().max() + r_W_int8 = ((127 * w.float()) / state_w).round().to(torch.int8) + print((r_W_int8 == W_int8).float().mean()) + + # print(r_W_int8) + # print(W_int8) + exit() + repeat = 16 + + for _ in range(8): + out = quantize_global(w) + + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph): + out = quantize_global(w) + + triton_graph.replay() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + triton_graph.replay() + torch.cuda.synchronize() + end = time.time() + + print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py b/bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py new file mode 100644 index 0000000..7e63f74 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py @@ -0,0 +1,174 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_rowwise_nogroup( + x_ptr, + output_ptr, + output_maxs, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + + abs_x = tl.abs(x) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_rowwise_nogroup(x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _quantize_rowwise_nogroup[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output, output_maxs + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _experimental_quantize_rowwise_nogroup( + x_ptr, + output_ptr, + bias_grad_ptr, + output_maxs, + n_elements, + M: tl.constexpr, N: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, + P2M: tl.constexpr, +): + pid = tl.program_id(axis=0) + if pid < M: + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + + abs_x = tl.abs(x) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + else: + real_pid = pid - M + arange_new = tl.arange(0, P2M) + mask_new = arange_new < M + offsets_new = real_pid + arange_new * N + new_x = tl.load(x_ptr + offsets_new, mask=mask_new) + s = tl.sum(tl.where(mask_new, new_x, 0).to(tl.float32), axis=0) + tl.store(bias_grad_ptr + real_pid, s) + +def experimental_quantize_rowwise_nogroup(x: torch.Tensor): + M, N = x.shape + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + bias_grad = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + P2M = int(2 ** (math.ceil(math.log2(x.shape[0])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0] + x.shape[1],) + _experimental_quantize_rowwise_nogroup[grid](x, output, bias_grad, output_maxs, n_elements, M, N, BLOCK_SIZE=x.shape[1], P2=P2, P2M=P2M) + return output, output_maxs, bias_grad + + +if __name__ == '__main__': + torch.manual_seed(0) + + x = torch.randn(1280, 768).cuda().to(torch.float16) + out = quantize_rowwise_nogroup(x) + + x_real = (127 * x.float() / x.abs().max(dim=1, keepdim=True)[0]).round().to(torch.int8) + max2 = x.abs().max(1)[0] + + print(torch.allclose(out[1], max2)) + print( (x_real == out[0]).float().mean() ) + + # for i in range(x.shape[0]): + # print( (x_real[i, :] == out[0][i, :]).float().mean() ) + + # print(out[0]) + # print(x_real) + # import pdb; pdb.set_trace() + # print(out[2]) + # print(out[2][:10]) + sums = x.sum(dim=0) + #print(sums[:10]) + #print( (sums == out[2]).float().mean() ) + + import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() + # exit() + + # repeat = 16 + + # for _ in range(8): + # out = quantize_rowwise_nogroup(x) + + # triton_graph = torch.cuda.CUDAGraph() + # with torch.cuda.graph(triton_graph): + # out = quantize_rowwise_nogroup(x) + + # triton_graph.replay() + + # torch.cuda.synchronize() + # start = time.time() + # for _ in range(repeat): + # triton_graph.replay() + # torch.cuda.synchronize() + # end = time.time() + + # print(out[0]) + # print(out[1]) + # print(x / x.abs().max(dim=1, keepdim=True)[0]) + # max1 = out[1] + # max2 = x.abs().max(1)[0] + # print(max1, max2) + # print(torch.allclose(max1, max2)) + + #print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/tests/triton_tests/attn_decomp.py b/tests/triton_tests/attn_decomp.py new file mode 100644 index 0000000..9e8ed28 --- /dev/null +++ b/tests/triton_tests/attn_decomp.py @@ -0,0 +1,363 @@ + +import torch +import json +from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, MyLinear +import time + +# class AttentionOld(torch.nn.Module): +# def __init__( +# self, +# dim, +# num_heads=8, +# qkv_bias=True, +# scaled_cosine=False, +# scale_heads=False, +# attn_drop=0., +# proj_drop=0., +# linear_module=torch.nn.Linear, +# ): +# super().__init__() +# self.scaled_cosine = scaled_cosine +# self.scale_heads = scale_heads +# assert dim % num_heads == 0, 'dim should be divisible by num_heads' +# self.num_heads = num_heads +# self.head_dim = dim // num_heads +# self.scale = self.head_dim ** -0.5 + +# self.in_proj_linear = linear_module(dim, 3 * dim, bias = qkv_bias) + +# self.attn_drop = torch.nn.Dropout(attn_drop) +# if self.scale_heads: +# self.head_scale = torch.nn.Parameter(torch.ones((num_heads, 1, 1))) +# else: +# self.head_scale = None +# self.out_proj = linear_module(dim, dim) +# self.out_drop = torch.nn.Dropout(proj_drop) + +# def forward(self, x, attn_mask = None): +# L, N, C = x.shape + +# q, k, v = self.in_proj_linear(x).chunk(3, dim=-1) + +# q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) +# k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) +# v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) + +# q = q * self.scale +# attn = torch.bmm(q, k.transpose(-1, -2)) + +# if attn_mask is not None: +# if attn_mask.dtype == torch.bool: +# new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype) +# new_attn_mask.masked_fill_(attn_mask, float("-inf")) +# attn_mask = new_attn_mask +# attn += attn_mask + +# attn = attn.softmax(dim=-1) +# attn = self.attn_drop(attn) + +# x = torch.bmm(attn, v) +# x = x.transpose(0, 1).reshape(L, N, C) + +# x = self.out_proj(x) +# x = self.out_drop(x) +# return x + +class Attention(torch.nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=True, + scaled_cosine=False, + scale_heads=False, + attn_drop=0., + proj_drop=0., + linear_module=torch.nn.Linear, + ): + super().__init__() + self.scaled_cosine = scaled_cosine + self.scale_heads = scale_heads + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim ** -0.5 + + self.ln = torch.nn.LayerNorm(dim) + + self.in_proj_linear = linear_module(dim, 3 * dim, bias = qkv_bias) + + self.attn_drop = torch.nn.Dropout(attn_drop) + if self.scale_heads: + self.head_scale = torch.nn.Parameter(torch.ones((num_heads, 1, 1))) + else: + self.head_scale = None + self.out_proj = linear_module(dim, dim) + self.out_drop = torch.nn.Dropout(proj_drop) + + def forward(self, x, attn_mask = None): + q, k, v = self.in_proj_linear(self.ln(x)).chunk(3, dim=-1) + x = torch.compile(torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask)) + x = self.out_proj(x) + return x + +if __name__ == '__main__': + + + for dim in [1024, 1280, 1408, 1664, 2048]: + for batch in [2**14, 2**15, 2**16, 2**17]: + + # if dim != 4096 or batch != 2**17: + # continue + + x1 = torch.randn( batch // 256, 256, dim ).cuda().requires_grad_(True) + qu = torch.randn( batch // 256, 256, dim ).cuda().requires_grad_(True) + ke = torch.randn( batch // 256, 256, dim ).cuda().requires_grad_(True) + va = torch.randn( batch // 256, 256, dim ).cuda().requires_grad_(True) + + standard = Attention(dim).cuda() + my_standard = Attention(dim, linear_module=MyLinear).cuda() + sb = Attention(dim, linear_module=SwitchBackGlobalLinear).cuda() + standard_compiled = torch.compile(standard) + ln_model = torch.nn.Sequential( + torch.nn.LayerNorm(dim), + torch.nn.LayerNorm(dim), + ).cuda() + ln_model_compiled = torch.compile( + ln_model + ) + gelu_model = torch.nn.Sequential( + torch.nn.GELU(), + ).cuda() + gelu_model_compiled = torch.compile( + gelu_model + ) + + + print('Model part 2') + + repeat = 32 + + info = {'repeat' : repeat, 'batch_size' : batch, 'dim' : dim} + + + k = 'attn' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_attn = torch.nn.functional.scaled_dot_product_attention(qu, ke, va) + ((2 ** 16) * out_attn).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_attn = torch.nn.functional.scaled_dot_product_attention(qu, ke, va) + ((2 ** 16) * out_attn).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'ln' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out = ln_model(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out = ln_model(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + x1.grad.zero_() + + k = 'ln_compiled' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out = ln_model_compiled(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out = ln_model_compiled(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'gelu' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out = gelu_model(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out = gelu_model(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + x1.grad.zero_() + + k = 'gelu_compiled' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out = gelu_model_compiled(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out = gelu_model_compiled(x1) + ((2 ** 16) * out).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + x1.grad.zero_() + + k = 'standard' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_standard = standard(x1) + ((2 ** 16) * out_standard).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_standard = standard(x1) + ((2 ** 16) * out_standard).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + x1.grad.zero_() + + k = 'my_standard' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_my_standard = my_standard(x1) + ((2 ** 16) * out_my_standard).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_my_standard = my_standard(x1) + ((2 ** 16) * out_my_standard).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + # + # + + x1.grad.zero_() + + + k = 'standard_compiled' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_standard_compiled = standard_compiled(x1) + ((2 ** 16) * out_standard_compiled).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_standard_compiled = standard_compiled(x1) + ((2 ** 16) * out_standard_compiled).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + x1.grad.zero_() + + + k = 'sb' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_sb = sb(x1) + ((2 ** 16) * out_sb).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_sb = sb(x1) + ((2 ** 16) * out_sb).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + info_json = json.dumps(info) + + + with open("tests/triton_tests/attn_info_ln.jsonl", "a") as file: + file.write(info_json + "\n") + + + #exit() + + # err_fused = (out_standard - out_fused).abs().mean() + # err_sb = (out_standard - out_sb).abs().mean() + # print('OUT', err_fused, err_sb) + + # err_fused = (standard[d].weight.grad - fused_mlp.linear2.weight.grad).abs().mean() + # err_sb = (standard[d].weight.grad - sb[d].weight.grad).abs().mean() + + # print('GW2', err_fused, err_sb) + + # err_fused = (standard[0].weight.grad - fused_mlp.linear1.weight.grad).abs().mean() + # err_sb = (standard[0].weight.grad - sb[0].weight.grad).abs().mean() + + # print('GW1', err_fused, err_sb) + + # err_fused = (x1.grad - x2.grad).abs().mean() + # err_sb = (x1.grad - x3.grad).abs().mean() + + # print('GX1', err_fused, err_sb) + + # import pdb; pdb.set_trace() + + + # # NO GELU, ST GRADIENTS, EVERYTHING FINE. \ No newline at end of file diff --git a/tests/triton_tests/attn_info_ln.jsonl b/tests/triton_tests/attn_info_ln.jsonl new file mode 100644 index 0000000..c2f239b --- /dev/null +++ b/tests/triton_tests/attn_info_ln.jsonl @@ -0,0 +1,20 @@ +{"repeat": 32, "batch_size": 16384, "dim": 1024, "attn": 2.1414458751678467, "ln": 1.6365647315979004, "ln_compiled": 1.799367368221283, "gelu": 1.0930374264717102, "gelu_compiled": 1.094818115234375, "standard": 4.159651696681976, "my_standard": 4.696495831012726, "standard_compiled": 3.675594925880432, "sb": 4.1465312242507935} +{"repeat": 32, "batch_size": 32768, "dim": 1024, "attn": 4.100345075130463, "ln": 3.1594187021255493, "ln_compiled": 3.437422215938568, "gelu": 2.109348773956299, "gelu_compiled": 2.11450457572937, "standard": 7.706902921199799, "my_standard": 8.799396455287933, "standard_compiled": 6.735652685165405, "sb": 7.66376405954361} +{"repeat": 32, "batch_size": 65536, "dim": 1024, "attn": 7.953710854053497, "ln": 6.236426532268524, "ln_compiled": 6.746955215930939, "gelu": 4.164382815361023, "gelu_compiled": 4.171714186668396, "standard": 14.894917607307434, "my_standard": 17.042435705661774, "standard_compiled": 12.985721230506897, "sb": 14.6140456199646} +{"repeat": 32, "batch_size": 131072, "dim": 1024, "attn": 15.638880431652069, "ln": 12.333884835243225, "ln_compiled": 13.272866606712341, "gelu": 8.228793740272522, "gelu_compiled": 8.243747055530548, "standard": 29.425136744976044, "my_standard": 35.08377820253372, "standard_compiled": 25.69487690925598, "sb": 28.760001063346863} +{"repeat": 32, "batch_size": 16384, "dim": 1280, "attn": 2.627238631248474, "ln": 2.0098239183425903, "ln_compiled": 2.4197474122047424, "gelu": 1.3455823063850403, "gelu_compiled": 1.35069340467453, "standard": 5.554787814617157, "my_standard": 6.2290579080581665, "standard_compiled": 5.132324993610382, "sb": 5.4178386926651} +{"repeat": 32, "batch_size": 32768, "dim": 1280, "attn": 5.0596073269844055, "ln": 3.903590142726898, "ln_compiled": 4.719957709312439, "gelu": 2.6203468441963196, "gelu_compiled": 2.627365291118622, "standard": 10.546617209911346, "my_standard": 11.850126087665558, "standard_compiled": 9.685918688774109, "sb": 10.088451206684113} +{"repeat": 32, "batch_size": 65536, "dim": 1280, "attn": 9.845800697803497, "ln": 7.711298763751984, "ln_compiled": 9.292080998420715, "gelu": 5.172915756702423, "gelu_compiled": 5.180932581424713, "standard": 21.371990442276, "my_standard": 23.921720683574677, "standard_compiled": 19.669152796268463, "sb": 20.267993211746216} +{"repeat": 32, "batch_size": 131072, "dim": 1280, "attn": 19.375711679458618, "ln": 15.333592891693115, "ln_compiled": 18.245264887809753, "gelu": 10.264746844768524, "gelu_compiled": 10.283775627613068, "standard": 41.79700464010239, "my_standard": 45.84744572639465, "standard_compiled": 38.35208714008331, "sb": 38.35364431142807} +{"repeat": 32, "batch_size": 16384, "dim": 1408, "attn": 2.9110386967658997, "ln": 2.1998360753059387, "ln_compiled": 2.581551671028137, "gelu": 1.4731436967849731, "gelu_compiled": 1.478634774684906, "standard": 6.764143705368042, "my_standard": 7.331632077693939, "standard_compiled": 6.24605268239975, "sb": 6.325609982013702} +{"repeat": 32, "batch_size": 32768, "dim": 1408, "attn": 5.542516708374023, "ln": 4.289716482162476, "ln_compiled": 5.065307021141052, "gelu": 2.8742849826812744, "gelu_compiled": 2.882353961467743, "standard": 12.749537825584412, "my_standard": 13.79828155040741, "standard_compiled": 11.728867888450623, "sb": 11.642806231975555} +{"repeat": 32, "batch_size": 65536, "dim": 1408, "attn": 10.80312579870224, "ln": 8.471302688121796, "ln_compiled": 9.96796041727066, "gelu": 5.681410431861877, "gelu_compiled": 5.6905597448349, "standard": 25.19702911376953, "my_standard": 27.226239442825317, "standard_compiled": 23.22910726070404, "sb": 22.682294249534607} +{"repeat": 32, "batch_size": 131072, "dim": 1408, "attn": 21.284908056259155, "ln": 16.85701310634613, "ln_compiled": 19.643358886241913, "gelu": 11.292420327663422, "gelu_compiled": 11.314474046230316, "standard": 50.06787180900574, "my_standard": 54.29378151893616, "standard_compiled": 44.58653926849365, "sb": 45.359253883361816} +{"repeat": 32, "batch_size": 16384, "dim": 1664, "attn": 3.382459282875061, "ln": 2.6206374168395996, "ln_compiled": 2.9666870832443237, "gelu": 1.7263293266296387, "gelu_compiled": 1.7317384481430054, "standard": 8.414775133132935, "my_standard": 9.117811918258667, "standard_compiled": 7.7542513608932495, "sb": 7.70898163318634} +{"repeat": 32, "batch_size": 32768, "dim": 1664, "attn": 6.468378007411957, "ln": 5.125559866428375, "ln_compiled": 5.791269242763519, "gelu": 3.3864825963974, "gelu_compiled": 3.3920034766197205, "standard": 16.016244888305664, "my_standard": 17.25083589553833, "standard_compiled": 14.60808515548706, "sb": 14.347739517688751} +{"repeat": 32, "batch_size": 65536, "dim": 1664, "attn": 12.645229697227478, "ln": 10.13532280921936, "ln_compiled": 11.427387595176697, "gelu": 6.6957250237464905, "gelu_compiled": 6.711684167385101, "standard": 31.792201101779938, "my_standard": 34.31189805269241, "standard_compiled": 29.10037338733673, "sb": 28.3128023147583} +{"repeat": 32, "batch_size": 131072, "dim": 1664, "attn": 24.970605969429016, "ln": 20.182937383651733, "ln_compiled": 22.7489173412323, "gelu": 13.326868414878845, "gelu_compiled": 13.345755636692047, "standard": 63.46555054187775, "my_standard": 70.19880414009094, "standard_compiled": 56.40875548124313, "sb": 56.22846633195877} +{"repeat": 32, "batch_size": 16384, "dim": 2048, "attn": 4.080049693584442, "ln": 3.2655522227287292, "ln_compiled": 3.3329352736473083, "gelu": 2.108432352542877, "gelu_compiled": 2.114713191986084, "standard": 11.370822787284851, "my_standard": 12.234866619110107, "standard_compiled": 10.377615690231323, "sb": 10.209612548351288} +{"repeat": 32, "batch_size": 32768, "dim": 2048, "attn": 7.74645060300827, "ln": 6.418220698833466, "ln_compiled": 6.55733048915863, "gelu": 4.163652658462524, "gelu_compiled": 4.171028733253479, "standard": 21.39316499233246, "my_standard": 23.04024249315262, "standard_compiled": 19.431106746196747, "sb": 18.732361495494843} +{"repeat": 32, "batch_size": 65536, "dim": 2048, "attn": 15.235155820846558, "ln": 12.684382498264313, "ln_compiled": 12.895286083221436, "gelu": 8.228868246078491, "gelu_compiled": 8.242718875408173, "standard": 42.55136102437973, "my_standard": 45.82635313272476, "standard_compiled": 38.663335144519806, "sb": 36.76284849643707} +{"repeat": 32, "batch_size": 131072, "dim": 2048, "attn": 30.24454414844513, "ln": 25.25731921195984, "ln_compiled": 25.67601203918457, "gelu": 16.384944319725037, "gelu_compiled": 16.409948468208313, "standard": 84.26841348409653, "my_standard": 91.10662341117859, "standard_compiled": 76.89539343118668, "sb": 71.73164188861847} diff --git a/tests/triton_tests/full_matrix_decomp.py b/tests/triton_tests/full_matrix_decomp.py new file mode 100644 index 0000000..de37b95 --- /dev/null +++ b/tests/triton_tests/full_matrix_decomp.py @@ -0,0 +1,353 @@ +import json + +import time +import torch +import torch.nn as nn +import bitsandbytes.nn as bnn +from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear, MyLinear + +from bitsandbytes.nn.triton_utils.v0.quantize_rowwise_nogroup import quantize_rowwise_nogroup +from bitsandbytes.nn.triton_utils.v0.quantize_columnwise_nogroup_transpose import quantize_columnwise_nogroup_transpose +from bitsandbytes.nn.triton_utils.v0.int8_matmul_rowwise_dequantize_bias import int8_matmul_rowwise_dequantize_bias +from bitsandbytes.nn.triton_utils.v0.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize +from bitsandbytes.nn.triton_utils.v0.quantize_global import quantize_global, quantize_global_transpose +from bitsandbytes.nn.triton_utils.v0.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze, int8_matmul_mixed_dequanitze_bias + +# KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large. +# not that big of an issue. + +def get_time_standard_fwd(k, v): + + x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda() + g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda() + + ##### time matmul 1 + for _ in range(repeat // 2): + g.t().matmul(x) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + g.t().matmul(x) + + torch.cuda.synchronize() + end = time.time() + print(f"time {k}: {(end - start) / repeat * 1000:.3f} ms") + return (end - start) / repeat * 1000 + +if __name__ == '__main__': + torch.manual_seed(0) + #for (dim, wm) in [(1024, 4), (1280, 4), (1408, 4.3637), (1664, 4.9231), (2048, 4), (4096, 4), (8096, 4)] + for (dim, wm) in [(1408, 4), (1664, 4),]: + + for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]: + #for batch_size in [256*256, 256*512]: + + for switch in [False, True]: + + + # hparams + repeat = 64 + batch_size = batch_size + dim_out = dim * wm + dim_in = dim + if switch: + dim_out = dim + dim_in = wm * dim + + dim_in = round(dim_in) + dim_out = round(dim_out) + + + # simulate forward pass + x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda() + g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda() + w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda() + + x_int8 = x.clone().to(torch.int8) + g_int8 = g.clone().to(torch.int8) + w_int8 = w.clone().to(torch.int8) + wt_int8 = w.t().contiguous().clone().to(torch.int8) + state_x_rowwise = x.max(dim=1)[0] + state_g_rowwise = g.max(dim=1)[0] + state_w_columnwise = w.max(dim=0)[0] + state_w_rowwise = w.max(dim=1)[0] + state_w_global = w.max() + + info = {'repeat' : repeat, 'batch_size' : batch_size, 'dim_out' : dim_out, 'dim_in' : dim_in, 'wm' : wm, 'switch' : switch} + + k = 'standard_fwd' + for _ in range(repeat // 2): + x.matmul(w.t()) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + x.matmul(w.t()) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'standard_gw' + for _ in range(repeat // 2): + g.t().matmul(x) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + g.t().matmul(x) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'standard_gx' + for _ in range(repeat // 2): + g.matmul(w) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + g.matmul(w) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + + k = 'rowwise_fwd' + for _ in range(repeat // 2): + int8_matmul_rowwise_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_columnwise) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + int8_matmul_rowwise_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_columnwise) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'rowwise_bwd' + for _ in range(repeat // 2): + int8_matmul_rowwise_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_rowwise) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + int8_matmul_rowwise_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_rowwise) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'global_fwd' + for _ in range(repeat // 2): + int8_matmul_mixed_dequanitze(x_int8, w_int8.t(), state_x_rowwise, state_w_global) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + int8_matmul_mixed_dequanitze(x_int8, w_int8.t(), state_x_rowwise, state_w_global) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'global_bwd' + for _ in range(repeat // 2): + int8_matmul_mixed_dequanitze(g_int8, wt_int8.t(), state_x_rowwise, state_w_global) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + int8_matmul_mixed_dequanitze(g_int8, wt_int8.t(), state_x_rowwise, state_w_global) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'x_quantize_rowwise' + for _ in range(repeat // 2): + quantize_rowwise_nogroup(x) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_rowwise_nogroup(x) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'g_quantize_rowwise' + for _ in range(repeat // 2): + quantize_rowwise_nogroup(g) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_rowwise_nogroup(g) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'w_quantize_rowwise' + for _ in range(repeat // 2): + quantize_rowwise_nogroup(w) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_rowwise_nogroup(w) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'w_quantize_colwise_transpose' + for _ in range(repeat // 2): + quantize_columnwise_nogroup_transpose(w) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_columnwise_nogroup_transpose(w) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'w_quantize_global' + for _ in range(repeat // 2): + quantize_global(w) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_global(w) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + k = 'w_quantize_global_transpose' + for _ in range(repeat // 2): + quantize_global_transpose(w) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_global_transpose(w) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + k = 'cast_x' + for _ in range(repeat // 2): + newx = x.to(torch.int8) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + newx = x.to(torch.int8) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + + k = 'cast_g' + for _ in range(repeat // 2): + newx = g.to(torch.int8) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + newx = g.to(torch.int8) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + + k = 'cast_w' + for _ in range(repeat // 2): + newx = w.to(torch.int8) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + newx = w.to(torch.int8) + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + time_standard = info['standard_fwd'] + info['standard_gx'] + info['standard_gw'] + time_rowwise = info['x_quantize_rowwise'] + info['g_quantize_rowwise'] + info['w_quantize_colwise_transpose'] + info['w_quantize_rowwise'] + info['standard_gw'] + info['rowwise_fwd'] + info['rowwise_bwd'] + time_global = info['x_quantize_rowwise'] + info['g_quantize_rowwise'] + info['w_quantize_global'] + info['w_quantize_global_transpose'] + info['standard_gw'] + info['global_fwd'] + info['global_bwd'] + + print('TOTAL STANDARD', time_standard) + print('TOTAL ROWWISE', time_rowwise) + print('TOTAL GLOBAL', time_global) + + print('speedup', -100*(time_global - time_standard)/time_standard) + + info['time_standard'] = time_standard + info['time_rowwise'] = time_rowwise + info['time_global'] = time_global + + + + info_json = json.dumps(info) + + + with open("tests/triton_tests/info.jsonl", "a") as file: + file.write(info_json + "\n") \ No newline at end of file diff --git a/tests/triton_tests/info.jsonl b/tests/triton_tests/info.jsonl new file mode 100644 index 0000000..879a65f --- /dev/null +++ b/tests/triton_tests/info.jsonl @@ -0,0 +1,142 @@ +{"repeat": 64, "batch_size": 1024, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.047907233238220215, "standard_gw": 0.04326179623603821, "standard_gx": 0.042986124753952026, "rowwise_fwd": 0.03902614116668701, "rowwise_bwd": 0.038955360651016235, "global_fwd": 0.03974884748458862, "global_bwd": 0.0391639769077301, "x_quantize_rowwise": 0.02619624137878418, "g_quantize_rowwise": 0.02695620059967041, "w_quantize_rowwise": 0.02631545066833496, "w_quantize_colwise_transpose": 0.08677691221237183, "w_quantize_global": 0.07359683513641357, "w_quantize_global_transpose": 0.08226558566093445, "cast_x": 0.007815659046173096, "cast_g": 0.016041100025177002, "cast_w": 0.01600012183189392, "time_standard": 0.13415515422821045, "time_rowwise": 0.28748810291290283, "time_global": 0.33118948340415955} +{"repeat": 64, "batch_size": 1024, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.04236400127410889, "standard_gw": 0.04898756742477417, "standard_gx": 0.04731118679046631, "rowwise_fwd": 0.03933534026145935, "rowwise_bwd": 0.03947317600250244, "global_fwd": 0.03688037395477295, "global_bwd": 0.039167702198028564, "x_quantize_rowwise": 0.02533942461013794, "g_quantize_rowwise": 0.02516806125640869, "w_quantize_rowwise": 0.02528354525566101, "w_quantize_colwise_transpose": 0.0903792679309845, "w_quantize_global": 0.0997595489025116, "w_quantize_global_transpose": 0.10209530591964722, "cast_x": 0.01626834273338318, "cast_g": 0.011973083019256592, "cast_w": 0.016044825315475464, "time_standard": 0.13866275548934937, "time_rowwise": 0.2939663827419281, "time_global": 0.37739798426628113} +{"repeat": 64, "batch_size": 2048, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.07753819227218628, "standard_gw": 0.08026883006095886, "standard_gx": 0.0906921923160553, "rowwise_fwd": 0.0630207359790802, "rowwise_bwd": 0.058263540267944336, "global_fwd": 0.06167963147163391, "global_bwd": 0.05801767110824585, "x_quantize_rowwise": 0.034205615520477295, "g_quantize_rowwise": 0.03341957926750183, "w_quantize_rowwise": 0.03244727849960327, "w_quantize_colwise_transpose": 0.08665025234222412, "w_quantize_global": 0.09483471512794495, "w_quantize_global_transpose": 0.10108202695846558, "cast_x": 0.012032687664031982, "cast_g": 0.03752484917640686, "cast_w": 0.01605972647666931, "time_standard": 0.24849921464920044, "time_rowwise": 0.3882758319377899, "time_global": 0.46350806951522827} +{"repeat": 64, "batch_size": 2048, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.09099021553993225, "standard_gw": 0.0799819827079773, "standard_gx": 0.07644668221473694, "rowwise_fwd": 0.05840510129928589, "rowwise_bwd": 0.06359070539474487, "global_fwd": 0.057831406593322754, "global_bwd": 0.06148591637611389, "x_quantize_rowwise": 0.03434717655181885, "g_quantize_rowwise": 0.03361701965332031, "w_quantize_rowwise": 0.03209337592124939, "w_quantize_colwise_transpose": 0.09028613567352295, "w_quantize_global": 0.0944770872592926, "w_quantize_global_transpose": 0.0994168221950531, "cast_x": 0.03769621253013611, "cast_g": 0.012010335922241211, "cast_w": 0.01600012183189392, "time_standard": 0.24741888046264648, "time_rowwise": 0.39232149720191956, "time_global": 0.4611574113368988} +{"repeat": 64, "batch_size": 4096, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.14450401067733765, "standard_gw": 0.14326348900794983, "standard_gx": 0.14762207865715027, "rowwise_fwd": 0.10525062680244446, "rowwise_bwd": 0.09800493717193604, "global_fwd": 0.10229647159576416, "global_bwd": 0.09718164801597595, "x_quantize_rowwise": 0.03429874777793884, "g_quantize_rowwise": 0.04567950963973999, "w_quantize_rowwise": 0.03365054726600647, "w_quantize_colwise_transpose": 0.08654966950416565, "w_quantize_global": 0.09663775563240051, "w_quantize_global_transpose": 0.10383129119873047, "cast_x": 0.01605972647666931, "cast_g": 0.08305534720420837, "cast_w": 0.01624971628189087, "time_standard": 0.43538957834243774, "time_rowwise": 0.5466975271701813, "time_global": 0.6231889128684998} +{"repeat": 64, "batch_size": 4096, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.14496594667434692, "standard_gw": 0.1412704586982727, "standard_gx": 0.14446303248405457, "rowwise_fwd": 0.10041892528533936, "rowwise_bwd": 0.10674074292182922, "global_fwd": 0.09856373071670532, "global_bwd": 0.10319426655769348, "x_quantize_rowwise": 0.045571476221084595, "g_quantize_rowwise": 0.03273040056228638, "w_quantize_rowwise": 0.033464282751083374, "w_quantize_colwise_transpose": 0.09154900908470154, "w_quantize_global": 0.0964440405368805, "w_quantize_global_transpose": 0.1031048595905304, "cast_x": 0.0835023820400238, "cast_g": 0.016242265701293945, "cast_w": 0.016283243894577026, "time_standard": 0.4306994378566742, "time_rowwise": 0.5517452955245972, "time_global": 0.6208792328834534} +{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.28106942772865295, "standard_gw": 0.2841465175151825, "standard_gx": 0.301852822303772, "rowwise_fwd": 0.19879266619682312, "rowwise_bwd": 0.16228482127189636, "global_fwd": 0.19488856196403503, "global_bwd": 0.1607760787010193, "x_quantize_rowwise": 0.033974647521972656, "g_quantize_rowwise": 0.08221715688705444, "w_quantize_rowwise": 0.03248825669288635, "w_quantize_colwise_transpose": 0.08646398782730103, "w_quantize_global": 0.0939294695854187, "w_quantize_global_transpose": 0.09895861148834229, "cast_x": 0.03753975033760071, "cast_g": 0.15900656580924988, "cast_w": 0.01603737473487854, "time_standard": 0.8670687675476074, "time_rowwise": 0.8803680539131165, "time_global": 0.9488910436630249} +{"repeat": 64, "batch_size": 8192, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.26415660977363586, "standard_gw": 0.2679601311683655, "standard_gx": 0.30617788434028625, "rowwise_fwd": 0.180121511220932, "rowwise_bwd": 0.21555647253990173, "global_fwd": 0.17506256699562073, "global_bwd": 0.2116672694683075, "x_quantize_rowwise": 0.08289515972137451, "g_quantize_rowwise": 0.033795833587646484, "w_quantize_rowwise": 0.03366544842720032, "w_quantize_colwise_transpose": 0.09965524077415466, "w_quantize_global": 0.09595602750778198, "w_quantize_global_transpose": 0.1024976372718811, "cast_x": 0.1602955162525177, "cast_g": 0.03787502646446228, "cast_w": 0.016216188669204712, "time_standard": 0.8382946252822876, "time_rowwise": 0.9136497974395752, "time_global": 0.9698346257209778} +{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.5719438195228577, "standard_gw": 0.524863600730896, "standard_gx": 0.6005167961120605, "rowwise_fwd": 0.3750324249267578, "rowwise_bwd": 0.28166547417640686, "global_fwd": 0.3674700856208801, "global_bwd": 0.2798214554786682, "x_quantize_rowwise": 0.04655122756958008, "g_quantize_rowwise": 0.1555122435092926, "w_quantize_rowwise": 0.03437697887420654, "w_quantize_colwise_transpose": 0.08634477853775024, "w_quantize_global": 0.09759142994880676, "w_quantize_global_transpose": 0.10081753134727478, "cast_x": 0.0828765332698822, "cast_g": 0.31184032559394836, "cast_w": 0.016063451766967773, "time_standard": 1.6973242163658142, "time_rowwise": 1.5043467283248901, "time_global": 1.5726275742053986} +{"repeat": 64, "batch_size": 16384, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.5423910915851593, "standard_gw": 0.5674734711647034, "standard_gx": 0.5907565355300903, "rowwise_fwd": 0.3149174153804779, "rowwise_bwd": 0.3899820148944855, "global_fwd": 0.2909451723098755, "global_bwd": 0.3783814609050751, "x_quantize_rowwise": 0.15584751963615417, "g_quantize_rowwise": 0.04688650369644165, "w_quantize_rowwise": 0.031463801860809326, "w_quantize_colwise_transpose": 0.09072571992874146, "w_quantize_global": 0.09774044156074524, "w_quantize_global_transpose": 0.10405108332633972, "cast_x": 0.3111511468887329, "cast_g": 0.08282437920570374, "cast_w": 0.015992671251296997, "time_standard": 1.700621098279953, "time_rowwise": 1.5972964465618134, "time_global": 1.6413256525993347} +{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 1.2115389108657837, "standard_gw": 1.1259466409683228, "standard_gx": 1.1027492582798004, "rowwise_fwd": 0.7407031953334808, "rowwise_bwd": 0.5539208650588989, "global_fwd": 0.7214657962322235, "global_bwd": 0.5515590310096741, "x_quantize_rowwise": 0.08765608072280884, "g_quantize_rowwise": 0.3022328019142151, "w_quantize_rowwise": 0.03347545862197876, "w_quantize_colwise_transpose": 0.08694455027580261, "w_quantize_global": 0.09706243872642517, "w_quantize_global_transpose": 0.10102614760398865, "cast_x": 0.1592189073562622, "cast_g": 0.6166175007820129, "cast_w": 0.01607835292816162, "time_standard": 3.440234810113907, "time_rowwise": 2.930879592895508, "time_global": 2.986948937177658} +{"repeat": 64, "batch_size": 32768, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 1.1010989546775818, "standard_gw": 1.1352524161338806, "standard_gx": 1.1676251888275146, "rowwise_fwd": 0.5864761769771576, "rowwise_bwd": 0.7485374808311462, "global_fwd": 0.5547590553760529, "global_bwd": 0.7249303162097931, "x_quantize_rowwise": 0.3021731972694397, "g_quantize_rowwise": 0.08751824498176575, "w_quantize_rowwise": 0.033952295780181885, "w_quantize_colwise_transpose": 0.09011104702949524, "w_quantize_global": 0.09443238377571106, "w_quantize_global_transpose": 0.10376051068305969, "cast_x": 0.6167255342006683, "cast_g": 0.15922263264656067, "cast_w": 0.016070902347564697, "time_standard": 3.403976559638977, "time_rowwise": 2.984020859003067, "time_global": 3.0028261244297028} +{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 2.472013235092163, "standard_gw": 2.218998968601227, "standard_gx": 2.2116564214229584, "rowwise_fwd": 1.466125249862671, "rowwise_bwd": 1.0577328503131866, "global_fwd": 1.431729644536972, "global_bwd": 1.0476894676685333, "x_quantize_rowwise": 0.16929209232330322, "g_quantize_rowwise": 0.5952082574367523, "w_quantize_rowwise": 0.032100826501846313, "w_quantize_colwise_transpose": 0.08670613169670105, "w_quantize_global": 0.09590759873390198, "w_quantize_global_transpose": 0.10358169674873352, "cast_x": 0.31175464391708374, "cast_g": 1.2264922261238098, "cast_w": 0.016067177057266235, "time_standard": 6.902668625116348, "time_rowwise": 5.626164376735687, "time_global": 5.662407726049423} +{"repeat": 64, "batch_size": 65536, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 2.181064337491989, "standard_gw": 2.2256113588809967, "standard_gx": 2.3229196667671204, "rowwise_fwd": 1.0886266827583313, "rowwise_bwd": 1.4654062688350677, "global_fwd": 1.0472461581230164, "global_bwd": 1.433148980140686, "x_quantize_rowwise": 0.5954094231128693, "g_quantize_rowwise": 0.16921386122703552, "w_quantize_rowwise": 0.03442913293838501, "w_quantize_colwise_transpose": 0.09007751941680908, "w_quantize_global": 0.09575113654136658, "w_quantize_global_transpose": 0.10503828525543213, "cast_x": 1.2264810502529144, "cast_g": 0.3119036555290222, "cast_w": 0.01605600118637085, "time_standard": 6.729595363140106, "time_rowwise": 5.668774247169495, "time_global": 5.671419203281403} +{"repeat": 64, "batch_size": 1024, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.08157268166542053, "standard_gw": 0.07601454854011536, "standard_gx": 0.09059160947799683, "rowwise_fwd": 0.053066760301589966, "rowwise_bwd": 0.04787370562553406, "global_fwd": 0.05243346095085144, "global_bwd": 0.04809349775314331, "x_quantize_rowwise": 0.02571195363998413, "g_quantize_rowwise": 0.025898218154907227, "w_quantize_rowwise": 0.02714991569519043, "w_quantize_colwise_transpose": 0.19773468375205994, "w_quantize_global": 0.07273256778717041, "w_quantize_global_transpose": 0.08068978786468506, "cast_x": 0.008046627044677734, "cast_g": 0.0252649188041687, "cast_w": 0.0393986701965332, "time_standard": 0.24817883968353271, "time_rowwise": 0.4534497857093811, "time_global": 0.38157403469085693} +{"repeat": 64, "batch_size": 1024, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.09134411811828613, "standard_gw": 0.07602199912071228, "standard_gx": 0.09555742144584656, "rowwise_fwd": 0.047691166400909424, "rowwise_bwd": 0.05320459604263306, "global_fwd": 0.04759058356285095, "global_bwd": 0.0521540641784668, "x_quantize_rowwise": 0.025313347578048706, "g_quantize_rowwise": 0.025119632482528687, "w_quantize_rowwise": 0.0269375741481781, "w_quantize_colwise_transpose": 0.1857280731201172, "w_quantize_global": 0.07451698184013367, "w_quantize_global_transpose": 0.08009746670722961, "cast_x": 0.02547726035118103, "cast_g": 0.007897615432739258, "cast_w": 0.039536505937576294, "time_standard": 0.26292353868484497, "time_rowwise": 0.44001638889312744, "time_global": 0.3808140754699707} +{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 4.940010607242584, "standard_gw": 4.434864968061447, "standard_gx": 4.4097937643527985, "rowwise_fwd": 2.9467344284057617, "rowwise_bwd": 2.09181010723114, "global_fwd": 2.8806477785110474, "global_bwd": 2.0816922187805176, "x_quantize_rowwise": 0.33279508352279663, "g_quantize_rowwise": 1.1817067861557007, "w_quantize_rowwise": 0.03306567668914795, "w_quantize_colwise_transpose": 0.08666515350341797, "w_quantize_global": 0.0957287847995758, "w_quantize_global_transpose": 0.10242313146591187, "cast_x": 0.6165988743305206, "cast_g": 2.446405589580536, "cast_w": 0.016100704669952393, "time_standard": 13.78466933965683, "time_rowwise": 11.107642203569412, "time_global": 11.109858751296997} +{"repeat": 64, "batch_size": 131072, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 4.293464124202728, "standard_gw": 4.461295902729034, "standard_gx": 4.638340324163437, "rowwise_fwd": 2.116892486810684, "rowwise_bwd": 2.9479674994945526, "global_fwd": 2.0760856568813324, "global_bwd": 2.8755851089954376, "x_quantize_rowwise": 1.1818408966064453, "g_quantize_rowwise": 0.33276528120040894, "w_quantize_rowwise": 0.03287568688392639, "w_quantize_colwise_transpose": 0.09038299322128296, "w_quantize_global": 0.09598955512046814, "w_quantize_global_transpose": 0.100649893283844, "cast_x": 2.4467408657073975, "cast_g": 0.6165951490402222, "cast_w": 0.016082078218460083, "time_standard": 13.3931003510952, "time_rowwise": 11.164020746946335, "time_global": 11.12421229481697} +{"repeat": 64, "batch_size": 2048, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.1699887216091156, "standard_gw": 0.14045089483261108, "standard_gx": 0.17407909035682678, "rowwise_fwd": 0.10082125663757324, "rowwise_bwd": 0.08344277739524841, "global_fwd": 0.09941309690475464, "global_bwd": 0.08352473378181458, "x_quantize_rowwise": 0.025317072868347168, "g_quantize_rowwise": 0.03849714994430542, "w_quantize_rowwise": 0.02596527338027954, "w_quantize_colwise_transpose": 0.19767135381698608, "w_quantize_global": 0.07257238030433655, "w_quantize_global_transpose": 0.08127838373184204, "cast_x": 0.012032687664031982, "cast_g": 0.06345659494400024, "cast_w": 0.03953278064727783, "time_standard": 0.48451870679855347, "time_rowwise": 0.612165778875351, "time_global": 0.5410537123680115} +{"repeat": 64, "batch_size": 2048, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.14855340123176575, "standard_gw": 0.15553459525108337, "standard_gx": 0.16282498836517334, "rowwise_fwd": 0.09259581565856934, "rowwise_bwd": 0.11080875992774963, "global_fwd": 0.09166449308395386, "global_bwd": 0.10796263813972473, "x_quantize_rowwise": 0.03939121961593628, "g_quantize_rowwise": 0.025227665901184082, "w_quantize_rowwise": 0.027202069759368896, "w_quantize_colwise_transpose": 0.1940988004207611, "w_quantize_global": 0.07397681474685669, "w_quantize_global_transpose": 0.08178502321243286, "cast_x": 0.065632164478302, "cast_g": 0.01268833875656128, "cast_w": 0.04057586193084717, "time_standard": 0.46691298484802246, "time_rowwise": 0.6448589265346527, "time_global": 0.5755424499511719} +{"repeat": 64, "batch_size": 4096, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.32291561365127563, "standard_gw": 0.2875030040740967, "standard_gx": 0.3379322588443756, "rowwise_fwd": 0.19295886158943176, "rowwise_bwd": 0.16265735030174255, "global_fwd": 0.19031018018722534, "global_bwd": 0.16187503933906555, "x_quantize_rowwise": 0.02730637788772583, "g_quantize_rowwise": 0.06797909736633301, "w_quantize_rowwise": 0.02642720937728882, "w_quantize_colwise_transpose": 0.19745901226997375, "w_quantize_global": 0.07253512740135193, "w_quantize_global_transpose": 0.08047744631767273, "cast_x": 0.022336840629577637, "cast_g": 0.1209154725074768, "cast_w": 0.039268285036087036, "time_standard": 0.9483508765697479, "time_rowwise": 0.9622909128665924, "time_global": 0.8879862725734711} +{"repeat": 64, "batch_size": 4096, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.3019683063030243, "standard_gw": 0.288400799036026, "standard_gx": 0.3154948353767395, "rowwise_fwd": 0.18264353275299072, "rowwise_bwd": 0.2075284719467163, "global_fwd": 0.17072632908821106, "global_bwd": 0.1960061490535736, "x_quantize_rowwise": 0.06893649697303772, "g_quantize_rowwise": 0.02561509609222412, "w_quantize_rowwise": 0.026594847440719604, "w_quantize_colwise_transpose": 0.18575787544250488, "w_quantize_global": 0.07266923785209656, "w_quantize_global_transpose": 0.08060410618782043, "cast_x": 0.12182071805000305, "cast_g": 0.022590160369873047, "cast_w": 0.04000961780548096, "time_standard": 0.9058639407157898, "time_rowwise": 0.9854771196842194, "time_global": 0.9029582142829895} +{"repeat": 64, "batch_size": 8192, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.6489232182502747, "standard_gw": 0.5987770855426788, "standard_gx": 0.6644465029239655, "rowwise_fwd": 0.35867467522621155, "rowwise_bwd": 0.31855329871177673, "global_fwd": 0.353105366230011, "global_bwd": 0.31349435448646545, "x_quantize_rowwise": 0.03382191061973572, "g_quantize_rowwise": 0.12668967247009277, "w_quantize_rowwise": 0.02681836485862732, "w_quantize_colwise_transpose": 0.19756704568862915, "w_quantize_global": 0.07336586713790894, "w_quantize_global_transpose": 0.08036196231842041, "cast_x": 0.0583939254283905, "cast_g": 0.23520365357398987, "cast_w": 0.03935396671295166, "time_standard": 1.912146806716919, "time_rowwise": 1.660902053117752, "time_global": 1.579616218805313} +{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.5789436399936676, "standard_gw": 0.6130896508693695, "standard_gx": 0.6558857858181, "rowwise_fwd": 0.3464221954345703, "rowwise_bwd": 0.3650560975074768, "global_fwd": 0.3174394369125366, "global_bwd": 0.35758689045906067, "x_quantize_rowwise": 0.12686848640441895, "g_quantize_rowwise": 0.034302473068237305, "w_quantize_rowwise": 0.02745911478996277, "w_quantize_colwise_transpose": 0.1847483217716217, "w_quantize_global": 0.07192790508270264, "w_quantize_global_transpose": 0.08050352334976196, "cast_x": 0.23534893989562988, "cast_g": 0.05846098065376282, "cast_w": 0.03949552774429321, "time_standard": 1.847919076681137, "time_rowwise": 1.6979463398456573, "time_global": 1.6017183661460876} +{"repeat": 64, "batch_size": 1024, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.0573769211769104, "standard_gw": 0.061042606830596924, "standard_gx": 0.0783093273639679, "rowwise_fwd": 0.046797096729278564, "rowwise_bwd": 0.04620850086212158, "global_fwd": 0.04521384835243225, "global_bwd": 0.04425644874572754, "x_quantize_rowwise": 0.03257766366004944, "g_quantize_rowwise": 0.03449246287345886, "w_quantize_rowwise": 0.033657997846603394, "w_quantize_colwise_transpose": 0.1426301896572113, "w_quantize_global": 0.09257346391677856, "w_quantize_global_transpose": 0.10266527533531189, "cast_x": 0.011991709470748901, "cast_g": 0.020314007997512817, "cast_w": 0.027321279048919678, "time_standard": 0.19672885537147522, "time_rowwise": 0.39740651845932007, "time_global": 0.41282176971435547} +{"repeat": 64, "batch_size": 1024, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.07858872413635254, "standard_gw": 0.06122514605522156, "standard_gx": 0.05758553743362427, "rowwise_fwd": 0.04598498344421387, "rowwise_bwd": 0.04618242383003235, "global_fwd": 0.04597380757331848, "global_bwd": 0.046450644731521606, "x_quantize_rowwise": 0.03332272171974182, "g_quantize_rowwise": 0.033274292945861816, "w_quantize_rowwise": 0.0337548553943634, "w_quantize_colwise_transpose": 0.14807656407356262, "w_quantize_global": 0.09948387742042542, "w_quantize_global_transpose": 0.10120868682861328, "cast_x": 0.020120292901992798, "cast_g": 0.011488795280456543, "cast_w": 0.027466565370559692, "time_standard": 0.19739940762519836, "time_rowwise": 0.40182098746299744, "time_global": 0.420939177274704} +{"repeat": 64, "batch_size": 16384, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 1.3515166938304901, "standard_gw": 1.1536777019500732, "standard_gx": 1.224767416715622, "rowwise_fwd": 0.6912238895893097, "rowwise_bwd": 0.5562454462051392, "global_fwd": 0.67867711186409, "global_bwd": 0.5518943071365356, "x_quantize_rowwise": 0.06204098463058472, "g_quantize_rowwise": 0.24417787790298462, "w_quantize_rowwise": 0.025238841772079468, "w_quantize_colwise_transpose": 0.19756704568862915, "w_quantize_global": 0.07240846753120422, "w_quantize_global_transpose": 0.08046254515647888, "cast_x": 0.11138245463371277, "cast_g": 0.4637613892555237, "cast_w": 0.03935769200325012, "time_standard": 3.7299618124961853, "time_rowwise": 2.9301717877388, "time_global": 2.8433389961719513} +{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 1.2090615928173065, "standard_gw": 1.1396333575248718, "standard_gx": 1.2223869562149048, "rowwise_fwd": 0.5849376320838928, "rowwise_bwd": 0.6985403597354889, "global_fwd": 0.5565173923969269, "global_bwd": 0.6789751350879669, "x_quantize_rowwise": 0.2445802092552185, "g_quantize_rowwise": 0.06200745701789856, "w_quantize_rowwise": 0.027727335691452026, "w_quantize_colwise_transpose": 0.18501654267311096, "w_quantize_global": 0.07182732224464417, "w_quantize_global_transpose": 0.08069723844528198, "cast_x": 0.4638172686100006, "cast_g": 0.11136755347251892, "cast_w": 0.039517879486083984, "time_standard": 3.571081906557083, "time_rowwise": 2.9424428939819336, "time_global": 2.834238111972809} +{"repeat": 64, "batch_size": 32768, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 2.683013677597046, "standard_gw": 2.2987723350524902, "standard_gx": 2.4510622024536133, "rowwise_fwd": 1.359008252620697, "rowwise_bwd": 1.1018887162208557, "global_fwd": 1.3311207294464111, "global_bwd": 1.0954029858112335, "x_quantize_rowwise": 0.11804327368736267, "g_quantize_rowwise": 0.479232519865036, "w_quantize_rowwise": 0.026308000087738037, "w_quantize_colwise_transpose": 0.1975223422050476, "w_quantize_global": 0.07223710417747498, "w_quantize_global_transpose": 0.08019432425498962, "cast_x": 0.2161264419555664, "cast_g": 0.9207837283611298, "cast_w": 0.03929063677787781, "time_standard": 7.432848215103149, "time_rowwise": 5.580775439739227, "time_global": 5.475003272294998} +{"repeat": 64, "batch_size": 2048, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.11088326573371887, "standard_gw": 0.10994821786880493, "standard_gx": 0.12367218732833862, "rowwise_fwd": 0.07392093539237976, "rowwise_bwd": 0.07127970457077026, "global_fwd": 0.0730752944946289, "global_bwd": 0.07089227437973022, "x_quantize_rowwise": 0.03361701965332031, "g_quantize_rowwise": 0.03525242209434509, "w_quantize_rowwise": 0.03341585397720337, "w_quantize_colwise_transpose": 0.14318525791168213, "w_quantize_global": 0.09704753756523132, "w_quantize_global_transpose": 0.10221078991889954, "cast_x": 0.012002885341644287, "cast_g": 0.05240738391876221, "cast_w": 0.027313828468322754, "time_standard": 0.3445036709308624, "time_rowwise": 0.5006194114685059, "time_global": 0.5220435559749603} +{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 2.4625882506370544, "standard_gw": 2.421922981739044, "standard_gx": 2.380847930908203, "rowwise_fwd": 1.1231191456317902, "rowwise_bwd": 1.360483467578888, "global_fwd": 1.0947436094284058, "global_bwd": 1.3314113020896912, "x_quantize_rowwise": 0.4795975983142853, "g_quantize_rowwise": 0.11777132749557495, "w_quantize_rowwise": 0.02699345350265503, "w_quantize_colwise_transpose": 0.18484890460968018, "w_quantize_global": 0.07201358675956726, "w_quantize_global_transpose": 0.0803135335445404, "cast_x": 0.920858234167099, "cast_g": 0.21616369485855103, "cast_w": 0.03937259316444397, "time_standard": 7.265359163284302, "time_rowwise": 5.714736878871918, "time_global": 5.597773939371109} +{"repeat": 64, "batch_size": 2048, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.12437254190444946, "standard_gw": 0.11018291115760803, "standard_gx": 0.10970607399940491, "rowwise_fwd": 0.07167831063270569, "rowwise_bwd": 0.07583573460578918, "global_fwd": 0.07314234972000122, "global_bwd": 0.07501617074012756, "x_quantize_rowwise": 0.035624951124191284, "g_quantize_rowwise": 0.0333636999130249, "w_quantize_rowwise": 0.03264099359512329, "w_quantize_colwise_transpose": 0.14795735478401184, "w_quantize_global": 0.09621679782867432, "w_quantize_global_transpose": 0.10380148887634277, "cast_x": 0.05278363823890686, "cast_g": 0.01249462366104126, "cast_w": 0.02767890691757202, "time_standard": 0.3442615270614624, "time_rowwise": 0.5072839558124542, "time_global": 0.5273483693599701} +{"repeat": 64, "batch_size": 4096, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.21922588348388672, "standard_gw": 0.20731613039970398, "standard_gx": 0.23101642727851868, "rowwise_fwd": 0.1423358917236328, "rowwise_bwd": 0.1195073127746582, "global_fwd": 0.1401938498020172, "global_bwd": 0.11940300464630127, "x_quantize_rowwise": 0.03353878855705261, "g_quantize_rowwise": 0.06387382745742798, "w_quantize_rowwise": 0.03428757190704346, "w_quantize_colwise_transpose": 0.14376267790794373, "w_quantize_global": 0.09389594197273254, "w_quantize_global_transpose": 0.10196119546890259, "cast_x": 0.020060688257217407, "cast_g": 0.10236725211143494, "cast_w": 0.02732500433921814, "time_standard": 0.6575584411621094, "time_rowwise": 0.7446222007274628, "time_global": 0.7601827383041382} +{"repeat": 64, "batch_size": 4096, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.20026043057441711, "standard_gw": 0.21172687411308289, "standard_gx": 0.2276189625263214, "rowwise_fwd": 0.12956932187080383, "rowwise_bwd": 0.15310943126678467, "global_fwd": 0.12427568435668945, "global_bwd": 0.14432892203330994, "x_quantize_rowwise": 0.06471946835517883, "g_quantize_rowwise": 0.03309175372123718, "w_quantize_rowwise": 0.03242120146751404, "w_quantize_colwise_transpose": 0.14733895659446716, "w_quantize_global": 0.09280815720558167, "w_quantize_global_transpose": 0.10265037417411804, "cast_x": 0.10267645120620728, "cast_g": 0.020150095224380493, "cast_w": 0.027399510145187378, "time_standard": 0.6396062672138214, "time_rowwise": 0.7719770073890686, "time_global": 0.773601233959198} +{"repeat": 64, "batch_size": 65536, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 5.324859172105789, "standard_gw": 4.977177828550339, "standard_gx": 4.468705505132675, "rowwise_fwd": 2.7004145085811615, "rowwise_bwd": 2.121664583683014, "global_fwd": 2.648312598466873, "global_bwd": 2.111390233039856, "x_quantize_rowwise": 0.22934377193450928, "g_quantize_rowwise": 0.9496547281742096, "w_quantize_rowwise": 0.02555176615715027, "w_quantize_colwise_transpose": 0.1977868378162384, "w_quantize_global": 0.0727437436580658, "w_quantize_global_transpose": 0.08098781108856201, "cast_x": 0.4259459674358368, "cast_g": 1.8352754414081573, "cast_w": 0.039637088775634766, "time_standard": 14.770742505788803, "time_rowwise": 11.201594024896622, "time_global": 11.069610714912415} +{"repeat": 64, "batch_size": 8192, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.49151480197906494, "standard_gw": 0.4681535065174103, "standard_gx": 0.42366236448287964, "rowwise_fwd": 0.2766512334346771, "rowwise_bwd": 0.2083033323287964, "global_fwd": 0.2709813416004181, "global_bwd": 0.20718947052955627, "x_quantize_rowwise": 0.034555792808532715, "g_quantize_rowwise": 0.11969730257987976, "w_quantize_rowwise": 0.03300607204437256, "w_quantize_colwise_transpose": 0.14345720410346985, "w_quantize_global": 0.09280070662498474, "w_quantize_global_transpose": 0.10214745998382568, "cast_x": 0.052288174629211426, "cast_g": 0.19747763872146606, "cast_w": 0.027339905500411987, "time_standard": 1.3833306729793549, "time_rowwise": 1.2838244438171387, "time_global": 1.2955255806446075} +{"repeat": 64, "batch_size": 8192, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.39635971188545227, "standard_gw": 0.44353678822517395, "standard_gx": 0.4724152386188507, "rowwise_fwd": 0.22813305258750916, "rowwise_bwd": 0.2868436276912689, "global_fwd": 0.2119205892086029, "global_bwd": 0.2749413251876831, "x_quantize_rowwise": 0.12082979083061218, "g_quantize_rowwise": 0.03444403409957886, "w_quantize_rowwise": 0.03444403409957886, "w_quantize_colwise_transpose": 0.14675036072731018, "w_quantize_global": 0.09495392441749573, "w_quantize_global_transpose": 0.1009330153465271, "cast_x": 0.19745156168937683, "cast_g": 0.05227327346801758, "cast_w": 0.027336180210113525, "time_standard": 1.312311738729477, "time_rowwise": 1.294981688261032, "time_global": 1.2815594673156738} +{"repeat": 64, "batch_size": 16384, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 1.0207034647464752, "standard_gw": 0.897720456123352, "standard_gx": 0.8374936878681183, "rowwise_fwd": 0.5457103252410889, "rowwise_bwd": 0.4088357090950012, "global_fwd": 0.5308091640472412, "global_bwd": 0.40555745363235474, "x_quantize_rowwise": 0.05984678864479065, "g_quantize_rowwise": 0.2306811511516571, "w_quantize_rowwise": 0.0334717333316803, "w_quantize_colwise_transpose": 0.14356523752212524, "w_quantize_global": 0.09340420365333557, "w_quantize_global_transpose": 0.09996071457862854, "cast_x": 0.10207295417785645, "cast_g": 0.3880411386489868, "cast_w": 0.027671456336975098, "time_standard": 2.7559176087379456, "time_rowwise": 2.3198314011096954, "time_global": 2.31797993183136} +{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 4.502948373556137, "standard_gw": 4.418112337589264, "standard_gx": 4.748217761516571, "rowwise_fwd": 2.1329298615455627, "rowwise_bwd": 2.6968345046043396, "global_fwd": 2.102244645357132, "global_bwd": 2.6461556553840637, "x_quantize_rowwise": 0.9493157267570496, "g_quantize_rowwise": 0.2290569245815277, "w_quantize_rowwise": 0.02551451325416565, "w_quantize_colwise_transpose": 0.18491223454475403, "w_quantize_global": 0.07426366209983826, "w_quantize_global_transpose": 0.08058920502662659, "cast_x": 1.8352717161178589, "cast_g": 0.425681471824646, "cast_w": 0.039402395486831665, "time_standard": 13.669278472661972, "time_rowwise": 10.636676102876663, "time_global": 10.499738156795502} +{"repeat": 64, "batch_size": 16384, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.8179470896720886, "standard_gw": 0.8687414228916168, "standard_gx": 0.9276494383811951, "rowwise_fwd": 0.4481859505176544, "rowwise_bwd": 0.5557462573051453, "global_fwd": 0.4100687801837921, "global_bwd": 0.5317367613315582, "x_quantize_rowwise": 0.2301819622516632, "g_quantize_rowwise": 0.05963817238807678, "w_quantize_rowwise": 0.033523887395858765, "w_quantize_colwise_transpose": 0.14462321996688843, "w_quantize_global": 0.094633549451828, "w_quantize_global_transpose": 0.10088086128234863, "cast_x": 0.3879927098751068, "cast_g": 0.10205060243606567, "cast_w": 0.02714991569519043, "time_standard": 2.6143379509449005, "time_rowwise": 2.3406408727169037, "time_global": 2.295881509780884} +{"repeat": 64, "batch_size": 32768, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 2.0698904991149902, "standard_gw": 1.7200261354446411, "standard_gx": 1.663345843553543, "rowwise_fwd": 1.0664835572242737, "rowwise_bwd": 0.8059032261371613, "global_fwd": 1.0454729199409485, "global_bwd": 0.801432877779007, "x_quantize_rowwise": 0.1127384603023529, "g_quantize_rowwise": 0.4529319703578949, "w_quantize_rowwise": 0.03398582339286804, "w_quantize_colwise_transpose": 0.14343857765197754, "w_quantize_global": 0.09441003203392029, "w_quantize_global_transpose": 0.09993091225624084, "cast_x": 0.19744038581848145, "cast_g": 0.769149512052536, "cast_w": 0.02734735608100891, "time_standard": 5.453262478113174, "time_rowwise": 4.335507750511169, "time_global": 4.3269433081150055} +{"repeat": 64, "batch_size": 32768, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 2.758193761110306, "standard_gw": 1.6880109906196594, "standard_gx": 1.8163062632083893, "rowwise_fwd": 0.8343160152435303, "rowwise_bwd": 1.073598861694336, "global_fwd": 0.8045099675655365, "global_bwd": 1.0492689907550812, "x_quantize_rowwise": 0.453021377325058, "g_quantize_rowwise": 0.11304020881652832, "w_quantize_rowwise": 0.0337064266204834, "w_quantize_colwise_transpose": 0.1452416181564331, "w_quantize_global": 0.09451434016227722, "w_quantize_global_transpose": 0.0998079776763916, "cast_x": 0.769101083278656, "cast_g": 0.19731372594833374, "cast_w": 0.027332454919815063, "time_standard": 6.2625110149383545, "time_rowwise": 4.340935498476028, "time_global": 4.302173852920532} +{"repeat": 64, "batch_size": 131072, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 10.728541761636734, "standard_gw": 9.228862822055817, "standard_gx": 8.837487548589706, "rowwise_fwd": 5.4414160549640656, "rowwise_bwd": 4.186157137155533, "global_fwd": 5.329187959432602, "global_bwd": 4.150416702032089, "x_quantize_rowwise": 0.4517659544944763, "g_quantize_rowwise": 1.890372484922409, "w_quantize_rowwise": 0.027563422918319702, "w_quantize_colwise_transpose": 0.1980513334274292, "w_quantize_global": 0.0733695924282074, "w_quantize_global_transpose": 0.08009746670722961, "cast_x": 0.8449330925941467, "cast_g": 3.6641769111156464, "cast_w": 0.03945454955101013, "time_standard": 28.794892132282257, "time_rowwise": 21.42418920993805, "time_global": 21.20407298207283} +{"repeat": 64, "batch_size": 65536, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 4.127204418182373, "standard_gw": 3.359321504831314, "standard_gx": 5.557261407375336, "rowwise_fwd": 2.1365806460380554, "rowwise_bwd": 1.6042962670326233, "global_fwd": 2.0923763513565063, "global_bwd": 1.5939176082611084, "x_quantize_rowwise": 0.21954253315925598, "g_quantize_rowwise": 0.8971206843852997, "w_quantize_rowwise": 0.03357976675033569, "w_quantize_colwise_transpose": 0.1431293785572052, "w_quantize_global": 0.10574981570243835, "w_quantize_global_transpose": 0.10281801223754883, "cast_x": 0.38795173168182373, "cast_g": 1.5318207442760468, "cast_w": 0.027142465114593506, "time_standard": 13.043787330389023, "time_rowwise": 8.39357078075409, "time_global": 8.370846509933472} +{"repeat": 64, "batch_size": 65536, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 5.576469004154205, "standard_gw": 3.361724317073822, "standard_gx": 3.6300085484981537, "rowwise_fwd": 1.6183294355869293, "rowwise_bwd": 2.1462254226207733, "global_fwd": 1.5953555703163147, "global_bwd": 2.0915642380714417, "x_quantize_rowwise": 0.8973218500614166, "g_quantize_rowwise": 0.2197064459323883, "w_quantize_rowwise": 0.03402307629585266, "w_quantize_colwise_transpose": 0.14822185039520264, "w_quantize_global": 0.09706616401672363, "w_quantize_global_transpose": 0.10339170694351196, "cast_x": 1.5312805771827698, "cast_g": 0.3879964351654053, "cast_w": 0.0269375741481781, "time_standard": 12.568201869726181, "time_rowwise": 8.425552397966385, "time_global": 8.366130292415619} +{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 8.900497108697891, "standard_gw": 9.188394993543625, "standard_gx": 9.503517299890518, "rowwise_fwd": 4.189815372228622, "rowwise_bwd": 5.426768213510513, "global_fwd": 4.155576229095459, "global_bwd": 5.329132080078125, "x_quantize_rowwise": 1.8885880708694458, "g_quantize_rowwise": 0.45193731784820557, "w_quantize_rowwise": 0.025987625122070312, "w_quantize_colwise_transpose": 0.1842118799686432, "w_quantize_global": 0.07349997758865356, "w_quantize_global_transpose": 0.08074194192886353, "cast_x": 3.6639943718910217, "cast_g": 0.8447282016277313, "cast_w": 0.03973767161369324, "time_standard": 27.592409402132034, "time_rowwise": 21.355703473091125, "time_global": 21.167870610952377} +{"repeat": 64, "batch_size": 131072, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 8.2329623401165, "standard_gw": 6.799045950174332, "standard_gx": 6.893906742334366, "rowwise_fwd": 4.252739250659943, "rowwise_bwd": 3.2025352120399475, "global_fwd": 4.176046699285507, "global_bwd": 3.173377364873886, "x_quantize_rowwise": 0.43221935629844666, "g_quantize_rowwise": 1.7872042953968048, "w_quantize_rowwise": 0.03328174352645874, "w_quantize_colwise_transpose": 0.1431480050086975, "w_quantize_global": 0.09707733988761902, "w_quantize_global_transpose": 0.10161846876144409, "cast_x": 0.7692091166973114, "cast_g": 3.057178109884262, "cast_w": 0.027302652597427368, "time_standard": 21.9259150326252, "time_rowwise": 16.65017381310463, "time_global": 16.56658947467804} +{"repeat": 64, "batch_size": 131072, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 11.278409510850906, "standard_gw": 6.815284490585327, "standard_gx": 7.280956953763962, "rowwise_fwd": 3.206692636013031, "rowwise_bwd": 4.246953874826431, "global_fwd": 3.1801797449588776, "global_bwd": 4.169579595327377, "x_quantize_rowwise": 1.7862766981124878, "g_quantize_rowwise": 0.4329495131969452, "w_quantize_rowwise": 0.03413483500480652, "w_quantize_colwise_transpose": 0.14493241906166077, "w_quantize_global": 0.09881332516670227, "w_quantize_global_transpose": 0.10376423597335815, "cast_x": 3.057088702917099, "cast_g": 0.7693544030189514, "cast_w": 0.027261674404144287, "time_standard": 25.374650955200195, "time_rowwise": 16.66722446680069, "time_global": 16.586847603321075} +{"repeat": 64, "batch_size": 1024, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 0.11636316776275635, "standard_gw": 0.11816620826721191, "standard_gx": 0.11482089757919312, "rowwise_fwd": 0.08482113480567932, "rowwise_bwd": 0.06284937262535095, "global_fwd": 0.08296221494674683, "global_bwd": 0.061664730310440063, "x_quantize_rowwise": 0.026706606149673462, "g_quantize_rowwise": 0.025641173124313354, "w_quantize_rowwise": 0.03740563988685608, "w_quantize_colwise_transpose": 0.2965778112411499, "w_quantize_global": 0.11304393410682678, "w_quantize_global_transpose": 0.12390688061714172, "cast_x": 0.008635222911834717, "cast_g": 0.037532299757003784, "cast_w": 0.06856024265289307, "time_standard": 0.3493502736091614, "time_rowwise": 0.652167946100235, "time_global": 0.5520917475223541} +{"repeat": 64, "batch_size": 1024, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.11609122157096863, "standard_gw": 0.11704489588737488, "standard_gx": 0.11566653847694397, "rowwise_fwd": 0.06706640124320984, "rowwise_bwd": 0.09074807167053223, "global_fwd": 0.06621330976486206, "global_bwd": 0.0859871506690979, "x_quantize_rowwise": 0.027574598789215088, "g_quantize_rowwise": 0.02520531415939331, "w_quantize_rowwise": 0.04095584154129028, "w_quantize_colwise_transpose": 0.37036463618278503, "w_quantize_global": 0.11350959539413452, "w_quantize_global_transpose": 0.12202560901641846, "cast_x": 0.03780052065849304, "cast_g": 0.00860169529914856, "cast_w": 0.06864592432975769, "time_standard": 0.3488026559352875, "time_rowwise": 0.7389597594738007, "time_global": 0.5575604736804962} +{"repeat": 64, "batch_size": 2048, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 0.22610649466514587, "standard_gw": 0.2229548990726471, "standard_gx": 0.22150203585624695, "rowwise_fwd": 0.1421608030796051, "rowwise_bwd": 0.10771304368972778, "global_fwd": 0.13930723071098328, "global_bwd": 0.10715052485466003, "x_quantize_rowwise": 0.02812594175338745, "g_quantize_rowwise": 0.04733726382255554, "w_quantize_rowwise": 0.03758445382118225, "w_quantize_colwise_transpose": 0.29515475034713745, "w_quantize_global": 0.11344626545906067, "w_quantize_global_transpose": 0.12392178177833557, "cast_x": 0.013589859008789062, "cast_g": 0.08285418152809143, "cast_w": 0.06850436329841614, "time_standard": 0.6705634295940399, "time_rowwise": 0.8810311555862427, "time_global": 0.7822439074516296} +{"repeat": 64, "batch_size": 2048, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.20173192024230957, "standard_gw": 0.2351999282836914, "standard_gx": 0.24710968136787415, "rowwise_fwd": 0.12035667896270752, "rowwise_bwd": 0.153418630361557, "global_fwd": 0.11473894119262695, "global_bwd": 0.14553219079971313, "x_quantize_rowwise": 0.04762038588523865, "g_quantize_rowwise": 0.02557411789894104, "w_quantize_rowwise": 0.04055723547935486, "w_quantize_colwise_transpose": 0.32641738653182983, "w_quantize_global": 0.1138448715209961, "w_quantize_global_transpose": 0.12255832552909851, "cast_x": 0.08405372500419617, "cast_g": 0.013835728168487549, "cast_w": 0.06961449980735779, "time_standard": 0.6840415298938751, "time_rowwise": 0.9491443634033203, "time_global": 0.8050687611103058} +{"repeat": 64, "batch_size": 4096, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 0.48126280307769775, "standard_gw": 0.46824291348457336, "standard_gx": 0.45252591371536255, "rowwise_fwd": 0.2749897539615631, "rowwise_bwd": 0.2111680805683136, "global_fwd": 0.2689175307750702, "global_bwd": 0.2104043960571289, "x_quantize_rowwise": 0.02676248550415039, "g_quantize_rowwise": 0.0842660665512085, "w_quantize_rowwise": 0.037495046854019165, "w_quantize_colwise_transpose": 0.2952851355075836, "w_quantize_global": 0.11366978287696838, "w_quantize_global_transpose": 0.12461841106414795, "cast_x": 0.0283755362033844, "cast_g": 0.1590624451637268, "cast_w": 0.06854161620140076, "time_standard": 1.4020316302776337, "time_rowwise": 1.3982094824314117, "time_global": 1.2968815863132477} +{"repeat": 64, "batch_size": 4096, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.4076175391674042, "standard_gw": 0.45526400208473206, "standard_gx": 0.4996545612812042, "rowwise_fwd": 0.238761305809021, "rowwise_bwd": 0.2913624048233032, "global_fwd": 0.2149641513824463, "global_bwd": 0.2717897295951843, "x_quantize_rowwise": 0.0845976173877716, "g_quantize_rowwise": 0.0266246497631073, "w_quantize_rowwise": 0.04038959741592407, "w_quantize_colwise_transpose": 0.33299997448921204, "w_quantize_global": 0.11374801397323608, "w_quantize_global_transpose": 0.12202560901641846, "cast_x": 0.15895813703536987, "cast_g": 0.028312206268310547, "cast_w": 0.06841868162155151, "time_standard": 1.3625361025333405, "time_rowwise": 1.4699995517730713, "time_global": 1.2890137732028961} +{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 1.02214515209198, "standard_gw": 0.9412020444869995, "standard_gx": 0.883936882019043, "rowwise_fwd": 0.5209781229496002, "rowwise_bwd": 0.41617080569267273, "global_fwd": 0.5089044570922852, "global_bwd": 0.4142932593822479, "x_quantize_rowwise": 0.03763660788536072, "g_quantize_rowwise": 0.15798211097717285, "w_quantize_rowwise": 0.0375211238861084, "w_quantize_colwise_transpose": 0.2973228693008423, "w_quantize_global": 0.11317431926727295, "w_quantize_global_transpose": 0.12396648526191711, "cast_x": 0.0685863196849823, "cast_g": 0.311531126499176, "cast_w": 0.0685080885887146, "time_standard": 2.8472840785980225, "time_rowwise": 2.4088136851787567, "time_global": 2.2971592843532562} +{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.8539073169231415, "standard_gw": 0.9352751076221466, "standard_gx": 0.9567439556121826, "rowwise_fwd": 0.4599541425704956, "rowwise_bwd": 0.531073659658432, "global_fwd": 0.42063742876052856, "global_bwd": 0.5125999450683594, "x_quantize_rowwise": 0.1581348478794098, "g_quantize_rowwise": 0.03755837678909302, "w_quantize_rowwise": 0.04056468605995178, "w_quantize_colwise_transpose": 0.3295913338661194, "w_quantize_global": 0.11314079165458679, "w_quantize_global_transpose": 0.12153387069702148, "cast_x": 0.3114752471446991, "cast_g": 0.06850063800811768, "cast_w": 0.06839632987976074, "time_standard": 2.7459263801574707, "time_rowwise": 2.492152154445648, "time_global": 2.2988803684711456} +{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 2.0550191402435303, "standard_gw": 1.7850138247013092, "standard_gx": 1.7571337521076202, "rowwise_fwd": 1.026798039674759, "rowwise_bwd": 0.8242167532444, "global_fwd": 1.0042376816272736, "global_bwd": 0.8189938962459564, "x_quantize_rowwise": 0.0688992440700531, "g_quantize_rowwise": 0.3054179251194, "w_quantize_rowwise": 0.03757700324058533, "w_quantize_colwise_transpose": 0.2973712980747223, "w_quantize_global": 0.11324509978294373, "w_quantize_global_transpose": 0.12398511171340942, "cast_x": 0.13050436973571777, "cast_g": 0.6165280938148499, "cast_w": 0.06848573684692383, "time_standard": 5.59716671705246, "time_rowwise": 4.345294088125229, "time_global": 4.2197927832603455} +{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 1.79310142993927, "standard_gw": 1.7801076173782349, "standard_gx": 1.9140169024467468, "rowwise_fwd": 0.8629709482192993, "rowwise_bwd": 1.0353922843933105, "global_fwd": 0.8200556039810181, "global_bwd": 1.002725213766098, "x_quantize_rowwise": 0.30517578125, "g_quantize_rowwise": 0.06880238652229309, "w_quantize_rowwise": 0.040318816900253296, "w_quantize_colwise_transpose": 0.3413744270801544, "w_quantize_global": 0.11326000094413757, "w_quantize_global_transpose": 0.12197345495223999, "cast_x": 0.6162337958812714, "cast_g": 0.13053417205810547, "cast_w": 0.06848946213722229, "time_standard": 5.487225949764252, "time_rowwise": 4.4341422617435455, "time_global": 4.212100058794022} +{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 4.0736086666584015, "standard_gw": 3.595758229494095, "standard_gx": 3.7020929157733917, "rowwise_fwd": 2.0306408405303955, "rowwise_bwd": 1.635722815990448, "global_fwd": 1.9890740513801575, "global_bwd": 1.627359539270401, "x_quantize_rowwise": 0.13131648302078247, "g_quantize_rowwise": 0.6001107394695282, "w_quantize_rowwise": 0.03781542181968689, "w_quantize_colwise_transpose": 0.2975836396217346, "w_quantize_global": 0.11357292532920837, "w_quantize_global_transpose": 0.12416765093803406, "cast_x": 0.2544410526752472, "cast_g": 1.2265890836715698, "cast_w": 0.06866827607154846, "time_standard": 11.371459811925888, "time_rowwise": 8.32894816994667, "time_global": 8.181359618902206} +{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 3.525231033563614, "standard_gw": 3.489706665277481, "standard_gx": 3.9937011897563934, "rowwise_fwd": 1.6627348959445953, "rowwise_bwd": 2.0311400294303894, "global_fwd": 1.6270726919174194, "global_bwd": 1.988884061574936, "x_quantize_rowwise": 0.5999915301799774, "g_quantize_rowwise": 0.1310594379901886, "w_quantize_rowwise": 0.04043802618980408, "w_quantize_colwise_transpose": 0.32950565218925476, "w_quantize_global": 0.11298432946205139, "w_quantize_global_transpose": 0.12201443314552307, "cast_x": 1.2257546186447144, "cast_g": 0.25444477796554565, "cast_w": 0.06848573684692383, "time_standard": 11.008638888597488, "time_rowwise": 8.28457623720169, "time_global": 8.071713149547577} +{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 8.123598992824554, "standard_gw": 8.085217326879501, "standard_gx": 7.293816655874252, "rowwise_fwd": 4.07782569527626, "rowwise_bwd": 3.196723759174347, "global_fwd": 4.001103341579437, "global_bwd": 3.1843744218349457, "x_quantize_rowwise": 0.2560615539550781, "g_quantize_rowwise": 1.1893659830093384, "w_quantize_rowwise": 0.037297606468200684, "w_quantize_colwise_transpose": 0.29668211936950684, "w_quantize_global": 0.11358782649040222, "w_quantize_global_transpose": 0.12476742267608643, "cast_x": 0.5020052194595337, "cast_g": 2.4454034864902496, "cast_w": 0.0684782862663269, "time_standard": 23.502632975578308, "time_rowwise": 17.139174044132233, "time_global": 16.95447787642479} +{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 6.932958960533142, "standard_gw": 7.0609524846076965, "standard_gx": 7.460080087184906, "rowwise_fwd": 3.1809918582439423, "rowwise_bwd": 4.078391939401627, "global_fwd": 3.185112029314041, "global_bwd": 3.99089977145195, "x_quantize_rowwise": 1.1891834437847137, "g_quantize_rowwise": 0.25588274002075195, "w_quantize_rowwise": 0.0406019389629364, "w_quantize_colwise_transpose": 0.3389529883861542, "w_quantize_global": 0.11313334107398987, "w_quantize_global_transpose": 0.12241676449775696, "cast_x": 2.4446770548820496, "cast_g": 0.5022138357162476, "cast_w": 0.06857141852378845, "time_standard": 21.453991532325745, "time_rowwise": 16.14495739340782, "time_global": 15.9175805747509} +{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 16.38999581336975, "standard_gw": 15.075922012329102, "standard_gx": 14.479495584964752, "rowwise_fwd": 8.128684014081955, "rowwise_bwd": 6.41091912984848, "global_fwd": 7.977847009897232, "global_bwd": 6.362702697515488, "x_quantize_rowwise": 0.5057230591773987, "g_quantize_rowwise": 2.3681968450546265, "w_quantize_rowwise": 0.037435442209243774, "w_quantize_colwise_transpose": 0.29555708169937134, "w_quantize_global": 0.11360272765159607, "w_quantize_global_transpose": 0.12426823377609253, "cast_x": 0.997692346572876, "cast_g": 4.8848651349544525, "cast_w": 0.0685565173625946, "time_standard": 45.945413410663605, "time_rowwise": 32.82243758440018, "time_global": 32.528262585401535} +{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 14.838922768831253, "standard_gw": 15.112213790416718, "standard_gx": 14.869242906570435, "rowwise_fwd": 6.402213126420975, "rowwise_bwd": 8.132629096508026, "global_fwd": 6.36359304189682, "global_bwd": 7.9823993146419525, "x_quantize_rowwise": 2.367999404668808, "g_quantize_rowwise": 0.5056969821453094, "w_quantize_rowwise": 0.04053488373756409, "w_quantize_colwise_transpose": 0.3559887409210205, "w_quantize_global": 0.1136288046836853, "w_quantize_global_transpose": 0.125102698802948, "cast_x": 4.880473017692566, "cast_g": 0.9965412318706512, "cast_w": 0.06855279207229614, "time_standard": 44.820379465818405, "time_rowwise": 32.91727602481842, "time_global": 32.57063403725624} +{"repeat": 64, "batch_size": 1024, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 0.15426427125930786, "standard_gw": 0.14531239867210388, "standard_gx": 0.1703128218650818, "rowwise_fwd": 0.09618699550628662, "rowwise_bwd": 0.10633841156959534, "global_fwd": 0.09483471512794495, "global_bwd": 0.10636076331138611, "x_quantize_rowwise": 0.02434849739074707, "g_quantize_rowwise": 0.026009976863861084, "w_quantize_rowwise": 0.04366040229797363, "w_quantize_colwise_transpose": 0.34148991107940674, "w_quantize_global": 0.13587623834609985, "w_quantize_global_transpose": 0.14698877930641174, "cast_x": 0.009745359420776367, "cast_g": 0.03773719072341919, "cast_w": 0.08277222514152527, "time_standard": 0.46988949179649353, "time_rowwise": 0.7833465933799744, "time_global": 0.6797313690185547} +{"repeat": 64, "batch_size": 1024, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 0.16738846898078918, "standard_gw": 0.14199689030647278, "standard_gx": 0.15476346015930176, "rowwise_fwd": 0.11660531163215637, "rowwise_bwd": 0.1050308346748352, "global_fwd": 0.11050701141357422, "global_bwd": 0.09868666529655457, "x_quantize_rowwise": 0.02781301736831665, "g_quantize_rowwise": 0.024966895580291748, "w_quantize_rowwise": 0.047437846660614014, "w_quantize_colwise_transpose": 0.5995631217956543, "w_quantize_global": 0.1362822949886322, "w_quantize_global_transpose": 0.14807283878326416, "cast_x": 0.0377558171749115, "cast_g": 0.00973045825958252, "cast_w": 0.0828281044960022, "time_standard": 0.4641488194465637, "time_rowwise": 1.063413918018341, "time_global": 0.6883256137371063} +{"repeat": 64, "batch_size": 2048, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 0.2727396786212921, "standard_gw": 0.2711080014705658, "standard_gx": 0.3120154142379761, "rowwise_fwd": 0.16424059867858887, "rowwise_bwd": 0.17686933279037476, "global_fwd": 0.161685049533844, "global_bwd": 0.17517060041427612, "x_quantize_rowwise": 0.025484710931777954, "g_quantize_rowwise": 0.047635287046432495, "w_quantize_rowwise": 0.04380941390991211, "w_quantize_colwise_transpose": 0.3401711583137512, "w_quantize_global": 0.13605505228042603, "w_quantize_global_transpose": 0.14705583453178406, "cast_x": 0.01584365963935852, "cast_g": 0.08274242281913757, "cast_w": 0.08281320333480835, "time_standard": 0.855863094329834, "time_rowwise": 1.0693185031414032, "time_global": 0.9641945362091064} +{"repeat": 64, "batch_size": 2048, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 0.28916075825691223, "standard_gw": 0.29472261667251587, "standard_gx": 0.30096620321273804, "rowwise_fwd": 0.19618868827819824, "rowwise_bwd": 0.17556175589561462, "global_fwd": 0.18328800797462463, "global_bwd": 0.16647577285766602, "x_quantize_rowwise": 0.047441571950912476, "g_quantize_rowwise": 0.026609748601913452, "w_quantize_rowwise": 0.04766508936882019, "w_quantize_colwise_transpose": 0.6060972809791565, "w_quantize_global": 0.1363418996334076, "w_quantize_global_transpose": 0.14806538820266724, "cast_x": 0.08295103907585144, "cast_g": 0.015836209058761597, "cast_w": 0.08285045623779297, "time_standard": 0.8848495781421661, "time_rowwise": 1.3942867517471313, "time_global": 1.0029450058937073} +{"repeat": 64, "batch_size": 4096, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 0.6430819630622864, "standard_gw": 0.5622953176498413, "standard_gx": 0.5780421197414398, "rowwise_fwd": 0.318676233291626, "rowwise_bwd": 0.29438361525535583, "global_fwd": 0.31290948390960693, "global_bwd": 0.290747731924057, "x_quantize_rowwise": 0.027455389499664307, "g_quantize_rowwise": 0.08405372500419617, "w_quantize_rowwise": 0.04369765520095825, "w_quantize_colwise_transpose": 0.34110620617866516, "w_quantize_global": 0.1360774040222168, "w_quantize_global_transpose": 0.14697015285491943, "cast_x": 0.037614256143569946, "cast_g": 0.15922263264656067, "cast_w": 0.08288025856018066, "time_standard": 1.7834194004535675, "time_rowwise": 1.671668142080307, "time_global": 1.560509204864502} +{"repeat": 64, "batch_size": 4096, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 0.551275908946991, "standard_gw": 0.591665506362915, "standard_gx": 0.6067268550395966, "rowwise_fwd": 0.33493712544441223, "rowwise_bwd": 0.32918527722358704, "global_fwd": 0.29528141021728516, "global_bwd": 0.31659379601478577, "x_quantize_rowwise": 0.08441135287284851, "g_quantize_rowwise": 0.025656074285507202, "w_quantize_rowwise": 0.04745647311210632, "w_quantize_colwise_transpose": 0.5993843078613281, "w_quantize_global": 0.1359879970550537, "w_quantize_global_transpose": 0.14815106987953186, "cast_x": 0.15932321548461914, "cast_g": 0.037439167499542236, "cast_w": 0.08288398385047913, "time_standard": 1.7496682703495026, "time_rowwise": 2.0126961171627045, "time_global": 1.5977472066879272} +{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 1.2295916676521301, "standard_gw": 1.116037368774414, "standard_gx": 1.1164769530296326, "rowwise_fwd": 0.603698194026947, "rowwise_bwd": 0.5168020725250244, "global_fwd": 0.5922466516494751, "global_bwd": 0.5151033401489258, "x_quantize_rowwise": 0.0437907874584198, "g_quantize_rowwise": 0.157918781042099, "w_quantize_rowwise": 0.044032931327819824, "w_quantize_colwise_transpose": 0.34073740243911743, "w_quantize_global": 0.13559311628341675, "w_quantize_global_transpose": 0.14679506421089172, "cast_x": 0.08263811469078064, "cast_g": 0.3115162253379822, "cast_w": 0.08287280797958374, "time_standard": 3.4621059894561768, "time_rowwise": 2.8230175375938416, "time_global": 2.707485109567642} +{"repeat": 64, "batch_size": 8192, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 1.090865582227707, "standard_gw": 1.1468492448329926, "standard_gx": 1.1166594922542572, "rowwise_fwd": 0.5559474229812622, "rowwise_bwd": 0.6105974316596985, "global_fwd": 0.5200020968914032, "global_bwd": 0.592011958360672, "x_quantize_rowwise": 0.15802308917045593, "g_quantize_rowwise": 0.04357844591140747, "w_quantize_rowwise": 0.04709511995315552, "w_quantize_colwise_transpose": 0.5969703197479248, "w_quantize_global": 0.13620033860206604, "w_quantize_global_transpose": 0.148136168718338, "cast_x": 0.31115859746932983, "cast_g": 0.08263811469078064, "cast_w": 0.08268281817436218, "time_standard": 3.3543743193149567, "time_rowwise": 3.159061074256897, "time_global": 2.744801342487335} +{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 2.4665743112564087, "standard_gw": 2.1993443369865417, "standard_gx": 2.1993033587932587, "rowwise_fwd": 1.192428171634674, "rowwise_bwd": 1.023314893245697, "global_fwd": 1.1711902916431427, "global_bwd": 1.0202191770076752, "x_quantize_rowwise": 0.08077174425125122, "g_quantize_rowwise": 0.30520185828208923, "w_quantize_rowwise": 0.043783336877822876, "w_quantize_colwise_transpose": 0.339999794960022, "w_quantize_global": 0.13628602027893066, "w_quantize_global_transpose": 0.14696642756462097, "cast_x": 0.15902891755104065, "cast_g": 0.6164535880088806, "cast_w": 0.08285418152809143, "time_standard": 6.865222007036209, "time_rowwise": 5.184844136238098, "time_global": 5.059979856014252} +{"repeat": 64, "batch_size": 16384, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 2.1861791610717773, "standard_gw": 2.157818526029587, "standard_gx": 2.321537584066391, "rowwise_fwd": 1.0536126792430878, "rowwise_bwd": 1.1971630156040192, "global_fwd": 1.02127343416214, "global_bwd": 1.1707991361618042, "x_quantize_rowwise": 0.30522048473358154, "g_quantize_rowwise": 0.08065253496170044, "w_quantize_rowwise": 0.04741176962852478, "w_quantize_colwise_transpose": 0.5979575216770172, "w_quantize_global": 0.1362040638923645, "w_quantize_global_transpose": 0.14854222536087036, "cast_x": 0.6162486970424652, "cast_g": 0.1591891050338745, "cast_w": 0.08288398385047913, "time_standard": 6.665535271167755, "time_rowwise": 5.439836531877518, "time_global": 5.020510405302048} +{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 4.891645163297653, "standard_gw": 4.233300685882568, "standard_gx": 4.2071714997291565, "rowwise_fwd": 2.3616664111614227, "rowwise_bwd": 1.9419342279434204, "global_fwd": 2.3244209587574005, "global_bwd": 1.9598640501499176, "x_quantize_rowwise": 0.15483051538467407, "g_quantize_rowwise": 0.6008371710777283, "w_quantize_rowwise": 0.043839216232299805, "w_quantize_colwise_transpose": 0.3400743007659912, "w_quantize_global": 0.1362822949886322, "w_quantize_global_transpose": 0.14691054821014404, "cast_x": 0.31141936779022217, "cast_g": 1.2254081666469574, "cast_w": 0.08280202746391296, "time_standard": 13.332117348909378, "time_rowwise": 9.676482528448105, "time_global": 9.556446224451065} +{"repeat": 64, "batch_size": 32768, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 4.267625510692596, "standard_gw": 4.237007349729538, "standard_gx": 4.666488617658615, "rowwise_fwd": 1.9670464098453522, "rowwise_bwd": 2.362079918384552, "global_fwd": 1.9469596445560455, "global_bwd": 2.32585147023201, "x_quantize_rowwise": 0.6000921130180359, "g_quantize_rowwise": 0.15481188893318176, "w_quantize_rowwise": 0.04725530743598938, "w_quantize_colwise_transpose": 0.5976222455501556, "w_quantize_global": 0.13619661331176758, "w_quantize_global_transpose": 0.14815852046012878, "cast_x": 1.2261345982551575, "cast_g": 0.3117173910140991, "cast_w": 0.08279457688331604, "time_standard": 13.17112147808075, "time_rowwise": 9.965915232896805, "time_global": 9.549077600240707} +{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 9.787477552890778, "standard_gw": 8.533861488103867, "standard_gx": 8.979786187410355, "rowwise_fwd": 4.741787910461426, "rowwise_bwd": 3.871854394674301, "global_fwd": 4.674319177865982, "global_bwd": 3.9110779762268066, "x_quantize_rowwise": 0.3025829792022705, "g_quantize_rowwise": 1.1898204684257507, "w_quantize_rowwise": 0.043705105781555176, "w_quantize_colwise_transpose": 0.33997371792793274, "w_quantize_global": 0.13592839241027832, "w_quantize_global_transpose": 0.14724954962730408, "cast_x": 0.6160177290439606, "cast_g": 2.4440810084342957, "cast_w": 0.08280575275421143, "time_standard": 27.301125228405, "time_rowwise": 19.023586064577103, "time_global": 18.89484003186226} +{"repeat": 64, "batch_size": 65536, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 8.461769670248032, "standard_gw": 8.428700268268585, "standard_gx": 9.447630494832993, "rowwise_fwd": 3.881257027387619, "rowwise_bwd": 4.7471001744270325, "global_fwd": 3.9101652801036835, "global_bwd": 4.662122577428818, "x_quantize_rowwise": 1.1892355978488922, "g_quantize_rowwise": 0.3024376928806305, "w_quantize_rowwise": 0.04708021879196167, "w_quantize_colwise_transpose": 0.5982778966426849, "w_quantize_global": 0.13624131679534912, "w_quantize_global_transpose": 0.1484602689743042, "cast_x": 2.4463236331939697, "cast_g": 0.6163865327835083, "cast_w": 0.08278340101242065, "time_standard": 26.33810043334961, "time_rowwise": 19.194088876247406, "time_global": 18.777363002300262} +{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 19.699689000844955, "standard_gw": 16.89574122428894, "standard_gx": 17.907552421092987, "rowwise_fwd": 9.453803300857544, "rowwise_bwd": 7.8153833746910095, "global_fwd": 9.313825517892838, "global_bwd": 7.8215524554252625, "x_quantize_rowwise": 0.5986690521240234, "g_quantize_rowwise": 2.368006855249405, "w_quantize_rowwise": 0.043682754039764404, "w_quantize_colwise_transpose": 0.3406330943107605, "w_quantize_global": 0.13626739382743835, "w_quantize_global_transpose": 0.14715641736984253, "cast_x": 1.2262165546417236, "cast_g": 4.8834048211574554, "cast_w": 0.08272379636764526, "time_standard": 54.50298264622688, "time_rowwise": 37.51591965556145, "time_global": 37.28121891617775} +{"repeat": 64, "batch_size": 131072, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 18.66700127720833, "standard_gw": 18.56840029358864, "standard_gx": 18.049821257591248, "rowwise_fwd": 7.742393761873245, "rowwise_bwd": 9.479016065597534, "global_fwd": 7.806576788425446, "global_bwd": 9.328477084636688, "x_quantize_rowwise": 2.368297427892685, "g_quantize_rowwise": 0.5978643894195557, "w_quantize_rowwise": 0.047303736209869385, "w_quantize_colwise_transpose": 0.5982741713523865, "w_quantize_global": 0.13678893446922302, "w_quantize_global_transpose": 0.1488029956817627, "cast_x": 4.880513995885849, "cast_g": 1.2248307466506958, "cast_w": 0.08270144462585449, "time_standard": 55.285222828388214, "time_rowwise": 39.401549845933914, "time_global": 38.955207914114} +{"repeat": 64, "batch_size": 1024, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 0.529509037733078, "standard_gw": 0.5781911313533783, "standard_gx": 0.6095841526985168, "rowwise_fwd": 0.2811029553413391, "rowwise_bwd": 0.3345906734466553, "global_fwd": 0.27928128838539124, "global_bwd": 0.33126771450042725, "x_quantize_rowwise": 0.025760382413864136, "g_quantize_rowwise": 0.06494298577308655, "w_quantize_rowwise": 0.15570968389511108, "w_quantize_colwise_transpose": 1.6086548566818237, "w_quantize_global": 0.481434166431427, "w_quantize_global_transpose": 0.505443662405014, "cast_x": 0.01582130789756775, "cast_g": 0.08295103907585144, "cast_w": 0.311531126499176, "time_standard": 1.7172843217849731, "time_rowwise": 3.048952668905258, "time_global": 2.2663213312625885} +{"repeat": 64, "batch_size": 1024, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 0.5729459226131439, "standard_gw": 0.5789846181869507, "standard_gx": 0.5775243043899536, "rowwise_fwd": 0.36711618304252625, "rowwise_bwd": 0.2913735806941986, "global_fwd": 0.33703818917274475, "global_bwd": 0.2821236848831177, "x_quantize_rowwise": 0.064849853515625, "g_quantize_rowwise": 0.025060027837753296, "w_quantize_rowwise": 0.22537633776664734, "w_quantize_colwise_transpose": 3.6401040852069855, "w_quantize_global": 0.4818551242351532, "w_quantize_global_transpose": 0.5101114511489868, "cast_x": 0.08286535739898682, "cast_g": 0.015828758478164673, "cast_w": 0.3114677965641022, "time_standard": 1.7294548451900482, "time_rowwise": 5.192864686250687, "time_global": 2.2800229489803314} +{"repeat": 64, "batch_size": 2048, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 1.1735819280147552, "standard_gw": 1.121576875448227, "standard_gx": 1.1242404580116272, "rowwise_fwd": 0.5535706877708435, "rowwise_bwd": 0.5567893385887146, "global_fwd": 0.5486570298671722, "global_bwd": 0.551365315914154, "x_quantize_rowwise": 0.02710893750190735, "g_quantize_rowwise": 0.11784210801124573, "w_quantize_rowwise": 0.15565752983093262, "w_quantize_colwise_transpose": 1.607745885848999, "w_quantize_global": 0.4824437201023102, "w_quantize_global_transpose": 0.5060508847236633, "cast_x": 0.03808736801147461, "cast_g": 0.15912577509880066, "cast_w": 0.31150132417678833, "time_standard": 3.4193992614746094, "time_rowwise": 4.14029136300087, "time_global": 3.35504487156868} +{"repeat": 64, "batch_size": 2048, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 1.1169910430908203, "standard_gw": 1.1065900325775146, "standard_gx": 1.1815577745437622, "rowwise_fwd": 0.5917288362979889, "rowwise_bwd": 0.5614385008811951, "global_fwd": 0.5646944046020508, "global_bwd": 0.5500949919223785, "x_quantize_rowwise": 0.118207186460495, "g_quantize_rowwise": 0.025041401386260986, "w_quantize_rowwise": 0.22566691040992737, "w_quantize_colwise_transpose": 3.635551780462265, "w_quantize_global": 0.4815608263015747, "w_quantize_global_transpose": 0.509701669216156, "cast_x": 0.15912950038909912, "cast_g": 0.03797560930252075, "cast_w": 0.3114044666290283, "time_standard": 3.405138850212097, "time_rowwise": 6.264224648475647, "time_global": 3.3558905124664307} +{"repeat": 64, "batch_size": 4096, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 2.3259930312633514, "standard_gw": 2.1472275257110596, "standard_gx": 2.213582396507263, "rowwise_fwd": 1.0509602725505829, "rowwise_bwd": 0.9888559579849243, "global_fwd": 1.0398179292678833, "global_bwd": 0.9887740015983582, "x_quantize_rowwise": 0.04647299647331238, "g_quantize_rowwise": 0.22570788860321045, "w_quantize_rowwise": 0.1554824411869049, "w_quantize_colwise_transpose": 1.610085368156433, "w_quantize_global": 0.48134103417396545, "w_quantize_global_transpose": 0.5054809153079987, "cast_x": 0.08297711610794067, "cast_g": 0.3115646541118622, "cast_w": 0.31159818172454834, "time_standard": 6.686802953481674, "time_rowwise": 6.224792450666428, "time_global": 5.434822291135788} +{"repeat": 64, "batch_size": 4096, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 2.19760462641716, "standard_gw": 2.2860951721668243, "standard_gx": 2.290956676006317, "rowwise_fwd": 1.0311491787433624, "rowwise_bwd": 1.0555200278759003, "global_fwd": 0.9858310222625732, "global_bwd": 1.0394863784313202, "x_quantize_rowwise": 0.22591277956962585, "g_quantize_rowwise": 0.046234577894210815, "w_quantize_rowwise": 0.22603943943977356, "w_quantize_colwise_transpose": 3.628809005022049, "w_quantize_global": 0.4819147288799286, "w_quantize_global_transpose": 0.5104243755340576, "cast_x": 0.3114528954029083, "cast_g": 0.08296966552734375, "cast_w": 0.3116317093372345, "time_standard": 6.7746564745903015, "time_rowwise": 8.499760180711746, "time_global": 5.575899034738541} +{"repeat": 64, "batch_size": 8192, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 4.633370786905289, "standard_gw": 4.397690296173096, "standard_gx": 4.286538809537888, "rowwise_fwd": 2.089906483888626, "rowwise_bwd": 1.9657425582408905, "global_fwd": 2.0679645240306854, "global_bwd": 1.9629858434200287, "x_quantize_rowwise": 0.08271634578704834, "g_quantize_rowwise": 0.43905526399612427, "w_quantize_rowwise": 0.1551508903503418, "w_quantize_colwise_transpose": 1.6106180846691132, "w_quantize_global": 0.48185884952545166, "w_quantize_global_transpose": 0.506274402141571, "cast_x": 0.15918537974357605, "cast_g": 0.6163418292999268, "cast_w": 0.311531126499176, "time_standard": 13.317599892616272, "time_rowwise": 10.74087992310524, "time_global": 9.938545525074005} +{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 4.424266517162323, "standard_gw": 4.391487687826157, "standard_gx": 4.61186096072197, "rowwise_fwd": 1.9874684512615204, "rowwise_bwd": 2.093140035867691, "global_fwd": 1.9647255539894104, "global_bwd": 2.06940621137619, "x_quantize_rowwise": 0.43999403715133667, "g_quantize_rowwise": 0.08271634578704834, "w_quantize_rowwise": 0.22581592202186584, "w_quantize_colwise_transpose": 3.631964325904846, "w_quantize_global": 0.4821456968784332, "w_quantize_global_transpose": 0.5102343857288361, "cast_x": 0.6164386868476868, "cast_g": 0.1591108739376068, "cast_w": 0.31154975295066833, "time_standard": 13.42761516571045, "time_rowwise": 12.852586805820465, "time_global": 9.940709918737411} +{"repeat": 64, "batch_size": 16384, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 9.229827672243118, "standard_gw": 8.319318294525146, "standard_gx": 8.652344346046448, "rowwise_fwd": 4.163607954978943, "rowwise_bwd": 3.778301179409027, "global_fwd": 4.121184349060059, "global_bwd": 3.7708766758441925, "x_quantize_rowwise": 0.1553669571876526, "g_quantize_rowwise": 0.8715838193893433, "w_quantize_rowwise": 0.15540048480033875, "w_quantize_colwise_transpose": 1.6092769801616669, "w_quantize_global": 0.4813969135284424, "w_quantize_global_transpose": 0.5070343613624573, "cast_x": 0.31150132417678833, "cast_g": 1.2259706854820251, "cast_w": 0.311482697725296, "time_standard": 26.201490312814713, "time_rowwise": 19.052855670452118, "time_global": 18.226761370897293} +{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 8.577890694141388, "standard_gw": 9.073298424482346, "standard_gx": 9.210295975208282, "rowwise_fwd": 3.7784352898597717, "rowwise_bwd": 4.165928810834885, "global_fwd": 3.7702471017837524, "global_bwd": 4.121150821447372, "x_quantize_rowwise": 0.868629664182663, "g_quantize_rowwise": 0.1554340124130249, "w_quantize_rowwise": 0.22614002227783203, "w_quantize_colwise_transpose": 3.6367811262607574, "w_quantize_global": 0.4828609526157379, "w_quantize_global_transpose": 0.510137528181076, "cast_x": 1.2258104979991913, "cast_g": 0.31299516558647156, "cast_w": 0.3114677965641022, "time_standard": 26.861485093832016, "time_rowwise": 21.90464735031128, "time_global": 18.981758505105972} +{"repeat": 64, "batch_size": 32768, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 18.52763444185257, "standard_gw": 17.835520207881927, "standard_gx": 17.375655472278595, "rowwise_fwd": 8.35346058011055, "rowwise_bwd": 7.584303617477417, "global_fwd": 8.300606161355972, "global_bwd": 7.550913840532303, "x_quantize_rowwise": 0.3016740083694458, "g_quantize_rowwise": 1.7321519553661346, "w_quantize_rowwise": 0.15538185834884644, "w_quantize_colwise_transpose": 1.6110800206661224, "w_quantize_global": 0.4815198481082916, "w_quantize_global_transpose": 0.5066357553005219, "cast_x": 0.6163753569126129, "cast_g": 2.4452805519104004, "cast_w": 0.31156837940216064, "time_standard": 53.73881012201309, "time_rowwise": 37.573572248220444, "time_global": 36.7090217769146} +{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 18.073823302984238, "standard_gw": 16.71283319592476, "standard_gx": 18.46104860305786, "rowwise_fwd": 7.542364299297333, "rowwise_bwd": 8.374195545911789, "global_fwd": 7.5644850730896, "global_bwd": 8.26016440987587, "x_quantize_rowwise": 1.7326027154922485, "g_quantize_rowwise": 0.30233338475227356, "w_quantize_rowwise": 0.2259574830532074, "w_quantize_colwise_transpose": 3.634512424468994, "w_quantize_global": 0.48204511404037476, "w_quantize_global_transpose": 0.5093887448310852, "cast_x": 2.445656806230545, "cast_g": 0.6163381040096283, "cast_w": 0.31144917011260986, "time_standard": 53.24770510196686, "time_rowwise": 38.524799048900604, "time_global": 35.56385263800621} +{"repeat": 64, "batch_size": 65536, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 36.123402416706085, "standard_gw": 32.68447890877724, "standard_gx": 34.13737937808037, "rowwise_fwd": 16.65867120027542, "rowwise_bwd": 15.004873275756836, "global_fwd": 16.536589711904526, "global_bwd": 14.949381351470947, "x_quantize_rowwise": 0.5952902138233185, "g_quantize_rowwise": 3.4581348299980164, "w_quantize_rowwise": 0.15559792518615723, "w_quantize_colwise_transpose": 1.6055963933467865, "w_quantize_global": 0.48203766345977783, "w_quantize_global_transpose": 0.5048215389251709, "cast_x": 1.2256354093551636, "cast_g": 4.875503480434418, "cast_w": 0.3110244870185852, "time_standard": 102.94526070356369, "time_rowwise": 70.16264274716377, "time_global": 69.210734218359} +{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 35.0223146378994, "standard_gw": 32.84081444144249, "standard_gx": 35.984884947538376, "rowwise_fwd": 15.018381178379059, "rowwise_bwd": 16.69919490814209, "global_fwd": 14.942582696676254, "global_bwd": 16.529250890016556, "x_quantize_rowwise": 3.442291170358658, "g_quantize_rowwise": 0.5951747298240662, "w_quantize_rowwise": 0.22576376795768738, "w_quantize_colwise_transpose": 3.621157258749008, "w_quantize_global": 0.48135966062545776, "w_quantize_global_transpose": 0.5095489323139191, "cast_x": 4.875205457210541, "cast_g": 1.2237727642059326, "cast_w": 0.3110431134700775, "time_standard": 103.84801402688026, "time_rowwise": 72.44277745485306, "time_global": 69.3410225212574} +{"repeat": 64, "batch_size": 131072, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 72.33698666095734, "standard_gw": 71.31465151906013, "standard_gx": 69.32922825217247, "rowwise_fwd": 33.37707370519638, "rowwise_bwd": 30.1642008125782, "global_fwd": 33.002063632011414, "global_bwd": 30.003495514392853, "x_quantize_rowwise": 1.1819563806056976, "g_quantize_rowwise": 6.896954029798508, "w_quantize_rowwise": 0.15557929873466492, "w_quantize_colwise_transpose": 1.6083605587482452, "w_quantize_global": 0.48125162720680237, "w_quantize_global_transpose": 0.5055665969848633, "cast_x": 2.442535012960434, "cast_g": 9.750165045261383, "cast_w": 0.31094998121261597, "time_standard": 212.98086643218994, "time_rowwise": 144.69877630472183, "time_global": 143.38593930006027} +{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 70.24158909916878, "standard_gw": 72.03734293580055, "standard_gx": 72.01339676976204, "rowwise_fwd": 30.072908848524094, "rowwise_bwd": 33.376410603523254, "global_fwd": 29.965493828058243, "global_bwd": 33.01112726330757, "x_quantize_rowwise": 6.894122809171677, "g_quantize_rowwise": 1.1817142367362976, "w_quantize_rowwise": 0.22567808628082275, "w_quantize_colwise_transpose": 3.616899251937866, "w_quantize_global": 0.4819147288799286, "w_quantize_global_transpose": 0.5107112228870392, "cast_x": 9.750377386808395, "cast_g": 2.4411343038082123, "cast_w": 0.31099095940589905, "time_standard": 214.29232880473137, "time_rowwise": 147.40507677197456, "time_global": 144.0824270248413} +{"repeat": 64, "batch_size": 65536, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 138.23134452104568, "standard_gw": 131.48364424705505, "standard_gx": 141.09868183732033, "rowwise_fwd": 65.38830325007439, "rowwise_bwd": 58.39048698544502, "global_fwd": 65.2194656431675, "global_bwd": 58.58004465699196, "x_quantize_rowwise": 1.1899955570697784, "g_quantize_rowwise": 6.623774766921997, "w_quantize_rowwise": 0.5935952067375183, "w_quantize_colwise_transpose": 24.08137544989586, "w_quantize_global": 1.740824431180954, "w_quantize_global_transpose": 1.8664970993995667, "cast_x": 2.413548529148102, "cast_g": 9.63655486702919, "cast_w": 1.1956281960010529, "time_standard": 410.81367060542107, "time_rowwise": 287.7511754631996, "time_global": 266.7042464017868} +{"repeat": 64, "batch_size": 65536, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 141.08363911509514, "standard_gw": 133.26667994260788, "standard_gx": 136.0350362956524, "rowwise_fwd": 58.49892646074295, "rowwise_bwd": 65.34496694803238, "global_fwd": 58.73573571443558, "global_bwd": 65.30505418777466, "x_quantize_rowwise": 6.648071110248566, "g_quantize_rowwise": 1.1903978884220123, "w_quantize_rowwise": 0.8329600095748901, "w_quantize_colwise_transpose": 15.297897160053253, "w_quantize_global": 1.7403066158294678, "w_quantize_global_transpose": 1.8791332840919495, "cast_x": 9.636614471673965, "cast_g": 2.4122819304466248, "cast_w": 1.1954344809055328, "time_standard": 410.3853553533554, "time_rowwise": 281.07989951968193, "time_global": 268.7653787434101} +{"repeat": 64, "batch_size": 1024, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 2.535879611968994, "standard_gw": 2.249978482723236, "standard_gx": 2.2262558341026306, "rowwise_fwd": 1.085665076971054, "rowwise_bwd": 1.069542020559311, "global_fwd": 1.0830685496330261, "global_bwd": 1.0597631335258484, "x_quantize_rowwise": 0.02650916576385498, "g_quantize_rowwise": 0.1200847327709198, "w_quantize_rowwise": 0.5937665700912476, "w_quantize_colwise_transpose": 23.926906287670135, "w_quantize_global": 1.7397291958332062, "w_quantize_global_transpose": 1.8652454018592834, "cast_x": 0.03688782453536987, "cast_g": 0.15725940465927124, "cast_w": 1.1969134211540222, "time_standard": 7.012113928794861, "time_rowwise": 29.07245233654976, "time_global": 8.144378662109375} +{"repeat": 64, "batch_size": 1024, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 2.245493233203888, "standard_gw": 2.2966675460338593, "standard_gx": 2.216015011072159, "rowwise_fwd": 1.1000856757164001, "rowwise_bwd": 1.0902360081672668, "global_fwd": 1.0597333312034607, "global_bwd": 1.0812543332576752, "x_quantize_rowwise": 0.11992454528808594, "g_quantize_rowwise": 0.026784837245941162, "w_quantize_rowwise": 0.8310377597808838, "w_quantize_colwise_transpose": 15.30550792813301, "w_quantize_global": 1.7401352524757385, "w_quantize_global_transpose": 1.8841177225112915, "cast_x": 0.1573599874973297, "cast_g": 0.03676116466522217, "cast_w": 1.195952296257019, "time_standard": 6.758175790309906, "time_rowwise": 20.770244300365448, "time_global": 8.208617568016052} +{"repeat": 64, "batch_size": 2048, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 4.197858273983002, "standard_gw": 4.288379102945328, "standard_gx": 4.155721515417099, "rowwise_fwd": 2.0567886531352997, "rowwise_bwd": 1.9073635339736938, "global_fwd": 2.0506344735622406, "global_bwd": 1.9086338579654694, "x_quantize_rowwise": 0.04758685827255249, "g_quantize_rowwise": 0.22284314036369324, "w_quantize_rowwise": 0.5935467779636383, "w_quantize_colwise_transpose": 23.935042321681976, "w_quantize_global": 1.7397813498973846, "w_quantize_global_transpose": 1.8662959337234497, "cast_x": 0.08194148540496826, "cast_g": 0.3077872097492218, "cast_w": 1.1968687176704407, "time_standard": 12.641958892345428, "time_rowwise": 33.05155038833618, "time_global": 12.124154716730118} +{"repeat": 64, "batch_size": 2048, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 4.126541316509247, "standard_gw": 4.309836775064468, "standard_gx": 4.117351025342941, "rowwise_fwd": 1.9266381859779358, "rowwise_bwd": 2.0577237010002136, "global_fwd": 1.908630132675171, "global_bwd": 2.0505934953689575, "x_quantize_rowwise": 0.22304058074951172, "g_quantize_rowwise": 0.04766136407852173, "w_quantize_rowwise": 0.8306317031383514, "w_quantize_colwise_transpose": 15.309855341911316, "w_quantize_global": 1.7415396869182587, "w_quantize_global_transpose": 1.8827766180038452, "cast_x": 0.30782073736190796, "cast_g": 0.08186325430870056, "cast_w": 1.1955127120018005, "time_standard": 12.553729116916656, "time_rowwise": 24.70538765192032, "time_global": 12.164078652858734} +{"repeat": 64, "batch_size": 4096, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 8.298952132463455, "standard_gw": 8.345257490873337, "standard_gx": 8.647706359624863, "rowwise_fwd": 4.106882959604263, "rowwise_bwd": 3.8046911358833313, "global_fwd": 4.09451499581337, "global_bwd": 3.8078874349594116, "x_quantize_rowwise": 0.08447840809822083, "g_quantize_rowwise": 0.4291348159313202, "w_quantize_rowwise": 0.5934201180934906, "w_quantize_colwise_transpose": 23.843105882406235, "w_quantize_global": 1.7399191856384277, "w_quantize_global_transpose": 1.8653236329555511, "cast_x": 0.1577921211719513, "cast_g": 0.6089024245738983, "cast_w": 1.1952444911003113, "time_standard": 25.291915982961655, "time_rowwise": 41.2069708108902, "time_global": 20.366515964269638} +{"repeat": 64, "batch_size": 4096, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 8.323360234498978, "standard_gw": 8.433796465396881, "standard_gx": 8.236430585384369, "rowwise_fwd": 3.8114115595817566, "rowwise_bwd": 4.106346517801285, "global_fwd": 3.8080140948295593, "global_bwd": 4.094675183296204, "x_quantize_rowwise": 0.4288516938686371, "g_quantize_rowwise": 0.08437782526016235, "w_quantize_rowwise": 0.8310228586196899, "w_quantize_colwise_transpose": 15.306610614061356, "w_quantize_global": 1.741155982017517, "w_quantize_global_transpose": 1.8809586763381958, "cast_x": 0.6091706454753876, "cast_g": 0.157233327627182, "cast_w": 1.1953115463256836, "time_standard": 24.993587285280228, "time_rowwise": 33.00241753458977, "time_global": 20.471829921007156} +{"repeat": 64, "batch_size": 8192, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 16.656354069709778, "standard_gw": 17.066240310668945, "standard_gx": 17.252348363399506, "rowwise_fwd": 8.220307528972626, "rowwise_bwd": 7.2372183203697205, "global_fwd": 8.2036592066288, "global_bwd": 7.236208766698837, "x_quantize_rowwise": 0.15832111239433289, "g_quantize_rowwise": 0.8406005799770355, "w_quantize_rowwise": 0.5935393273830414, "w_quantize_colwise_transpose": 23.86143058538437, "w_quantize_global": 1.7401576042175293, "w_quantize_global_transpose": 1.8653534352779388, "cast_x": 0.3079026937484741, "cast_g": 1.209162175655365, "cast_w": 1.1951625347137451, "time_standard": 50.97494274377823, "time_rowwise": 57.97765776515007, "time_global": 37.11054101586342} +{"repeat": 64, "batch_size": 8192, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 17.398890107870102, "standard_gw": 18.470749258995056, "standard_gx": 16.520217061042786, "rowwise_fwd": 7.235266268253326, "rowwise_bwd": 8.207589387893677, "global_fwd": 7.235914468765259, "global_bwd": 8.204508572816849, "x_quantize_rowwise": 0.8409880101680756, "g_quantize_rowwise": 0.15821680426597595, "w_quantize_rowwise": 0.8324198424816132, "w_quantize_colwise_transpose": 15.305522829294205, "w_quantize_global": 1.7396919429302216, "w_quantize_global_transpose": 1.8805749714374542, "cast_x": 1.2103468179702759, "cast_g": 0.30729547142982483, "cast_w": 1.1953599750995636, "time_standard": 52.389856427907944, "time_rowwise": 51.05075240135193, "time_global": 38.53064402937889} +{"repeat": 64, "batch_size": 16384, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 33.533211797475815, "standard_gw": 33.00020843744278, "standard_gx": 34.614477306604385, "rowwise_fwd": 16.364943236112595, "rowwise_bwd": 14.551006257534027, "global_fwd": 16.33496955037117, "global_bwd": 14.513172209262848, "x_quantize_rowwise": 0.3053396940231323, "g_quantize_rowwise": 1.6693994402885437, "w_quantize_rowwise": 0.5936138331890106, "w_quantize_colwise_transpose": 23.89485388994217, "w_quantize_global": 1.741711050271988, "w_quantize_global_transpose": 1.8656104803085327, "cast_x": 0.6089657545089722, "cast_g": 2.4122074246406555, "cast_w": 1.1951886117458344, "time_standard": 101.14789754152298, "time_rowwise": 90.37936478853226, "time_global": 69.430410861969} +{"repeat": 64, "batch_size": 16384, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 33.65536406636238, "standard_gw": 33.02193805575371, "standard_gx": 33.10496360063553, "rowwise_fwd": 14.54489678144455, "rowwise_bwd": 16.36252924799919, "global_fwd": 14.50401172041893, "global_bwd": 16.33254438638687, "x_quantize_rowwise": 1.6695670783519745, "g_quantize_rowwise": 0.3054291009902954, "w_quantize_rowwise": 0.83121657371521, "w_quantize_colwise_transpose": 15.305932611227036, "w_quantize_global": 1.7382949590682983, "w_quantize_global_transpose": 1.880194991827011, "cast_x": 2.412091940641403, "cast_g": 0.6079599261283875, "cast_w": 1.1950358748435974, "time_standard": 99.78226572275162, "time_rowwise": 82.04150944948196, "time_global": 69.45198029279709} +{"repeat": 64, "batch_size": 32768, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 67.96638667583466, "standard_gw": 67.99514591693878, "standard_gx": 69.66376304626465, "rowwise_fwd": 33.51752087473869, "rowwise_bwd": 29.131878167390823, "global_fwd": 32.65715390443802, "global_bwd": 29.13403883576393, "x_quantize_rowwise": 0.6002038717269897, "g_quantize_rowwise": 3.3336542546749115, "w_quantize_rowwise": 0.5934685468673706, "w_quantize_colwise_transpose": 23.92345294356346, "w_quantize_global": 1.7405375838279724, "w_quantize_global_transpose": 1.8656738102436066, "cast_x": 1.2112446129322052, "cast_g": 4.81804832816124, "cast_w": 1.1952146887779236, "time_standard": 205.6252956390381, "time_rowwise": 159.09532457590103, "time_global": 137.3264081776142} +{"repeat": 64, "batch_size": 32768, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 68.2341456413269, "standard_gw": 65.5074268579483, "standard_gx": 67.13805347681046, "rowwise_fwd": 29.153641313314438, "rowwise_bwd": 32.71844983100891, "global_fwd": 29.124341905117035, "global_bwd": 32.65979886054993, "x_quantize_rowwise": 3.3318176865577698, "g_quantize_rowwise": 0.6004795432090759, "w_quantize_rowwise": 0.8309967815876007, "w_quantize_colwise_transpose": 15.305690467357635, "w_quantize_global": 1.7405711114406586, "w_quantize_global_transpose": 1.8802620470523834, "cast_x": 4.8183538019657135, "cast_g": 1.2096390128135681, "cast_w": 1.1951103806495667, "time_standard": 200.87962597608566, "time_rowwise": 147.44850248098373, "time_global": 134.84469801187515} +{"repeat": 64, "batch_size": 1024, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.07764250040054321, "standard_gw": 0.07398426532745361, "standard_gx": 0.08482858538627625, "rowwise_fwd": 0.05266070365905762, "rowwise_bwd": 0.04478543996810913, "global_fwd": 0.052012503147125244, "global_bwd": 0.044364482164382935, "x_quantize_rowwise": 0.02640858292579651, "g_quantize_rowwise": 0.02539902925491333, "w_quantize_rowwise": 0.026457011699676514, "w_quantize_colwise_transpose": 0.17770379781723022, "w_quantize_global": 0.07440149784088135, "w_quantize_global_transpose": 0.08142739534378052, "cast_x": 0.008150935173034668, "cast_g": 0.022415071725845337, "cast_w": 0.03479421138763428, "time_standard": 0.23645535111427307, "time_rowwise": 0.42739883065223694, "time_global": 0.3779977560043335} +{"repeat": 64, "batch_size": 1024, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.08524581789970398, "standard_gw": 0.07383152842521667, "standard_gx": 0.07564574480056763, "rowwise_fwd": 0.04478171467781067, "rowwise_bwd": 0.052671879529953, "global_fwd": 0.04452839493751526, "global_bwd": 0.05219504237174988, "x_quantize_rowwise": 0.025328248739242554, "g_quantize_rowwise": 0.027123838663101196, "w_quantize_rowwise": 0.025607645511627197, "w_quantize_colwise_transpose": 0.17121434211730957, "w_quantize_global": 0.07916614413261414, "w_quantize_global_transpose": 0.08177384734153748, "cast_x": 0.022619962692260742, "cast_g": 0.008556991815567017, "cast_w": 0.034421682357788086, "time_standard": 0.23472309112548828, "time_rowwise": 0.42055919766426086, "time_global": 0.3839470446109772} +{"repeat": 64, "batch_size": 2048, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.13731792569160461, "standard_gw": 0.13414397835731506, "standard_gx": 0.14049187302589417, "rowwise_fwd": 0.10158121585845947, "rowwise_bwd": 0.07804110646247864, "global_fwd": 0.09908527135848999, "global_bwd": 0.07766112685203552, "x_quantize_rowwise": 0.026516616344451904, "g_quantize_rowwise": 0.03666803240776062, "w_quantize_rowwise": 0.024981796741485596, "w_quantize_colwise_transpose": 0.17706677317619324, "w_quantize_global": 0.07443130016326904, "w_quantize_global_transpose": 0.07870793342590332, "cast_x": 0.01224130392074585, "cast_g": 0.05828961730003357, "cast_w": 0.03501400351524353, "time_standard": 0.41195377707481384, "time_rowwise": 0.5789995193481445, "time_global": 0.5272142589092255} +{"repeat": 64, "batch_size": 2048, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.14651194214820862, "standard_gw": 0.14011189341545105, "standard_gx": 0.140264630317688, "rowwise_fwd": 0.081576406955719, "rowwise_bwd": 0.10671466588973999, "global_fwd": 0.08158013224601746, "global_bwd": 0.10219961404800415, "x_quantize_rowwise": 0.03775954246520996, "g_quantize_rowwise": 0.026103109121322632, "w_quantize_rowwise": 0.02656877040863037, "w_quantize_colwise_transpose": 0.17822161316871643, "w_quantize_global": 0.07506832480430603, "w_quantize_global_transpose": 0.07928535342216492, "cast_x": 0.05893409252166748, "cast_g": 0.012326985597610474, "cast_w": 0.03498047590255737, "time_standard": 0.42688846588134766, "time_rowwise": 0.5970560014247894, "time_global": 0.5421079695224762} +{"repeat": 64, "batch_size": 4096, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.2734065055847168, "standard_gw": 0.25558844208717346, "standard_gx": 0.29174983501434326, "rowwise_fwd": 0.173322856426239, "rowwise_bwd": 0.1515895128250122, "global_fwd": 0.17048418521881104, "global_bwd": 0.1506991684436798, "x_quantize_rowwise": 0.025950372219085693, "g_quantize_rowwise": 0.0653192400932312, "w_quantize_rowwise": 0.027138739824295044, "w_quantize_colwise_transpose": 0.17699971795082092, "w_quantize_global": 0.07373467087745667, "w_quantize_global_transpose": 0.07901713252067566, "cast_x": 0.02214685082435608, "cast_g": 0.11127442121505737, "cast_w": 0.03481656312942505, "time_standard": 0.8207447826862335, "time_rowwise": 0.8759088814258575, "time_global": 0.8207932114601135} +{"repeat": 64, "batch_size": 4096, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.27839839458465576, "standard_gw": 0.2537444233894348, "standard_gx": 0.28207898139953613, "rowwise_fwd": 0.16542896628379822, "rowwise_bwd": 0.18540024757385254, "global_fwd": 0.15722215175628662, "global_bwd": 0.17368420958518982, "x_quantize_rowwise": 0.06661936640739441, "g_quantize_rowwise": 0.027049332857131958, "w_quantize_rowwise": 0.025507062673568726, "w_quantize_colwise_transpose": 0.1741349697113037, "w_quantize_global": 0.07463246583938599, "w_quantize_global_transpose": 0.07879361510276794, "cast_x": 0.11301413178443909, "cast_g": 0.023346394300460815, "cast_w": 0.03505498170852661, "time_standard": 0.8142217993736267, "time_rowwise": 0.8978843688964844, "time_global": 0.8317455649375916} +{"repeat": 64, "batch_size": 8192, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.5755424499511719, "standard_gw": 0.5219094455242157, "standard_gx": 0.5992203950881958, "rowwise_fwd": 0.33193081617355347, "rowwise_bwd": 0.295441597700119, "global_fwd": 0.32791122794151306, "global_bwd": 0.2906434237957001, "x_quantize_rowwise": 0.0337548553943634, "g_quantize_rowwise": 0.1225881278514862, "w_quantize_rowwise": 0.024937093257904053, "w_quantize_colwise_transpose": 0.17729029059410095, "w_quantize_global": 0.0730752944946289, "w_quantize_global_transpose": 0.07835403084754944, "cast_x": 0.058166682720184326, "cast_g": 0.21592900156974792, "cast_w": 0.03454089164733887, "time_standard": 1.6966722905635834, "time_rowwise": 1.5078522264957428, "time_global": 1.4482364058494568} +{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.5104020237922668, "standard_gw": 0.5302242934703827, "standard_gx": 0.5842559039592743, "rowwise_fwd": 0.32220035791397095, "rowwise_bwd": 0.3576017916202545, "global_fwd": 0.2939775586128235, "global_bwd": 0.3313682973384857, "x_quantize_rowwise": 0.12369826436042786, "g_quantize_rowwise": 0.03423169255256653, "w_quantize_rowwise": 0.026501715183258057, "w_quantize_colwise_transpose": 0.16975775361061096, "w_quantize_global": 0.0768713653087616, "w_quantize_global_transpose": 0.08094683289527893, "cast_x": 0.21589547395706177, "cast_g": 0.05825608968734741, "cast_w": 0.03466010093688965, "time_standard": 1.6248822212219238, "time_rowwise": 1.5642158687114716, "time_global": 1.4713183045387268} +{"repeat": 64, "batch_size": 16384, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 1.194491982460022, "standard_gw": 1.0553859174251556, "standard_gx": 1.0726377367973328, "rowwise_fwd": 0.636763870716095, "rowwise_bwd": 0.5154944956302643, "global_fwd": 0.6281323730945587, "global_bwd": 0.5117170512676239, "x_quantize_rowwise": 0.062175095081329346, "g_quantize_rowwise": 0.23643672466278076, "w_quantize_rowwise": 0.025566667318344116, "w_quantize_colwise_transpose": 0.17768144607543945, "w_quantize_global": 0.07302314043045044, "w_quantize_global_transpose": 0.07866695523262024, "cast_x": 0.11140108108520508, "cast_g": 0.42498111724853516, "cast_w": 0.034831464290618896, "time_standard": 3.3225156366825104, "time_rowwise": 2.7095042169094086, "time_global": 2.645537257194519} +{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 1.0797791182994843, "standard_gw": 1.062549650669098, "standard_gx": 1.104947179555893, "rowwise_fwd": 0.5390122532844543, "rowwise_bwd": 0.6449781358242035, "global_fwd": 0.5145668983459473, "global_bwd": 0.6276033818721771, "x_quantize_rowwise": 0.23603439331054688, "g_quantize_rowwise": 0.062234699726104736, "w_quantize_rowwise": 0.02781301736831665, "w_quantize_colwise_transpose": 0.1703314483165741, "w_quantize_global": 0.07431954145431519, "w_quantize_global_transpose": 0.08028373122215271, "cast_x": 0.4249885678291321, "cast_g": 0.1113303005695343, "cast_w": 0.0348016619682312, "time_standard": 3.247275948524475, "time_rowwise": 2.742953598499298, "time_global": 2.657592296600342} +{"repeat": 64, "batch_size": 32768, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 2.392485737800598, "standard_gw": 2.046734094619751, "standard_gx": 2.177651971578598, "rowwise_fwd": 1.252591609954834, "rowwise_bwd": 1.0205842554569244, "global_fwd": 1.230098307132721, "global_bwd": 1.0132193565368652, "x_quantize_rowwise": 0.11823698878288269, "g_quantize_rowwise": 0.4639141261577606, "w_quantize_rowwise": 0.02602487802505493, "w_quantize_colwise_transpose": 0.17801672220230103, "w_quantize_global": 0.07301196455955505, "w_quantize_global_transpose": 0.07893890142440796, "cast_x": 0.21591037511825562, "cast_g": 0.843394547700882, "cast_w": 0.03460049629211426, "time_standard": 6.616871803998947, "time_rowwise": 5.106102675199509, "time_global": 5.0241537392139435} +{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 2.205628901720047, "standard_gw": 1.9917488098144531, "standard_gx": 2.1518059074878693, "rowwise_fwd": 1.040138304233551, "rowwise_bwd": 1.2538731098175049, "global_fwd": 1.0131187736988068, "global_bwd": 1.2291893362998962, "x_quantize_rowwise": 0.46381354331970215, "g_quantize_rowwise": 0.11790916323661804, "w_quantize_rowwise": 0.027123838663101196, "w_quantize_colwise_transpose": 0.17021596431732178, "w_quantize_global": 0.0752471387386322, "w_quantize_global_transpose": 0.08159875869750977, "cast_x": 0.8433908224105835, "cast_g": 0.215873122215271, "cast_w": 0.03452599048614502, "time_standard": 6.349183619022369, "time_rowwise": 5.064822733402252, "time_global": 4.972625523805618} +{"repeat": 64, "batch_size": 65536, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 4.755370318889618, "standard_gw": 4.736289381980896, "standard_gx": 4.0378570556640625, "rowwise_fwd": 2.4783052504062653, "rowwise_bwd": 1.9634142518043518, "global_fwd": 2.435591071844101, "global_bwd": 1.9498206675052643, "x_quantize_rowwise": 0.22948533296585083, "g_quantize_rowwise": 0.9186491370201111, "w_quantize_rowwise": 0.028233975172042847, "w_quantize_colwise_transpose": 0.17858296632766724, "w_quantize_global": 0.07418543100357056, "w_quantize_global_transpose": 0.07958710193634033, "cast_x": 0.4257224500179291, "cast_g": 1.680031418800354, "cast_w": 0.03458559513092041, "time_standard": 13.529516756534576, "time_rowwise": 10.532960295677185, "time_global": 10.423608124256134} +{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 4.050172865390778, "standard_gw": 3.916766494512558, "standard_gx": 4.281226545572281, "rowwise_fwd": 1.9789263606071472, "rowwise_bwd": 2.477586269378662, "global_fwd": 1.9495487213134766, "global_bwd": 2.434592694044113, "x_quantize_rowwise": 0.918261706829071, "g_quantize_rowwise": 0.22961944341659546, "w_quantize_rowwise": 0.025540590286254883, "w_quantize_colwise_transpose": 0.17032772302627563, "w_quantize_global": 0.07384642958641052, "w_quantize_global_transpose": 0.08105114102363586, "cast_x": 1.679886132478714, "cast_g": 0.42508915066719055, "cast_w": 0.03442913293838501, "time_standard": 12.248165905475616, "time_rowwise": 9.717028588056564, "time_global": 9.60368663072586} +{"repeat": 64, "batch_size": 131072, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 9.53347235918045, "standard_gw": 8.138865232467651, "standard_gx": 7.9666972160339355, "rowwise_fwd": 4.984956234693527, "rowwise_bwd": 3.850068897008896, "global_fwd": 4.9025751650333405, "global_bwd": 3.820303827524185, "x_quantize_rowwise": 0.45222043991088867, "g_quantize_rowwise": 1.8290691077709198, "w_quantize_rowwise": 0.026736408472061157, "w_quantize_colwise_transpose": 0.17832592129707336, "w_quantize_global": 0.07471069693565369, "w_quantize_global_transpose": 0.08177757263183594, "cast_x": 0.8435025811195374, "cast_g": 3.3529214560985565, "cast_w": 0.03475695848464966, "time_standard": 25.639034807682037, "time_rowwise": 19.460242241621017, "time_global": 19.299522042274475} +{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 7.996037602424622, "standard_gw": 8.2748644053936, "standard_gx": 8.523400872945786, "rowwise_fwd": 3.8556940853595734, "rowwise_bwd": 4.966288805007935, "global_fwd": 3.820043057203293, "global_bwd": 4.882067441940308, "x_quantize_rowwise": 1.8279887735843658, "g_quantize_rowwise": 0.4520900547504425, "w_quantize_rowwise": 0.02676248550415039, "w_quantize_colwise_transpose": 0.17083808779716492, "w_quantize_global": 0.07691606879234314, "w_quantize_global_transpose": 0.08223950862884521, "cast_x": 3.3530443906784058, "cast_g": 0.8434318006038666, "cast_w": 0.034671276807785034, "time_standard": 24.794302880764008, "time_rowwise": 19.574526697397232, "time_global": 19.416209310293198} +{"repeat": 64, "batch_size": 1024, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.09413063526153564, "standard_gw": 0.10038167238235474, "standard_gx": 0.09725615382194519, "rowwise_fwd": 0.05979463458061218, "rowwise_bwd": 0.0525452196598053, "global_fwd": 0.059057027101516724, "global_bwd": 0.05194917321205139, "x_quantize_rowwise": 0.02664700150489807, "g_quantize_rowwise": 0.02642720937728882, "w_quantize_rowwise": 0.030562281608581543, "w_quantize_colwise_transpose": 0.2400912344455719, "w_quantize_global": 0.09407848119735718, "w_quantize_global_transpose": 0.10256841778755188, "cast_x": 0.008724629878997803, "cast_g": 0.028502196073532104, "cast_w": 0.05552172660827637, "time_standard": 0.29176846146583557, "time_rowwise": 0.5364492535591125, "time_global": 0.4611089825630188} +{"repeat": 64, "batch_size": 1024, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.09753555059432983, "standard_gw": 0.10102242231369019, "standard_gx": 0.09121373295783997, "rowwise_fwd": 0.052150338888168335, "rowwise_bwd": 0.059779733419418335, "global_fwd": 0.05161017179489136, "global_bwd": 0.05943328142166138, "x_quantize_rowwise": 0.026702880859375, "g_quantize_rowwise": 0.02469494938850403, "w_quantize_rowwise": 0.03324449062347412, "w_quantize_colwise_transpose": 0.23468583822250366, "w_quantize_global": 0.09394437074661255, "w_quantize_global_transpose": 0.10142102837562561, "cast_x": 0.028360635042190552, "cast_g": 0.008717179298400879, "cast_w": 0.05577504634857178, "time_standard": 0.28977170586586, "time_rowwise": 0.5322806537151337, "time_global": 0.4588291049003601} +{"repeat": 64, "batch_size": 2048, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.18056854605674744, "standard_gw": 0.18374621868133545, "standard_gx": 0.19219890236854553, "rowwise_fwd": 0.1150965690612793, "rowwise_bwd": 0.0903494656085968, "global_fwd": 0.11263042688369751, "global_bwd": 0.08984282612800598, "x_quantize_rowwise": 0.027067959308624268, "g_quantize_rowwise": 0.040043145418167114, "w_quantize_rowwise": 0.03063306212425232, "w_quantize_colwise_transpose": 0.24128705263137817, "w_quantize_global": 0.09361281991004944, "w_quantize_global_transpose": 0.1024976372718811, "cast_x": 0.01381710171699524, "cast_g": 0.06845593452453613, "cast_w": 0.05572289228439331, "time_standard": 0.5565136671066284, "time_rowwise": 0.7282234728336334, "time_global": 0.6494410336017609} +{"repeat": 64, "batch_size": 2048, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.16536936163902283, "standard_gw": 0.19479170441627502, "standard_gx": 0.18597766757011414, "rowwise_fwd": 0.09634345769882202, "rowwise_bwd": 0.11937320232391357, "global_fwd": 0.09264424443244934, "global_bwd": 0.11524930596351624, "x_quantize_rowwise": 0.04038214683532715, "g_quantize_rowwise": 0.025559216737747192, "w_quantize_rowwise": 0.03334507346153259, "w_quantize_colwise_transpose": 0.23956596851348877, "w_quantize_global": 0.09445473551750183, "w_quantize_global_transpose": 0.1020580530166626, "cast_x": 0.06891414523124695, "cast_g": 0.013861805200576782, "cast_w": 0.05607306957244873, "time_standard": 0.546138733625412, "time_rowwise": 0.7493607699871063, "time_global": 0.6651394069194794} +{"repeat": 64, "batch_size": 4096, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.36064907908439636, "standard_gw": 0.3711991012096405, "standard_gx": 0.3863237798213959, "rowwise_fwd": 0.22270530462265015, "rowwise_bwd": 0.1760348677635193, "global_fwd": 0.21781772375106812, "global_bwd": 0.17484650015830994, "x_quantize_rowwise": 0.02625212073326111, "g_quantize_rowwise": 0.07131323218345642, "w_quantize_rowwise": 0.030372291803359985, "w_quantize_colwise_transpose": 0.23974105715751648, "w_quantize_global": 0.09407475590705872, "w_quantize_global_transpose": 0.1024492084980011, "cast_x": 0.028584152460098267, "cast_g": 0.1303069293498993, "cast_w": 0.05582347512245178, "time_standard": 1.1181719601154327, "time_rowwise": 1.137617975473404, "time_global": 1.057952642440796} +{"repeat": 64, "batch_size": 4096, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.32703205943107605, "standard_gw": 0.3764517605304718, "standard_gx": 0.3938935697078705, "rowwise_fwd": 0.18771737813949585, "rowwise_bwd": 0.2374798059463501, "global_fwd": 0.1843757927417755, "global_bwd": 0.23005902767181396, "x_quantize_rowwise": 0.07155537605285645, "g_quantize_rowwise": 0.02625212073326111, "w_quantize_rowwise": 0.03294646739959717, "w_quantize_colwise_transpose": 0.23755058646202087, "w_quantize_global": 0.09388476610183716, "w_quantize_global_transpose": 0.10246038436889648, "cast_x": 0.13131648302078247, "cast_g": 0.028781592845916748, "cast_w": 0.05638599395751953, "time_standard": 1.0973773896694183, "time_rowwise": 1.1699534952640533, "time_global": 1.0850392282009125} +{"repeat": 64, "batch_size": 8192, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.7961541414260864, "standard_gw": 0.7424280047416687, "standard_gx": 0.8688867092132568, "rowwise_fwd": 0.432576984167099, "rowwise_bwd": 0.34543126821517944, "global_fwd": 0.4248805344104767, "global_bwd": 0.3432855010032654, "x_quantize_rowwise": 0.03750622272491455, "g_quantize_rowwise": 0.13292208313941956, "w_quantize_rowwise": 0.030599534511566162, "w_quantize_colwise_transpose": 0.24292618036270142, "w_quantize_global": 0.09351596236228943, "w_quantize_global_transpose": 0.1026056706905365, "cast_x": 0.06843730807304382, "cast_g": 0.2539418637752533, "cast_w": 0.05568563938140869, "time_standard": 2.407468855381012, "time_rowwise": 1.9643902778625488, "time_global": 1.8771439790725708} +{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.7150471210479736, "standard_gw": 0.7525831460952759, "standard_gx": 0.8075274527072906, "rowwise_fwd": 0.36595389246940613, "rowwise_bwd": 0.4404708743095398, "global_fwd": 0.3485158085823059, "global_bwd": 0.4275962710380554, "x_quantize_rowwise": 0.1329965889453888, "g_quantize_rowwise": 0.03767386078834534, "w_quantize_rowwise": 0.03295019268989563, "w_quantize_colwise_transpose": 0.23509934544563293, "w_quantize_global": 0.09398534893989563, "w_quantize_global_transpose": 0.10186433792114258, "cast_x": 0.2537667751312256, "cast_g": 0.06839632987976074, "cast_w": 0.05571544170379639, "time_standard": 2.27515771985054, "time_rowwise": 1.9977279007434845, "time_global": 1.8952153623104095} +{"repeat": 64, "batch_size": 16384, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 1.6392990946769714, "standard_gw": 1.4941170811653137, "standard_gx": 1.4451220631599426, "rowwise_fwd": 0.8369758725166321, "rowwise_bwd": 0.6830468773841858, "global_fwd": 0.8197203278541565, "global_bwd": 0.6782263517379761, "x_quantize_rowwise": 0.06883591413497925, "g_quantize_rowwise": 0.2565309405326843, "w_quantize_rowwise": 0.03046169877052307, "w_quantize_colwise_transpose": 0.2430342137813568, "w_quantize_global": 0.09346380829811096, "w_quantize_global_transpose": 0.10301917791366577, "cast_x": 0.13044849038124084, "cast_g": 0.5010999739170074, "cast_w": 0.05590170621871948, "time_standard": 4.578538239002228, "time_rowwise": 3.613002598285675, "time_global": 3.5139136016368866} +{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 1.4654621481895447, "standard_gw": 1.5012174844741821, "standard_gx": 1.5183314681053162, "rowwise_fwd": 0.7059797644615173, "rowwise_bwd": 0.8470229804515839, "global_fwd": 0.6788894534111023, "global_bwd": 0.8200779557228088, "x_quantize_rowwise": 0.2564750611782074, "g_quantize_rowwise": 0.06899237632751465, "w_quantize_rowwise": 0.03293529152870178, "w_quantize_colwise_transpose": 0.23559853434562683, "w_quantize_global": 0.09375810623168945, "w_quantize_global_transpose": 0.10203942656517029, "cast_x": 0.5010105669498444, "cast_g": 0.13037025928497314, "cast_w": 0.05577504634857178, "time_standard": 4.485011100769043, "time_rowwise": 3.648221492767334, "time_global": 3.521449863910675} +{"repeat": 64, "batch_size": 32768, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 3.236088901758194, "standard_gw": 2.8601549565792084, "standard_gx": 2.8000958263874054, "rowwise_fwd": 1.6548968851566315, "rowwise_bwd": 1.3559646904468536, "global_fwd": 1.6249343752861023, "global_bwd": 1.3474412262439728, "x_quantize_rowwise": 0.13122707605361938, "g_quantize_rowwise": 0.5038455128669739, "w_quantize_rowwise": 0.03061816096305847, "w_quantize_colwise_transpose": 0.24301931262016296, "w_quantize_global": 0.09343400597572327, "w_quantize_global_transpose": 0.10178983211517334, "cast_x": 0.25383010506629944, "cast_g": 0.9955987334251404, "cast_w": 0.05569681525230408, "time_standard": 8.896339684724808, "time_rowwise": 6.779726594686508, "time_global": 6.662826985120773} +{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 2.8433389961719513, "standard_gw": 2.861086279153824, "standard_gx": 3.0227042734622955, "rowwise_fwd": 1.4057457447052002, "rowwise_bwd": 1.6565024852752686, "global_fwd": 1.3475008308887482, "global_bwd": 1.6247481107711792, "x_quantize_rowwise": 0.5038045346736908, "g_quantize_rowwise": 0.13130158185958862, "w_quantize_rowwise": 0.03298744559288025, "w_quantize_colwise_transpose": 0.23539364337921143, "w_quantize_global": 0.09393692016601562, "w_quantize_global_transpose": 0.10208785533905029, "cast_x": 0.9952597320079803, "cast_g": 0.25385990738868713, "cast_w": 0.05589798092842102, "time_standard": 8.72712954878807, "time_rowwise": 6.826821714639664, "time_global": 6.664466112852097} +{"repeat": 64, "batch_size": 65536, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 6.449159234762192, "standard_gw": 6.384443491697311, "standard_gx": 5.543403327465057, "rowwise_fwd": 3.3065229654312134, "rowwise_bwd": 2.6249960064888, "global_fwd": 3.2497718930244446, "global_bwd": 2.6061534881591797, "x_quantize_rowwise": 0.25821104645729065, "g_quantize_rowwise": 0.9981803596019745, "w_quantize_rowwise": 0.030606985092163086, "w_quantize_colwise_transpose": 0.24094432592391968, "w_quantize_global": 0.09358301758766174, "w_quantize_global_transpose": 0.10264664888381958, "cast_x": 0.5018562078475952, "cast_g": 1.9840113818645477, "cast_w": 0.05584210157394409, "time_standard": 18.37700605392456, "time_rowwise": 13.843905180692673, "time_global": 13.692989945411682} +{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 5.508493632078171, "standard_gw": 5.689781159162521, "standard_gx": 6.020743399858475, "rowwise_fwd": 2.640843391418457, "rowwise_bwd": 3.3075474202632904, "global_fwd": 2.605751156806946, "global_bwd": 3.2674334943294525, "x_quantize_rowwise": 0.9983181953430176, "g_quantize_rowwise": 0.25597214698791504, "w_quantize_rowwise": 0.03277510404586792, "w_quantize_colwise_transpose": 0.23587048053741455, "w_quantize_global": 0.09367987513542175, "w_quantize_global_transpose": 0.10236725211143494, "cast_x": 1.9848868250846863, "cast_g": 0.5010329186916351, "cast_w": 0.055771321058273315, "time_standard": 17.219018191099167, "time_rowwise": 13.161107897758484, "time_global": 13.013303279876709} +{"repeat": 64, "batch_size": 131072, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 12.975204735994339, "standard_gw": 11.424731463193893, "standard_gx": 11.05477660894394, "rowwise_fwd": 6.623122841119766, "rowwise_bwd": 5.253363400697708, "global_fwd": 6.506938487291336, "global_bwd": 5.211424082517624, "x_quantize_rowwise": 0.5057789385318756, "g_quantize_rowwise": 1.9870363175868988, "w_quantize_rowwise": 0.030517578125, "w_quantize_colwise_transpose": 0.24361908435821533, "w_quantize_global": 0.09384006261825562, "w_quantize_global_transpose": 0.10285153985023499, "cast_x": 0.9967051446437836, "cast_g": 3.9620958268642426, "cast_w": 0.05599111318588257, "time_standard": 35.45471280813217, "time_rowwise": 26.068169623613358, "time_global": 25.83260089159012} +{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 11.05555146932602, "standard_gw": 11.32136583328247, "standard_gx": 12.035444378852844, "rowwise_fwd": 5.243867635726929, "rowwise_bwd": 6.622854620218277, "global_fwd": 5.209986120462418, "global_bwd": 6.507329642772675, "x_quantize_rowwise": 1.9862838089466095, "g_quantize_rowwise": 0.506080687046051, "w_quantize_rowwise": 0.03318488597869873, "w_quantize_colwise_transpose": 0.23682788014411926, "w_quantize_global": 0.09349361062049866, "w_quantize_global_transpose": 0.1023709774017334, "cast_x": 3.962486982345581, "cast_g": 0.9956248104572296, "cast_w": 0.05572289228439331, "time_standard": 34.412361681461334, "time_rowwise": 25.950465351343155, "time_global": 25.726910680532455} diff --git a/tests/triton_tests/info_mlp.jsonl b/tests/triton_tests/info_mlp.jsonl new file mode 100644 index 0000000..a2076ee --- /dev/null +++ b/tests/triton_tests/info_mlp.jsonl @@ -0,0 +1,20 @@ +{"repeat": 32, "batch_size": 16384, "dim": 1024, "standard": 3.807276487350464, "my_standard": 4.196919500827789, "standard_compiled": 3.771558403968811, "sb": 3.5132691264152527} +{"repeat": 32, "batch_size": 32768, "dim": 1024, "standard": 7.215872406959534, "my_standard": 7.991522550582886, "standard_compiled": 7.241688668727875, "sb": 6.581142544746399} +{"repeat": 32, "batch_size": 65536, "dim": 1024, "standard": 14.26444947719574, "my_standard": 15.685759484767914, "standard_compiled": 14.251746237277985, "sb": 12.735314667224884} +{"repeat": 32, "batch_size": 131072, "dim": 1024, "standard": 28.49559485912323, "my_standard": 31.26966953277588, "standard_compiled": 28.414390981197357, "sb": 25.319166481494904} +{"repeat": 32, "batch_size": 16384, "dim": 1280, "standard": 5.887262523174286, "my_standard": 6.132654845714569, "standard_compiled": 5.902409553527832, "sb": 4.947789013385773} +{"repeat": 32, "batch_size": 32768, "dim": 1280, "standard": 11.14131510257721, "my_standard": 12.859955430030823, "standard_compiled": 11.133037507534027, "sb": 9.303092956542969} +{"repeat": 32, "batch_size": 65536, "dim": 1280, "standard": 22.193141281604767, "my_standard": 25.66336840391159, "standard_compiled": 22.22583442926407, "sb": 18.285617232322693} +{"repeat": 32, "batch_size": 131072, "dim": 1280, "standard": 44.23898458480835, "my_standard": 51.30268633365631, "standard_compiled": 44.08355802297592, "sb": 35.999126732349396} +{"repeat": 32, "batch_size": 16384, "dim": 1408, "standard": 6.938718259334564, "my_standard": 7.269218564033508, "standard_compiled": 6.94604218006134, "sb": 5.764961242675781} +{"repeat": 32, "batch_size": 32768, "dim": 1408, "standard": 13.04878294467926, "my_standard": 13.742901384830475, "standard_compiled": 13.011425733566284, "sb": 10.774023830890656} +{"repeat": 32, "batch_size": 65536, "dim": 1408, "standard": 26.738539338111877, "my_standard": 27.739346027374268, "standard_compiled": 26.75659954547882, "sb": 21.882005035877228} +{"repeat": 32, "batch_size": 131072, "dim": 1408, "standard": 51.905401051044464, "my_standard": 53.98637801408768, "standard_compiled": 51.8316924571991, "sb": 41.67725890874863} +{"repeat": 32, "batch_size": 16384, "dim": 1664, "standard": 9.233824908733368, "my_standard": 9.619377553462982, "standard_compiled": 9.214423596858978, "sb": 7.557623088359833} +{"repeat": 32, "batch_size": 32768, "dim": 1664, "standard": 17.324909567832947, "my_standard": 17.996780574321747, "standard_compiled": 17.29544997215271, "sb": 14.035224914550781} +{"repeat": 32, "batch_size": 65536, "dim": 1664, "standard": 35.51657497882843, "my_standard": 36.674730479717255, "standard_compiled": 35.43049842119217, "sb": 28.38330715894699} +{"repeat": 32, "batch_size": 131072, "dim": 1664, "standard": 69.0087378025055, "my_standard": 71.56594842672348, "standard_compiled": 68.82885098457336, "sb": 54.01633679866791} +{"repeat": 32, "batch_size": 16384, "dim": 2048, "standard": 12.590140104293823, "my_standard": 13.106442987918854, "standard_compiled": 12.606985867023468, "sb": 10.286301374435425} +{"repeat": 32, "batch_size": 32768, "dim": 2048, "standard": 24.830535054206848, "my_standard": 25.563716888427734, "standard_compiled": 24.895809590816498, "sb": 19.559212028980255} +{"repeat": 32, "batch_size": 65536, "dim": 2048, "standard": 49.55078661441803, "my_standard": 51.16480588912964, "standard_compiled": 49.739621579647064, "sb": 38.29141706228256} +{"repeat": 32, "batch_size": 131072, "dim": 2048, "standard": 98.36294502019882, "my_standard": 102.69322991371155, "standard_compiled": 98.76712411642075, "sb": 75.88706165552139} diff --git a/tests/triton_tests/info_mlp_autocast.jsonl b/tests/triton_tests/info_mlp_autocast.jsonl new file mode 100644 index 0000000..f2098cc --- /dev/null +++ b/tests/triton_tests/info_mlp_autocast.jsonl @@ -0,0 +1,20 @@ +{"repeat": 32, "batch_size": 16384, "dim": 1024, "standard": 4.91420179605484, "my_standard": 5.577877163887024, "standard_compiled": 4.810944199562073, "sb": 4.512995481491089} +{"repeat": 32, "batch_size": 32768, "dim": 1024, "standard": 8.876129984855652, "my_standard": 10.154612362384796, "standard_compiled": 8.820965886116028, "sb": 8.367843925952911} +{"repeat": 32, "batch_size": 65536, "dim": 1024, "standard": 17.47015118598938, "my_standard": 19.857674837112427, "standard_compiled": 17.338842153549194, "sb": 15.992552042007446} +{"repeat": 32, "batch_size": 131072, "dim": 1024, "standard": 34.824438393116, "my_standard": 39.499424397945404, "standard_compiled": 34.56207364797592, "sb": 31.573951244354248} +{"repeat": 32, "batch_size": 16384, "dim": 1280, "standard": 7.342606782913208, "my_standard": 7.9323723912239075, "standard_compiled": 7.279552519321442, "sb": 6.395488977432251} +{"repeat": 32, "batch_size": 32768, "dim": 1280, "standard": 13.69999349117279, "my_standard": 16.0503089427948, "standard_compiled": 13.603456318378448, "sb": 11.813104152679443} +{"repeat": 32, "batch_size": 65536, "dim": 1280, "standard": 29.557034373283386, "my_standard": 34.2303067445755, "standard_compiled": 29.382556676864624, "sb": 22.882774472236633} +{"repeat": 32, "batch_size": 131072, "dim": 1280, "standard": 53.629085421562195, "my_standard": 63.07622790336609, "standard_compiled": 53.33048850297928, "sb": 44.76426541805267} +{"repeat": 32, "batch_size": 16384, "dim": 1408, "standard": 8.81417840719223, "my_standard": 9.477965533733368, "standard_compiled": 8.73943418264389, "sb": 7.479414343833923} +{"repeat": 32, "batch_size": 32768, "dim": 1408, "standard": 16.242466866970062, "my_standard": 17.616644501686096, "standard_compiled": 16.14125818014145, "sb": 13.665586709976196} +{"repeat": 32, "batch_size": 65536, "dim": 1408, "standard": 32.429613173007965, "my_standard": 34.80646014213562, "standard_compiled": 32.319076359272, "sb": 27.123987674713135} +{"repeat": 32, "batch_size": 131072, "dim": 1408, "standard": 62.85770237445831, "my_standard": 67.55391508340836, "standard_compiled": 62.453076243400574, "sb": 51.53566598892212} +{"repeat": 32, "batch_size": 16384, "dim": 1664, "standard": 11.585861444473267, "my_standard": 12.565858662128448, "standard_compiled": 11.504307389259338, "sb": 9.657211601734161} +{"repeat": 32, "batch_size": 32768, "dim": 1664, "standard": 21.261662244796753, "my_standard": 22.771358489990234, "standard_compiled": 21.12410217523575, "sb": 17.64291524887085} +{"repeat": 32, "batch_size": 65536, "dim": 1664, "standard": 42.85307973623276, "my_standard": 45.70870101451874, "standard_compiled": 42.57970303297043, "sb": 34.918561577796936} +{"repeat": 32, "batch_size": 131072, "dim": 1664, "standard": 83.56057852506638, "my_standard": 89.11971747875214, "standard_compiled": 83.05662125349045, "sb": 66.32210314273834} +{"repeat": 32, "batch_size": 16384, "dim": 2048, "standard": 15.7279372215271, "my_standard": 16.854502260684967, "standard_compiled": 15.655294060707092, "sb": 13.228952884674072} +{"repeat": 32, "batch_size": 32768, "dim": 2048, "standard": 30.42648732662201, "my_standard": 32.26502239704132, "standard_compiled": 30.239209532737732, "sb": 24.354808032512665} +{"repeat": 32, "batch_size": 65536, "dim": 2048, "standard": 60.779355466365814, "my_standard": 64.11923468112946, "standard_compiled": 60.89268624782562, "sb": 46.91776633262634} +{"repeat": 32, "batch_size": 131072, "dim": 2048, "standard": 119.93677169084549, "my_standard": 128.19699943065643, "standard_compiled": 120.20225822925568, "sb": 92.3452153801918} diff --git a/tests/triton_tests/info_mlp_autocast_ln.jsonl b/tests/triton_tests/info_mlp_autocast_ln.jsonl new file mode 100644 index 0000000..706f949 --- /dev/null +++ b/tests/triton_tests/info_mlp_autocast_ln.jsonl @@ -0,0 +1,23 @@ +{"repeat": 32, "batch_size": 16384, "dim": 1024, "standard": 5.171686410903931, "my_standard": 5.839601159095764, "standard_compiled": 5.032263696193695, "sb": 4.89344447851181} +{"repeat": 32, "batch_size": 32768, "dim": 1024, "standard": 9.605035185813904, "my_standard": 10.910414159297943, "standard_compiled": 9.230785071849823, "sb": 9.128175675868988} +{"repeat": 32, "batch_size": 65536, "dim": 1024, "standard": 18.802084028720856, "my_standard": 21.311581134796143, "standard_compiled": 18.105976283550262, "sb": 17.489850521087646} +{"repeat": 32, "batch_size": 131072, "dim": 1024, "standard": 37.49683499336243, "my_standard": 42.40527004003525, "standard_compiled": 36.13145649433136, "sb": 34.58733111619949} +{"repeat": 32, "batch_size": 16384, "dim": 1280, "standard": 7.709823548793793, "my_standard": 8.290477097034454, "standard_compiled": 7.564418017864227, "sb": 6.8823546171188354} +{"repeat": 32, "batch_size": 32768, "dim": 1280, "standard": 14.64156061410904, "my_standard": 16.996942460536957, "standard_compiled": 14.4081711769104, "sb": 12.761622667312622} +{"repeat": 32, "batch_size": 65536, "dim": 1280, "standard": 31.40200674533844, "my_standard": 36.074504256248474, "standard_compiled": 30.981406569480896, "sb": 24.76389706134796} +{"repeat": 32, "batch_size": 131072, "dim": 1280, "standard": 56.93405121564865, "my_standard": 66.35250151157379, "standard_compiled": 56.07586354017258, "sb": 48.49743843078613} +{"repeat": 32, "batch_size": 16384, "dim": 1408, "standard": 9.188003838062286, "my_standard": 9.84550267457962, "standard_compiled": 9.006097912788391, "sb": 7.9473331570625305} +{"repeat": 32, "batch_size": 32768, "dim": 1408, "standard": 17.268165946006775, "my_standard": 18.64910125732422, "standard_compiled": 16.983114182949066, "sb": 14.70106840133667} +{"repeat": 32, "batch_size": 65536, "dim": 1408, "standard": 34.39047932624817, "my_standard": 36.69705241918564, "standard_compiled": 33.8401272892952, "sb": 29.188089072704315} +{"repeat": 32, "batch_size": 131072, "dim": 1408, "standard": 66.70494377613068, "my_standard": 71.27603143453598, "standard_compiled": 65.56134670972824, "sb": 55.6538850069046} +{"repeat": 32, "batch_size": 16384, "dim": 1664, "standard": 12.10707426071167, "my_standard": 12.931793928146362, "standard_compiled": 11.76995038986206, "sb": 10.228671133518219} +{"repeat": 32, "batch_size": 32768, "dim": 1664, "standard": 22.5130096077919, "my_standard": 23.962542414665222, "standard_compiled": 21.997176110744476, "sb": 18.89890432357788} +{"repeat": 32, "batch_size": 65536, "dim": 1664, "standard": 45.210108160972595, "my_standard": 47.94136434793472, "standard_compiled": 44.2262664437294, "sb": 37.37735003232956} +{"repeat": 32, "batch_size": 131072, "dim": 1664, "standard": 88.1955549120903, "my_standard": 93.6831533908844, "standard_compiled": 86.33609116077423, "sb": 71.23208791017532} +{"repeat": 32, "batch_size": 16384, "dim": 2048, "standard": 16.538940370082855, "my_standard": 17.607316374778748, "standard_compiled": 16.108587384223938, "sb": 14.030493795871735} +{"repeat": 32, "batch_size": 32768, "dim": 2048, "standard": 31.795650720596313, "my_standard": 33.57230871915817, "standard_compiled": 31.04180097579956, "sb": 25.971196591854095} +{"repeat": 32, "batch_size": 65536, "dim": 2048, "standard": 63.021354377269745, "my_standard": 66.8477788567543, "standard_compiled": 61.682507395744324, "sb": 50.138771533966064} +{"repeat": 32, "batch_size": 131072, "dim": 2048, "standard": 125.17062574625015, "my_standard": 133.60925763845444, "standard_compiled": 122.21191823482513, "sb": 98.40084612369537} +{"repeat": 32, "batch_size": 16384, "dim": 4096, "standard": 57.31645971536636, "my_standard": 60.84543466567993, "standard_compiled": 55.78199774026871, "sb": 45.43223977088928} +{"repeat": 32, "batch_size": 32768, "dim": 4096, "standard": 111.80306226015091, "my_standard": 119.0284714102745, "standard_compiled": 108.91905426979065, "sb": 85.4572057723999} +{"repeat": 32, "batch_size": 65536, "dim": 4096, "standard": 220.4471081495285, "my_standard": 233.0927476286888, "standard_compiled": 214.26431089639664, "sb": 163.30372542142868} diff --git a/tests/triton_tests/make_plot_with_info.py b/tests/triton_tests/make_plot_with_info.py new file mode 100644 index 0000000..116d1d1 --- /dev/null +++ b/tests/triton_tests/make_plot_with_info.py @@ -0,0 +1,137 @@ +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import os + +import matplotlib.gridspec as gridspec + +cmap=plt.get_cmap('cool') + +if __name__ == '__main__': + + fig = plt.figure(tight_layout=True, figsize=(12,3.5)) + gs = gridspec.GridSpec(1, 2) + + + ax = fig.add_subplot(gs[0, 0]) + + rdf = pd.read_json('tests/triton_tests/info.jsonl', lines=True) + df = rdf[rdf.batch_size == 32768] + + for k, marker, ls, color, name in [ + ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (sum of parts)'), + ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (sum of parts)'), + + ('standard_fwd', '^', '--', 'C2', 'Matmul XW (standard)'), + ('standard_gw', '^', '-.', 'C2', 'Matmul GW (standard)'), + ('standard_gx', '^', ':', 'gray', 'Matmul GX (both)'), + + ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'), + ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'), + + #### time_global = info['x_quantize_rowwise'] + info['g_quantize_rowwise'] + info['w_quantize_global'] + info['w_quantize_global_transpose'] + info['standard_gw'] + info['global_fwd'] + info['global_bwd'] + + ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), + ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), + ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'), + ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'), + #('standard_gw', '.', '--', 'C1', 'standard_gw'), + ]: + xs = [] + ys = [] + for embed_dim in [1024, 1280, 1408, 1664, 2048, 4096]: + df_ = df[df.dim_in == embed_dim] + df_ = df_[df_.dim_out == embed_dim * 4] + xs.append(embed_dim) + y_ = 0 + for k_ in k.split('+'): + y_ += df_[k_].values[0] + df_ = df[df.dim_in == embed_dim * 4] + df_ = df_[df_.dim_out == embed_dim] + for k_ in k.split('+'): + y_ += df_[k_].values[0] + ys.append(y_ * 0.5) + + + ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.) + + + + + ax.set_xlabel('dim', fontsize=13) + ax.set_ylabel('time (ms)', fontsize=13) + # make a legend which is below the plot + + + + ax.grid() + + ax.set_xscale('log') + #ax.set_yscale('log') + + ax.tick_params(axis='x', labelsize=11) + ax.tick_params(axis='y', labelsize=11) + + ax.set_xticks([1024, 2048, 4096]) + ax.set_xticklabels([1024, 2048, 4096]) + ax.set_xticks([], minor=True) + + leg = ax.legend(loc='upper center', bbox_to_anchor=(-0.64, 1.), ncol=1, fontsize=10) + leg.get_texts()[0].set_fontweight('bold') + leg.get_texts()[1].set_fontweight('bold') + plt.subplots_adjust(left=0.1) + ax.set_title(' Linear layer, batch * sequence length = 32k', fontsize=10, loc='left', y=1.05, pad=-20) + + + ax = fig.add_subplot(gs[0, 1]) + + # now plot the % speedup for different batch sizes + for j, batch_size in enumerate([2**14, 2**15, 2**16, 2**17]): + all_xs, all_ys = [], [] + for k, marker, ls, color, name in [ + ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'), + ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), + ]: + + xs, ys = [], [] + df = rdf[rdf.batch_size == batch_size] + for embed_dim in [1024, 1280, 1408, 1664, 2048, 4096]: + df_ = df[df.dim_in == embed_dim] + df_ = df_[df_.dim_out == embed_dim * 4] + xs.append(embed_dim) + y_ = 0 + for k_ in k.split('+'): + y_ += df_[k_].values[0] + df_ = df[df.dim_in == embed_dim * 4] + df_ = df_[df_.dim_out == embed_dim] + for k_ in k.split('+'): + y_ += df_[k_].values[0] + ys.append(y_ * 0.5) + all_xs.append(xs) + all_ys.append(ys) + + color = cmap(j * 0.25) + real_ys = [-((all_ys[1][i] - all_ys[0][i]) / all_ys[0][i]) * 100 for i in range(len(all_ys[0]))] + markers = ['^', 'v', 'P', 'o'] + ax.plot(all_xs[0], real_ys, color=color, label=f'batch * sequence length = {batch_size}', marker=markers[j], markersize=5 if marker=='s' else 5) + + ax.legend() + ax.set_xlabel('dim', fontsize=13) + ax.set_xscale('log') + ax.grid() + ax.set_ylabel(r'% speedup', fontsize=13) + + + ax.tick_params(axis='x', labelsize=11) + ax.tick_params(axis='y', labelsize=11) + + ax.set_xticks([1024, 2048, 4096]) + ax.set_xticklabels([1024, 2048, 4096]) + ax.set_xticks([], minor=True) + + ax.set_title(' Linear layer summary, varying dimensions', fontsize=10, loc='left', y=1.05, pad=-20) + + + + plt.savefig('tests/triton_tests/plot1.pdf', bbox_inches='tight') + diff --git a/tests/triton_tests/mlp.py b/tests/triton_tests/mlp.py new file mode 100644 index 0000000..1ec85b8 --- /dev/null +++ b/tests/triton_tests/mlp.py @@ -0,0 +1,64 @@ + +import time +import torch +import torch.nn as nn +import bitsandbytes.nn as bnn +from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear, MyLinear + +def construct_model(dim, layers, module): + modules = [] + for _ in range(layers): + modules.append(module(dim, 4*dim)) + modules.append(module(4*dim, dim)) + return nn.Sequential(*modules).cuda().train() + +def get_time(model, x, name): + for _ in range(repeat // 2): + #with torch.cuda.amp.autocast(): + out = model(x) + #(2**16 * out.pow(2).mean()).backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + # with torch.cuda.amp.autocast(): + out = model(x) + #(2**16 * out.pow(2).mean()).backward() + + torch.cuda.synchronize() + end = time.time() + print(f"time {name}: {(end - start) / repeat * 1000:.3f} ms") + +if __name__ == '__main__': + torch.manual_seed(0) + + # hparams + repeat = 16 + dim=2048 + layers =4 + batch_size = 2 + sequence_length = 2**15 + + # construct models + standard = construct_model(dim, layers, nn.Linear).half() + my_standard = construct_model(dim, layers, MyLinear).half() + switchback = construct_model(dim, layers, SwitchBackLinear).half() + switchback_global = construct_model(dim, layers, SwitchBackGlobalLinear).half() + #bnb_8bitmixed = construct_model(dim, layers, bnn.Linear8bitLt) + + # simulate forward pass + x = torch.randn(batch_size * sequence_length, dim, dtype=torch.float16).cuda() + + # get time for forward and backward + get_time(standard, x, "standard") + get_time(my_standard, x, "my_standard") + get_time(switchback, x, "switchback") + get_time(switchback_global, x, "switchback_global") + #get_time(bnb_8bitmixed, x, "bnb_8bitmixed") + + + + + + + \ No newline at end of file diff --git a/tests/triton_tests/mlp_decomp_autocast.py b/tests/triton_tests/mlp_decomp_autocast.py new file mode 100644 index 0000000..3a1fc9e --- /dev/null +++ b/tests/triton_tests/mlp_decomp_autocast.py @@ -0,0 +1,166 @@ + +import torch +import json +from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, MyLinear +import time + +if __name__ == '__main__': + + print('Startin') + + + for dim in [1024, 1280, 1408, 1664, 2048]: + for batch in [2**14, 2**15, 2**16, 2**17]: + + if dim != 4096 or batch != 2**17: + continue + + + x1 = torch.randn(batch, dim).cuda().requires_grad_(True) + d = 2 + + standard = torch.nn.Sequential( + torch.nn.Linear(dim, 4 * dim), + torch.nn.GELU(), + torch.nn.Linear(4 * dim, dim), + ).cuda() + + my_standard = torch.nn.Sequential( + MyLinear(dim, 4 * dim), + torch.nn.GELU(), + MyLinear(4 * dim, dim), + ).cuda() + + fused_mlp = SwitchBackGlobalMLP(dim, 4 * dim).cuda() + + sb = torch.nn.Sequential( + SwitchBackGlobalLinear(dim, 4 * dim), + torch.nn.GELU(), + SwitchBackGlobalLinear(4 * dim, dim), + ).cuda() + + standard_compiled = torch.compile(standard) + + print('Model part 2') + + repeat = 32 + + + info = {'repeat' : repeat, 'batch_size' : batch, 'dim' : dim} + + # k = 'standard' + # for _ in range(repeat // 2): + # with torch.cuda.amp.autocast(): + # out_standard = standard(x1) + # ((2 ** 16) * out_standard).abs().mean().backward() + + # torch.cuda.synchronize() + # start = time.time() + # for _ in range(repeat): + # with torch.cuda.amp.autocast(): + # out_standard = standard(x1) + # ((2 ** 16) * out_standard).abs().mean().backward() + + # torch.cuda.synchronize() + # end = time.time() + # ms = (end - start) / repeat * 1000 + # print(f"time {k}: {ms:.3f} ms") + # info[k] = ms + + + # x1.grad.zero_() + + # k = 'my_standard' + # for _ in range(repeat // 2): + # with torch.cuda.amp.autocast(): + # out_my_standard = my_standard(x1) + # ((2 ** 16) * out_my_standard).abs().mean().backward() + + # torch.cuda.synchronize() + # start = time.time() + # for _ in range(repeat): + # with torch.cuda.amp.autocast(): + # out_my_standard = my_standard(x1) + # ((2 ** 16) * out_my_standard).abs().mean().backward() + + # torch.cuda.synchronize() + # end = time.time() + # ms = (end - start) / repeat * 1000 + # print(f"time {k}: {ms:.3f} ms") + # info[k] = ms + + # x1.grad.zero_() + + # k = 'standard_compiled' + # for _ in range(repeat // 2): + # with torch.cuda.amp.autocast(): + # out_standard_compiled = standard_compiled(x1) + # ((2 ** 16) * out_standard_compiled).abs().mean().backward() + + # torch.cuda.synchronize() + # start = time.time() + # for _ in range(repeat): + # with torch.cuda.amp.autocast(): + # out_standard_compiled = standard_compiled(x1) + # ((2 ** 16) * out_standard_compiled).abs().mean().backward() + + # torch.cuda.synchronize() + # end = time.time() + # ms = (end - start) / repeat * 1000 + # print(f"time {k}: {ms:.3f} ms") + # info[k] = ms + + # x1.grad.zero_() + + k = 'sb' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_sb = sb(x1) + ((2 ** 16) * out_sb).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_sb = sb(x1) + ((2 ** 16) * out_sb).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + info_json = json.dumps(info) + + + with open("tests/triton_tests/info_mlp_autocast.jsonl", "a") as file: + file.write(info_json + "\n") + + + #exit() + + # err_fused = (out_standard - out_fused).abs().mean() + # err_sb = (out_standard - out_sb).abs().mean() + # print('OUT', err_fused, err_sb) + + # err_fused = (standard[d].weight.grad - fused_mlp.linear2.weight.grad).abs().mean() + # err_sb = (standard[d].weight.grad - sb[d].weight.grad).abs().mean() + + # print('GW2', err_fused, err_sb) + + # err_fused = (standard[0].weight.grad - fused_mlp.linear1.weight.grad).abs().mean() + # err_sb = (standard[0].weight.grad - sb[0].weight.grad).abs().mean() + + # print('GW1', err_fused, err_sb) + + # err_fused = (x1.grad - x2.grad).abs().mean() + # err_sb = (x1.grad - x3.grad).abs().mean() + + # print('GX1', err_fused, err_sb) + + # import pdb; pdb.set_trace() + + + # # NO GELU, ST GRADIENTS, EVERYTHING FINE. \ No newline at end of file diff --git a/tests/triton_tests/mlp_decomp_autocast_ln.py b/tests/triton_tests/mlp_decomp_autocast_ln.py new file mode 100644 index 0000000..2596278 --- /dev/null +++ b/tests/triton_tests/mlp_decomp_autocast_ln.py @@ -0,0 +1,165 @@ + +import torch +import json +from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, MyLinear +import time + +if __name__ == '__main__': + + print('Startin') + + + for dim in [1024, 1280, 1408, 1664, 2048]: + for batch in [2**14, 2**15, 2**16, 2**17]: + + x1 = torch.randn(batch, dim).cuda().requires_grad_(True) + d = 2 + + standard = torch.nn.Sequential( + torch.nn.LayerNorm(dim), + torch.nn.Linear(dim, 4 * dim), + torch.nn.GELU(), + torch.nn.Linear(4 * dim, dim), + ).cuda() + + my_standard = torch.nn.Sequential( + torch.nn.LayerNorm(dim), + MyLinear(dim, 4 * dim), + torch.nn.GELU(), + MyLinear(4 * dim, dim), + ).cuda() + + fused_mlp = SwitchBackGlobalMLP(dim, 4 * dim).cuda() + + sb = torch.nn.Sequential( + torch.nn.LayerNorm(dim), + SwitchBackGlobalLinear(dim, 4 * dim), + torch.nn.GELU(), + SwitchBackGlobalLinear(4 * dim, dim), + ).cuda() + + standard_compiled = torch.compile(standard) + + print('Model part 2') + + repeat = 32 + + + info = {'repeat' : repeat, 'batch_size' : batch, 'dim' : dim} + + k = 'standard' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_standard = standard(x1) + ((2 ** 16) * out_standard).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_standard = standard(x1) + ((2 ** 16) * out_standard).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + x1.grad.zero_() + + k = 'my_standard' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_my_standard = my_standard(x1) + ((2 ** 16) * out_my_standard).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_my_standard = my_standard(x1) + ((2 ** 16) * out_my_standard).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + x1.grad.zero_() + + k = 'standard_compiled' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_standard_compiled = standard_compiled(x1) + ((2 ** 16) * out_standard_compiled).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_standard_compiled = standard_compiled(x1) + ((2 ** 16) * out_standard_compiled).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + x1.grad.zero_() + + k = 'sb' + for _ in range(repeat // 2): + with torch.cuda.amp.autocast(): + out_sb = sb(x1) + ((2 ** 16) * out_sb).abs().mean().backward() + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + with torch.cuda.amp.autocast(): + out_sb = sb(x1) + ((2 ** 16) * out_sb).abs().mean().backward() + + torch.cuda.synchronize() + end = time.time() + ms = (end - start) / repeat * 1000 + print(f"time {k}: {ms:.3f} ms") + info[k] = ms + + + info_json = json.dumps(info) + + + with open("tests/triton_tests/info_mlp_autocast_ln.jsonl", "a") as file: + file.write(info_json + "\n") + + + #exit() + + # err_fused = (out_standard - out_fused).abs().mean() + # err_sb = (out_standard - out_sb).abs().mean() + # print('OUT', err_fused, err_sb) + + # err_fused = (standard[d].weight.grad - fused_mlp.linear2.weight.grad).abs().mean() + # err_sb = (standard[d].weight.grad - sb[d].weight.grad).abs().mean() + + # print('GW2', err_fused, err_sb) + + # err_fused = (standard[0].weight.grad - fused_mlp.linear1.weight.grad).abs().mean() + # err_sb = (standard[0].weight.grad - sb[0].weight.grad).abs().mean() + + # print('GW1', err_fused, err_sb) + + # err_fused = (x1.grad - x2.grad).abs().mean() + # err_sb = (x1.grad - x3.grad).abs().mean() + + # print('GX1', err_fused, err_sb) + + # import pdb; pdb.set_trace() + + + # # NO GELU, ST GRADIENTS, EVERYTHING FINE. \ No newline at end of file diff --git a/tests/triton_tests/plot1.pdf b/tests/triton_tests/plot1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1fe71682174766b2d551d9aa055a72e6eb837737 GIT binary patch literal 34302 zcmb@t1z43$(*P_;cPl7(=!SC+-QAti(%mJkq;!WMAq~>q-5{liv}U&dl!4&dgFPi%T*?SlH31OBX;T&1eu18027VjmFOpVpaEa zGy}1U8MzwSI#_^Mm5nUSTtHBufhtH)5Y5cq6xNXY9~~qe>|H@@Hw{>|l#H#-Ok6=+ zzkW)1xJs$H8o8Q**nfRcHga_}bG8R@z<#2!s#zMDTG?BGIRC13b}&&ha|P)FV8tZ? zRLneFL98-%01Bdiuy-~Cu=|BRtE!ocgPXGnFdrEH0DJ{A zQ!67;2M+)u82ICcu(NPMxwtsEc(}prASgRK3pW6VRScLRFpLX`=NFzL_Vy0IF9@v9 zKdAz%{+o>QX7(1YmLSMqpe3zrfr*1yC2av}h?|)>n3}=xcX4$#GqOYTNdKUx@3Pn! z=iAW{z^cnpL-^j(XO$d#)%0Us@o0~!UQ+S!nn*GS8Fl>Es$)7w7I#`z`Ge zl$Sq^vwh@>!m@HCYhZ12ffC2iaaBEk)$09|)V^I}HKirvw5nk&{xljA08R#Kuhz87{}qc7eUsmr+!9kGY?ZziX|L5lB0VDv@D@6*S-FTRM_t5{SxyW|_~ z-`Z0mZ7DN6EmS-aq-WP4*EwIx$ranq&={853W!xq-u0XD{dC1EAOfdsEc}92n3?KD zlE`C{!37ezNB0~S zx$My*{wv}40z3MImADnwkq~>`g#79Lnh)EOaG0~I27qQ6PWF-|Ok37A*= zp2?9FTazFr)#QiefDbj*;)6<3s@sXuoV?f+R&nOODAFmlAIe7Y;qb07i3RBR?p6;g zAp;GBfd=qlV^MMvWLodmU!G9KP7!$3%1zcvqkSI&dLzb{jfqKH-^VW#E%?T=iAj}V zJ>WNe5?Ny2=UDz=}OJ=8v)jH zLU#!D)oH9iuN;jAK2g44)5=-4Q+`o2NN8Hg+wyq<`Jw6G=LAo{(SqqUy-xEFON_9Z8NWYGpa)?Y$!CjH3ZG5YFT4ByMH z@?g8MFQ_xtylebIZob80)R>+3vDDJ?WzF5Uw5!tu<68GAj|vqBXY$G{{aC5;$jL!l zX=GP-YeqVHLb%T2A7qFWC`q4Vl9M|7BbsMZQ*nnH?riFaQnbGOMuA@&^sQVfF3;p8 zZaj(syqN&rQC{zP1K}qjRvx19IP?D2>;~zu&syIJY+on!Gt+0m1zH+YyjqZ5O-@Xk zln{zhDNCEId`Hfb*L6=mbo#BOb{M^7Z_pFIdxgCvkDe#BH>4(g3tm`YZX#}*v@3VT zl7XDRwwUU*MagxqW#%AlhWxa>j7&?eQz09r<{-q^^TcoG<(PE})FbS-H9l3t_sqg! zh-x?2Bx#t+GvJBJ%a~3Z3qw7klNzFLGZz<=pWjH_O~P+xx~ks(y5FD&qRqcwtllV?{nh9>T9fV&FjKDW*^`8O6Wd#w~4k$3cprT8IZ z#njieLs?(p`um+w>Mp(qsz&FWX@2&~KTH=24VI%@E8w@}Qq%3W{H&JNuiTO}M?gYZ zO-K0gtyUgquoo*Y*`>rWGMQsXE4*uwRwo{zt~+ra5;m3jw}7U8OM#L*T@qg=*etX$ zG1>bz?M5n=z2(u%7-V*5Rvh|US()G?Ryh^in&*{dgNq%7CEtf>&ef-jN4t^1aj5-t zxu8c?j}1(ak*>6nWKWzD_#QepYoc0f-e*e_55|RHoy2{p{+=6IUx@DlnOi^CYFIOq zE@=(=eq5X4DPtX+LZk#mH_$+doZUueNjy9}=f32MgqLWALun!U`uFGj{e+P*T5~IfW9#gMucR1#=o}D>e0@*9#fEZL1u!M*Ytl9P{V z<7+ByePU4ko?t~Bc}~k?sJAz-E6}EP+9x=5GcHW;$Y%836hbp+@|x6x2-epHJ+5q> z9xc{G>92GSEtAq65}&a?7I8ChdZ{UVR$eje`*dVCICO1puXm?zG0f6?_-B9p_t&Ob zORZke^6mAyxFaI-1(X?AsXM4YsJ_hGslU;e-fg~aYi;7bn9$F-x>)JiZ~kcUbfd%d zdUny^^2g!I`ejZ_bEBvCncUgM*ygk0{OOHPsoD53y5jzal_IV$ys|9xt-J9NMBP_@ zKFNC=yG}|q-N*1Hcw_FJNr6~4Zb8-O?~2kK&5lb%3=&hh&-*$Q`K2B#9&gu|jA}V% zVv_lF%punHcYx7Ef2vJ#>b>!~cWuDbOu`cJK|;Z)T;xmo)6({tx*eaz6eb@0Dyz9u z7UXmz2Lx#YmoCauOf~Uhf0if$HXSpPt`Afw65u6OMB?J=RHKD{Py^QbL6`mqWqsD7 zpFNASqpxbP9h$Q~CG7kCSU+8LJKNjXyPAE`GRp~%HJHNGjMb%FEVXDS8YP(||2Z}u z+2B!zElN@Q8;!~1?qC8NKPQre5oij$k{bEympgkz490g3^x89A4!~i~ljcr-SG{fF zcTonE$zCnQ_3unh9z)ip*Etab;k1iBf8lXu-(905f9qm1dPcMpOQ9irq?F+pc)Y$T zd0KCh;v1`->o!KVGHFN?LMl(6mwH#fgte#cTfMX-e^44)nnxZ& zqYkaH$|8=)tG?4#_g8KDlHcEXJhVlkv#0je@ zaLB!FJ0`{HBwaxbIO(;9TyG-Uh)=K^J=xG`W`dTvr&`GzxH0Bp?(Cklv^>Ewi)Ow5 zl)hvB2(OzV6`U9^<&n#gX5=MA%xUPbN|oA~62CgpVAO0Pba~y=)FMN}Q_&%5AjI;* zXGzZ|^o74HZsehJxOkIKE%-i0Ck4{WKpNcV&Yll{yrj;{&3@K|dFAU4Umg~sWJ$A7 zVj5r|O9to5()kw5NMTUUw7zw@PEr?x^F#(>FE1F}YGNMxE*v4S;(#-T8%ME3@h?iqkX=@lf>O~sE%Co(z z+AGuss{Acq7Pmv5@Qba(Gt4@XMWo3Tma@G`JZhb_oB%mIVjI~@Tokn zywau9YoGgXxaH~fGYcQrV9293o*SvFLR-qn8lajb4BQvQaeA^2_z$=kL(X2&b(}Q)IARK~ zfF~a}$T%a&?NydmLaLgiQ?_*@3y3Rhr|~L`<+vik-?CE1E&2=@3J!@7_2P|I&F!t4 z^u&=`YBa%nB(=&FqokF&dFbZ&&5_{yKt9p{g8->ivC7KKCV!|&xdUbl8>j7|iV&_~ z!E?_hu4ztg&8;nN*(kgL$qkW*3F~Z>{80aBT^~KN$RiEMDLN%Q$=(9p$DfDjWPRa# zC0EHYJRd8rwS6K{d&kT|(y#%-j#Tz>l6{$%ptOdJni3&0&)XAXTCgT!`q>sQ*m0oshKw7ZfCosJv4^y|G8`+Ndg1m} zB!njEz0hn%=Sj0dJl^Z9Jmz<*`~FofXYi9g%|@U>L%x7RU<#iGcrHGor2g&cfVV*d z$JpGmbp11wF_391Vi#4}v0Qnv%_C#Hcw7v2`dW3ga;`jQQG9vldSu%ka+Y|R`^3u# z&odzObFEh5YI4*|@dS}-ODGI>+!MAN^VaoQ?tKbnqqT5PS8?v1-ZB2+mu+ zbtrxw>CTe>;gosb_|f-Rgxqhn9hscb(1{@XQC9Y*{jynTuKu#`+vpC#Ou=KSE5lm+ z7ETYF(epWuhdHDb54~LTeKOU)Wbk82A7P;^?|-lL4|#h3DV(L?i0}~$&jDkQ&!uD! z-+%)|fTIvNDqeqH)U3H`E%BpZRq~!_h^7NAtz^oJhkn&LCO!m1YVP;Msjbc-UEfNK5tv8!(m-P*huA4sRs*5T#6aacw^e^74t6_e|yyMv|z`^FdWJ%Jo5&JG>uW+PuU z6EK$uX))(wvS+yGo_~bHF;N%$GVeIYy|A46;S!Z=IefF)X4j$>eKXe3Dgdp;T497Q z#%SS4M62C3t&9x|rX|ZuH`>oVy7KJ%p?i{aZyjavNdMyGNT2Unh|{-MKk5&^8TF-& z-ObW?>ov3CqdB?g*V-<4aj^f%?YjFqh0CFZE9K)ou7;F-d;6G+f^G@Hsk?WD|~w_Mrlc*PbuN*p*+GBHaR$LCi(0 zP_`b^po!r2Kt>UwKLvgx`fMA~Qx67*$Pv;hDZwq%+TgPf+ApS5^BzYf3Y9wcREveZ zdxK-rpr-TXN=>v&unJd+F?lx*jWC#?q;XB#v^G16~ARmC&}8HmS=xR#P~Ib6dGV@JrgCf-%@8a*13< z;HT;a-eyiqbwg({K>?H@N=4XFmfv<;@v!@t89w+_7fjqcXF4p>KmO{IlRfSY4N9&0 zh*Y|ki&-L6^@GjRKzxH33;JE2XEvTMd%8?^>ClLYjx}p6(ydWA|1mQ&dpVg#+<_jN zR8XFvzp1Z#pch09q3DN6a^|b;@6*YN@1CZ!aQTc?S|}%OuAi*!UBkTTzn50Mf{8x zQ3_{;(uu54wZ?I;Bf~y@xI0Hqa`dh{OR6eA-i84+E2+EQLW568Nu4wL;d3m)@kE=0 zZ~Eu4x`if88;=+&#v=HyXrhm>^>SjdKc+L3_IO0OnU}N>GA;>)dtYLf&^Kd+IjVZ* z6~&Q@n@p09g?mA8Coxv7b2R<2Vyn`4Of?QL_2ehR0#jbMd@CW1t`Lx%pVuS3{)t$R{ z{RkdpqX|_*)Umi?s_u*$rD{1U?%eP1s|$GMobh0?(!fDanOg?UBxCSLy+`)qnGjEr zW!%46zy>oj{}*N1*#CgC9IAZw)DVoF4T4tTJ3AJ$8=hUp5nr$7sX3&nMeC)*-7mUD z`P!K(YbLkO>kIcJDx#hRzIiC5M5w|_V6lyG#J$ZcsAO);DpuapjGLh*q;yWY_Eb!M zEP;0vpVq>yi(ael6NwzHX#Gsn;H!g-DcOu~BJLnf^4X}ohTU}Kp%cw}6|q*x)ymTK)ibOPw%utSIbIA6&<9i&Pmt}Aq4sv%U{xr&PE-#~ z$zlQIBuZw!y>bPN%4C-_lTI=Iq4xb+hHSQO%?1 zBU-VeaMZ92ef)^0F68~!f=F4=`wQc%5#+O^dxli2g>{43txZ3}U+OyKsUv#0CRs!` z`BIrnr99M8%#t+~m|ktAYn#yH;Ofw~ETCI@+iKZByuHtQ$^R@%=o*gK;aTP_{DGPI z|C>J$&Oea+c)92`FoxI}PcX~G^OKs@WA$7}MB0VJ#$@WYgs{l;3j(&s^Odp}N3~1u zy|6A$8BKJjYC9f(Z_Z?0@0GsyiW!4<828QKDYP$31nQ@ALSB=zm*|| zd>IU^U~~~AKIJw&(yTZoI1yZAK=SSvQCOG2^;zoh`SdYM4*KryUN{oZ2j2ttq+{g~ z=gDz~O*<9G2G(z-o@2P$$WKywG~)_&)B0`rQqmgiH@=}7V$qWDyq!vyoRl=Nk*uK` zLL%|*bR_jrr}H|OxxTh!Oa5u>vxL5wmL<=!7w=fyozQ~qtV@uAF;~*~f!V@Mx8lOJ zC2-Tw(Vm{hl3^)DtmzBh`&ZkYDSNBbv-S)67ncuLn1i^M7#Y&7Pu-3%n-)q+&C?&r zMhHf?I&N${^2Bj{ehU*|&XxaS0+fyG_e|hWk+VmHpv-R&w09<))4F6n7I2$fdZsI; zZbij!7)kz}8{xA@F=bgS_4zxF3B-=TOWVs*0&bZ_ir+W*5Jhr%539H-TCJMWuCgtz-t}4Vc9MHbReJ&1 zzT+ABfwOJX-e5HT+o#^VT!ngBZOE=`AI5w`>tmY+)XZZn6NukVIEO4>XDsb_`~JK$ zaiXDd3)NsgsDDw7`wyrVBWHgX0*I~)nmz~+-LJ>;%B}#}9;lApNezvqJ}Pz{j+OQ- z9&t@b6I=3CevEwNip5&lZrr`gAE-cQN#XzloAtm=j)+s6)Qmef`(r=v~ zhAl)2rTuZPw%GfwPOB8d4n+#jpr^VpcXRy2^?nZv+~6HeD$4k9gw$#AEeTdofq!!E zEA5Ttiyw#;-HT4GAFT=RiudbHVeBDKy<1|tm_8~VII)Y52%gm;zPILUvrWXxYK6kWz_1bPB8_d)9Uu0tk zga3eRYBDggwTv@<^M{|dz7SADXj{Af7=MH@p&m{hHx)leEhhU#N`F|){N?5Lm#W^f zHSZ=J?|M}|Bb)GD`ZKe^s<0S`swoo=V*f(?PqEpNuO%1n!-WTg7x9V?ig`b2ACrq* zBsSV94l9xjE0xR_;NOJ=U@#bsPD&EIp=S8dakcRAli3@OCz}e{j3EIpd@5y2qnLtB z6uX|RMA49rJEFbi1=j=^984E@Ez%^Y4KGqsJz`Gus7vYLqx64f?QD^oSPl%!zU$v0 zSr@5yxG7s2E{mU`>Iv#Lz!AX~6m!NpM0muxGl)R)bDpftK(S>c_2C!+E; ziilYE%f#(&7io<=Vp6wCWj3$;yN6+a)G*bNrKW`*ycwI>2C?y_tKWc6|*8!c3CpK;~kb|R^ zHM9f$Z7g3yj~F_c09g_^a1L#D0*Pwx%?Fd04nkjdkFmW;ayD*Z>_1(+e>*eT{$MLt zh!MfNOOArD-x@%BC!leER^1wV{rY2M8i%)2^TT`+)~;@^Wyhyu4410Shit`Oj)g0p z3A1?={si@RwG)-9kwk42nqdw&OKm^yZc^< znZj`(>W+a7MeMEX3b-U1ftd($Q<8}^teye`!d`}+j}MtC;X9;=$pV{%L`4F z$D{V3TAC(;S*SB|_yk95r>)8$eZ;6K%qVhjLP8WI3#Mcy3v{<4qpCloDg1a4 zcL*vJD5Zmj$vMbRvEy%&@w1u=5TD1G#)~T$X6}MYrSB~n z-yG6(%?bU^{IyWjo&(&{Q(0t#5ZB|lvpX2tnVa~z1M)FtsNvvk1DXrCP%!6ZsH>hc zJOp!1vJ{7~Wh4wWjiFt*u2ax;+DRVOZ%MleB6Vnud!e!A%=_63*JkiA`*K_AYm0}q zbDrXS#3QQ0-r{Qlsp6z%LDXnhDP{GnAIFLB#A{12MNmhTAm`ndkxRvmH@%w0-{k zuX3a}44{1qWMG-6Vd+P`l{xn0Sw?JWTMX)p)d-kQ@ ziS1jo6L}*G^XClS^fFv|;*A7a&=R|6B*IT*%JfHKv3y%IaYnD=9k2~aA`sriSo?r< za{i6+JgK_oaQlvWmh?jl={d-^4kX65c8U;n!&|6xi$=@+2V0iHTM?{I5DG#kU&S&i z(_G!pLMyL{>n9~My#|KyW=8pG@K*9(lPemmD9QvwwGmnLQ=;q`qsM`ly=jyubX-Hd zN+os?NZF(d7pKKAaeD(-{-sqhba8IlA$DH6eLIlC= zMFTDeejWv!%vwzf6;1tWz6*H@$azIBq`!;zymssJ__?mlJ;k1_j$_*fOg}8!X?a5P z{CY>sktmd4rCqYKS+A%ft>(eBm*f`RCD$ZdNhOoEM6++?c<5=p10PTnIzWjv$&C%y zOA?uMg1xgyRonULb}K3pkL)o@NBxLovMCJ#Iwd%{zO-NNtHMhM*uc zxgO?E%&kS77WOUpp4!O#MQjoO5k)4o@AFY|IHLZvnOeMbE~!XwI-6PM0=M4Q_wXqi zg1VrP*G_ThZP{n`KR6ntKj7BzVwTzZ7j`o?$cTn>k+THHSX7@V&{y#$Eo+^<`Vz$~ z?Pj z*IEBQkS0i8xT^1zo*G@i^jwTtoz^E^El#V?8PB~ zJx>tvp5>4{+1PQeCQ-(P!!c+W&DK#xgWF*^k=?Ucu0Kvst=%>6O`g*Ct+)HhTGmD*^B7D1s6b@VLOPSJlGN+#)UaBSuy-P6KBw{_qhCSz zWBd!DqV?jWr_a0$c5J?2TIF+^&(eQl?Be#%AXD|0+mp1VI7y<&3KH?^te_2mOhH`Q zZSezB!nn4P?QN^>(0&EKB-vAgggHT^UNADag(2-tVkWaVDK{zQ4QDb{(c(RPK&YTD zn;pyK$c-!|iQYUUU($)qVsBZ}`w%UR(-IrJI<$pKnfk@l3liqy|1*1lpERv&-X~y# z={Xj6%sPT^K967)JkF?+llM0B-&ml*mm6V9=IN_MQXXCShjD|Em^ zb<5_*af|K@`GdWIjk8J_{Je)i47e|l5#PPVR0Et5|GL3I0HgbF*P}#?2$mQ*upZL` zC{c_jk7wBRXD%H02CqK_agRqTA{~zf=dV;w&qN&QN(&q|6sI2NRAQ$x&@G5ThZR^& zb_e^sN#j0wsxpH}^*Cys?1~fK#!nYyKv^`x{K^Nr)%)e~^z^jc4Q2X}@4sdWN+KJa z`%D^sGF`Ko!|MA}8w8cVMRIS`{D9!&-%dtBMZ_L~9HRv|BMhnW;K}!Atv`reT-(^0 zOKuO*(Db8PMMp1q%yu3{5r?tblyWn6P%vDvXC1~!eD6u_*$_VG=XM`xA?4hY?U)#` z@7u~X;G##@+@X7rjIJn^u(OidP_!0=$jE0Xrug+R59$?99P;#~U3(Ljs*Dg*bJakq zQu#2EYqf%e%%(H=3yu5edFhIA2HkEK6K)-iBw@N(x#-rciiZMSisrUF`5MNRPBm0- zODtEzutl(Q#>3%iv3-66+cR4JHrcsF;p6!e%^-3F_c?GQ+*cPt{d`YUD0-Fr^ci8! zAw2s2C=w$l!3ae1b@8f5?9)li1%tHC&(cfrYdZ?yS*s0Mjv=eU)%~Vqp^E|=8w-N{ zFR~`SgC@IAJi5N$!C4|B{x|1S20T?XAl^TRo>{YGL(Y@r~+cm#m!tyoUI&P9h^bXze3K{+>Bjs z0`p;j*w9!NjqHH%_WyHyU=Yz?aqY|yFc%oa%nn%LAv|2bKL{5G;HtWTqhRFfY;_a- z&H@I*c2MB&e@d{pcK}0WAST}h#0IMc0_G*`fyi}O#QuL?3jF;EjaA&r+}sR^w}%DT z>w;hl(#6rp1c?0y_F+d@Aik}cIS|i(Q*^enuyh5n^MF_#?9D(NfV{f^m~a4xs@Y8q zCr}avaRW0|1MvX1YaS_tRYq)s`@Q;@SH}wA7I`SW;{(~bol=}_& zZH+7dnn8b2`Tvy>AXX6<6W9|49sn{c3=HfCGZbcN6Ekv@{zVNyO0@ni0uLa7rx{iz zBK8)xW&l@M)m+W&G+_k=BafSDfgtPv$^Y|M<7SeuIq8Bp0n_aN4+-u6>lkc+O3V&$ znF9&|n9B{~n3qP*=`!Jaf3nZ5DpM5XXE66b@;y$+CS@H zHfUIa0r@*|0Am2az#uNb-o?cU1#z*l1C;mc5+~3X$N>_u0}y};PHwK74zNz_KxYmBGj^c! zO&xIJ@&JH2*#WG9G6#U!&0xSFupB_(7vEqc1dzBPEdZ1gK<B*-{OOI!x4a@u;3mX zm^9o70!$eGD?z~A9Kcur;J+l}@BDAc_&fh4TE8R%CKdlH0kCg=L!f9d3HdwS2nSHd z1?)#KLAWge3mq7dy3|RS}35ML?^)LbWJHdnl5Fj*I{-2wh7pMZ+z}R$p~QCuWu$GfQC1f_{Q7PQ0>t8` ztrhGXxGC8H67j2K2Lj~!=9@jhs$V4s5c>@rw40LwkmZ|tN6?L-1;FD!CkguvwEume z`~}t-cC-K;(SY6m=K2P7yg6Y00{rV>0_J}+xGU&Jr3Biz!H%MvZ%;wMX$&jlG(5s8{|d z`eL<5(g5|9|9;?f`Z-C}iqM>7UXqZyE(jl@LglK$MRF$1)rvOhmlyE(&6(CQ?o-B5 zU0(O9j@9Vi zHMDO6r3!+MWNJv#-qV{8+7=DiF=#^3xZXsDU#Ls!U>|XgL7ykA20?X{+)HV4E~jWy zV(;NQb+hpC5gFs7KX_65C`qvD(r0J%{FPN%eyo!S1$$rqCyUmB4Ug+PNlJP&|5lhd z|0!aB(eF>JA#By`8i41a_6}Z1@;chOWb%#YK`=CF1QCDae|a4qlicZu#MHb6G>p@K zAM1aC2GIE}=h!$n9{&PeP$2ogK$}@wVVQqKoi86eS`yU7P2-7a?m-)R9J0jQ@_xX-PbrPeC>VyY~J3a`498lP%J(w_pep z{{JG=FRk=9kxQ`9fxoWNhIzv0RR432e%F9X^vEP(j#`c4OMZQRvTV~(O0=6ERKafm zYGXZ)*)Y~e(qn%nQHoirCA|A{%uogeO7*Lx4cu`7`r0u=O zYy)boMf+Ol(|gyiHN zG_@5)WZKp#;U($8H#bQ0u&2>s7Wq3Swn}R?Pw-E^I|sZ0)>FG!UfGW1(`eT(7xPo$AK zstSf9g11r^$}(~!%;`1+INjWPvJ@)@dO}gt2NbNn$)vDLO@8dPn4CA1w~jO29N<}r zt;UmV();}3T~SZ~C<37!^YORTXC~w6Nrj6-gBK6p--6*S24XgtuK(LOy7<3BG6aKJ za5{g?W*+C!HNRYXHcv~v75;FRNWwa~SzcR>ES5bX=-JkRV^zixpY6E0`La$>bq(d!z2nFIf*(BNx9M8wk7hA{eei9yGSF4r8zraq_|(kKRa>uNav&Lqg!Zyo9P<-2j&}vXc0WX z=W=(yokHmj!Q{0oufWXp6Gqj-Hs+upI$V@gP+89XuJGmMHLc=R#dn0mh$g#ePYXR0 zqiDnG%+QZwWhjKuH0Esv`NS+Y);;Q?-^}g2=Rx$$AH1eh-^Yz!e459YfqiX4&vFaW zx7eeBN8G;`7>PIqXGC(8ptH;z)`K@|j|Il_RJG;zIxt6DsTg&F*YCcE$Co<56Z;yL zg#QTY!@3FeX@K$_@Xn?>DmB{c^_car8R;(bF^upTh&`=W70Sqdvroda#G=aKYG#_{ zm7NBJV1$l>+!O(@}6A?G}y3+M41xio$j&W|sd zl!B$Y%a&yb{8K&~xCe|p=rVtdq@qB)k=pMWta!70Fx3!S0H;RUxKxy_i*g|(A^ zuAx*Nl}vvB5%1eO283a?`cGQ{cy=W}TrNt{?Jnl<72UnJWxn8tZX}m>nynHh^l2dp zs5Hqzd}Ft5vuU$i>9PWC1+7EHQ|K&~!@9cYCa}IX?OLp8kX!067TTE|RxabwBi6IJTn zX!F~xl0%6n9h7it%3hm!KzY?yh8Yba*bJX(BB6 z3y(pfGFHZ;uvbzG!iBj2)g})&r^| zN=Guu5@z%LPvv*h=#}OD>e;9!aQ&q)Ac+;L)Mc;B{W~H=5A9cW0=q#aVyd7qFYHj) zuLYcO_`Ia)Y(hp{CefV|5Tx`i*8%}#8lH=8v>=xUW?wa>X~W)7L~pZ2(`PbxXL%4Y z?GAG4(#8v{fj+CMh7ILsB@PSO4JV@UnolAy;WaIr;wx~Ew8*GN5+_2p$qL*ld`tNU zb9pyS6~;;_Q)k~R&8MC}bZmGsXR4Sq;7a!PJiT@-kcY7OY&DD%Ss`j}^XnPD6$7hk z3#m?qpBrA%;{A>Lcqtaoh(DQLe@FiLLWklO3f`hBvH_R<-%}9qxbzAC^0>S^cs1^+ zs3La`ODA(mL@L|!_N%ahEiHM>W1Y#uj$m{i8 zS*o~6rH;^{84Th@qi-g&&qb4ikz*APveCPJd48_4KJWTiiP`vB04h>RwN5YEZ?8g} zh9`n+v=K*Gl&74f-Tg^rZ|4k(XzdW0k8bkt8)_}0sGW2#{I>8N@vn*MWwFr~RQMf) zix$QY4L2sM;mV2C#go0i4ulqu6-Bm+_X;X0{GdygXV&buS0jFMnHV|c><9~5uQP>#niJ7raBd0yKMx16-8am>~K zQPQ-deC>|k;EB>t*Lj^$w^nDA9N{ktA_py4<&sssy->l7D0;6Prb?2uR!O!lBOKi6 zv!21bBv;O@IP8Ts6t!9uuj-7&qrz=n1lka$<@ft|rhD+aZTB;#v#Z(I7{~+WnnSF2 zgxzb}j~(l?gxEH7>~7)dEov)Z8Tl*|4D&TNae zSNzcI^~((kHI;MKA{Jv&{m=*F*|-M+**19ERc(hnOu4Bp?Z}UU8xh#D;Y@sGpiM>V zN#yv77;X=-bn?0zZH#+XbSC+u7VIx1&CO_nFk?(b7w$g~1?|9B-)BnXOw)g)bG@^QNHFS1PT7f-!sh55W1#eNJ0aM&>c`6ar zg$s+FxdvP*XlQdvGzq|=9)$z-G2Fn~3qnM%_mq5um_%bhsD&(RvY;Fz-1ga-d1wI< z+mZI!+=P01^82S&aWWe83wd~+*H{)bu|=MfCK+ewCKr^qgnXY=0C*^Oyg& zL(S1&`g2O`aqSzJrmy=-{=?`ps-dmNh+2W^aAgxUCDq47C0`1$(t=BNXDW(~LS#yW0Ro^5FW(jpp_9pX3b5ur$2YCi@vNH*rtG%bNW)Hmdq?`8> zN68#0b2YZUKfJ?+1$OQXrPd{FpIo|=?5n>i z5fy&LD6FC_d^z~Ry=w_FB<)DFjuhSI$lsUhD97CoFC5;sa^LHKdS;#Vad~Lk0jgWG zDQjzY=G^**XadU9vO8D?rafgPxkbslTl@j1781^qky8=eyyWpZ?|1@M-n}N%S=HmJ zTCt<7YI^)>@6qRUR=v}mdvafcdT)`TTl8uOU{(FS5XBBSAp$W@`)}Z>!Ozi7N(YXy zcYaoce0wkAbzc<2SgE^G@3E7SYfnrAGaKeJDdN5UA5fFl4_YAF&AS+JVI`A!KivC> z-+Wv9U|gP?qsPiif%i6kwM3ZGhqrnlp}wB&%GKb}SMq)QYX*i5dm}iOH+#b0RUaKr z#muT~r)9@5Fxrq=&pi&($CaHIWGRFyF1Fp(`P`ONA~3fvMCQ(W^zX?6X5oK{BH#=7 z?OfTQ4ZtQ?$qt7XLPYa_#mClCrw4>uJbwwY!|4MNiC3kD5sQGuuRogz*Zn@mDvo7}`OlWlcUIumxDe>YhsNJG<{)s*GR#GM7 zXtLasXF`4pKbh=^Mv>W-gmkCL0+aR@u(v61e-w5UC+Ca+0{s7(0hG@#*Z31;&0ry; zm}91#?kPoAL9&zucuX4vo?zdhqfyiPt*1Pid=C%_B3-(m+-0errLyA@0gga zHNNLV>U%Jz&r@vblpE=$?#AFKKb~}=viRLyJJDAP%M5L%)a9Y=@~~aceOk#^WqVj> zVIU=1HL1Q`(Ch&+ZIo|P38Q0%?dhZ(7z^P_6#wN~gzyfAfB&f3w>7-4tE=uC{)P^o zxYDyYgK^~dNdw$7^sZN?zqegGh478n5tl#QUwQg++I`@OaXg_5T*Lx>(S{9+&PQV2 zr3AKtwStR~u_nWmmO%Oa??Th#KYrdNqc{I?3n_2W;vk%VAYBr%d_a^x5WT3FO?ggv zS8&`MO3!)bU@~K?+WB4!6xf0vFf|ttwi)oEo_Y~+zYM{`2UWNG&4U?jb zZmTVP=j6+2ZXWuXH?UCp(U0+~hLM%F+z4tzMvJ*H>Vg^%ER5?B8t>^XbG=2oVT1m` z%H~jMvFih4eC0sA2L}03kFJ8G;~Jj5;+rd$e&6~?!vLDE$Kp1`plx)NFXk3r;FD=y zaEWyiyZVxBmb_VQI@$X^QF554ENN95)0rbWXZxKrJQwuCGn50y=PI4zuk8=du$#5< zgh>c42{zM8lC_NCF(ZP@W(ij`7wNScOU~V?nvhYQA=;a2SQ6p%4V^y3OtW!1={wHa zXxH|>@FYgT++S-V#Fsrg@5)pCG<|M6E~xu;ds~9jJ%oE};(5gLSEE#LB5K?vf{jKO z)-CHBsdTj5Jku39kakSPbvf0g$4vv{uSJ&5+4wDd}A^P(`?=9 z`U@`X8sBTUC&b>Sx3J+pskPOXczu$4u{=Y**qd@vGf1!|R8pT!pdI>#trMIren_qnV^6W<@k3{0 zc=Lety&PwnBLjZ(>xCY+ADDKocQs#-Rq28z@(e%LgPaGj?_+eO;zv+m$SNRRmpjK~ z8ALn~>^T%8h>OQs=eX9oIRP>Z3Bvv+7;F-R}Qx{LcTKCP8fU#p4*57&xAL}9#B zhxvI+gTc=SwUo^m!lz-qPHC#q*&Q~H`Qxl#Xw_zJA<->*+n;Rv5|KBLBE-E53=#Zj zP6bsGyO%mJaj5MZX%_|bNKsSi?b&+7kTNltLiC;dza@jw^`w-eGrr~;eQ4iCh&m}y zJ>JJo%=}uq60!paFG^y53u?D0Yf!+=_uC^$L3zjy;NC?G@PxW^^I;1aas+?%)ibeS z)c6!jxs~jrMBU_JvolH_hoR;#VlNbvtd8s_Mk}g*=3>CRUn{HQlu=j7zSVnYU&SuN zI@`k?m10vDy;1BPq@C+=usZXe-hWE~+GSqT#se-97{iKHZ0xVte$SUgbvLXz_Ed9jOV*%P&)H#WsW6gx8cAFXncqe~LH&{3@5b!*Ih2{r(?xt10lt8f=`}0}Z=4(;}+}9q1)ww2! zSP*}ifrnu!wT~{@qlB3P(?mM_z|rSLxPAgQ35%D?(vaJML&QIS_x;(P z{l2>|oM9jM#&7(;BjEw(!M_wI$e;N4IATUHVe#DiX#wa66XVtVO5v|xxL(^(7b0T3 z1P{^95(UOZipd*(eQENUT3xA3kczUQS%wFvSeXk!$x?eIcclijK9Hyr)&x_ zBX)36+m2nFc$7aRx9}^W;ty7rgrX#3GpxZYPj`oMz9(<8KtfF=a{g61GfNLBU zS$Kb&xGsR00;Nh+ierNc1TQ~IXM`U#USs&Y=ozRD1_q>$yq)gG}hein7{aQ-wpYoH+WnWk&-t`m=7wT51<2A_M@ z|I^-;fK$1BePil`Zlsdvs3?UqALoRk5TQ&VA#;?OBuyw&rjiPED?%w1QG_BHQi?{+ zZc?Z;x|Ig~*M842oZ(*mzpwxE-S2xo&$GSnS??a#UVE?ouDyTjHK-NN=&o6{!Y|~J z#je+_>ln*DdbHcly}jD>iFCyOn9d>fu$Sl8PKjBtS?@&Ys?qPNx@DTKu8WdRPn4Fw z{_cy&w)rDup2R#YXW~+$^WKdJRr=`Mo?E=*ODFDG`_*NK^uLrQKA{Ra*|>Q{KC2%u z>X{lz?Mtd&`t6(8*sha&P-w6}$J4!ls~^65P0h=XvmrPP*laq#PCYcV*8K-Pd!I<2 z-hy#P4HqrT+%7wBTx4LFq&~)HlCgK0md1R`m~D}HtK(jz9!t+{v{W^1{_uF&iRTml zSie-`R%=hJqtFiX$feHUjX|^2>GQqS%HHOqr;?)0DB)h!xS&!gl89`=>6H^xXNfPq z-89SQE8(#F-B|^p#NVgRJC)&ictzRd(3dJ=^Y1>~RUW2jI`Vbm2-ikc`?YzeeixW* z5}Y2FU|S(ke^XIh!ZBk+zW<%XG<>1lw45=HVzM_2*6J&)DVg~0{PMcg^4m=t%{M-G z=$jf{%IXLne8hlpdBseZH?9kBP&XfWamq^gL$Qg$Ak)QIfoWAhoN1oW7 zg;`xf)X8PzO)rOy^PJmNXpmXfyt-3$QHtJ$@hS3Vhh1YeXBwIqx~3SOQP?sq!?wsF zE!}EXk;Ri@uiEO$dv>=k=_UFnev9iGVfAgv3;Wc!rdv*?kiJZ`s?gk$VD#Bh|DheJ zyHAf;DEo1gpT4r_*0{pFd*_>+q8mTqS{U^oBupYTUh-)upY<{Y&czJdP&EtVz*@Zc z0|1F1qq9NGFXm0Z`!xQ?+}a>D5w#5l&EYHB51dT;qc>%|*QYd}na~^3K(!zXd!)J9%funT8RtJF* zkOPJ$74Tg1cFDC5M~}vz2h+IpIT;h8(^D0~vKz+dZz(;6@NBkEZh_D%1GES~yi;=*VRiDKaawvGS z4xPGk7qW$LPn)zu&E2;o`~h9!)Tv#TIY|Zahlt4^i@jwOR%9)ox+Z1bmYP|F zhfmAoFXh}Rf1DM6esi`UX%wH<@L8Br(5da=cEe~EOoU;4`5m_rAMX3$%{_QC{#GfG zcW<0#pOK2-a zE?MMMw`aEpG(^t3m~(h^eeapVlg}>6Co9;>Ov}`(BG=Y*PTGC#b%6&UaJ;ILL$~gm zeMha{R@6f75uCvP1$ev|IIg9{Tkew=7!i@)AI*~mX3&TL0` zNW^_Zapro}5c4TLl$j*y&iiuu4|OAyQqq{UWx+kIpPLI0_T5d;sGKvaYq{Mr*Bf)? zx>M^^$z5T#J;5LD)NdJ0eUtUAdI)jct#PVD%7W^`7%xpipCf}IjqN%e{!B==vG z9ku9-*0{=KC29H8ORbU;nVVNA97@{LH#q&DQu>$g`9C$-ac8}jXIl%yw^+;*O}QgU88S-m(&TLa%=V7te8 zyRY>mic}Ic%1|-t=-%vWBDzuuCkztlf#bgB5R!wVmnI$|y$VwES2yqCJCJ}Xt<}9RW+vGex#J!o+;<6@KYUOzZ1s>g2^l0yFf0>?d*B89*7Ck*p ztCe57`K)T;%<-@zn{t|J0b6=>0jXRM16;21^MNy$?w)Qqk7xd}%Q94KZ3SeNJ)WB> zrbSOyceM;UD5#@fm+Lg?=)9QOTV_iL?Ak8%hF_idtQHwRVLqUNX(OB{uD5>4ld)}; z_rw%&cPnd+{9Rig?R2zg(|I;$Y*KE-4BbD|9r5?N1g-Vk%#9j;n#o5pZiY46&Ggwh{;~f?fqnAb{A$Q&nMeAW?I`W!3-G26M>TtKKXkJ1s$_0h za(GqD>mMhl8JCX^6LL+6YHpT0nsG7OY46k88%kEIKaZ=`QPQ}S?j)I$eM5fXjxZO- zM0{{*Uc|e@(f>}Z|U_gH$b{Zo+kQ2<;A{36~CQ5khk(KHRTbt zC?%*=qH=wcWy_Dl$11M$ik@4m-p8+5{FX^S@dB%zpeYHAM`K{2=cegm^J60(Z_ZIE zj{n{>{Xy+@C%YX&B~=aWbMtNnHci*u@Y`O+b{jdn({was+j)XV&@UX zXO?rr($)8sJ#1-QMeW#KtQHhrdZvPKJf$kmBX-LLue4IZ3@yP0E!U5jFPNavd@ILt zX2C;|x!X#7$2@X99kJ((Vn-a_Wu~mE?$tbf>G3{IGqP!wU;%u*r~A<*%bdF16C*|l zS~DiMbhfk<=&QlX-B<5mbz6pHh-0Hh30qSVgUi^ zfGa|72NtCkwb%F+_7sh<`O+tNV4>A+UIxL&#^8JDAR-CiQ3Eiu`KIRPde%c=W{7zM z2s1nf!i+cM zJrJ;B2!cikUJG}@o`LRg?*X5Qf&X41aF1XEgdq12cfgggP!0rNFGzrnkGlt~arOo* z1)i`F;39y`0$35S8=zQ#_P`U&0oH^k*x(7aYz!O0g^gh&xFaMAo&X=%846(|cmskB zH-KOx1PleR;RzuKzeWO%6V~AgVVp=d09A3K6|jEo*LZ_+Goji>0Aho82r4Wns9@p1Fg6rAj)|aQuysrZ3A-|3u|-FSDnLWA;virc2}1%> z!1jio(csDeW}zz<5Znx*x!6p@n2pLt41_B%7Kh5DK^hKN7^)x%$qyl0(4J*nKvT73&5Bo|Oea5>YZ5suk26bt@8s{_g6(LTzDxgT*B*2P#Yu z&}^10fE*0uk7A)VVZsDCVSHr^2g)6Ouc6?6lWkHxj?;&t^?pTOoQ~#bqk?>ER|*iIkN6d@BsJ3)SM}#X5EXA6)^bcX}D8(6a%UQ4)w|4iw?DARa6C{sC2aO6yPb?o65DyUE*b{(3vn#<; z#DMu`(Lp!EsA2tB1Gnr}hrlhH(V!S%R>NYigIjjNVQ|YX6;t&j*snox2K5R`(=Ry& z2!qVZ!vIDjt15$0QJ(!#bxA`Z@B^Q@BP`H`NCa1BunYmg=Jway6itlq8@kLCW+FD5OaY+FW|Ju_<=En33JW@R z$Cv_gmMw>-$dLd$2Ms)!HCdU#Y|ri#Q<(7CvS14Cv0>I^7e|6G>Y;o1?j8(mm_^xi z3ZC3wB2LY(FK7uHV$R2+4y*a@G0-Fl*eyw^tL&=dH%HdV-j-( zt-e3$n)D~p^RWKf*`6ga^W@#b6FqaK*X0`XIiH1z(l28Kg*H4N*c~S1GELhu%3j^! zH&Nxl((UZtDt;r@!~0r=3Hd+1l0%2?e_!B-gsNT#Ad>=m*mpY zeWe0DlC$_6Ucywt|A~emMfUGCgsu*nO%3}$_zxWJu=NiK*7w6IAfN4;k~Mg+bRLKt zEndChRQ4ls%R%^>;MWpGk;Ihnt)C}071giW)YhoHW{cf{#ufV{ba%#U|M5zy<+D+T zCGG3&7PD)a>cWq=y`5l1)xK2juXZ{>R{6|J!*fsk%yQ?Q?VkDmXpDKR>=akI(2t`& zChsAcRJhKWZju|9DO9%0p?$WPf6?N6-boYF<^RtP#2CZ{Y&rAqvE%PnCjXh~8Cz%l zx=J#Cep0zXGiydGs7ud;cHsA2%!zmM@P}hTf&#_m{4_GQ2>fLg{hyrueyP^}U%U|H zojh4z6DJsZxgi%}U`jx*0I)b;;I{_sx7a&V|JQZ^=s!61E`ZNuBX?*s!K&=yD(!_Gkk zibvUd`T6RhPnFNqn?oj&DMSjHPNc(Ir-q#hav8?1S7@NS2M!sO7&vex#r^~P0S#;t z9{2$oT3(_*yf2#u_8jD|hJ!|CP|@2CHx1boxM;{S!b3xzdAQ>t_ca_e_<@WYTr}i9 zh>J#}B1bt~@yG!A=c3W5$VVDSJTilU{Pb|rm_+pA#zjMxA8s0zr!HhBou^zfQv+F2 zxbkCym5P%_A!2p_j(E_Q$RNN)qan)x7fk~hVK`}EOW^J!3fSPe`-4KFAX^MqTIB7D zlSTp-@X}~JeGWw+D+fn@Py`RJh%_osS|W|XBY#9X0`qg`N2JryN{yooBAtnx{c+O3 zWff0a@N>q^KcWWmKgN|747%KXL}Z{dbKLQ$eB!|X;f}||EIOR^V^WYEii-w};h`~* zL69RJ3~*!u9(UWo6CuyI1fOciYROp!i3zJiP8ylcKz1#T zcw|V*8;^+m{Bg$9V4!Ui4jOD@^Y9*=z46u$eAe;s1uWUza|DTsT=#L6K_eq$9cQ^T zK766kkkaRh$3!+dP8v)HJawT{dB!xjsRJL7+-Vue6&PoJAiCT>52F_Uqg~6*os4!!3$D>kt>qliELnvo{G+w<*2Dg?x<^1!q65n*oH0q{Lsn?v!xpOdLTnNq+!{(gMys{gRvD0 QoIj;8aY{=1OAK)T0@=r1H2?qr literal 0 HcmV?d00001 diff --git a/tests/triton_tests/plot1.png b/tests/triton_tests/plot1.png new file mode 100644 index 0000000000000000000000000000000000000000..794c86900835cfd60103bb4999b23ac33bf3d7b7 GIT binary patch literal 121873 zcmY(rcU+Hc_�snDjVq#^C1rD>$4ttF+UB~hVGXb)*egOm!DCM6}b(=ytFw)U3R z@3^1m`F%d$&mYgrBfanUeP7pko#$~L$8p|)n(Ak%b}{WDAt9ksRywUsLbA<_goIRg z=XU($KI@@i{2}Rb#=u3#@v_S;GbbyOb7n5rt~k0}u{CGAY31Z>>v&aEKwLnWkIlx# z<(jjUprFJ5{DOd^leJ*iuATn42>CT7LuV3_J-GTFGtvyXbXyWq5)$Rp3c9Wd6N$GB z==8h%r>(|R826?b+N2odiu2`?sisuLw5!%Js~)Rr56!Ji`kNWZrRd8Mv+n{QsmE(( z`bxipr_)reM^-JmAEqhO@{N>bzi*G1bZ!*&O^ol8KEtUTabKQo@6K&oeE$2Rzc@wz z$M*l{2l#pE?w!<%|L3=!(CuXp`9B|2;9)zx@BhEZKmT%9TRuSOf3Ew_kJ9e!HT&PM zB7USodg$Z-{j#VnC+}1LpC26|Td40OA+9Zb`xq0|@>I{Kh6c)LmGeJ_@p}~%{v662 z92}aN+MEjupFLEr_uzT%uJQc7aq6G%pJBQ--7AxjkWkl2iHo@YEcRLz3kwUga!`yv zoXw0o)D+s~(ev*Em2QuVk0+fw^Rc_TRCD}!dV0Rc^3`6~0V+yL$|QfcyKQ@~n~*2% zsFck4H8619$w}K=(Dc^l8>ePc4mI*t%s=7FX8z~)@3xU-@fH^syFI-6{M@;7!s6nL z;xcz>P9^By9Z^tD&}UyzR5TXfCdl-wlty|_-LH~ETTIA^Te)=i-aV~M-Op83F@-$E zd6&O1@$vK5cZNvghgUlC(!N!DwJg`a414*KS2b3E-(K`i?5eR%PswqMst_J~(QUq` zaoHRxs}<(t#j;;jq0AY&M;j9)b-0S?=w{`(|J@_Y!{v_ypPze7Lrv}D>w9L5iBmI9 z*kb#Q-`{u&XMemh{PgI!y^4dhLhWoyfZPKXnG}oaFx?lr2511x(9~e_W2@`FFDwe1wl3OEs?yvYN_Hj)~EGJ|Gvou&|IP_OzYPFz^2T z``fIf4juC0RE>#xdgg8{y^M@Z$&K-*Ox&H%c=NreS zu!AQg+~zpEy}kdet~&o+7!xTl)}#p-SFCWQ&t~==IwQwLR@n682ZN5z*O4n<^0i)m z*L`I`?X}K$Vjv(eko@M&n?!(U+fZ|fibh67(aln`&Qd39Z(3fyEc>8OLE%u%lwS4? zp7qt~^4Yn$6wj48mx=a=0RcM`7xl7?d}Cu7tE#G8-QDMUZuO1vB}-gqpPHJAd-O=V z(C++m^~4v>r(P^>7!=ykpMRlS(cb=6e#zwB&CshoC1#F}8Y^={vx|#Xc&NHL!~Dxb zf2%Uo6Q$G=r3&Sj?0ar+-MZEG`wMz}!K({z1T-GAv$KzW%cSMv;?m7Bik92>^X$W_ z`Ikq>8xkZV^R1gwv@l?GD*G1Vh$ghn)c}E z=-9Poe`$Gk_Qqcc&p$$07v5BT`$osh%PZwFDzB!dmeduozE3u|z1_f%iut*#SAp$U zKRm_J`ogY~o0CSy7cb_%e*HuvSyu9N?#BA6PPVb+fddB&@+Q%Q$SQW5=R&}q zZ;XCLH*9=FKu=GvbaP|%m`yVYBO_yHkwg6sR?iO=_jW{!;Rp?V)sU-fYh%Py4mZT# zd;FOB_WGiJ()!iDvV#;9!JeCa+pHeEy-K%Ua!8p*^m4<8j*k5}2S?Ap_*~6;`xRdk z|L5@V$FsWBVCNuJW}x3NbOU)aTECFMNOR zajVyBxu<)#)^5F51`25^(ako;q~_n@9E2Y_b;p`jKU}7y?9VTXUXN*+hA&_C3!86q z=q&j8?15~n?Q(y`R!Ssc$kMrd2Gb6P12!!b!qaM!oBT?{MG_nosSSZzc`T4oJy1vFNZ`)bG)6&u+wWl3VHiVB4 z`LlX|`0^!m>yF(*HqC5EMk*tXi9}U&e80%R>iy@?Va1UC&w;d_bG6(}(q$byqq&cy z-9#uD1wP|rwUOLk-#D6LS%S*SWUU($o;q|DQSKMHZ&e@jG$%(xQ!|`>%SaLb*6oxZ ze*B0*GNi9*GZvW=S^rlT$5r3?pX;o(@Q#k9sqAMLAk&KXepWd48 zZFnsHK(Ks`=Fatjs_TDd%n+AB@88ROeSU7>Tc%EsW})p@E+U5nP#@kW>-o0vkFN|P zNnNa9^0XM$KA}5!gs1Xdw^8-6E>Y7Ptv1I15RR-KJ~7^wWBn~d>y>GFl6aqIzWl&0 zlLM&U!-j34$YObxwKQ9{Y?)bF;t~#^3fpmoGFKt~83(ag?H1$jAM z%z>%z_U8M%M!C(kd(WO7-m&{Y|IE)%ln14vkYk(ktW-~)Jh_vS(sHooVS!_x3D1`c zQ5L_3hpUKfLE#$v{*sYIOH1oYUs+<$yIYJltL}@xD>pZm|B8@(ioq?>pMAR ze}i($Hy$BHI?oTEdzF<%zGn}S!b4bO9+`deyZ(2Z*39l0Q?L6t|HR}ZIXU?u z0fFofl6jXK;wrF2&kl_DzW1=OwEX;caa>tVE%odJ7SHv?Hnju^>LY({T%8-NC2j&| zQ!$jO5_yQo=p^f_%ju|Oo}25-8Q<*yKO(JLyS_XY7j<8-i4nM{^6utjdyxb4y&Ze1 zT3V>(;gtu`Wbn-a201c!3#@9!@;YW0Yu~yIOXb$Bqsanqz#~SRvWjt*B(^2bSiYj^`{R_g0 ze{5__+04wWcu6aKAS$uYzD*^R>39Uv3Om()k<)hgM8VC;PUEht-EXx-#KmKQLY?_k zm6ac=ojaF?xEpO?m3DpoE#(Xa4?jN;nQesbGexSzAzE(rZ<1|JT~mNiN&*4`buJ1I zzM}#}4KWHBhm(x`stm?zX!n(QrCt8=Xs9I7&7mGxAg#=6b+{yqV{Aj?ptP$d@r~F3 zmrmW6dguO@Z?4K&T>!ehu{3g@oHi8^pp1*m&dw6+vjAYkTMwMUiGpvtFxpZa_A>3r z$X%>?G_U^LdUyG4xxPg-AXv}C@o{lVcLt}Y1v$C7M>=k9tgS>uM@OrdXBibnC_Z4} z_nQAA9^vcjOLySFE53KJ*EC9IX70L(&AnRaVeM14KHQ|j$YK&c#Ywm3t`*l;+eUi5 z$5qAfqBsM;5l8X6cdFR+Vu2m02*L4&{MWC;^>R$IMAd)vl~OT_Dc!;Gl(p&wsNvkR zXOERT%0WbMaBx(7d}hP^%uKjZfz4AjYfDQd)QU*4EA8Rw>tr%qfVyQ!M%;;$*+zvg zQd?|XI;~`K^1fzW5Vx%fKUBA{x!$*_9DVeBYlq*2rCc@^T^p^-jZO}3ZdE`M?Ms)Q zh%{AKpM4~3;n(PH*mPU|k%(20NMc;v5zF5^zxpNj{on40*q<^x@&NDhH)p!Z|W8wH0^&?J4tm zx9v#z_~px&TLJK$dSWIvol%;i{r0eCe|?T4r;R!gJ!U#3YHZ*XEi7}iwapKCAD?$8g=)M%RgoK4j2gNynm&v?ea=Mlg5I4){xK(FG zg-=Gt5fT0=(UiOL1oNDS8}I#1VV=^EOMt*mzOvxsl}C z!!^t^C>?w0=_`>2=>_ixeBRmHJ;!AmuQ4y~`9^iBP1B*KXsYYlk!j5amk$P z>IBAFTVJP0oIgD7n1t#}QfT+RH9mG-{6kk4GmurHnBk|_-|UV9ph}I?FUC=_${sK^ zHJ$2xuZn|$u7wKUb2O^Wd2?f(*t$Ni<%8I2bSz=AUVo2tX@(E%l6*j5OEeh*a_bAU zNb|!uQVDXmWzV{5-OQVscN(fQyLvTy;^phtLPN`{sz3VXyWe6*&`$!YDaY$Kk6SJc zy^2JC&+pj zJ6d){tvu|zUKa`^G^X}AK7J@J_C^P4<<8x^FC(3xuFcN4p?F!hcdJ3u%%^VKsQ zPVSnmOPYR?LAPXlCxZO!3xiyCdS1QHU%%3#D?tvdz}7@RzIEr$9TN0qRmgwpsN+rx zfU_6ToK|Yct)Iq@qtpDHS~a>;$WXan?J(Ez;b>~JlxNRuzhzLf-d+<%v|xXeh1@OR$H#Al=cwAn-b75AiS2PvH7!ehM(!KMknj$Z1D!_i#G0mP>a3UIhbaQXL*QF3aO@&{O zx368lZicL1SNd4oF{b8^{~p%M$Z~aO<5@y7!Za1UL%i+SzWuV15SqD@RpBzupdIV+d?T2xeYT2YZ*Ib!Za`1gAm zsbSF%9_&GnXMw<%jQO6Sb-2xehY}HfSVV-z%WM6BvM+@Sa;&hBP(S)G6@hzi-%5b| zQLB&PkG@%)$a{u!ysRPH`M7oD?fF^IV?s@v5mF|Jfy$(}i|2 zN~-s0tLv9a(1hGAFPFpid;a<94|>LFs#_8m&_63HOa0^kuYUF@hXMb&AP+Sn##_@= zi#?VFkVFuFnVJL4%*^vYE;#01?U5qdAAkQHcu<`f#zAj3L4d|H2!A95PGCpDrRgw$ zzKEBK^*H$R7pv>*gS*~1(o$1D-PzWW6UnKdY&=r%_T9UIsi{U}uCcB+g20lE=kJ3| z`h`A1Y9(SftE?JI&#vQ1Sy}vO)-18#1vV{r$!WPtnb<>~4BBEzR+p#K&%e|wE~Dq= zLD_uTtvS40q#D5?yS|`~Ej%X1@7=o-Zyfu$`1oF(HcqX71XMUv zzPbJYi76r>q2Yti*64;MLFP`1od8zx%oci-K$rxc9+};MxOBW^6Dt_I)NA#csPm9-2&$cblfxT z%s8nVcG-?81i@3v**KLr%eP8VPoxwqp-pe8BxsxgEXx|cBj|loCu4SY_I-bU-a|^L zr3H51gJSqE%q?{}=DW_CqkcA~sm3Djq|tc4cIXnzHZ502bmivdnOBD$v^mBkEiEl1 zDvFEEh8#S>fL0pGHZVAN0MCnyenp`pnis@aoRHZTB!Gdz+DJ+7try;0{el+7;PoX^ z8IL8)w(N@&-Nia1Kg&77zNacjW}JV?!0I_e`b;tOV{>!(lP7Fw;LiB&q6e`02$eLwbA_eE5DM~AT2Ul-7xF8A%g zne9FxA)hSkB_tw}a?*DfS?7*otpxi?bog&?Z_0vQh!QkSLQ5`W_Hi4~-1BZzqO0}U zx+CvOhX~VgyVI7I9BguTF7Nqy?ba>sBS!)d4bO6NngTKp=j~ZBdP?QMJML7TQ+MOj z$L`QzM)99hQxA60lloZQ>M=m=j*pK&X7_C;ushLQf_R}53_&+N^F_R`IaQffL{v0h z;<|ZKtwyqVr3@F^V`6z<85c*GdatA@iGi?5M*A3v8ywurN8v6P&PXgQxTqB4VyDWv zVUm54P8`9Ef|q}N4kUVJQR~JdZKb6qVnz0CyKmjPwSD)28hul*GoGHMVg2W|2XA~4 zXt*IDDH)IGZ_c;Yc%hs55gAPaOp9ar`pxV~ZnZdh0AM2LW61~5`I;lzBwy$cRikUp z0L#?d+lxTW16#pZw<>tTS3qlBA9%gV~jqkwF>L^11j*A-k%%$l(Gyzo>{fT z=ZETQf9#(xx}qbF9stt4-m4B$Hzxv;lUb4Yue84=4S#6^ZnG5x5Sr17^ki=%JF6ak zetv=;0~1(*JO}#42UqByoTU1|XzEGehWHb=1W-i?h8&xAw3s|0F|p(A^}`p6t~^CQ zyEgM_5ANWsHKm}SVA<-_iL&*j3k1QFYfEl`hMX9pe zue-16OUs#`{;a?H`=5Bul?cwv z%=9PU-T;Y0#TQrdq(E%Q_t3v80Ref0I_#AGeto z#^UZv#v!@6^l}^RQB=%pwxbtrwo$37s-i1VhCnhiJ4=WeYm05BK{G`*E$mQSKq~vP zn@64h8}N&T7SNK|7SLT9L_Zm0t139%S1#8l+gV_Hudwhq^7g`bn})Ta!t{>06vw`@ z(eZH*c70o7Pc#LCwkEoW*C5b5Jw5-Rrqe5M=8Omc&x<+ssyR7{V0kUuUz@2EmPR~( z+a_6P^yS&v2d>lamyt%&!OgiqB+}ob6JGyN@Iq0DF9qYT$;mH7ubYeXcT(mtGZT{t z`WIZ?rX!CN6&rwYOO9!0MyfR#=~f;d9wLt~cAGMwum3VPR8JZuH6ebrn+@^~2#||` zJ>R&5JbHVN6l+B!33Wc6?4Glv`$l_tX`(~La|x+)m%}G$RVd!9*IZn5X()$dc22$N z^L}Pp?hR?i+`{4>Y8T)&p4u6Gh-Wnx^e5ol+H5te+tC+-%zyv>#V^i%+9N0AG*}~2 z$jQ%71qgK}&=kkt=M#0omF~9>0Wv|Y=%i`~t<2_autXmRt6^YZ0A@*l^tBICG}|sA5-6WL*P(3?8f0&C)ky}5A?G~C~HKHjhu%SNXznkCQ zStXj2J9g&lO6i%+58_)h&E~JXS^Bl69zzI|-S0gZ?;y|5VWW-|^A)6hfB*h{U})$N z0GKtlTxz+j_~D~RFKAg9SXm!LM$+zKmE~)_VF7v`OvXmqyQ{B^GP#bbcxyN~@C*0% zP&%mwawyXf3_iF>G68zNn&jOFr8Eg*$XI~QbdUS;)R&d{5y|VnPRA$cV>3Q{{Thi~ zXON)wFpI;{#jzL!*yZef!*)Ap-h-Z%RReWymsih`!fUDML(R3FOE`@78_K+~v_)@! zXCf);{2d9W!GH*dzVF}92kL|DDWB`#Pe*qe)p8VA2fYs`L<7(P$OHuV(Rr?>^^vdO zWZk>zGF7l92D#>B2o^vU( z_h;7Pii3kK06xmQjyXwTN>Fex1x}PXplSEp>-Ru6{QULnBht|d;vQtiAf&nPnXkhc zJ10p30s=Ibd^wcsydOcTLoOhyX+wkB-{tA12u@YH?>E6O5VR^mvVFVJR5wulzzy9a zaTzE`L-m5?y1hbN92|r812_~Cx26st&~R9kXj3@ZsxW|tP%tuhy64%(U z*qYxiWKoDp%WMFITUUkbuRACArci|~(M=E?2@dJ(?ak%e zD$iE|YJ}{6@IjIZ2o0@lZ;u0CF^{auyUIT+WB|6tCtv;}ZcFj}Pnv;#+&+gQ# zkXI@#DXnGzbJE^xaRlbUlcR6$$hY2OS19Gbtck)$$Q&s6_4@ZwY_4WIzCfccjW0;{3N5%?bAb~La0fjk$QD?)X-z3J_~G!buc zJKHed54Ut3*!GD_(@R7v4Gj%}jv(bi&mJB8$#e9?%?XGpR1mWSOiF9eenT$^mWZtj zPf5@}^QA)fD7)I#w5^5GO|3qi66HPbZVil&2j`$cSXw$}|3l4I2kkS#kpVIai;C*7 zFnsy)Mfu#hp(eRaLZ}54Vv+T%K`*-UD_BqotkTPzvex~AY-}XpR*&jt$OGpAEq|)4 zJae5MJGKuZz?Q*Xf*m~>g@k;7qw#BS5Ix&=6o$3AdO_n-S1lga=R@H2Asy|6?pYbS z^4PIVtIEa2We}-!03AzHw<+>PKRP2K8=X4!$@qLP_DZ|V(+!FxgoF_!qiV!VWiaDb zx|(Qrl&sTd&u+c8x#0!k4K!Ce$}68qsf208D2U{sEJMega2!7EH!D>UAQB|5-^DQ)Usy(gF~59S5v11)5;-R)r~f%4 z($^@b5cQZKR+l0tBbpI;rSpv^f_7Ti*#*6MBaW27x?>Lu9eYT>Z&hPs*e?2`ugxoW zy+Whzf4t&lJt%`l39cZnRsa^2)zusGx5O1)dCQQ+-M|bU ztJC#kf<*S&Q8Fx^l7{}%^lY_}M#*UhDk>^MT*2|*Jc|vh|In&Wf&E9{7AI;Qrf%rS zk2T$1H(c7f>lC`UZQHhuMlA{K-dvmC1E|g=CB=jVBaS`9L5hps#+sf%L9g4wULw^ca;;yUJRCA+ zTE4fD(Ic0ice5R!7lxD6wzO!)x4k&eOCAmZHuc1&@zPVAJm~9VQlXo^i;I4zr|4^W2kVA71$1^WM-Ze3H%M*NAR?uJ z;!BFZrmLax!1VSy;eNRX!r`}NViUiq3X)-z`)`7&mv)Q5$F@4{3uSO%p+zJDp-SBN z9i=V@37a6q?)&-<&0H$JHJzm1sz*yps~8ggatqv3NcnG_M|QJ$m;q9vgVsRI?WCZn zzvV&k5ot5i^~J*2SAW1}5G+;LDlm%tnygsxx@Dw5VXHd2-7L~d=pu*(hJZV`%VTJv3|7u#Ur1eOdTd9|Lp+z41l%k9mLMh>2br1QsNfQC zXOOCu)@$^1r*?E)ECQr7Ff^nFKq__~A(71Xz|$O%bh;0%cFgj2(`5I-<91OIrkrmQ zP8!ZGFI%H+`LnX31(gOzN&)RF__hG`6O$K2CxvMV>)jKMaE`$Ev(`?+f7waGh7KAw zJCWMS9DmesSO0|D`JE7nWn?ZEi`RxJKKlu&v%0J6gofZgQVvd^BJ=C(+pkR>;NXxS z^m5fEHz7QJ_ss)^_C&JMMAMzWa@bruNUtoL4T9i3u%%C0YzUhco?r+Vove;V&aqdT z^V{djO5Y%QUZ;`9gU}Nm+bie43fx+lZP}VX&oX{(wD}O|fdCHW_V#uH#nRBy7Q1Sd zwbYo5eT1V+=6&qTnrOX@Ep$H>nX9%qWk-}Y%eHfKas6&!g(n8fd<8HAI@$HPL1j+z z9zfq4b?>cu{8z}gb>`-kklBq`eTkiC2m8nPpUVMN;3JA15L#qROpJYD)JNSx!udfI zq>73Pm@)Fue1aeOjEk*+n2Nq>v|P$4Q~Z=LIhqJ~ldt&K zYSC^6WMaz`g{=rh$Y?(y9(gW)*Z=e9PcyWdtG%VQV5kUijOvfu_X7}40%i9aU2C4$ zrtk2aRXE4%H#_X@uU&Ix|9JWzlV%iOrMr91C2i6ZI`6E#Uxmy+$bBPHbW~IB=JT7n zLOiao=8+a8kd8{9y1S#3|1~wWyT+=yz{WY)a#?BxyGdGXJOncYg#2q^#!O_K_Ua+X znOm>H&IH&jmv|HAgSva`2sMeD6Z_B@nk21X&f#Us>|Jcm2rwGtFG ze)|DfAnJ4j+w`q)=dbWXGPF;9jQB2UgIYjB(DR_?wjj{&u`eH_rG3Wnpc80NSU4}- z@TDFau~C&PUrfvbIP zZr%$32s$WE#zPD>^fEv$h{u~#-O8{})^)BiK2n`qRhO0-DyQeIyM36wzKX?1e&)9} zpGMqWKQdq0T|Oi|S_o2EOUrj&4EbLg=Fy+PBSAf!oMHfcu|Qmxu%A(y9>6p7TK4cGLa;TqmV*mn z8#qCNfEaE{CYn!zY=UNK4QPUOKH^140DTSX$WtVi*8!W(kUupn^OQteC?O?f?%v6*n_-H06OGrqMJgf5bza}d17m1(?@i*`00Fy zb@D85<5yfBo#7`moG&@3zjBdo?{OA~Z!JDY=JG$bWjg;Z@;C9RhLU}-9FGwPg5(w8l;GGp7POcD?20$Q0HN84zF|=vv=Rk(VLq7$0W|x%Quc4s< z477SObe^F3(JfN~x&Hd~%N2eK=O|vjqc~%iKinbx{Q2{x%a^OrU&o6(Cgn*>Nj(A> z9veIe3BDfyLuy4QhR^UL>}nbC=MlE8-rl4KvVQ}RxOs-Mor%K>H1{wmsWJQFyQr0m zB`#0lbc5Bw8Tty`qeR)kArFeJ23Jj|&=29+bGhf1grsD2jdyJ@)TTLfo;qnN^x4M6 zci}6;5)I&Xg8t~!9o2iKBrku5#^~GI>lQdtfxz!aql1v6exa2z$|4N95Q6AS%&QO_ znV{ui_c$QuZ9L1ygoYKgue9f%aQKICp&Inve}*4BM%_R-DFY_S%F3pQL=agz=$y7Y z&9i%ow6RD1soywVsXTi_?x@AfZ8^KzvQH{*H!!OEsYJ3{L)t7Qg-!Hu(|OpnpUlv4 zcsMHS9xndZl5#Sv>HU3=GgCU6?v^?1x-oH6)a=hxU6QmK^R5&?qaB$3RR_dZSkPh!586IgKAWGX~5yu>}r8kpYjol8pcEFlm zU!kghFjGh(>eh#+rKfDExLtksLuEr=umChMes}X+x>3}ViDLJ~qljp3=q-{*xyYTY z(ceh9y~yLr*}Ze;r9U%2)sm!Pee2x2Z=Vt}n}qAkrQI4*gsq-_Wp?esL|HNO9jEO> z20t$EzwCFp$v~Cu$iDdfPx#q>M^`EvrbWzu?0$R5pF=XB8p8sbC!-$iJiIrG4u;Pa zJx!V@ek?{);g(1?$KP!^$<}Y%fvgbj4`@z~jf_--H?Ib_s>1iP2>Vr%x$N`gNcUUP%uLPf0}_{mwQpSuk3D<(WKW)uIzACmYFrpfW(A}y z4txU%I;xN-=QupQ@(~<|PQK?dF83zv7hm7qa!g!-t;P&?C|{KVK2!uj1l!e_4xgs* z@Ry3-1|z48P814Uc(yI)0Ha{-Z5e5K!LG=MN{zgGcP8!lcK|@73O0#CqjS(cK}cl( zQg#`O@U)JT3CxEz903X%G`1v*u>3P|Jm(aVvN=XRMov=Te6{*o@=vQP(f#L7waT~C zQY&7d=i%)fEM1C7x5*CFi12Y%eLMD zaBp zuHODTAyO9rVi7&Y4y(Tq+&_H&d>?)@A-B0f^F^6kQ^uDY6XGp4z~c)aKhCe6?6EY# z6LW5h!qtFzmGDHb+ST^t5>^t6XyJ>;FLsn$-+ub{NS#}BRFv-MjctPBu>y6sdwQ&L zIZr^ft^NJ()*<$j81Sfgsc6oAr8jkW_>21I>ddiOhA6?!wQBl*=2J@L7p8g$Sm6Xi zP~`xpfIt{vspG|#gc@`f2WD6zvp4dOm6bq~~YYzVnojkE|t}rff0j;vf zY6CQj5QZV_zL)aehq!HKX&Gfv=HsLAkkvIfKU+SW5q+3VF@r~F=GpY4UoC!zK8;p= z$-a0#YF;J5KZkKo%HZC^g-qQqMtT|*I7aAqny9pX{aJA@{cim_cXERIgQFxI%9TCu z(kveTGmO(u#ttqCfr^Zq3C{FxY-w@IyNVnzo56QVmDuLN0FB_0f!nfN zJg3S|ObvDhD)io~`*Ea#H?-&|1{*HL%_sWa6_a$dOr4N33l6c0hsX9Z$Ld@O z42#*c&VSZ==RHsReuY-5*JXTWanIQsPqG(UjIG_Hpne{Zk+imj=-lBKr+=A)14;xq z{1+w}t)Bq1feA?Wvan$GW!(AVwAU1W|{O*eBy&K;wOkFm)_XYy3z0fISHh z-wPd9Gv+x3o+R*LX6@kDt;tTjKN(1pek3(q5z{Gdqrs8y^b~3;8G4-`=^LBMp%k#g zIe5hJu{>K%VwB}u)->Lt(`RaH^9wUXKS+?h(yrj6eYN<+m}?*By`wB5=04ot8wWpl zuNYYz+{zHEVtvA7jOIHM1c0kQ4!35uu_R%WN81l3p&va$Q8u~|@%4%O-OuW@=Hqo9-2vg_;CMG6SYDySmLHOrlB+R;qS<2-IAuEvu6TQ1j z#|;Zcxui{Ml0ua)AN43Qjmw;f<@?^kHF)EupVLrMPj7{SH(N*m$ph;EF?^z5DdK8M z_v)=f!`nH|+);>_XfaRN@;tLYev4w}lUUUNt(yy9D%x2>CO*9V6-nZ7Rto7l0TE&K+}7T2+hl z^H6$QN^aO6yzofOK?RKoYDpSY-QpMxxK3XFD*g)VJvPCqrwDDUGmLS7Rb=u|Hi18N zVs0;WSYsC~Y;6O5d`JP4&Ecv6m4BK3#Pwg>2w+}SwH==N7kXKBaD>ctrxVQyjuW~b zG$W7gANTiPvf8xyWU-x!DRm+e<^u+pH9%Gpm^v^d%9OX@YL5|5rUyLtiC&&?&V2mz zi8M~qnG42n!jf=@uNke_35~~`%0%WR0ST0770keo_JkH+f+jNK;yW=~E;Y@VcO@m5 z-Et`v&AS@Rkd=j>UBcZGjUQqwAB=;*#gfy2a6J0i2VmqKg3#~Q`NE2D2IQEsq89=e zdKLh+k;&I4O+@9_(^_&H1YdSmyC$vZ~r6{3{JjwdTuFVRzQ@n~$@ zCnJ=%gM#(7{f@@19~=&Rd$fn@hy47(<}1A4Ov$HAKMdUZnXD+(M)OzZ?sf4?U(QJ! zq_|M>3FJ%rwrqW!+{_ETOR*ej!{pAhH8OX9m_(e)=8z{#zt#77`l4XzBPr=v;p0q_ z?e=J<3AzLw2fVNt71HUnVKt_up&^t$!ej`$2MI>!cHSX)2~FrEOvh}(Ee;?>6Qn(q zF_M|NIgFJiAg7be-P?c}j#N@N9X-48d-rIFTFqr%9%;$2nLv*HNBn@+gR(|U)*wZW zlz5>NByf6swqDkwIWJ^yqPGPhFCroWK5~F?IU)dp44emRsMprk=mfWe+h;#~SQ4oQ zL984$OR%N*6`YtM_+wHKGH#Q9r6rvPccSw~9e^9V2L8X5!AK2*H&?+L{yz%PDbCU= zkZUlE5{4_pOgsir(A5LS61F0b>Gx+5;ByD&amJ-xXGlmOS6FSJ>_!i0gQLUXu70*L zhD*XQm-3>yYx@ig4mwXyl3|AG?afIh!0n2UA5Uh!y|yr_g?WdR1dC!}$b8S$qhFqq zIB^2JTNQkfqrLA>pfj~X-9iBjKvM^LHUpXl5{XXZo3+uNcj6%OgS@6o=XVoC3FJGX z`vWKQYjiXK&>OPg5JVWl2?i66?Apw!#IhJc(=el~=SabEadCrov^D45|I@0TxC&QR zRpC4CVla^KY5j!><`9y+pd8en&YBpVU7$wM4x7YOp;>hdADX*@Z4VN^{3j( zaV(?xREp0cZ&aK(vaf=@U~NZWq2RaVH7X+yYX9F12A%OSf8G*iHQ1~Mp_H7LW`ik? zsAfc-DJA`nGl2+pGE0w^nVFd9<<>~z0DVJkf&A-N=G}Pl2l!B^Ui6M}Fl?(+--?Kc z_-7aiV0cWjhZRV~nuz-%OXwOfK94zP2OE~`GD78jFxIYRXTeH}c^ zVIuD#Z{w0JUn3I}k)|u5HWRaRxY{$&2}DCvP;iV89LdQmdU_6G^c2*N)0F{iAi=UC zkgb4ERetq+B(Mi`%S@zBVu}qsyPmWZVI@MfB$Ds$-Mb}EKEh}f5kYvgLEHi|@wb=y zdkT#bE%^8mourne4Rv)#tJhFYcubTtGc%zY+Cn=wLc*t5snK z>p^BhN&F1w6h>abs%gl}lc3nOD5t(^v4QwV_!J2WbFUfdB;izt`yC2Tir&ll0_Ar( zICXU}ERqI96}$*m{TYq9SPh3fw0b!xVyU-59<+;M+%G7sfT3y?jTHg*4r|) z?h#-EH*KAr-eR-wxE&p0%n~Ap=gLqlY@?BYbh9mIS+y`PPy&f*yabj961;0*tYI3( z6lO9>DAkonu;>9newe=KTX1q#+HgWN434*{|2z{1_=wB3{|s8;)@gtG)|2yk&| zm+i^&9CTVh$=xp&yr8ghz&)SJ-| zAB8bXd!Cq`2%V~d_HX9Q50f4#yW3sPU9BaUwB~1L_n@*6YYmqH=4Zn&<9Ect=*`uv z03o|VCNQRG2W#(=QMBh-1xLiD9bs29d)>fZZN2#Rp_A5Yc-}%Vpaa>Ou!4{O;3wv% zF^2vD@j%#fA~?(}EKb3Mt)!-=>-5mZxsT2d^$mHB@bV)Ucx9U$AUl-f;K=}VWr6=l zR2rjU9y7msRo&B58hiRh=YGq7>QjzkJ{OD!m(chSlfh7joiN-=T*WPPNov$oJ5 z{7}WTFn9ax1vV8U7dNdVG4Mc`ytX@n1HKZ(w_Wk~Nw<%5P@A3;eqEwZN=jmZZyBa{ zf)pmavp0>d4*_pn=_(3Ey(Qj^0O8>?Kl8)g>b$g%DH-+zAm zMA0B2awrAEF2>jA*q|OZE30hir{8p{#J{Dh^`p@^DedOxk)J$Obm1$ za)FDY8()rdD5u@+jr2-ZRgZ(Ob`~0`Hb5Rset|(jztCKeEQ~K7Te~G*Y zbW$D0@frhA$e?d8x+(o|ziUaaiB#$PkHDoR`Ipe-K!ZTzfF(H;c^v*D!g^J$vy*^_ z#9K^oW0*P;BXFng2OeeV5*{}kKxL#*QerX*sJ$Ap6rp3o6$Rx6?-)^$Fu*?(sGxj(i5We`06Mv#H&ydb z;2|M&9=zit8n|2wN=R#KYt)(ovV?%%CZL4HKvgJgTySXkgIv)jYk*msaI4`3HNbMH z{(2D?80nC-A(7PLIA!SPuwW{a@PNUU04gs4_5rY-Dvhx?VHQy0Ugex)>Oqrh?vVbE zOwGr_C1>fK3ftX(7_Y^xin_! z`JeaWXPP^M=U5^+(o`b)-kCI=xkI|*a|6plPy?Vr0fs0EbPAmd(5c1t;R#rU1zj2b%{{}5PwlKGuL31eJ-u!4f~BQ20QVr&Px-95bjmb2UF;N;Swx} zjzgD$joa?Ex-J{GJVEFLOA@N?%`EwYfZN$1qS!OFME9>G#EC_&k z8~~@8eg#6+!Yfv?j0(-`qIjf~1M@KiLp&iYE64G|ky!SRgw-DC>m6Pe7!7M4}t} zk>z^h>&BmfJ;SgmL-K%7dK3gp8juJHjCpY|K;>ib8NCfL_JYq!S*0ddqAis~M@w`# zpr`btu^(d}ffsgCQ4tP}y4=Sy9!2bfXU?3t4r&F~t1-ARbBq3w%>{Rl#gBQ?tdTEA z4j#C63n{sni-R5g9L8-3T$ePd@_Tvo#$gms=&+DWcO^P=Lo-3fZN{)r zT6c-OrtxRD#lJ$UeI48U$hKUvHb491HXFU#QR&#+AZHW$ky~@;2hYE|YFtb?-gYm3 zf5<_K7wm(5#szz6Xw7OgU9}&}YX+4DjC{`zKjw5qQ$<%`OTCwyw<2l6#QDD^R*$G0 z4)y3R*#!QO*WJ~9dmFhMN7MJV*i$okLHtWMHM2cyuR7k`b+qJLk>}A8iP*eRZJ4OE zXYaR|+ghtcvV2!@R`MYU=M{brXCnGlA3_~{xh3MsAXqj__)~!zc{qDO=*!Q{<60WD>% z5Goc`Kd!BJypz^PfBtwzwC~y7WmL+_PV|h7Z7Ivoz)#Ufn2@JZDup(hTGpz0-;{G# zEDqo`Jk4cMGYH(ep^%qJ&(6MFfFhtmWvPyOl?|r@YmS>mB{$D z)5f)`mOQ6FUdXBr(z{f9G;ef7=lJNX#ap-WzvM4?!$+dmxk&`=q?=94R-oMcl?M-HeF!&*QE;&aeM=UAr}PBtl6)K!EeX|Goo`^W-H1w#PYFw|_8j zXb>s4GGM+Cn|nTIH$2ss@D2#rs%S83fWe6|H0|(6RD&V1U4|@9xatU^{pQWfz<4fermTe4ZGSHwG_p##uv?zlz}pbW*s%2khCoQcDiIN z?D(*eVtNc08>@%tF7o9E3J9EZ;kkYvhmi33$+I0{pbn(=%-f_D;gD>p zV4GHO$?QxE$#P%ZCv@pfBSgR~VmKWyaqE6}>jAh*3>cU;!{Ouo=jRR*w3Y;Q2Tt2V z=FC2aBif`d*)KlYAzV#LZmwSQRhA-2fU(w1HP++8&)-f-uJ5I_Ocpr!>@sHzcXCpX zWz^mu9ZfJ-y>ex0a&f^)Im&*=WvlQb%Z58!hAL10&#MFPvPVWd{5blg<#c*Yq0wVK zySg}ci@F~fZ96fIO9G6fbnYA{#v3tQIa)w?V~D8*pyjtEB{4qHR;LyYcSI_rMMTaq z-2XE#P37ED!y;_h!Zi{avQ2sG3r4oo!Skmhxq=0$y9R9^KM#M%PEGsB@5Z^UrLy|; z%6Hrn;&iUK&9F_M{{I+z?{KdF_WvK*6j`N=LK&4+DI&^Vk(9lPjL57I*-7?Z**ir@ z5g947j5L&;9hFr?^u0gd@9T5@zTe~X&+j^pYaG{gT*d44d_ErMdEW2$+gW}2vgE4- zT2{FY|C5SzJrffn6f)lbe1`ZaMbJ}-Elq0(D|Lu*|JN{iR$|fvb179%PbpkK2sngd z=G9q+cRCF#pj<3nJL(^D1HMMX!qMR3;G3{@Ydr2JiM#y(SC6=UM$@#px; zFG@<6?12Y(5@eFt=4%BH1Wqry@F*nhNCHqq^z!SS7nc@;4)79bIg~}ym^T=t#%uu; z3=$vSwRP8T>!`IcVhKQyeRO(S|H~z!n~<9qpI4t2v9A%Xzc0~q%)%hmN`I$;^rb_m zP|Q1zqVpsnB7ShUZ}}ew!kL?J9LaSG#_qs4E%&nO_3hjui3!LD;6i<_{mZt0$Mxqi zlNSqjbvF&l#4ayef8;qseJ*JGY&Us4x3evAHtamPbOnn&eEJEhMgrVr=E4$y&@qT}=`9aJL zKqM5@_CdHO6YmVs>r26DT-z8l4*3=;0+HsIf?D~BeW-NpAu-7$7oPKiV=D>CWv^@wfyYeu3fXjZ~&TXzDoaUHqmJj@HcqN zJ0a-k|HH?GRpcwj^BZxm_;(!WHIw+ubfcK8*^Bz_E0oRde^$kafds;J7}lcGNWb_K z)!+Z+(>RoULeAMiXxq$;(!lfn?Y{jte!0jBO(f4IU3Iv#cU$iH`ruPlGAX)88-2L; z_ntgcU}BiUpdG}kT;e*`{A$W#^Ge95q_xIoQMg(TPw)Qp)smLuYa?QFQIS(`Qh%y= z4kU+WwIwColryyRh>3mEW8)I3u2bAADi+ez$H&DS>mC+9CMI!4?qMhSp1o%`jcZI( zj0}#8pLI5Kc*VIvJ8-t&NozQTKK?szZODGsmayRCmruSF8{7UNg|W*me0~woSj4E=r1fCv0j&Bm~i(7oQE|hFJ(j zVYca8V86z3QPOa}n>*-HNz@P&!>K5Os<0d_FvwLHQH`0@qNi8e<7&74_%)qKx~VLkeOm`>o_btfZy9yE z9zt5kx9isBj&GGhT;Hfx-m&S8q zlnJC1gmMYRbDHZ(DAkWhlW!~5<C)^b+_*W)eYWPWGLy> zi)2-~@g!e0dgi=5)T6@=tB5C5t1^UHt%lHfp&ZCZLoelUU9zueA!$B*k<&*3QTg^VQs|i*3DjwpPnc;i0)9|$mksE1cLq0JBh~rvHd3|g6@g1%xr%GVlr@e zN5C?d#t11hWfw$}$HXQ7<{bscbe+O|jvn{%KSCXy=Y;|ANrjYh_`tULM>Co+!;ovU* zV)wsWSRGg1p9lLS7sKTRHS@m6KT1gFv;KE;44c{zuMQ}D@94ED=wOu zSa-7Ho!yz4nXI!#g=F;Pc4T&{+XQvhnd*ejk49Q-7dWrAQ#D5ax*hw+v670pcjCKF z9J#>tI)k6!RbdQ4@GE^YZmoqtL7JszB#Gpfy^Itkoc@Gx4uyjtaUzETVoxL#jUF_i zFdKwEefl(_<>miBz+jz%>m9)MhzkiLCFS~C8>^VBNxDwb+x_efbSHd3unV`lX=^iL zSWI|kF%*S|fxMOHHMo6?9JBE40p2ClHYm5v+pye{1_TzuufopWhDw#xVD3Jo861Cv z3$$Gd;U0JlGVpw1)ZK|8t_SSA1iTYL7~Rhs@EnEhJ{AG-Bm(|-j4W(EJYWY_OF5^9 z2t5$U8+e;hG*sV~hOSKz42aP$)IGvd56B1cXse|{2`YCUQ-a|XiBe&wVG{i5S}29bO@7}l@k5}R9P6ru)@zVQRfIXMlH3W zL9P~YLLVl) ze`Xi8dQ|LmdU!nZOunVMo~6Ye{_gktd#|yExGgD=m5@^TZP7Yet(f?p#PHuzO=NmQx~?oof_tkbKrBqJ_I*2WnI9P$X#jhYFjBFzvlCWm z62f2$s> zul+L!UciKK31ZjG9Z?v!flK2g!O=?~UGW4eLw@;Gb3+9AtS^|F=j-IDkq{<2oOLsO z(&!i`YM}KXw+&L0oN^#kYQi&dHsS~BR=Rh`-Nxe1ss4noBLjsr;nI3z269<{UnP_^H}oOE(^WD8;Az{`eF z2fwK3Bl~a>MPW12a7ufJ#{3r*{lkU~HVx0jlB(O5mg%!jAGiaq@6Bt9JCWyBdmBK- z$i7!0DVwpHSy+TpkW<5?;ig2Al#EGw485P9(-`IS%&GNXP7`;57O}0W+S&bZy(3GF zm@f2?WJ$9cZzI|M>GL${wdhv~339G`L?{}$Ru3GG<*d2hgZUM`n&5;I9F?mtmoQc&>r(&oZ%K5M?&*BQND+cw|-`Muse{F){?r($m7 z^N*!dX$-UuIT*0R=x>42MkPi8pL(J8-YJnqd|gQ>l;9|V@q#g2X_J9SsG~%BeS^>| zIGtbMTes#!6lL6A&=%P(8hZMd@7~2hg852KPlDn1E8ukCvH}1^6SgfJpYk|0FkFX` zie}rkX1Iz`Hcv^OadB~p&?e@YWv~{)u%!lv7mAa^pgT>^PI=o|UA`=C-N`}Zj|eKA zq2Vs{Ad)e4wqt3Pa8C=Y|3IOJ0z#07rq#LU6Mha$JoNC`go8l{SB=Zdqcv4k0%RH% zYn1Sl3c8%$kCqvEuKn8vBpVjbb&zUjT{I2#(5);q#Diga1Z+s0FSz)Fm2w*2zmG=` zb@!-zs|iKTX;!NIoYh* z%Zfy*>U!oA2Zyv<4w|$-C~~ZTy7XF(<|zFG_VwiF=HX#(glP$s$-jJR!a?@<@nh>> z88TcNKl#!Ru1i0Bl*4MzH)Cs1lbukNg3|#Qb0V!sC|sZ@%%g)5>;P5N7Dk&>r%oYN z-dpS83eX@-rYQ250Fl6JMby*B1Rda7jeB|3kf_}0=q{m4LW32|aB}q@`YrmTDR$nK z_j+dVNs1HTS-omyHQIhBYda@OzzM+;%RA%Tx z>e*1a;h&8JBj?AJ2DY9&G0)Dl=-m-~?c{6czx?i-Ua%Ar|Ln5=NMwK8lS`RrW}JTB z|1;Qo;O`TX8)S0rB%xB~(cjXF^B8tsh-$LRjyx4Ksg-!%&G+wStc(RY1@9rs)7EnL z?p2c{HjXR`TZUh*_2K@oAr$NpFT${~-m$)>sEE;-I<3>a0#jjS-*D5 zeYA>Acys~xpoZ&>z_Jh*08Zbrk@@8v9bmtM`-MPxT;K~5Xz_e7#o}u?jB?bgihlwr zyGzVsgLcxyL0pR=vm8IEw|PS2k*(tZyzy}3{A%4xIxbbi+f29^BVPcAoEcd#O-Z-(VygPM&J_h3rL{ z6QD<-vm}54Alg^Kq8V{)D>OW`qmSS^hD!^wQ1goy*($$aZQ+FXkC-^Ez=FLxX1 z(A8D<+uJzXfeV@c`IEw&FbxmYCU56en#a^s(1Ad^10w+p$Gf(cxA4={t$jTJ1!(aN z)s}K%+-szoX&T`AYDEGcgH05Li->jNOZ5k&Vb7F>q|*oq;n!j zdHLe7&-@gdWMX6}U2X5wtG)uvL=Ct|gWXe9R#>72pEsr#_^Ydr(<+sIW-ZWF*ePGR zm(-DxF)c>MU;UT6#}HXq!d$gnd3Hg8GOSue<3l=oj1K4};0V(}@^y1_0s=N#~Hy_+rnW1s|q4m3F$+wmwb@6+-Q5!bW8=S@_m?Ls z*+1Zj(N)YYTFTOLyGiHE z$wQFAq9_uj5=L~++!6M0W*BT&oN~1tYwzvOPVn7r5E@;dQkeTdRdsO>d=$`&fl7X# zNUG=eI)Bg}yLCIYILr8aVZGWh5!Hw%S4JjxJ+h<5yu9=8A?;tawtp#)I2h^}E!ah| zt@HiuVE6cJmHDUbS;o;_ebrme+>0tmJrZ5KXX4ty^cIhO3W+iDWG_5~syN6?Cf{~4 z54!Yp=2@9@#l5g~g&Td-a5`qS0IXT?X_ z-B!sQP)im6@GM#5;V3@pn>;VvWpaexK7Sasone23dH>oc?^G$jWGcSE#sKre0h9Dn z%|8cXmQGG=a#j6_zjvkO<@Q=@c}cxNiAe6I+drLFgbUx#f9Th~^DZg~QqMkRN+&iG ztwiNdpLdefNpC%5clVyYUS{Zjf>MYdZ->7EwIl)cwC&FHE1$#XY6+z>^9kn5!P z*4kL3X#Gu2K<)Xwt{d2saOT#k_OCfnfwUuuB78cO#%2p{!-mIh#r%Dx@zliav9Uj@ zl5M+Zdw1MZ7v&;K17#tt4n=PAZM=Gi+wanJ2Nt9YhexTa&@C$ey9CL99?A_AgXl}Q zaAu(mB%tCro^GRH1@o|xu(lfSs;hxzsB(0+*UWYGy9Ehy%lcoRvM_J#$W#cCo2M6+ zDY~os+;daag!g)TUQzg5L!Ecy`qQ5QSB6jh@lXjNxkM#;IKXAXvMXZ!?8vl!t%JLP zYGhEi0pm@z^y^$CB>}Fq#y5U6to@9N-TX=Z?SytpuTIDwLkG%mIiHn$YN_Akq4R(|_ag&i$kN^2kE)=*>WrHLnFilTg zAMAS-Ipsqx_fV(e(fiXviKWKyv5-@Yy^Yo8>+(7seT3cVX274pB8F*J9;$+~6ptPp zpxb)(Z)>$B@r--2^6;2yZQs8jWSm4M7@cW-aLT95A-Tdtf9Ict6PA{(>krIi1soG9 zOiUFD8V9tGPQ8&@Sn;v@YS1`AempUFI{VNLlNKi9py`+cVyzR|mDwGF<54*e`m&6w z3)!^gro>MDM~^lA*WLtB7Yr|MlVI5W!~aGv{ciyEMIa4M?~9g}x+(dzm}Toe9O7R#Pu&`pQf?`_Da>TK-CLJE-D3#%8T`BQOW+dWGJ$;2f)XD7 zk*85r$Ml`N zwfDl-#w$WQEtl-0`Bta6kGpB7ml*AMRx8Lqc1WNo`Oze$@txoYF1~fNL#6u_5^JM; zQ>A~pNRk-0<)u+Qo5(gB{`@rW-&`$Yn1SCySgrMbX>Z4k>t_i$M2FIt*j{I zfW|7>z^a^>l(i!<$6yTc1p^TFJvytLC*BqxnW-1C+Dn9zJ%#=x&1PUth>OKp$19He zk0!-kxphC$@5?>CFUDRTJ(B@#Khj`9Qb!=dRm66G7z0r=Pm?e6x4Zm2*hH%~nY=r{1H~c*Qymg63nC4 zXsP#o{sB`@vZU+fh~F>+H@|yq<1xE3_vxH*1-U=Rou&KspQ396#)?`<1xkgqrouYs z)^D1z8=d#M>tYJFgAA2Ac~|P{@oI4~axVNQ)t>r#S6ib@18hjriy0v}!t(5308L zoRlnKrMGZD-^VQId{QbrJ%Fqm4eF%9=67Xn3D}nc=>-i5nriVezh41+r6Sw;OS7JR1 zHbuNdSjE7*CtE{aHiz}Nm3 zHt*$S)6!7C%vI!#kOfjo6qc;mvIa;9%m^P!6SwqlkoE9 z7&HIcwR0!SZ@;ITOh#88>+cZpCP|+$VL{ z?C_RV8VtTRYq>Yy%cGhsMn2C!oH=9LuxZfX%EXLX^Lz-^y9%M?Z|+#; z{as>u4-FMz+i-UFIC}1wYK-oP{wk{A=%}a$d<_{D=l%7fp;$Aru;eZoKnWm`{WHIX zcoHBYy&4=$BS5XzgQy$;whzt9sv5xE^kPBu_g*3Zm9M%bv*4zd=gnnd&@=pB#K!T3 za+8>P1oFH}_YVxsw5E=Du6EhD=E>qPRin$NS-qo@j-F)qQkQjoo#(bxPEH6n3{VGW z+=`({qGdIo?)Ik>UC%M^7BlcYODe-@5&vU$U&BkXe!q~`p@Ru>C(PLx0}nRnk_?e0 z@Fd1YP+lj=Al2MX_PRr%=&ooCUm3)hk`@!|PeoQLI=`~@_h~)%{E2`rk;H7T#jtU7 zZcby+zA*&dYBr{HSR)4BTN7H~Hq1BTCSWThC0xQVRw7lfwKbg!2uCP2l70RC2$8no z*g=%S92YeXh_UJER&;Rjb&!FvuTd&XiO_WNbYSERCv)hB?y~dnWqJa(gY_GjWl2<3 zRgD16k0BJ`tATGnbaiPd+hZ2M19^qmStqrGO(SsNWCM8Y5wV+*(YL<7zO$BKoT0HJ zVD{nRTM67Xkv>Wi3qXz}nP|)6)q>RWbedED*Ve&U7Dsrd5rGNDZwD*N-Jl=};?>+c zYODDSj?g7c-_9buNG#NQY;$(oO_=S~!DI(NJqP4bfQ2KGKu&YGP^3$wTF_7u%L0&= zpndlg2|D;oiKJ3>J-L4^F3(&vsZ!hz6HSnKD1se%&tDM&s|xc1GY7|O$Ww#}gp=fu z{450`bSj>gr83BUUTpCFzUIo=FXc5fza;+T>-&9zf+0xy0i)K60^?p0P5;jj#PJMClJOll~y)tYUwfvi6ZAM#FqK@o4tYomXDs^ucjaZ9Z zmrBVkT?dvzmyn#}H@4l{INupMHb*ai+t-SeJWjdDWoCvwi!pYaaQ?ie$?7Aiq}neu z5kI#0kZ^CgMsj6D)10)NN`rEQdYNjIftn)QH@v|nQ+WRFk8Hh^i^04yDxM7i<8!vY zZq5QzzVT(OHI8$WFLLNejQ>Zai4FcS!aQ(rP*i)j(}_`uX1h!FnhtahW6}wy`@hdpS;lP=KmHExJY9aMCXIWWUxP;Xe z6v&a2!;#*Di7Rl@O$66j@Efs^>}@Vhv&XxY(1?irKqBIp7FZ&-P9n+=)?X$JZrHHq z1fw%1j{w1gF&d3glw#o9$wvfwAGgQ1<{q9|&S|fJf!Ie90-*27a-Hb?15lqe)qEUR^ zor#YJd*nn)oA?-6QX!(p#KpY?z$giS3;uzzn50>mE?P_%Lh878SBkaSrdl6;JHY?jOvry_hxX7R>l3Q}yvpW3Vx{|z+Q~(7 zW&RDzU{Uq2=N;|85|u=1$GYr#n^f3*^{&#&u)?+jZztVoI`YDIYsrTi@+5}ddfv-W z|BS^b@b|{z4=t6k@ZNvkf4h&sfXnWNzEfdeR6xqcu$t#(0)IzIEM)#mzG$7O3 zn4NLI`<>Y>u??&7aE>@Hh17q6G6CLu>)scZS|w|E4lJ%*QHLvym}7moz~+kyk{gqy zKgQu^bsS)W;}a*=Ai&q|wORe=I`~_v#Ew3M=4QsN1h)}DsDT0FmIT<$3ACrx8cs9< z2#gsR2D&Q1abVO3TSOAVPG{?Hc>`&y@~>yx0$=n$YOa{$;4|=-b?+~~s~6?r@rMzE zm{ROhfT(QP?t(D8&bf20a}|(K1cpsaOxS@f!x&MDCrl}akf4aYz4NeNmBZ6cwBZnz z2tL+t#EfnGFZ`n0ARePT#t`Js*5Ue#$E*${UiFI?c_u0>ov;*$4e~UV05oPm0f;Fx zm<-W7l(g+}?57ncjzAq>+8~@c{?=u%Op`T{k&6Cfn${6Y(iON=Y^q5B%`rbx0Sf{z z>sD;Kv2u8OK-qAY4d;jFi|z%*ZzwJelZL4uw*Yy}Akf2|aiF9oRBzDA!u7y24P{xc+D-CiqAHxLGy8GitV&4)u6&0^46Rd664=d8O@VNR~xo11ES{8InVyH;KVJA5ciDYR)^U8bpRC3I0DuYt;wlDLk#mWyWf?(2mbf4> zbH#&^<7$lMds&Y=z7C%2t~9K_dKJ^X4D%AdUhe*figlw9oEBmG)OADsE>U zCno7QN6JwRluJy+PkcS!+DdZZz@rXDbJbe1p1)+CBkxMbTaK+HJXg@+nfR&>)E|$A z0^hrN;ol4C$Jl9{7CQLX>7C}bo>v~{x)qncEmh*=PX*7MEz%v5dc2JpHzwZp3hw$e zW)*c{`8xSe_04UEnU4owt)8CK^0G@$2)*iePkUwG`>*8tywPklG)`a4PK6)6E${pyl^d3t)%B^+^J6m`W*>7!CdDW4N6~~0?;JtA# z7==nC%0EQT0M;mvam+?xh-)LO$42z%(f8ZA%5zvtp%<`p?ZN&4G)9D6g#L08GQwrd zov_eZz{9QF6;XeJua2IZAWE>Yu@S{XYfa4X$%E9e$mHXEAs!)A5jdYZ@W2r$>4-7I z662n`Rf@{WL@IDE5(^kw1@Q?9-Ysjq)YjcMWL zVjhpvS0lGSO`kS<^04u-r8wzyrjDP3QPuM^B{8~XHPW);+2bvqL(+AmJM!fgIVJtd z*2yt)zS*DK^2lP#Ks9f$`A>Jc+KjzYgKGYz!i~TAkH&i+G_R#Po}4g!Ic}G2Ylt&u zJ$v^E8|}U{&eUV5mDt>HV0?~OV1HiriTo>XL%cYOcpFGmR1{7|@R^)DzK|vGA0ZB- z1bhVY5fgx_@fZKcQCAzE{TEfUQb| zQJ%u?2l!|Z-t`exLsUumVM4EDrTwQER1~PR9t0kmPz2LR^w=@}BS&)?OGoBJAj zInl{ovaxAJV?gRrZ#8VK9Kq2D_A0UC62XSIe# zOB(Jpl*|w~=&%}vZaiE!s;=@M%2JH=zakF#Tk+nVyN@J*>`OZTaJs{Nc)`m4eX$lQ z$|`7$2B{0}G0CK#jmihq&hZb%Ev1kgXAm$R?y(hJke$A+vbj60t44pF|9nige8cyU zvFk6@_rH*qrY;yxq}a)Ovv5gsh)hG{$e-1mgAF@(gXB0N(qL(4zM;AEx-px%#yio^ zTNB=SRFfz1jngU~_IuJAADXg9PyBPl!Gk7AVa9`noTq)aPwUtN#%W$C z!8h`TB>P;p>%j`BrP>>R=VIIA{|XdMTLXl73z-sUVy&0;evvJ zB?uWi=-F5m!P1tF`e6)EF$koqh?yco>QeIl#$prVi(n&)VHYa)qOl*bVMLFyS4t{{ zh=R2|f!B!zex+CE6xAWd5UEK-1}t&m{*EYu;Dk2ixZ5-%nh+`g%mZ<)E;R3UG`F@^ zMk$T#cf+izZwYKJ7>XVP@EvNRIW~|0UV+n<2e=tfIgkJ^xg$Z5hxZvbZyc~L=GdB% zmc|JT^#>X{*X!4tVZb0($3pybMw|gPS18krQ3qzk9%JFhXsv4g8O?X?R*gy8_+h_! zbCr8faIlDdCF2&E=sPPOhr#d#!gvOj(lb>RnU+<~@@M>wK;)bFGU0MXW{;odVF9yB zC-1HrfvE%0zRW$Mf07@qb5=c5B5&FJko!J?O4mHZpHU(`nYmp)wY%*uYHCyygsBpi1p*HZ zCP>Mc+I^r+aMchiy3sJ^;CqvdS!ZJbIfNjOp@RXZkpZFR*Xc5;Fru*D&-lCX!w36g z%CMOV*Z@_8!12Hk+hzf;{H?yvR_0wjcN(?Rf@5z83ar-_|B0~tf zRd;W16HqZ2yb#nZL`UJ2v%q@>gE);ZE?~UW8hD40O6NoL0jUCUtgAV>HqDC~iMYMd zQ(2+U3;MiCYypOw1z!h|nU2*uxmWmmk$fhsJGkLTG48R3)&~kkBp`Wj(Gy-=*-Bj3 zXdbffnEtDy3n1cF{`>wP|EB**>4Y96&uc`k)yd5*7rzYg+rbks=}uFqQ?!brtE2Tr zVrr<5}24vh86ivM5gZ;>_EmG{4~zbOun6mtJ;p3dSaVm$@@ zu*^Mr)y^%4OJ7Mp)BUE?d_l(NU(HYa1VvH8YIb7bEo@DVp=1-e{Ze6inESr3a+YxN6^wrh0lj)e7;As`I7IrfT#V9r;LU_~hW9zP!eq9H&j! z)|7pxhdELl-aOxEV}9gaV|?M6;nC^$yTig<9B=MD^Xpc1JfGIn@}S7UhG&8PTxwry zpM_tY$X2g@Toh3<{pGH}j*a`s2W}lbx=-K;s7)AXI>A$2E|K@TtfGRDj;$PS|MNoA z{wE8h=*vDXGhNAOZpue{PC-ul`1gT$oVUEKrhoVT+^g_Cu6~Ea4*G^B-87UcLt&z{J4@lChbL38Cl4w7+QO; zeQikHbP*q#AuVG9U$y#zi3Rg zFuoRG!W1_Pqbe&0rQ{{>_TwsP{&`3RaxHVAD?E+~F15|e4=XyBZyzqS@8S=B=h1FX zuC!y5`bx)#-f)&Z_9?6I!5n`PA}BUy8-K{JJp$>Wy4p{wKlXLnu$)?=;`Y$bK|gNx z%Sh7aB~W;cZf!c#@SJ6Jl2)Ttv9SB%$1`je%krcjcZf(jz80iTWhh!9Wj($szbK

E>=p${Lt@f3{Wt13LCnf6df z$MfL8O(vbOKUYVWB6F{dl=6R3U1>MBpw772$Zu_-%(rSNf2C&NW2M7Zy+rxXTSd!a z4~*sV1(d0d+6&l<$BwSiR;}dz3Ub~zze;~_N#zKAR0>)B2Ia@nj`ev>pTb1-;}Jrzn=hWs z_LGSIPq>Kn;oY$gg}DtrkhxCBjC<|PJpERhra=E@bYEd^xkNMo8w=RtBN)#&jNl6l zg2Wq*WAmVsECG}zcy2vC(HKjgEjYt@3vSo)e0R{_kD}CVnR=TyAZ}J`te|*GBaDM- z?1IUSPif(qs(y>#%?BUNdvkjUB<4I33uI#*_qg^jGo#{9nHp&rSuL|4k=Kv+&5X5+%t_x#3&YbfUprFEYopD3ZA1NBM`R z4m1o<{dsRk>GoCOaAit}+|6;9o^U@czxpVOq{W8a4fdAK``P*!ycR4uQQNlf8m0M9 zSZ_YY_5Y=k4|H^Aza9G$?R8OB*J;6tT+>tcfJ{KT6V_&8al;D~b&@dE;FZUJT7nC% z2Wbb880ZG4;Cm!&9=K7!E!dR3ImOBP>V5O5_002}Jv@qywEoYx{NlP95|Z6)^RDk^ zY1s;2N|!z}b9hLKehyD&U(MME0@-9d{wyp?!|u?8mh|-&UF!&j~2Q>{_tr`Q~s-_3Yo7-33Wdd}`H!e; zHQ7oj^o>TNZNhT2pA$x;XTnwDuw(Plm>$xgiJP0X+;KPg#njwrAJj0^iHvtVc_vyW zUX{es=vOPRJQAZoW@u=vU@9TSe0XTw)t0hq?^PFyeY?eEiD)!NLuz7LU(!Vv#uaj1 zjxpY@qR)yqIlv0_OPV2KZ;N4u_??@{`u0zNZ1h7yo)|p9NgVv-i3E&qmKgaWy29?d z+y=+?^o4h{XwF)ms~y9@g{8FcANC+WyXD!=qa^)`dU*SgU$tT#;+<24ZegD(EB!)4ORPjP0#1$39vEJhtx ze~P`6pwMHUa5%EuP`kZ$KB&Aa^caU~3gzDV)2$+V)TuPjIdL(tX$}pqOltN$-SM`! z@dMl1GR5AX)l{YQ3*Z!quZiX`3gHMk0h$B z=G$n31Wv#A1(7VciD)Ut6+18;MX9fd=_peOIgmmyGx`DA4)eQ>-TguqU%5T2e9>)| zucc*T*Lzk-9~~>VKHIBUI58#sI4(MU|HOF7P4aZwH{SB>x(>$DAs*Ojqk*})Ek=;L zcD=e0!il+lczAdW$rv1wfQiEBMPw_GASMDt7b}VcT9^IHDz-d->h|!_mc`s!sSn!u z9z!kR>Oo`d6RyuDb#m`p8C310P|oOhm6S1S`H)06KhHQ&keQT$7s6}0QJ#Ru2KD!Y zF?v#DkLF3H$}(+ZZptRpccuqjQl98IUwr!GRmZvb_7SqZ ztDCz;cFSSk*03kr8g$L;G@6s+Sa#Xkl^>NAvnD%5bLyY#Dr!qthO9Mm+QecF3=1s9 z*>~;Qg^AqW*fzqwfyFD<@CBkP62){7hB9^RB{=W!3?CeMpPMVQGIz(1%;lp$4SKz{ z-Ztbe!rYFA>CtRs^+>%*ZA0(j?%O?%y!&QiW5v1e-A;d>xm3D z2ST~nf*vS1-E{O5@(qNFg^emf*>7OVLk0m|m;$Do;JFf8(hzmTDu*VH7VK!>FDYqI z#<6=hT`RW&oEDR9C9l6NeHJ-(mFoT$yW^4tu_;0f&tvl&V$*i{2D&_ds~W31G2yb< z;Ad^er~cmgUE)~-k?;WxuB!~kE*+uPjCr&i5tkg_8^74K^I5>NEXvQ$Okp{DmkoAJ z*t^_~Kd`ghje0{pJnErctIj@~o$XG4Bsg8Q*i~A89=~|WVqIeE#Y+>-HPv~&PtyL} z%STQ?L4QLoZMMv%GA|`7&N=>VQ6~Seqn&3o2h%Mknz8rqe~XOeUWjhf2W@UP%)Lpb zsE9asF)__5DoXwsX}G5~!QablV&%jcp4f?$0~&-EK#!n{zsvK> zV17p1cvu-U8f$#9Brqt1qZdYssp(({3pZp8RQF2Jz0i7b8Kwemj2jmSO*U|AIn_}dAUt$+pf3L;Rm)n z+4Y!D$s~h~=A)CU_n)DYUGFvf@`hQ86b|*DeAZ%@?cJNMeMQB_V=GZ!#Rqlx+E-m-IShu!tll6S8EtN?n4v9A7-N`&9@&Kd8{P|NUCbJ$JQr;7PGc~E+jT#FXs8S2mTfUi z{5YNcBZEhh%=7Qv^QZs)rj7dP_KZLhX19I*V@6w4)uV@PX4$Xw+)6rN#W}vCZok!X z;m1#YD&hU_uKdW?8wKLGZqbuc9-T3&kYXKr zrg5h%+UW|Jew{Bl^N!DzB1wI}B*X;+f9Q=Z{IFArpSDqWY z&a4(LjcnGqDxNkpWWVqE*m`2iweQUxA8U52v!7GFU3rEzQ!NWCc3zT(zJI5mldq(v zRFYVlr*m1AFG;V6?0npcQQwl<9$)`A#Tt=~wK{V=uc0DJdaa5=Tt)?5*1BSi2`q7} zVlmCLXNxcGp#}W`2=TYjR{gjCJTKViOsIE+jt-@mpl!|0cEFT}xtBNzzEvWfK^T{F zlqX6$kd`8pU`(7QJ!3J*H^f3G{p{K^fPos+Tz?3|rJ|CywsyLV1NdryvmYV&);z_y zlnc&ysY&N}D>EnVqLS;NJ;eS+O5xQ7)0g_I|5(RvhL_J$Qeg zFEWbR`T5OY>>)s#@b($lZ$&+gWQfO@I{6Btt-!o@=d?Fwwy=gFSUhS4`WXK8K z^n<5bgjV_`i&%usZZn@;{Hjc%q{O&`zl)AH+V{1_i#xL^xovW0oOMOy-yXh;VmW(D zQ1H>#BudLkk*gFBVpIxCO16Ei&Zu~iqDR))KW=S0af6F()}&Hne-j8{xQzk2TlG~b zBs&Ks1bB>G#~}715pz&f zZ6ON&85tSbKk!s9EH0i#%oh0u7jVMz%CN5VCCDssC%Iuy6jUjUrS6Byr|74<-^#8* zm#bNtJTpdhAKMXw`Lq0AG&Rpm4Tm3#C*L`>W;G*KcOit;oCL2$3n-Zt)S`RpB{;<{ zDkNM%WM*(mz~NIN9>B`^MT)s%;#Ep~PdzOm&YW4Oe@C^!dXayp}rD zpZ5AV_ADJgi^pv7em}wU4GPR$b+J0eLQuT&$w5HrRW!UgX zP@FxG`?h+(I9XVVgzheik^^#hlgy+69DpJWP+g+PZYogH z`FsiK4AzaypT6U=AcDbIeKd*`L`n+rm?#Rx*E}~R!1oAe0{NQHH%IK@gyn8w7s}E#`cDh^!m$pSfF6{ahcAXH}$qgRZ5RUK(P|1FZ8cI@TNSGKOD1nF&70BjbX7CBMt#r5(Vgdm zKc_b28-35Kie;L3(b4?3Eoi5MgU!b?!gX;X410C@%(h2J_%6LarN?mjLcs5%4cT`d zzhTT2=_aqD-$~lxn<{5LelbaIzDV1A`y__!Fe=KI{XykPENhYD;{92VomJTJXepk| zLr*zQMkI{pS5~OTRUHT@r| zZvPRJ4{TFbLMRcECgGJMSc9ku$;@p9|4t%9+|u4oixqB)X84y6@I2vDCZri?YDDD$ zjd~PH4t!HRZ{I>j*<;oc#fkh(6OPvz9Bk-@S~`i6No`$3ujF3#XJ0w(cN!^2 zRA&UUHTLJA=UM*xJ`^r3uB;z;-F7pDy%tpmDh>OsJX&>`2=X;tMp_ZGe}7ML!@ZL@<$So5*rTe2+dbYC zrvof-&|3+kfDH@~h#$TD{5b_@_F0fRytpHUs(F3tl;efivkr?>|2{u-xtH|_{o={X zI#L{+?$_T=Y;-C}zHQ)mX1#3YLS=fHB~knVy`WO2a8Naq@h^B)o3We?bcB6cY{?o* z$UR|Kv#)gG3mi1zAZd``aM4rS(~nO-FLk-r+}2hVkgvXH*A9p$hheu21h;6b8uYw} zv0ahk~%E54NTUvo-IB7myizJ2>|soVG5S5D{RaV7%D zK=F*u%oMdYXSCsnA{G>bJJMQyMeviM zFoL+cfEaH%q$|*J3r`<_DX&GPyLeB2T1vh?*SRb7;SRfsyR)+=J``Es`?JBNQzUIa z*r${DtI0&HLRiLB7Nu1#3|++57dVXsOU$sy`3>%OLdJvvp<}6vo2mR3w$MvQ8tJI2 zd&e^5t2KX9TBf*DPh}^iU8lU+{L^FGg01!$+Dc*_n zjQjxmNHGJq$WhAgrMGv~En02iFBE589*$L8dPP^gqCUwYb}U%w;T{$4{euDh)#NPF zcm6u9pJZX5-z`ef8n(rT@&V<5>ohN;mk^77AT}FeccXUMy_rK~nORwWz`lql)0>$M z+pnA9KEh&;ci{RQ>)+EgzWHlwEmlMM&2eC_0}sn2DC4cyQ*p{$SX&dp2Vw_@PEj1b zVt|7dhZcr4+j*l^${jjbu8UKoK3)1UUfEfNeTk5pU|~K`Yk_--;J)nIWgz=GE{+98 z2rWQ6kd!b7gOacX;NqLAVgP{>GVzk7p2hsH>JF9^@h+47fCQ^8Y3>nFOPBWWkByDJ z=m8fM677%h&B=X#zVbvb6XQ!3$}+*SFe$OSfk%GJ2Vej7ZNB9DL&KcAcb44*3vS4c z$krdr>VI-x({|(hR<~t!J*G}^tM)z7Dnw7Ovno|v{p-NF{4kvw1)j6C&)FK9l%rzE zZr?VKjea`ee0xXOHU91+ZR<(m>~r4xef}nE-Zs}EdHilXOIlu}X~Jf8jAprrmeTe&=@&)!hziONOEZN8LQJ=T$q+^%H zO%Hsw>;5hi;vma%!h7`z)4_TwS=pFSPKgqy9>%aQxEUGWbv-8ttkpO_)2pK9JYPnDPh$G zq>TsIt-u_E%y$fCH`! zO{R4CSwxmEJkW1M?~XI(1y=6Cmyw;Fy$ai3e8f-a90v``1A&$hvfsqQ27&>M@0PHr ze)`^ojS_gf5GB)uoi_f%x1r~$aJ_efrG0&_uMvVci-#8+&MnBx5YrH(lSk1+dV!om zQ2p;}^WD68lNfa1;ljcUkMzz7%(=QheCT9i+6Nx~lP6E^$=pnU2;u~?JS>l`u%EKb z^78PAinc17WBSQB-teq=_V~u!5t97R`44^>hLE_laj~n`X(Jyma;E6IF zVRbPwvi4wSXD$D0B)pTIeWFVC=+UDDf*J|WAg}&J|V*4WNU$O zCc)9@$kC$&tRA1+Hbg>*?+UWPxW_%LRZ2i_xcKE29bQX6oST>lPQd^TQUsCtN2N=A zB~bbZkNMp?JU9{RVk7_TZ z-lIq=R9OAJ=fS3_aD&Re`6@EwLaKyq^O}l%pGfLTDP-EMMH*Q4i;rk!3W;)7lk_qk zXRw#279~4%M#a*Re_xwlZZ02K)PVs3@)4?r6x<>>>xf|}?stsY3+!A$5Xgn9N-QnJ z3wJ)1SKzYlPw%*U_g({1|LLhCH=Yd8(^AkH6DY_4T&%={F#00s)#Y=hcf!M8VVySq zAsssWe<*wNa4g%lUHmpnq(l=kgbuzkq%yr&U&9%Y zs>D&lSxj|QEFzk2d!CK<3s~}r4QW-xRc?-vr}j~gZrE(^Tza!z1wyfH+d18dEYw%pqw990M*{+8k^krN9 z4+kALs^?Zb%d%Hgoh6z&pKgq%c(bE-glEkiGvA&5$=3Y;n2}%6;HDS`qg3g&7!Y3jO8KHLr?iv5P#e7Kc^(kM<;+yfBe^ z^TzUu;lkbxdb{}yX-f5vF-p=O1y-mraegngm&L2VSnC|Gs#9X!9+ckA4r&!H9Vux` z&KI`P%+U<{Gi-J&^WUvBJS3~eyCD28x9dg01mqO>^N{Q-5xNi+Oib>|KJR%-lPsux zsgw?CU&WVr&?%AryF$>f&-U5$pYLlrcP-9mMFy?`x?2BXtSe7+9F8Znq5l9hdoSvi z$qPp9yFB8XXAl@MKqYeb4a32#rJM7-TUSD<+*Vx_6XHwDUlW$c#L#~AiGjySug8@E zopI{wSI(aF*nQrcY1RBlb8l})s$h2^i-Tx(x6k@btGU{Z%T>&&>?!reA60o57?Bi! zi_?n&KzI}~RdUE+^H@1JQxIce>z>!(YozN_{hnxrw= zvgxV+g_}z^H^wqGBsC9gv{C6#u+v-BWB@c7M z#Kd$|UtL-VjW@8|FUfGD&F(9c7z0=60GI!#G}V8z0;n*`>-4E(kK>lm&Ax~dJi^G# zEU;$}F&mO;04Rr|*_I~$q3$T|LmxE2LsRs-dl|(xuqeZI>DF)RF^Oa^<|q z6@?u|^cgv;wr=f6rpnIywB|dbi*syP*R357woHrq=z}%d`Y*N5(2=FXgDa;2Y!{Pb z3p{NYy3~&H5E^M>lY~0sI?^y*pirk!Q&WHLE!#mt-hg+GVYUKjj7=CJg|gw8T{(*O zA=}1!L9^h;)JmH_O6Sj<2+X`P!TQ?6YgSOrw^~Cpsxmz^++XN>P)~9~SJv=iq^XrT z@{~4+8P~#=5Za!sr((vX;4SLwyooVb#L}2U%VFGEA%}UTjZyWB15K`Qt&NTN-@JI* zFZR6p*S}mq(D2&$NcTsY9Cm#h!n8vMs-M2PJ-M~TxSXE2y_$7;1| z)7{AilUMxiXQo^TLuSd9Uk--xDc7LMMbjp@e}7d+LyWq>x4uV3>J6Br24Z3w=nM7t z?XqxaMyd+{Y7y`I1+yjoV^{Fp{3U~@59#aQ#y^5I_HPG9i6WMgltD2@%IH-XMw0Xq zNe40%5Qn~fJc`5f0Qm94R#zTC((;m7=;N0Gv#Z&gWjI@2)OV@|Zs15L?Y`($-apoo zC`%od_EhkrP|xoX+D(*w`<{vw0W4;u8ryFj#4oR^e1-b+z5tKsOSc}$J_nt|@j17n zAWiEq$E80?6_jm}N?m6pz1)ASmZW^A@boiE#=P3WSD-|ywhPn8Ldl$a+`oc4-Y!pq z`-@Mb?CCkAqr(a&i%==hKDcsipFM00;v3+SNC7$@EMc19Ls41r*-KnH$H@F6eD5rQ z8xTV1J-y_J)`vj}Dq~E!yQ^1&>>^+TK)Leto`Nz4PSxs`1lGt3I^*|Oc-ij&sRby= zP-*c>xE1ZL6xjAb`ul1^+V@@n`a}}cICyg#jJd))B-04&)zjT{xRX_pKS2j6db&kON{k>q*=rxL!u|@FHksgZS`jPlmk05A9l;H z<1VN?cXoAa{MF3kKYG`(GE)^+ckQGT-B=_-UAJAk_^-5L1$TYU$$7ipzRi8PXX>l| zcwPAY)1FvW&^vtY&RGxk>@-jsGMf{5$|JpR5i^Vih0x%tAk+IWgo#f;AU#`KLeFRE zhZ>ofiB^}T=!Z{OIhfWSE}0?4D> zXnX7+BPDgLSFt4Zuz>-)d}Km`77RO9@_V*TE8tmocXunikFD@F|Fep=;)h6E-kqQ9 zhm$sGTU!ZG+?u*imG*ts^IpRrFf8{(D&m^4hud}*@!bkF=U=}57*+anmxp(sRXXo% zE^pF7U?sC3)c7M|XTCk3(c!j~T5Z!&YmtBN0hFZpltR;(^R{vga{wyWJ@FNaDulY1+2$t5dx=^abkg z@->5p1SW?tjs>{+0L5G3C4`LF&uuRI^yE_Xfyjp?C1eaUp>ds^$#G6OfBrl?mC5lut+xnpdnh(6MfiU!0`b*eZt-o)*&e)~~O7e9(fL6Sj>?vs9GRX<)PeX^V_ zc07)VVMLlj)j(#WBJMI3P(3({t9S2iA_=6>R3C&#;C;3CKJ<+tu7hJ^ne0y8Pys;X z_s~6iwpRfZ4DPVdzYe(MT(( zj$xeS2==3LIhKXr1-L3LE9?4ybUmu;jw2mI5hRJUtmiEIe9^dQ1{YfK3f{entcySkg zpf#+-pvR0}TZl$KgbRkSH$R{YZHf_ZM)AmD^{x-eS~8d|40z``n*1Ey6l#dHNgx-v zEYa43%5AV0cYwj+-~0#vMWxU3J!n&kG0qTx%?q&jOuNP~$m;IC%T6(Kcyp;)u{C+G z19}D4S~p}#7eSBaxqjY7W>kpslI`>f!Lp%uk)57PMM?H+z4aN=XTGwzaf_TOvvA%1 zjsNm9gR`bSV}{S3UT*UE1d`39(4Nqm*6C*Fo7uZ$o=1v{i5;@*-1nh^#$iHtF2(Ep zoNL5L*AICS?lm@YXL3@-emw>MrK~(7s?N6R9ebMgM6K0`O;(rR%&WYu%VB1Bb5@@d zS9GPNrV8YJRvE4##3!Vhqiw1~vu*XsPQ9++HJ-CqH>^ko-DtQ#IiwK2ZTh0Hjp!zk zU%Pj`RaUdO{o*nEP}=S2o{7z8Q=Icp@_IgU?0X5OT+Vkz0VjKXUtc_s&Mt)efy7jg zleWR=!__zv46jv^fKpIdWcD_WYBdNNeODH@uB# z$j2QgpA0<_fH}4}hfo*-1>0P`^7||XqfNZY+X%1rC)82MS;ry2fr52|Nl|oUq5!GP zalJ`mN`m6wC0Aw^7OZI&&=`b^BFo!n`8W0bns4Dn2nZn=oZa09^vs)*FPd*bV0vbz z5V>VRs)AU9Wpgtk@`+jms{=qcq$O~ouVLXF<67{9H=O)}jB@oK0$n7{4`wq!3#bo5`<%171iXxhhL8i}3w}rK$dO5j zCqBzp@F*K_blblz2L$;@i8x`zw^PXLT z^76!O$_AhSf6dO%_0Oln1>?k1^hq4UrEB4b46#0k^H6E!B)-H+xb@zlmjeJz!z(0o z*#3_eYuuyp?x$p=9K_cmF^GjPE-T9bU`BwtcM|*H79ykQkcz6JU-Hw)Y26)W#WIgy z4OIsoyJ(oN(oj2j{Z~PcetmwE=ap!!biK_R=NV>yzSLKZk4{@xP@Nl6cuK0lsPdrl zm|ZMq)YL*K%OoJ)R#Rp`_^mkI-%djMdA3rp(5QNV{`+OSi^n zxrVxu6V`O^pbDQ)a`Ug5uPd>2p&BdMSzj{HP#dc>_QqVolE#vW{ia0)^rCB-#GZZF zyBHSAa5cI&;cIlpo8&Jq)V9wyT;5pneS4%*EzdmkU!rA?xQuT${VH%jFCTDw-;_u} zUSD0}mHS+)t&5;K!TmsA+t?Tkl65OmoDrpw@mP5`E=neCd-$<1ekd7ZqVdimku-ys zh?sw#{-+0kZ_=}9J9(a!l-b=zcAQd1{)>vm4_V_-i zRSWU~a1j(qJQw8dg`z-(n2p{NM=j5{!6>8SMdlCyvp_kGxpsQ?_aOpC?D2EBY_SA} z#>dt0`v5GHF+&TBi$uMdl*EII?2zd`+sN;@9`5YO;_BvPV%Zr=RxBZ8P}(an+!5JZT8 zWyBdV<7H;k+;n!j0@2oGRa0`a&DaJIdGSe01G||NuG}sC% ze4P+c`M9n7;M#*=d$H;gxfh@qaUFWG>Y`i8@yw1lZ^R?MB0dRWs*<~QrQ;ri^A!p7 z5rWDWPraB=I^dg#>xGM(`*lyx%jV`#h*I@67W&H0ZbYMKcuf}pMo~wMy*3RmH{2Gw z{ddySXg>4mX$B6{=mX~abv||c<~(%0*>|6S1KYhhm(-x^+QmV!vGf>#<-%e*kEaFB zsva>?V{MMYf4EB5D|fNyGfhjl$iT*U6$;?xi=E3NH>iTPZ9GNULR~{m=NTrxmXYZ; zb*SX;m%Z{qjP`c9r%meC#%To&gzs~fR z?Og9JAt3=Y;T4Wg93lGd9`5e$(+Cd%r}+YRbANny8wRwk3yN&SB<%%^fRKmg`#LKR zNr@((fK0y{8VZPu+l+*E+`J{{2UW-nO%SddCCmlTkKogKPZyurZf&jRGxn|zIbdOE zV;`WcM`{7WB5WZb1aCUh7>=4_7D^3B_}-sucQi1#qvsyQds%vTd51wZMOQeu>?!0v zsIt$yxv3f%#iAfCfz})62}g9tnYM&rtZQfiv!MNkOlk(5X)&Ns>M0A*=HeRm8U&Uz zh*Ai?bnt6XSmEy9`>QmCbg0u7=Evg&w!f+uaP($D!>PO|D#+WDMvEeYoz>`JTJ|&3 zBilMJKEAxsOxovo^E~x0YHOZ%v4eD755+cle>i%{L@w|{->B5gwWl)v6WPgZp-Q~3 zqOE>#MrStO2-VcvU`erF%X2?$*Y}Scxd$Ob%Ndb=wZ06Rlce01?VqNYHp@$)oWio( zw98Jn{3{A4ZD2a2UBLlQ5_X_k$#6vA`oniI7GmP!9IG6ZC%rGAIa9dF7p?hYZY~pT z04cUm<3PwAo1x{5AJEa+IfNingOPny9-G=jH%k07G5nl}?1zsZk&55aIf=k}NO|S2 z{1isTqdx-Su>eUMLrV~kLi{x-Q#@LYqesUOUz7?_1Tv6qpBA-nNjfJv17<*XzgjTp z4+^J{`PZ*s19H|mc4^_~MAtvr=@4xZ)3utK+T=&iVb!sbk!yv8lb+uf#^RkxQ-mG> z!SV;1e*=*dWn*34uY*7t^hH;uc1VzW24?tK^nN84a_F#ySo*JWM1sagEJw+DnSbNch95&LMaSsmT3yVIqp31n{aI21&_9vV2^R-Rh zF@K9;w(>nZnzB=L=aG{M>C`e59jG+7xhS96gRB8YP5$~B-<-mn!|ge48X@gp>{HHW z2R}K}7|hpJ+318C1UDOUw>xw6w__+VnM1`fs15~f3hrAImpdab5E+F&(l$n~B#w+g z0<}q)XvB6cVB+j!q=~0$>SQ{SF|Md&>OXu?I1+xVlE|J(L5uqoM>Rj#JOk@*e|5dWrzHmCwF9ZMUUa2abapm5TS65L80w7!K+0v)MXSZaBYhmo3C2yK-dT#Cv2x>A>g-qx}yb75Qg@bUmZWwA&lJ^jFcy6OO7?KQZ z&YzPXgz>j>a2$b2N96{f#Lg8^vMW94n@)n zwTp8)aDdJlUtcH`Nuvm4LSOUzH~iZxSDtkGAXKWPv???xh#u$eC^c3}y$LBOB;X1>z=@}zqzT|+3CT?Jtj9w za!Q!QoA#|L7WroCW~kwp^RXv#c3@FpT#g_6r56{gSK%hhg_ssH8OjcM2f z*2h7GK#PbQc7`T6lb6!S$V4hsVyC#`Bt1D=10b>E+c=VxIqFvQMNLoHOiqU&q#v@=Nx{t(XH^$KD#GdHfZXdn>+Za&)zH5!1~6MJYY5 z8cv<0XK$Qqh1ZBxR&kz!o=Mlw~>pTQVcGO3xcoco84)ZSD#Rp5p z2NMz!f<<2ZNUC++^|XwPdL*_eK5pvFYC^>#2SGKwyJ2V%fJ|FDIpKD5BYCn|bg6I% zLBv452!vzgM1$aK3_?fGziS8=8K@<6dTb)@!>yRr<#UJU#2lV5?$(%CsEX3CN%*tx zM%sY^esRN<-J~$q=em{W@=>BNmMx1k*tiBwpCnQ{Ahw4RV{90T=;8IBzd(n z@*L1m8})VLyn38!M&ZDX>zq=_JN?6X>{a>&EV(+E80kc5#{GhCaP9>q%@=S5>zkJW zh)N6Y>LVDOLSmS}I)V0d0dpb1whCY+u5vOs9)kGrs;a77?G22yqE=n4+FagZHEG>% zyKTuK=nx&=!Yra;&o#jID(!1zNVzCW0fb4K!J>(}_uPMeUdOU!7qJ(TY(T?w)F2}e zelDR3#9(uyz~i|w4OEuX)ah*5zYXhNHvJ5+cc9upC+vJuuY%5N&4*XUEA+-|*m7T)b?C`&#PP>;Q9)IC`qJXSq3x2QO6<;N zq6-SKHGki+%{2MBDLGZ%eZLh-yVHFdA}r+I(;wyMytI!ox(T{g>Re}_&0UgMfr25kGiTdN-oBl5DAMNIukJK0)iqtUKznA%PeR{rowfQD;{$M`P@NaaR za6Zf^OhPVE{NRDE7@m3p?u3W9ElCBA_kczMZ$z;r=>QJ}y#?x?I(!r48|n75W->DN zivHz1M&B#j_*YKveY|dZ;T{*A!dw2!wY&X?sYW)oHXa&$i?Y}Nn5NR_!3XxJ!*BFGmWz{qG#?t_G_(nteJEGS>`h3|W-HsD2@OhB?%n2_q=#vUGY5L= zg&DP}z1{4Nm7o8nFSzQ}Y}LiXJ8K_P$NEJw{&Um_zc*s~H&uXEFoNZ+?5Ft`tdxV| z2YcKLlr8ivGWJy60fFebW2 zu@lUP6mO1q+>%AFLalW2AWM;isD6cV8=p+Tk_;xNt>KbWf``@~cq_EGs03vkdv^gn z@96auZc zPsa7t^ht0%+CHTCersDDI(u+K5Z_;V| zraduH`bxdrOp`TF{gu5rV*gWkXGLpVkKK?A;3B!_=pk{yAAXZ5`~x)+=7u#vnZZ=W ze-k|+c0O?HC+ZCmpp5}NQs{U#&H292iAQf3fmwpp<xH=@g7S7`)#|ZJG{&PQm zPUz#ZzX`@2O_}I}Rx#K`%K97^}S=_Sm zQ~FUi>)z-GI^Jz=1FmN6^Xj?7f#GY;I=mUm%n+oos2tqv^L1PFbEC>AK;e>a9@bvtV6{F zp6UT2GXg`!YSG|j!+1>w6++$`GKZR78ixRSoxs9Er}95Y*x*a@o->KOL)%|s%L+U_ zWHd_j5C=UOWeO^R%op0k8M^uc4lQ?{|Dk$UHQM**lDpCT@q%L&ytkVkUzEGM`p>|% z#lU{uZNtH@mQHM=-5-T$tg+ z1oMN^L!G(Nx!OC5t!0ZK57&$Eh@OzTYZP+wnup825dEi>z6qyo z32;opi{W?$cfX0uAjM0lm@6wTK7|>H-~$X{Wb505a|4-vA`V@lcOZblNMRMcGVs$` zC~MG{xkB1Qcv8|gLnedaTUHp2Ow0>A8NIx{ad_4vui9P<5E1Bzh&0M=REC(!sq((q>fnbpJ+}y4B#+bLE+#W1OCd96KuLvV=*neeYRc*OJ%0!(3umgG z-3KN)A8xeHS$kT{*RZHCJoC)T*wAbB8#>or_I5l7qyHi}J0NLl6&ZWKKc?7>b-|Fo zPGdODXODR{yY*2fTKUK%kLUpp-w&4ex*@8KCZPV^SfJm=*tYbkgKsomvZ z*9be-9fQY*38)v&pH~8|h2|Gcz?S;o_?ZvzN>^6?IHRV6;}vRxe>E-yjVA>jGIPK5 z6f}da1w6n{y1&|5KfGxKbHuNff^|ZypBoO}y1u?Y%iE=|f+vK-4w1?m$v|_6vk}Go z8UP4UP@|7rqeLhPV5)3p1dfujqJ=31Ij%d(JUu){AGf3VOerYPhJ1wG5r_{+GFBv~ z5YMiQ3+5jHypaS)Twjj9Oc{3JN*6snJpg>;3qOWU+6d}QM&yEB4{=RwgpChKG7+-? z5(AQx=H5ih8?EV@U>gS{{55Kzf4#sXKz9sIV33$KdQOghRWy1nY?mLyhlejt_(Ih2 zc;>qsT<4+g!GMndr1zy3z@UcGNv1(DKo%_Xr@=vg=v#;9IuQ8A^WDEi5_PlqR zD2@#BpFRoiYClS2ndT*zMfi33MSYF zlr$th`{>c5pocd2flUV_pEbg_l#-s2@jA~)f|#U$h&*Kciv(}EefvsKjb_Y2)IpLp zQAp4DD$()&;<&c5&G+wP>bv)p=q{|{qKayMXu!?8ey#8K6PD8kswjo_mNp@M;C2!^Tj&j&o`&XG}e z*rpvF9iK=nA;F(9(gQE&2lNQ&0}VgT&X-r=B7&o{IvEY)d#BqXC-Aum_lfg}<0}>- zrmfKd8$Vupz-s|kw+J&;64gMc*Nk%LZP}0<42H4-o;f0sxO4vJ^t3LUFQ^Jqp_ZOR zY&n*6hHG3SP!-b80XfxAaP>pWPu>OUs+TxcU;!ddbX<2kevj>G?=OD}KMI*`3j+Ez zbcJ}I1;);ZRYh${hKk~1IrrglB;a_F9v%hv2y~Ssi@ngiDg;Czh`#2zN=#BEH{y9n z3c-z}*cBlzAJBsEI>8-B_7dj_nkiC{0J;(le)Z+c7lf=7yizDdci|w<=5`rB#VsEq>EaP06E{=PoOw z?;8J4U1_yG&X0~N4iD9RGJEvncPf)1r>W~DorqLzet*9Mlec5|Tg#^WRBuX3z8U=1 z&v4h|tA3Gwo6{Yhfd4&-d!B=kxFKnd|IZ}uhPQ%*#I?5jKaseapYNufJ7y-nP6Ne z2QKb)Xd2L3%l2J!MgBr|ULMQ?*%#YTvTEYxl1L5?M>tJ4u5EElwCVgW+c9OVJ*gdf z8AzD!TYZ3jNVvQVB{4DrN%si6oC^71aKj(FCkx`G#f6J#E-+xS4)}ZMeV~rW(E_?iRDem@$zhD)+#yj)xbvM9GDHUQTxLqs=|uSazh) z`bu4?FwNnNLv?m*6kfSYP~}pz4Q#^!GDy*FIF&0~QkQaO@ReQK{NO zS4-JNT7S!(9=+?Tr+e~XMW6Na!sw0;QTLyB^78UJAHvK6$8NXvIGNp*~h zph@$}lRI5cd8U&F^i@q4MKen`>(Ov8>$$}Kj=X?y0~8uZfwMj2m;?kf`|JF-?`tPN z0L~|za@*{Js`BT7h?On7^7&e5LBo43F5Ubky?IF{_1lv>TDGNP0@U;UCt|3K46Q-5 z32$aMe$ye_^YUT9sX_^EZ*5{D+PPePlXfzJ<@vK7-iGQgTU^%P6nA?qyVG6h@8$iv zsHF7lC;R2lhowS6qm>}H4ssj9C4rVEq1haKq7{SxE8$xH9!)NpVgY>z7u3%LF-Gti zUNy+~xXG7OTQ9#vz7x1$uG_Y9~na zC}3d(LDJ(2Ad(PVO|Bn{z=-Hga*#Ss#N`nld?&6cA+5W~a+6xl@mh8Dm_m)N8Z=5H zu0Qs2FJXB>7N7W~mXb&Es5;TDrvU*YQ$rwamqrhtu{h0liBNk8^-wkLdkTjfRbag2 z^l5NN30T6Tk5>p{5;TS~qf_lahMvDQMW@WKny}bLKaNODUoe;}swxzyeY{6PC(HRP z6V>5BOnK7F(aYd^o0pRVA;O*E-?svP3b@}xK3*G3gwkjeZ{s-kD}PD#SY?)N=cT>= z&I$U9!_}hI?f*pdp(=F1%prnOMZ2^Qn;AsS$gcHch6pY)lfyZn{{tyQo4!-INcgvO z`o?AFHLvgUwsRSTC<20)hGgZd4^R36Ji``v0e9Mk%a^k-8Xj@=B8~uiqwC9;tnBS` z1b8#DvM>p(m36WxGxIPK3y6nw6$^{^)Erna1oByEFjFlqI>A~%l4`IqE5@!;LD!7& z?vj1O#EytIaR;8`@sN_Xmes>J*-^*hHSapx69&6K#JBm3@0-HEwN>Qhovk*zF)&J$ zWHB(LGS%YkJU|*R?pw5z44CYT{E7tIeGp_f;C^1j8DfmFJTbojqLCo}5U#xe3FwyS zAZKmtIg8U#ji@)#)#cX0j-|_Pv~#B3mXrc!NayFihp7A3jnLn^T!VG8L0HOZGf zb*c_;lVo=T$Yti-qcu3#g{E8~cr(yDNN4wwzfa?E_=6v!Nz4Am)0Vn#}Ow`|~LiCOe$rB)47py24 z=&LwhLCdtwUs!`_K~yyK9Uvt!&<5ei zN;AGgn|&iRGz~X^=6D^nY0$N)ihTWW{=$V6NH#SGcjK4_sKSA}0jW|!I4$(v+rXrZ zm5i(Xd2KBvB2*Hgo>Q$1G6>B=TTX;h{MH!e76**hW%;)o7ML0?^F^rVT$j$d64Vdt zfQa3Z*58H|e-!f@C?19dsNIOT8aggHRDv4WMm@ddL zCel)2DN!&y&T63Ad#ZKEwg~+kJwYUbq0J(T8@3$;-M7Kx1$R9dLIym1!*~&BlV1JL zKDx(wZONA-8m-L~*;>F9vQviz7Epn-f;)AU6;>$R-JhVn#rlIZgk*d{xY!Be9{@${ zXi8&4P>-Xj9>1WSNo`43<%_nq^QXxk*$i6K#DfR0|~C87E*1n~>1Lc_PKYJK~7mpt)Lb@Xu!)1RtusIf36H5<}&I zc;kPl&@jD#aaKO)i_pQo#s`~c%Al^huE#cR2&04dnxBIn&s-^2X!X&8JU%bc2)VylyVe0h0!Ku~yV^%L4JZ17@>X&$$> zEMKDfNv{>@`TR~4`yRo2Q43G`gqO32E~gnki^TlAbbM$K;7K^PR<&QHPKIB4I%M6* zO;zr z#Wk-}w$W5KyB}a8lr^;t31g4;`gx(dzShXXJHuhUCYdxwdVszcACgp3RsBDmw*LRS z;(nS1$2{rryu!f>?W0`j-~qPNqZ~8;al~c_NWI)19vS)C)3x7QOF&Jf!=b`9CWz|8 zADw=4q?e5zR0>Ait&3x6x4_7LS~CV_>?!am(|c5|!Alvy$_p2V;E@=3;|{>Zk@)oz zEV&ndysfHv=XQA`<(|Vm60eY9-ndqdlzW#OxK#x;wyoyw^&JkWxal7vzTS%w{kA1S z3JF?Gff$JlKSF_Ic+TYY1y@n^+avuZ8mHF&-JvV^*D&(GDKx>qyG24+Hb0OYUY4zG zn3_keI}<{zwEmxHHpx11p-WH0Awle7$L=4&%W^7IH9K9mNEb31$8u)KY9du7bVR)? zBjG*gi(kV!?2dcdgi05m&_AIUe11?ZLI%eMX|PCUdFWfhs7?&j9Dw>o=F1lzi=9a? zY@Nl`1?^M=4pB$npH+-8p!c=ClQC(vjVCL8m?43#G;;e02OiJm|F+E%Z8KO5GFvTo z56UWbwsou4h5h0@a&H%lUwfX{QKPIv(Q>9J^DLKhUE(U#Y3z=>70%gpHm`DXi;Sq= z^#D3U0Fo*w`S0$sk4pVb=0F)k4Ug=qhW`G<_a#S;4vjy1mQ8=iBFf)@+Ch3KOyJ#n z|3shuh9~TF?QF^x+$Unhy-Y&`5LQvA)F^sR0zunML$PMyN)`Q3%8^k~4l*z!^F>A_ zJ~JLDJ|uLQTK8@4&h=l8>J}TWg^RQ%#vJhp4i0PzibY{+15GM^BC4eR!ILhwZQlww z+7F;UkUQddz|GJJ;S&h7d(N{;^2TaDBFxBwH9ZfnA{^j&PZ9xgPuocDjHmNy)iJI z`T3(YcBx`Sx+rlwB{{7~V_eFq+wQpL!$>A}2zWbyL36#mJkc3=|0Txib#%kSJ4;_3g>?jZO(_{oUqgCA*HD{H+zI^|EYONABr3RoukE7RR6Ve{b$M zyy1yx$&m;m$Z^#k+PYywP>HKzRQer$tJC?ERijP*7ZlmPtgIK+aWQRYZu-hm?iSe2 zVtIgz3A(To1+5f=vE!K`veTFgjTwEBg%w_|oaZkeKktT007eJpmxWX~y`#9g?w zFnkN@Q*+fG4%Q;8eE%hnI)RMmDUyqSc4oX-ckbBBfkY)6!!7U{Ybp%At)l&eS{tkx zfM&G7*}Vx?r?Brp@&%!m&FQ@(2DOPUh8W>4J&Kpf8c! zJ3LdBG>_b^U=Wt5N8S zAaPfm!n&f%Zq9q6`DCIv>}pQSa7N-n>aU9QsFA1_lNc zCkOG9bD~lX;o^#i(CHY)agnGZ0>OHt`vih*5ZxH4=_382!GyBd%rLy3>cW4eV|7-r zE=B(E$rWY6V$BG!+IVUxHA+21N3}s|{Xv=g`~w515$~c1M34@&-E1&|68APhRd}2w zW|nJ?T#_xsErM}V%)?6y4l#N}XoO-e^xP7fn-ZWnB+?hY9ggn+l=y{&=z;llcR$n# zE}90VW@b!T zUf;1~$Ur_435$0H;i42qpCkJ9#z?{51awvZ5KPyuk;;weo4`S?uBZ3^q|YZJ9!xUm zHKVr2+1Y)JcwKfluO)H3u9(Z*_v%|I5sIB%&TA{B-J)-Hrb)-s1~x!|%svGfa|>2h zbdLpmrVu*{mw9XPw_IglvzB&iX|XXrF{Z71r_qvxEVPX(Rk44L45bq3pgON#AcCFF zuA9%VWoE#(@67#e&<{Kb+Oj;hTUd{83_lXb%351nTf&9lIXrgzx&eZ%5j2+x`!27M zk$Kbn6(ymu35WB#r#R4d?KX_{(={^Mc=BUvsB01iP{IIYjrd^fqID5QqSL>AjY6R( z&m@TeIn;XQ6O9I^W<*vwg0Dc9UJ$4UB3?F(*SCc0z8%FHlQy0_Ivub^P=Zi{Spn(w z62dK14swcNyB>Ku@uo-PA4pakhZURE?h5`SdJ}RaJjO9%rZX_l<)`Bw}OqtI&?? zBD@K>Qv>`f9&_I|cV*Qq=HeVNdA1LgGjSSQd0}*_ z#DvF2CNM$i17%VB4+S!Y5Ef@3UKkLH+^Gjh&%{;-CP3USXy}H=#$KX{Vqb>yzYdA7 z9L@jISLesuxRth+r(swWzKi_%PZZ!epa*EqgKfe`(a)^Z(4NBt4k(>1C^P5N9~~?z zw5SQkkrfma1Z@BzC?GXRH|D{(XP8J|zj*^ka17eJA6OV%w@ge;<3X7egW)4%T}hfQ z(sV$b8sr;GHTXbPBLskM4R`)R`$EG&z8@aT1R;=L>ZIgZw4cjWB zjw_OBXIrvJPj&7*yLi*u%1krnf<=RX?}L+|NQpm+nMY9}v>_0)P|#MjH5QS1W<}5595Z6V4{HYOgq0-3XC|gfy2jUzk?Hw(h@&$_-17lc>Of zF#u+gmv#D(44V<=m&8H;K|?4}lAes&)BlVyDaS{Mx_3_#SE(HW^?+}fVPW+-%Jji< zR#H{~88Few;$OL7RA8o7TAiY!563c?{msJJ;3DxEzlj>)Nk1*pY8#fHkwyd zJw!Z_xR$+UTV*TeNRlmbbv6040H2OxwGI+55N;5#!Zf(D>*bjk8Fg>yOl_S9roza? z)HL_l^&Ts}ZyF>a*sa7Jw9}%x1puJ>kt6y*e|`bRW+W5v;Ftx98BKf854aq;biu{t zE_}NMb&WC5zn3_bDsZS1J`9jPNm4Cd!JiQrhRg$il81zXbm!b2G9Dm$&ot+OYJ9A# z03bwqzXG$v;ma|kXMY9)Z6bj*xt1G1VnH)nF+CmnCd@!-h5e)g=1_P}^?Hd44&_rs z!qVVCFvLYo?>kC>;OP}tK4?@)&&a-3g9HaLv=ao1&^J`vc#>*;n0)=@ZgxR?Z(Rell9?Kqf0(ZC;n~ z!9yqK?8h~8-Cuv6su09obPMUS4sX7d_#0xPF3h!rS&WnB+2Fe$Tuwb+k|(feav?=K znZZSj#>4Q#3txoi;mbaN`v-mlxcCvrE%A0b?a{$>)bI6(YbRD_peQgQ6J!i?rr~|0 zAKAJN#mO)@{t*~|+Ae&MI{mT({8#-9zJ8p72hi)ERF+Vv05N|E$B@s$AOqGT5sXWi zS1KJmcoj}!B0Pk35>N}_-w14%wY`Y*^VnDr{*-7U@TUY~xjboxGYoSn27x-@s8-~e4CWL?V0Dj?%7hTra$4i~22QWJbgB7v5h_oNR zL=u2ORoN7`!o2`^23Z<+5)G=v72g0}M4=`PKVP6xG(H7RRrKT{&DAdVVBedjL6RDv zAbq|=Ri-aHp_@X1u+`1YE!QAN1-e*F2S0^lAjA<8-~8!AE|)HiL8q91qAUc@l1fG5c)ON@ z{JI>iNjK>iHjD+|PH;TsF5+jC`#H>#`g}blB3r|*vuo8xo%s2covSpr1A1*h;sKHK z_YxyF&`df%T=)1@Aq?ku)`O$=khrbkKKJeUpRL3@VIWJLrT2-eoJpPf3r(xxIYRF2 zN1AI+e+&Kfdoh}hy;o*Vv1pY|>@Fze|5LN;q|p4e$j1Grtfh7JV>_JguKxYfMY*oS z%;(@qB&g+cMV}1(6*jw){@FWm=ZOot%fLx=oK^@JxR|^}58zVF`FqBnTErsP1s_<*qo11;GB_mtz$Dvwtuz)Al=M9z^TtHDxM!{LvyT_i{!UOjmx9r+6=We_EP?`#{H zYr10<1rLIRXlwSN7?y0F>09MTY`nOwA46}2_6Xt@5Iu)fRH(09xk56HA*mR1_={YZaU@0Bg3Tn2RUO_`co!vmf{9-1pT{t~BDjzvwi)gn%Og25B# zHH7_`#tC>MXXww$=~NXuMy4pf+Zqw-*6-eV-R-z56u6u+Gk@}k|3mh@r62{IQV2SzCa+J<9Zc1bW+bw21@X z1K5JwLxtC^?5pr7OTL=tqSX@b{Abt?fIduuXu4E59t}&R6!IX%zGL!_p?lG2^rPqR z0lcOPk43}4Vf1Y0)^tTmtx7BPt+!jb$r>lhHt(`3%BDd#?}r=^@h^p3U9Br zx$K`G7Yd+`O7bwg%gPL#Kl;TV6mO>-9a|AF6Y{Ue8c6+k8v7~5uhgFNuiTg9==SZ? zR{T11YUYrVK&2LyzP8eZ*n?vYzWQL)R50inXSji5;8#%KVqv*S9eA0vC4hGjk6R!RPL!p%FhpYZD=3iK z6!t{q@UBJFD~eST{h@yJ=<%8KM&?!wh>5n+!ZO+*VXk%Y;!n?dMs&)+oHp;fB!-89 zgG(GDyl7OSnB#ueQC-K0B3doTqDBgW)8=_)%l%v$3O5`~2~# zEMa-@mP#=>2$EI2dyJ?k5;;iupX$yT7vDaySMJ`=16r$y)v@&j1 zUfDGo2tbJl4naOHq6%S8(#UK$qNNq4_v%FEbuFhiA4wn%!dTdwaRQO_elw$*ED z8^{N=Y_E~Z5`Eloy_XsLXl1Y0SsB@)0a_)CgGoufOV_RzA559tCfli>LHo_8wxq9} z!;!<`?(0flo6?Jh`jHd*M91EB`hg@6sj|+paTH2+?;mCi^`_!k0xhfVm@|r!l=M^d z?!pi)M#Q?P>CZmb)17Q(H2q?*M)5&t(+!3m>&m3VQpfj_{6V_<#EGYmw?dLl5MqN)NuTKt4lZld+Qi}sA&hDt6ot^`7e-ws+5FI{ z;_nV#km(2agaxY#`aN8>7=;}9ai~#{2~dmTyaEzf@E;5#hB-h*YSGIte#t9>02O3@ z1nI3SCfcoWL=b8X^+{R^U(y&BQW`khB>-3<#RK(75r86wbj<+KC>|Vq0NB7=9@E!9 z4fPGyvxveiY>vYi#|kB#uE43epFh*Fc1d7zr&>_3DTNMs}cHUT&2y@EzFi`}q*_C1mu2*yUrT=b5bHZ3xEnX+5(_g{O* z9=>$(_Jib2s-^|ym%69E-MDv;(O7ur%s(+3z3<)$E1bmPYaR!K#SYLvI2OIB^oM(7Tr;s0(a`5_HM7AMhdIZuwywTSHPZhRb=G^VTjxeAw zo&$+DZZHy2CELL=QkIgGq>NA_QT}Kbw_&OTy2l^r3b^QK zQNo*7dujY#Std?lqD=nUqdn+9_~lCo+H3;eL4l1gDSA9#9SgC9TLRJ_2yvw#l?{xD z@(@GQ+qPS*5x@-U1v2PGb+ZGk_C&#pQW^z zAG$PK2f79wjU+k`(BZ%*8!*}L$V%e8l+Thow&e>f1)xDzQ2hagr1!&OCf=bqlnt2cb$wE!>5y4zWME@EH>owfhU}Cq~Fkl4<|DO)wBA z6V1#>FOaC>CB>Pj^+Nw+R1)@A;LV#ZYb#OzW7U%F*RJ?S>vc5%1Ckk%m2e#GvC?@h zN&TZom1B<4Szha7RB+rM>iAbeWk{}ch5pCu8859b_Z)UjS5VN(JO)M6Er`O zn9l>f>MdklawDuZzs6(sOpcDR#a-)|3@0n94T5Izn$*%)Uu9BE^nb1fw=$`2&7+$p z|DTzO5oZPOb=G355S0lwP8uXexqH-fsfnxvIyHV_VRb%*RWohagv5dj97WfC8wWis zyU0`w!u>BLwGP#Fy^E7Y99!^6)nJhGERGW*Gr-NG3_#^hPsP)xX-I@7oPUka@?CjX zp}#x*VEugdU)O|NA zswkM~bN%-ZVuusoQEDAH&eZ+?QT86-T=)IoFiD9zLqn#0#-|_yu->>!N ztlP#j``LcI6xoy)K0e+l+L5*@dN3@%<-=M`!UZOa^UNcz^TtrXOmrk~Lp*~sF$=GV4K%cy$ST;|&a0Zv8J;$2-wnyTN_y>hv5`~sZ>MeUbI!>5gc zX#Cgd{JG+0f4d1yYu%?$=NUc)PrPBLw-}zBYzB@^2!60hu!VHZ|N3zm#G+=#I+g1o zQNE(@KBVw45}yDQ`v*70sPh`=9k{TH2XA5_T3jseZUy)^e>S{-uPgUHTp-Z9g?hWZ`?aGUYfDc#ne{{FJUED}bNx#4jZbXMk01ql z+Q0tPrt$Co8}5+Eo2@!A>g=DkaPj-sRsdw)TQq>Pj9jTigo-Td_g-_S@=*Ijai~k! z;$hFu#iI+l42PssI8(2?DYFOV$n*6-J(5?GY+`xjh}xe(pcd}y>N)aOec-SJ^ zrdga@+BDsSZEpvm%+$jlLmN@D^4hEc{=xm*Ab)hw5Z71I zn)PQ|`;Q(ju7BHm;HqNl!t25pSe$pmFXL?w#ia?Gx26`V9}RWY%f{={8(-B{8Vh|rAM_q2nW18cI0*>OB z-wF=(kM8+=@WMK`WJ4{y^t|Jto8`$Wj+!w{K^%&)C)FR+rqv2h*77iRGE!W!8vJKw z@)Iss%wILJ;wdT3%s*qFQl9NWCibhYr(a*kuQw>08{V+`&yUw9-%+%1Dz|F3QXEn~ z{N;V&x!6sTS^_mj*E`^nkSmq|3yC@#_WdU40b|FODrXyVu`-mU~M>yGNCjFh0k9L*4;fHj+tDDYbAIrbl*dr$u6nr&5 zMF3dmQEIGKMerw4ra<#pcj?K=*@^$YVZ?_*z3v&U!>)EgO`$Nd6z1l@I6y)F658%J zmn2J1^t;G-2_7^HG4FNt6bdrid-!>Wk&S-W1287KkzbG&RF|BI(ZiL%LCZWDyX`;Unhgxo8MGpoKVvAbG zPkl!%$WgE7`ma7~3Y8*NeIybaKTK+Q`)GT7O>q1bMwhklq&L|*)H`oxqZcV<;Q43z z%mDV`Ry)iNo$9olDB^z?*yESm(bL_q^KwyAG1K`&6gC;A5oYzFzT7EFS{<5eMU%Z4 zLP74RA+;8dgw@|aE108Ix-ar7DBVA83IO2K+?^(-3uJK#2#@BgC6r~W~FFEt=jq*cN;MfOm& zY3?*cZt{f-C(-Q5m1*2g*na$f^m^F(+j<78eG$XacU@@o)kd40j1^TGGT|GBb)RKG z--FmFM$K)@^RtoIH78u_Nb|krxmC;7bES43|86_^m?h>kGsq>pPq<~fpw_vhOkI14 zaE|@^VTj@BbO)+Qok?Y}j{J^@Z3f8iuRO=*}j`gD^} zy9}&;UXG*r*>F(0QJ)itd}cMqlB7de!Nmz zYgdpeQO`cWbU%;Kd0tr2K%zlof!p=qxU9xa6h@p=^iCe#O=hGJ> zs%Ot${IfiV6_`HQk%2DzD-aoI1?peEOnCcK6u5EfD0+3xN*70BwLlAL|Lr+kNdoSa zfyBfccm9~YkxiGqJPfK}( zCOzrt7Djqj)^wX~>-X*xd&8fpt%Rkem|Fp|@n$P4C^Ys|dDCdEBguZ4OMt|KLDjZ9 zo1f=6r3-C?R?@Kh;3%P2(10T9;`{2Z@!K`Wm$vSBjkr+EwD}+~+-AnziSCimO#rlQ zd-jUidlOF*8bHmNV<*sf>ck+1qf?e!PNd)Ih@bWRxr8=m@dfAb_5ceL+(l(9V9i@o z9lI}l^Ip-eG5_0{H-o=vQS_J?jpL15xH?;XN$JSMh5p;z?Q!beiG1PNe4&dHexS&4 zA>SVAxzImht&%Oxti`6KBB9dDS}N4?*g;}EL)(k;F_eE0GMM6nVXU_QQojZ9>0a7= zmY7N9syzRn%Xx2N68ayO^B#p1$hqGNrIU1RFk%^mydU%rpB*_r8m{`MW10Ee8PV(nYUq+WkvO91M3A8FME`A2p~Yxx9MJJ|49H`N70?o*AA`+}*sy`oMuA(viCeZ*>9KkO1b`VR zCZ8)kFB+r}ql7`3)?J$GHVCey%*-Y{h}w4kK*wP|efRjQbgTn&MvfDLo#qL zR#fMa>a9AFf8KR>r^1*dcgzDZ@aXu5kx@uqA>8G8`~VV{iP~uZ zM`F%uPxd)^u#o|^s-+o0l_^7T*n>mVvAaiZ$p56zr!%4V3s5p^pKn=@eP%Acz)hgV%NDhK|KkS|`G%y!`?<{g;o!XmfQV=bB>C z$AOT{^fa9F4cZN1(kE6=IWA1%EZgMC7P#3mU+&(6*`A>krGWA1bU-ASX%7T^g?X$I zLuYtDQ2KjS9&=$wJ)wL0uU)lu1=SyYmF|TKjm*4YGUiK{*++RReD2fJ$4e6sUdeLs z9#fkG&;#{*fThWu8WKT_z9yr8GX;ryCW0y$;&T>$q9cZnVlSM?$b1O@=D9M@g|06Z zUQVKx$AW3#Fon0G36qLp6aJz5uv+0PZ0;yCLyn08X9%N1+^4i&*~kA;mvBg`{w;zy zM+s&}IlkHl+Yhh9bVcec6#e8;g;R+D%s@dP5xG%U0w`U%_nx2MCc;DtAZcd@ut{u~ zh8Elc8I?+A^|70S?qTmHIfp?`!g(CTQPqmIVxFF!c*T=5N)X5!J4)B{?wwZt-|u=C zGe4qw&OzE?YHf{g$$*HC@n36w;xl=Cfw`2`k)N`7611t;iyzBN)VRCzp`Z=&G#=DP zn9cZ8AN7+gacdH~@J%E6@DgL*g{v6~8td~DGT18zSN@ioeqpB1OT0KZUbYp=h}I}< zFm-B+W!!1_%^B;HEs|OGXTP! zOCYHOWD?mqIM%5fL~q}@lh_M@+4)69MTMPPg|~u-i56TfJKGv0Dzi=8@f3Vc(cH$l zGN(^lcRytY2}$U{Y7cklFZjPp!$yE_DsN)47Zt4vWD%0p0@xP7{t$-&F;A0i5@?p7 z2_+G`gxCdk<1cwkS~?zkqTiwcE}lm+0J4+r#KrBw=N`sqU3V)ne?%RPbMlV!f3G~T` zVf;!R%Kcic@8GzF!f>adfK4&Y<=2tYT;;B-hhxU$hK90_P!#<*vU1^)orF7^Md8oL z#yal+w_S^Z-~V1}NNZFr617p&eKu+Qw(b)(LT*5$+=~72I9v32=Sq8Jo z^$WZHF3)Z~a_qel+i#`8s%)b9IRhu|?bu>&{KC<8vefS(n`l)@t3vha(k?;_ zlUW8Rm*xr~Uf@1LLdFi%p)n|+N08=df!KS?>HYin-^KOLLwo>CV2Vmg{za?sQ{x?J z2KB3O@+1XVPO>@;>Uo?xlFW|qD++oPw`Zf9IPLcIt0{$vY-_YaMvUZ?S%+bBunh@ zp~c*m58-1`5oy6KS62~aH>P>_&tOfyiWeczEHkdA_aSy%&uCI!S{r@(r;E`1vU zf(HJKkN}K=|4Jc$V2wWgqH87sYLrH8br?$k670@gJQHj}(nPlffW=v>2i%-l5Z--T$WcWj= zZ3bKv0udF}Q;SF7{z*X51{4!WbN2| zCn@ipZt>aU)cLG@hD0_lBH4s`bb9UgG${$Kr0{$&j<> zMN53Z!ADU~4fPM_caA4t-*WBQs!d_WtrY4o}6wYbVN>EP3n|8%H7 z-?w#GJBA>&98XTp4Q1t*S0aZu#d|_t*PTaR$Ne`nwFjm`>b& z;UivRSWe#TB=RNa)(?j3+?k zs#bTh-2f00d2$eKia`31lGimgWQzzl_kFmsho+`lAf3ejF$AFch|S{U9sS6A5?p_N z1c~125&cMefR5)@)xwa)$gsWx!k$!8#ZoyhZt%|h7XH~Bm=*;nRWu*$&H~HD!?HeW6)%Gt$TbP zaJYXuc3r#zjMl~SGN6EhPrpk*-+M!Mk}X6&Sj#J5ukpU4BN(wJv#=_l_k93s-P%vj zwe({VAuO!QJCzl2XTJ0Xjf0iH~<#{jm|r^OqdHJFv#AWAS?(q3atjoFX{f( z(%znq%7Yhz%F1e7Pvk4BT-cq~R+a0hKEY8@2A2ji**eVnBJanoh?P~2oTzix+@|9` zy)qGUym){4kzv|>-p8HiQz&ZD+xvg42J=nn3vnJ@V!1fW)61A_@ND^iCwd-w9J`Sv z5Lg@p<{*oTuYC;cF3Cf+{?O1MPtHJxIrzY3DW81d<6#2khi|xS#4b9Bt>b{3P{4k#9ectpaFN}DC@C5t9&5kYg z`!S(&p|9lfrYFX*B;1A0d^`ceG*^F z%k4|6u9r1L^3pPP=xZ&rd5Zmf#NE{1!g^oy%+pFITaz2-Ez+8LezmZVU4QV{$tG^r zaGY!GNi?@NJ%^Z}0U>A6uISS2llAbZLU*ChnZix8&%2kt>eP+i{!@QkR`lk03cp&l z7v2i~-5v0eJAMqGyJld(xU{s?w;G(e&jj7WUn{pj zdJb)x>t|@L55<4)#CD(kVB%|WjA4x}M=j&^+Cy)4q%=DxHxF$fwK(IW)R3_NSy`<6Fvn6WIz3~$yjh^Jww zlgJPJSmCTiv#%ECkT-e&v>Z(6@XSEB)pT@dxXj>nWJO(M!lzxD787DgZ>zs(I0Iluk0IE{B?4e zHsw%{W!>B=e1n2m`}+FY-38GepN-zn`~!IrJWIXYyjRYWjVYAAIOugqG$;7z07&i> z7ak^|kOx9g$bc6lycyuZ0bp`TU*bX22+f&t9 zUw@|aT4Xr3ZXH4#%imw%HH?iZ!#|ugG~5Y{(Z&&8M&%nnZ*>-sq!QSyJCT)Jd;)r@ zaHAObi~M|isBy2)$jjeHjq1Qu0RQVhOeNDid9g$)ib7)vsmLG59MCa6KuNQ;vxCB! zSF?nu@i83b!Cru`e;NWNNR*x)8N+qRq;_JKNzmrQllE$0U^VK?!3!`m!vRnTX#*7d znzohDs>nDGLqia7_bwHvO)|CMvUe0;*5kGO67Zms2mH1IDp1NyB@YUBio<$2lgrdn zq10$Pp+T(2lZs_lpYh58z{Ee||Q;X}7#FduiJ09peMXqL(#)H!8JT4z}x+vd!hL zxPRBxx%5lp=#e9X)3#SeRb|cA2ebO*Xg)L6xi?GzAlz(k}AtfDfRe4Mum-wUJ~pZc>da{uwlN0<;`0r@#UL0 z4^gBGFLiJ<84n{ODsu7!va)c2vC=>Rw;xSl=WktH_teW=klVlvng`bmO2WpI*olKt zFv@TcevP5KTcs$pbG{z6>H3U+iM7U+k7AKYUJsk9S0@g-#u&r1R*O{(JnL|wz8r3oWF zXuP`2dHue9vjsr$8b=U9{3H6wlr6owxD61nrZKPyYga`0xXr%WXklrI^Ra>ClVGGG zV<82ae{++iwa1tq;aE@ESIGl!5+5YRM52Q762 zWi+>I{Kp)oVruB5hc>G^<`;;1EH$)wSBX?!5_x&|&SPttlQEQg+=N?pH5fAcOX_MV zb0uYld*u5^CuQhn@1*s)192C}&-?f9n_F6P4SZ&zmD}g@CUovRIa{cGKOkd;ICjAv z*~}eHn3a2~&f|Y$fX5Ff$2`)Hh}Hg+=bC$_grMLStS-gtm9gaa-~ka2wc8dxqr?xS zMnz*keq3L(l%zc18s`|5)xg5jedPdVs}Z81y>}1$nN@K#SVL=d<4>q#d_6Sc2riRa z{!B&GYS2yq-HbUh1}TcBT?MdNqV`bTG{*^M+o$yxQWo)b`bC&&DbSL$@85s=`-`KH z`k+;??aiaaC&SE5R#kKjFH6ky1`J!XmM8H{8uBA4`Uo-JA<~f0{dnUQ#~xFg3q9Pe ztE)>CT9_@EV$uYmDd5jy3@avB#oFvZ)XeetYkR!>e=!^M+7Br|oC>}r`sB?*n-cBz zEvnSnQrTx0&gT@Y8xvD(J9|J_kw0i@?Zj3=F4mg^&xOexv9``OvONYVqw zUmTyx7zq814i)C*B>-H8;;9oT6DIp4mb*yQ34`v(485E?t&9)=3z)CMh z$dU{H8d>vN_&I1855WS})1zk?CO9bpSPWLG)CKjC!WL~>-C0XQ4X!(88AxcuNFQwQGKFQ^?P+X}N9?-dnv#spa*-fw2UioWSuA=^z_ zh7SZ6^BfPK)>6LvM}jNbaA$A%V^iBiz4=$|w~>2+LEjPu5&9=hJ7H(QJ9t+cpt?;J z#+IYvUCrnwx;$)=G~Mv()hWTpSFdN%#QWgAzp?*dAWlhg{SXS?23WRx$=Sul9zOw0 zO^Qy+_$w4MOao|#x|r4@wZul+9x$KB-xVvcA($QIf%u@L5j*~)Cr`33Dr(X1vc%Wb zjKPa#fvpahJ+LM0t5x#|6iDO9Fwd&T{U{hDnyd=2N1u#=pL^h zC>voBk<_U*gr7s&YHDTG08b+ZpApa#5qESE!}9FWZn8}ihj^V-9_&IS#R)>8mpwg^ zP*i0Z+_ki{L~m<6HgoFCnX&4o_Ziu?{@SYA#Yc+_KH41C-5{jqx65^6-*3+?`$d%< z2fIbzr>#a7=Ve}8>io4V#1OT;>7MVFZ{_)P^4sK(d6w?bbI{u%2KCOTb zS|7R5=E>VZae>dZB4)Clg%y7Xq$@3QGa3C6`rURxwc8wxuV0_$1LsoL*H`vvU1xv) zPKYegEMpM}HSlLE;xF#)Jp|Sj)&#yCdr-x9p;;%kL8v^+0JWf!*fi0k^(5ynlzIIj-NO|ytZcM=H0FT#HWCy-oAZ1 z{OJuq(^Bm5+M##6i<*b1)ggER8=%2QmrDLLc5FT!v)5)M@>4v}#8*RrLwfp`&^&d2 z^uUs>CcKFlV}e`vklZIwhoGqvqa7at#lfq55j+4^CT;e@i${`_>>V5eiZ?X1wvr>y zZLvch-|9MS1?WylKpbT_Zz68v{xc6gBT@4u{D(L>?O`;`j8ESoX3vR9B8l*IXm0=x zn?r97B?(YX4Z3T;kKhObvT6UV_psuoLICoxpqzpn5Y}?CClb&MsL1n|z9p0&Ff%jb zWKFpf9!x$JSPWw9hR_^A2qzQO32bck!6h2!4fLChc+Wv@kZ(`89L(sLsq?W(VD0bB z8bQMj9y}|Uye%Wl0Ew81=(}5=h_jNT9U0@$rS4N~66P2KO{IE~R zA={-9E%5Ww$Jc`U(l081(5g(+YyKE1%WtvAuf)4MuhIM3P>=?a%$=K4LuL z@yoq_FI#)r@9~TblVA^>U%`$C10W;?do_V(h*pw%*ZyM#Vm2ZgiW=I-p0Js1R0$KKlZ5A!aK+em{yMWh{v`y&y5aH4Uyyz?RkYooH%0Ahw2hgazqRwM9 zWweGe+-k^ydgLfYXI%1N&db4*QVE+XHcCvrk!16gaddrq`lzDt`q<{TM;QeZu?e9vW{?U zn`46yW{g6(*D!3&E-nK5E$FS&g!~@wwj74ZBRdePn_~E6-Q@`k@3<$Fv}K{V{fsz6 z(Fa4bW25xAQ}aJZhY8d=sOnMMiR0^TNV{4@|NdMr8$XE3&{#$bff+SG(ckM@Svb#WaI_u<6;(X-9d15i zoUOsIzkTOUe~f>uw0wV#7#J9s;S?piwq)RFg?`Z=r(yu$4i?uOS{w&Hz%WBfFr@N< zB*g0BGW7kdC=x6%V!I6Gvo`kOT{OLT@c_j{wCm#6;O&Plc;yupF+sDO1O$)pt#T#% zy!jGlL!~F~oYY1?5}NY?#7ri&qtvy}UI@Npg-4UDK*4T~QuRlZXj3}K&jcJnfJXoY zwV>9#>3wonpzt7KePTVvO3ed627&4kyRX~W%0arT3#%Ct!Bh*)71k2&#sg~ALWWmt zc&UeTgK#NmAOFMa;liC_89v-B#f}zLKKaGn>j*%FJ3z4x4>X zEau>rmPfM~0CYg%=VK&Jc3iVNtzqTJHuobjx31%Zyj)em<4ezkg6B84o&!|_*N>6) zZ)9A9WjJjFmzVkgx)WUd595>Lj&lWZyJEg*Q#x;qWf}I1-6%IN96$HdoBq2ueZ<Oee>fn8{m=~Rvz)A~w|=!pC3*5NyfMnq1V4Jj4!E-xr-|H+ z8%4{QU40NF+AsbKN@yRMM0!AI9Q*d2g5VD*FEl&^6f?t{`7fRYJK8W%WCj_g)wuYu zFir}r+gsedUt=8DycwpxVMJzgU{?@mE;T&cuy6wi^u{LGTqG?pA^~q0ZUTaenPJj2 z!?Ywr(`|&D)zdK1=o(=M_iizJSrTvf-Ccy6TOKjreo)=|!d`=L=d)ys^|3X;rOW`? zN%As19*k{hs?0Fcn<3UNYl*CmlSRcP2`{LN-pDXd z$Ux|tf{F@&?tGlK{~~Bahl4!-f4}JT^g+@@6f@A%v!lta#f6l`-NK~(_|n^i+n2z8 z&hFDOc%ws-q0{$d5ct==rjE4|uCvR@(v6$qy*t;x+ZY zwKQyIT|a+BO>I70DHXZE`AFT{1_H09xmgKQpg#^-x_wIN-V8F1Y}y&>r^tN+*=t1F zS{u4rCSlgrh~pry6;2O2^d#hfK(qDT%+ggW{1ELvrNEJ>@NhX5mCXo(PhI-% z8j2Wkb}V_ux&Nz7rO)E;F657dMn^v#)8o-P2APF2WRrlJU99b@F8@sKk%JlxI5%hn z*5*lFMpSg-j=-^Ak*E0dfeIycj}HF2DEEuUKICNWrncQ|*V}u2KZ{HIKS}3zP7f$t zY=7Efk&-bi2nBU#N%h9IA0sD+Qcv-QZSEEKTG%w6X{PP7r%kf7`ll2|r)21LM{p*G z@Aj#RkZQZw_N^*EMl#=3|7#(=-JAI`y^6-wNxA7z3Nm#d(ZK?v0Fo5R>WTUJd91K@ zP205?)7*0FiuU4PFGaoLLtWl)J|&pQuzSzns!yk)ww8FhZY(!dZ-@ZiN(tHny~Ux$?=_qhdKZ#jE38m$_mVmyjIgw(A^m362ar3)+`e z>myv;4m~>-5&rOe>363n*NLCYU;6iM&Jd$;7j%#QDw^nbbK8cSJA5P3SR#)Inc7*s zA@g2fze;cNA(!$ci>E)ePaJnjxa)9#eveF9KE_j24HC)&e2lbHx8{bQYf>&0EQ_2! znrMAht6*K4gnE+FsVx_7Kl2K97C|$50S_@AP$JdC1c4lJb_lL`u69_QXT19PyE*_( z{2DXnloE^!{2(3@x+Z(z(JaI@?-$N}6dW+g*qW+&Z8&JkNaB_jyjYasKFMkds%6UFI#MnQ*6aYiKNivzTo`3Ph4+?1jn3u z*Q{#LZ^Qd0>xj_Iz6u6(8U=1E_JGFEe@d!g;N;(Y{7Fi4YiHR5>r4|?Cymsk_Zz+` zXYlV|C`Tp0b4c`2pbhcjnPawuT_~+WF4S4EQ3>xF`4!b8qj@u=eBBuDq>f9z_EO)D zvTti9+pnAYX&Ss?3AcXVLaq8{En>pO;4!1H4_l9udiOKFZ9C6?9va-LSQ<;wO?+D3 z4D~NhQo_GM2;WI6LxXMe>}7G4&&jvsV*@yo7WkC$cCpyk?0yJe~N+bjKs$900N z?`m?W`*+&g@>@ zV;@-0t82YHvivf5$>jYnHU10pVb0%z2HL$UslHYvi;pL@1v8}i2b*8M(&8#Oc>3|C zD(MBuy?=gCh6bsfId?i{ubao`&+;30=y-e48144^atPbyf*N_vot4PDyr z8>Gp~DnJF6sArmeaN2A?af1iA5f%-b@XbI_56lAOZc*4HAH)$|jM59zt7 zYNC->+$QIE!yoc(i)T^OyvgY7?CiapF44L$4gO%$oXdd^gH`?9xjw3x{%meZk#5^? zQu*B7_p6c_y4Az{J3f3kd5U5nD=0^JXl7Ga@w1k#QqmXVE!%@X@3**kF$s>3Qq$uL zcgM}sE}!YKF`k(2WRurwm}fB=V@rnjE_2#DFJ zAYf<}v5=Vf7Q8zXCCR&9Z{4D&^-}+fcJDLWGHg?ng3Zh=?%q$`IL$KM zYtuJ;Whn4|s(dEXA&vBzxrRRrA?xZJjUum@Ppw%oABx;-bg{ET5lsSxmm#xFhfwm~ zz5ZLR$9n@ObGn;s9@qU9|4n)6e5%;s(X~J*s1BFEx`bDLL20`Qo5Y*-F3q<1?=M-A z&t)gCDCl}M@qbN*^P5=ya}!3bv_hy0TeDID*TejhT2V(aa(T#Z_=#=}GP3Zxur19D zLS45NW^#h(08z9@)Y<{HIDu}#<}I}_)FRWz8WPbt%o}DGE_eZr08hdtD$1%})=kh! zV3MGD0A-%gO;n%7#7WR)jHl?^OmR&$U#Pu!3{Q)u&6g%M$WAC=EEywcSaUNi9F;)* zBrzMtJ0A?(D}5I5TF1M0#Pn%9SXEiO7qQdlGO^$aV=z!ZY+%15;|Lm_Ktu3ZYv&k6 zK_r>=-2_}3057Df(E~Ujn~n_)`$a6>Lm-M}*s-Ij(seC5cCVD%V*mshycnO&N}U%D zKwQpOk`0Y#CzxDFlv4`uRd5LUK*fYU9RpiD9Kamj_@w?AS{3m6Wd}TbQyuD!kt;DP zOZm*b=IN!T%Z*J_D-Daik8Py(>`ff1wl;ny)}Rlcz;u6OH}iZhMrXSzypCQ3vu;-gO(O7p9md(C*nHvH%#f3pIyi9 zrIGn#vcd1_J$XdJJqJXkSpgjGv64O@CccBhO3Mme+DZ9^E&d^V3vH%}R1P*HBAkvw_(wIvDd($H zVihY_6mrF7Pubc&i`FS?{qQ;_<l#Fup}FNO(%4W z#Rd2p|8K?XI!t>wY2gsx$LoUCm%!H`u+%Eg+>O@?Pc2G=dPXk%4OAo+9rE23kSG+S zS;ymV1uTkW<6+n&A+g2g$`08Nurzv6DDJ86ZpI1CVp{|x8b_b>Yj!6 z*7WZYB?2d&`+B#ow>B}(Di^p?{8;+MAH}Ubd%oIK?QVG)+bXg2tfF?sT{KnrXWFqF ze@pgkIB@mlx9Ow7(V?^%MnMs5Cig7waYVU!OaC$}7Aq5+bzaM~dKBMM)<0HjFfLuf5B-gsUeD2Ez~8sGZS zB4mj+eCAbclO@&G8QQo zto@No$Fu%7I?2vjrvERUB#>CGj6LiGUA9Y4P2F@7a%xn^@}>vZ#*Fh0TEl!yVjYR6H{vfR zh&EYE0!f|QuZi>|IN9258-T8otz0&i(6dAOme=?T)Ce>5Es$Fhu_2Luk`hMt)Z>SM zFl9jV7BVUAyak$+j3nAq9j6Mm)|R8!rWfIw~0y3-oAiB*52Me zDhSzk4A4HaiiwHko)nIHe!H<&;qb&)P7NnVW1@as%w*PAc60xF`TEdbAK~W)(NlaA z9_9L?tKRxcmaksC9^_b@Ii9G^XL8}l)vZdYnz4b>f@d~7WZL3!nSDAv^OHfz-!dIM z$wn6AmI15zzv~j*@5;V;8C~Q0al3?m?}+F3 zreT(eWsv#k#w*Bzej^BpRi)%VhoG&1s>}?|UYnm;*mxp9i>1e-5-vfZu(e6wh8h<4G0~GX~_>KlXW%R2D3kZvcj`L z54-^=X3pC`U(wqPqe!vIEPKy4G>PZK;hJRCr%Fe5Tmm~fCpjzys;3*E?u99gobc$NVftqOBg+3_`c1)~d^08T(*%%G4d3m{@6D(Szs}8!i~} zY2>Co-@8vZ&x8u?4(0pS$kz7FUx(kpPTQ{}zK30qH)e1=mUsSmqPMH4ChhmDA}^_) z*S|NCj(i#XP{Zc;PLIzD$!FUvL)5}9Oe7_1{n9DIOPCLV8DyD?%F6qNgtl%A?Q{-* zpR;=PiWDdVXhjqh6mHcsWvV2d;u)l&AAf z3J;-xQ7kCKqb}aQOd4aTL^p3g{wDXzr`z_7^DUP^6(+ zQ}O*wBw~C})`;?8!|UAG))vv`1Tza{rd-V(Nb5x;Wev`CWiB*M&HHkf{es zCo$)FPNW1U5*}~blKKy{VP9aK+>g*^Na7$=gu>2TOhI75Z$s><}s*+J0tQz5QMn*=i zVWkys#3;)0%D2LHvVsBVI5%cQ&?8ibV0R2t;zA3Tj7}-`grh(H#JK+x?AcX?Y4|bX zQb9T)D-y1fGl#%5!ab1U0(~Y~3IqJ#4oeLDFcs# zJv0Rz?RI$h@g&IY=G9_B93e?o$3S}~Q=4VQ80+nVF-afB<4>k+@GG(q^ob$86cIQ&m@6wiI@Bkx{yuw8KU*8E{~ z?D?^T*LnSqbp!?+zSxcGHVZ!6lHKvXYs{ic(TLG_`|rQ{6}A<}-9`>M-{bQMyM3F| zv-Z!c-~Y@k1W(DA8kB|?jsS=Z6yUmX>(tD6Bj@rr?Y_Xif!&KOUtfSzZh$ioP&ZLC zAk!LB7EV=0%w7(eIe;9{Zl#*OzK1XdVir+V!hU<6%AWMJv;vDJMj|XikP`*TvqYDq zHRTFif=G3}nu{x;{UD2DC%ft7&qyKg8h14nh8YsM1j7x4uCK5*npRLhEhXg<;x>R- zr1~g9m|?y4e=J0<<8${nhol|1`aY0K6r?*?Z-I}Dbx-#(=quSf;hy2+Q$D>FhfxwC zRy3%|WV;gjPi&cba3d#vGo5;;z7mJHcnl5!GDBIv>caLad|cv$$>cRf2~YhF00ZA^ z@1yW=P-A;YxFXJZifVh3j!4Ss+4{y zL?+>@+{Y$4Xf9K2+M#gA4+wekM9^|Z%pvU(UFtd{3Ub-mQbJ(~gzIUllhnUL92_4? zqGLFS(F_~pU7$oX$?WOswZE>J79JtE3<=4}ufboZ+~THyNO%O-;14beSe=MV16pt= z!JKHA@e!>e7L4KBD1uoA8<_y45>qr$&ykc~1XAF6#riTY2%|lIO>o03?Uk5l8tl&e z@kU(59{35NSsX!0YTwLmv=gL0Z=Dq!#vFWnN+)LUiAjDu!9vujhaUR-gBzuyz*aQC zBE<2D21PF4eKXsMFf?`ZL-+R*30(2UjW2OCAx!#2_we)vTr5t5CqIB93zq6r;s72; zM|u$_l$X+B=ZQ`(G=WXHfF@ZL!ZQbVlrVOK zUdqz)9vSBqE7%v3wQo%-h|R`e{_ptFw;i6M+nV`&LLQ0l1N-rE?p#Au->n0CUECV2 z0^_gG{yH`JxLZSzFUqHItaay|*t>_`iF{EHI(^ByB4%9n%$OZQAe99_WJ1nI4zXvA z7?JNI!W%FVnSbeoAP?+x?SB{c{?3Y`^5h*^MzmORXDf1zUz~yS8k#M%@6a2(#JwUd zUWzwTcmA7p8L%I*nE+%-l>!EdKdGTS`v?<;LDhXo;;pQ>c`wkmDI1X3Ou#!f+Z^_- zS4zbW*zFUN%S{Uj>O458BLtceey#v4$iMg{kF@}oAUkOLCl4NM?k};+(v1rZ)%yCR ztaPRYS%8>t6@#_OwxeYnLGgW_igv8&-kC^e$n1A#fut@KIwYV*BCzQ2O7snduSCa4-1dQ+&qHf}?R6 zqZovrqomOMXMYL{tpwd$tY0Jz4ZQ17NRPzLZH<@So6uX-3uGz{6_jiO25){HiwE%G z>v31RCVjlUXIFm?zJu)%y4jaF;JLCd0uF9`^JWw{TngZ7_^2iHZrFgLg#L~M`dY)D zp0d6ZV->%UqVg2XzqqWa-{BU;;>i}c;k?lRAz%>~SHy<*8PS5m&u5Lzq?q3*$d&@2 z!H3c7nEigLpU;7EOVqo3I!}?0d>__epmiiA0iTcTAqQlQhieGQ;(YJKqL4R;iB}s7 zLa+o#3maM{&Ws`k8{Y@cwd0ruk-_x{##803P8cLHl(hrKd)v{Ghzd`J3*dWDa%Q}5 z$67c!+!ml}7I=njvJL_&2oGB)+^%-}PQh`a7loV@OFEwzuHZVfqW|v>%MuoDfBm}k z{c4++KxjJ{79>VAFw;ifS~SFSU4riD*1)vKXk z9ca)yarc_}QUyM)XE}fSE2nx=v#e(O?u*?5=cP1O~ zxmeTH(?$4tZJ|K3@`*^H6A|l_!^w%KX)Cdsqx)Yf>Byl#3*;8!HbU2kdEWHOl{O-v z7hPN4sJWF>bQN(< zlQ@j&C^&*hoLP67lCJ7LZDU#zLvsg!9X{e7Iod(?-psFM5 zb8&=@7PdFXr6vMOBSJXAZjlV55Ma?tmIj>|yiiEw(bHiFzGP#Al-T`5$coJuYDi`P zMzTxfVypUPB&wTa58@S>M}NT*GO9?FAQ;GyqhNtbb>!-&ShVFZo5}0yvY^ExUR8AC zw}Yg9Wjss23uosX(#*-Spvm{yEdUX?1~LFf)}UF29=#4HFm5QRAdrAO=>^cvjF=np zoe^P-n@Scy$SwTf1Hvvl-3LEJn$<#r{z7XRyTc%|<=}MQO>1n8s{()w;F2}YJ z`kBWg+jZ|}k0IcsQRAr^MYQgf=JBIa>4}c7&BA{Og>HPV*;hHtXsy^samnWL?5FhZ z>vr0`t-tKXT{<0a9CgeW$=@%=yQBR@oXYnj;q^-o#r9s0`{LJN{k)j%r^qx3M|hIo zoL&`T%Eh;XWi>W3*f~;zr+JKJak~PQ%Zt}-G^KItx>YZ@{U~_B8Xm5d-sK*LKH`5C zG}krX>i<>DqdeJZcu0CHx+NdscoumaYed5g4PGrGhtLjUsjm;*;ecCw3fx)9LQ-Ip zV5uXY3CESB2wl4h%p_8HfT$j`1MwRH#US*>+{{dKm_t{Cc)%cM@~N$@<;XTwLc{># z?-D?zVX)mtH(&4>>!0vebd9N~xgM?%*t+?a*tCKghxbVuJn$vjQ2f<{td&Qz-zD%c zZVWyf^!Al+YYt@$hoUe)4VS~b=`e<23d(#s<6Suag;Zbt=;Tpd{N%*Ds|k-ak%k;c zAg^f?w4reGhI5|@qOMo2#dM#MQHk$~FTAa3Wxc^a7NS7q~O%6%P%GQSFc+nY$ zUv1Whhd$hWPDxVf_QBn)oa=B z-NkaRi?`HuC2Q5yxw5txPd}eq;P{z^LA|*zIPSAuYhW0^cPWo{-=iy%dUWl$UqJ*f zgtC(fAfg#?R`96m*l(f)d|pQ84o)5hOd-r@_=lnCV?qG9P2UlCF9n%seTeHDM;iy0 zhCvbwbv9T@R+t=Eu;C^+Q>%F_8W0|V_t8`F zL+!x>$1lfZPi}4m8}I*GQHy_p74HB?K{YSSkn#yA!VgGC0Fr~13r>I~z?72NWP*xC zlBWVOCtx**Krlc1EdQ3ftrP&$VFc9UM_w-}TxFmB-y@-Fvhef=e4E0sDFX@=xtReN zD*VunYyyx?;?Xh#aZ6!|xQC5Rj0i}khfV~+PHN@n1BSC(0~bq(rJX2cfEW_I^q}D6 zhsLYbtM@QxV>CDbrydNNWR13F%<(FILPG&Yf)Hv=WN@|R0P<3B$}s!uaV}gW^6sNq zIqM)IFqur~RdWE;$;zyp7OPIn#Zbnu`;Am{_MdY$zi)J;CSEyR;aGWCRd}?2?y}LB ze*RFd)e;uP$R8&uX#|Id*S0*)F^>+)wBoRO9Vj!wc(`KjV`Js3_lwz_0T$nizfHvU z@5o*+c++wQP<5WMlennRK<$}#_X1tmGPAXIRm!{6O8Ij|^v<)aWsf>f>)F>=KaW4o z&ia-%uaYFof7BZ7Xhg{o1R)NJXCC(E{ROi28JN~#QCo+vTABU_KY#2 z&wvM@)MfC(D*-l_ir8+zMx>gv?cU89d0gF6r60jm$`Jy*1-alA3h=e@4I0rtU&r>Q zx#2o&NY{HxVz}{tD}a_$R8gsY^@`HdbFF8|l~)k*-NvEK@K%FJxr>=8)BU%_CxI#q z+=%Ud1+f$^H}NCkd6)iap>igz;#TC0JoRnocohx&+bhH?3<1G4hu@-9&B(mDg&hm< z$q03_@xu7=X7dp)F026&-y`Mb>*uE!HPVdX0QPMt^h;U~WD1We-}E>pZho`in46%e zR`=bea}(z}8nU+zJav4n-%$FP<@~{>O;NVd?kxV!3(J?1xLHc%HcEKL(6}1B(Sp%3@-y9!h8Sw8H6;~{ zFC^$<)J*6M2QM!#(p#l^Rxq_#0=@vsm#8xMIzZyfw%fbb{>=PSvFa4!wCg`oURr7s zcNR>dVPRq8H|1SotUr_p0!u`H22rT)S^rQ4nyBSI-_9^-_AMM~={=aB@4`{Cc z|8H1BX_8POS~63FvP&U*RU#o<*)uB?GK%a?c2=^tG?103L`2ET3>7809iZ4v&*$?R&*x)3osLZ&><*8Kd{|ZxrzQCN{kY1;tnFsk>weE_ za8NEOFH zJFW|9b#5MZ4tDVHG}eWQ%SDHud!^8Ly3bok*;@HQy>q}AxWjFWh5}35|`rJb!wvIx9Wv?r%z?${+Uro|#gxGvVLH zbwb~$G_+sLKK%KK?F**;$?L_bo^ka0J301UePCH{J?+VD9cnzF{$C;6Jr>xf_qH+$ z_@gmgAbv}VE*NZP(qvLxaN@;Dfw7(A2#i2co`K&4`9m)u7yH)eZ~P%+_Q;1sEcvbO z`SSOI&SXaVYc=XFVXQ_u5xmi?cI*>;Eq`ch6W^(M6^``n$Wo+w`pah_AyBg6an~lW*QN^z=3%sGrtRJ~)L^BPH0+R9QJ1?PfZ0>DluC||wb^IV) z_Jyg#`k{f&NaV{0ZJKtoGW>lZYGAr0-~;M|V-=Y)S?Ac-~A;9iBeXAjmdN)PF(oVt9mX}?3VA`$p(nePAlrTM3& z8D?L+O}v{puk>!8<+nZ{MTN-llkUsMo|fOLT>5lCUae=_XSoe;nZI?>AJh6;`QnPl z*{KZK7rx>LL=P6cbZumCJ=y)p%kYST#g+EgUEL8sOMkyjd?5JwfoemLni`nmFI)YKIhv)eX%LwD!B)xU)Hv{v{5|KTappCPIdu=dc>&T-P2r z(E4X14iX9uy2l?eb`b(xVBdjTNB;^P@Va!~>f8n|$4YskIo5Ff*T#b&oavm^OsGL1SE+U9<1?M!t%Js4VduT8+S}UR z;$moD0~3?`Z<>JAq6u%VH)>W^oI@hwr%$hp2`f{nS=7X{%bn*qcacMt%KN!w(4Gzn zsZq-;{mz_EjQHU8L^xdGjFk7Gfwu#;gY2?`yv2&O|Lf&HOKX$G>bv{AoZ9}+jBS_w zo1nZgv!QcJt#MO#dw+EqzwFSkTl`>%q{Tx&=MSp0(Cy{Xl-1j^%*b%EJIBjo+m`S5 zc19#CtKX@e>vS$ZXVwgwU9YBm-jg!s+eJa z+q5KG5rfbsG(AF>V>DehQ`$B*K2X(9ij=l;66!cXALha&r=)bm zdWX@#>A>noxY|&14a2ZEJd%Bx!qkXvI`9f@5$)(U1cJnX3=2iNrP;`kWmJlQC8ixvoc7F-_vnbY2iz{-&VSfqt9(RBq4jM)DZv@Wr~;v$ zLdCBy+h4IC6ueVhZSb;HE#F{tghTkr;^A!~CnJj;3ZmBrMZ>t9m%r-xn)C5}67K%9 zhv?Qoo0yCKtNI{eDNr=D9Fn!N;sxzOVj_*zi%g7-Z82dNHxr^}xyuF|eb;74d3)2G zoS1kfN;f8xdBcYW&P4;LXD{z?+_9I05UNTms{kfJgbjU#zHPylRX0G}bwB>W_tTQ}QU;jFlcfOW&6c zOvZkV>#VV}cb{@MGw54yxYw|BMMy?yYN{waeJWk5DQ+^((D1zb{mc8;hs>n!L>faO z?gU98-vbVkFnBz+ZRyf&IVrz9q_DyiY z4fnNfJKanY6Qmt33OIdm+FEcT;nf9E!|c`HgL{qMZW4966Cce=l2uBYdWXjtrlV}E zW|hSHg#TpaUEMrX3(q0Yg2R!rXU~F*xm|2O$Eyv+j`W7vPZFu z6at{r#;D8xS55(fG^vp?;yxN!KS}&{mZeg_9Arj64xMgX7u+xL3{{@+9?!70n$#=q zIk6X>X9ZFpnbua%mCC2RqGNH_A}QO#C9EdF-AeYb?Ltw@5Ze*X>X)vrejiT0EqRd= z#rBhhVVto>_>t7np!B>$Wx9PWvzRNL%%XDV#rA&rTFO_S)Ia)ihHAX5N|5GWzIfMK zAPJ#RY?HfU-=8tLwtbT==OK0@+eK~K@`GVZW>V|gxEO6rOt*dwv48!xyZ)u(E_XYm7s`-wR7Bei%;E>tC|}I($jyA7+>8B zcOKwyDDED>u3k`Tz2`#C5(HL=aQ!*YAYI2WaA4r;i9j^gX<)@L#S}yx8GQsFgFT2$ zFr&|bJ7!dE_Dy{q()7QRlwa7=_yuDSfdpf?@HPNYsE0|)kiuVMWP^+W>p`IFdS5Ag zR{M}!3%C#X!*CeU*_f|}CxgDHV3EAp1_!=3xIzdBYrtM0Tu5R;ha^;kqetAeEYuxv zkBB1r2xu&WTibsZn6E8eA)+4e=1>8HKUGSW@Bj_N@m3krX#oJqsHjjQV;Ct$^NL!@ zqtYFsWN-lq2F6lAQU)C(L6h2A&vHiih;KH8(EIy`-k?RHCwB%nN#3%e%;wZD5iBvk z))x_Q?{HE1vTm8vFcuahZk#HCrmrM+qi1&H^=B?O!O9hnb;o8 zEGA``+h~Dc;|)pMDl*-+QHW}Y%D99KoeEjUV^K+Cec4)T&BEs94o$no--gOF{wA}l zR}Pz5u7?t|wZJf>aG^=T$D+0|P4ugo*|qzjc~w@`!9R^&r;cv@RmhgyHlMk2U%Nk; z!(zt>P7Z|rYrD&d^NC+>N~qq#=s_u{My=%SX(TXB{ue?w7>EK0O+ zn@FzI@rA*~y|S`RQucj7c;%K67a5eWnk43;oPe=_@-6A7| z0q!XMAD)-{Q9(pBgg)vm-l(CWK}cY9BA6(1J~T=kO(6-ifCH5bK1RB9GtyI++X_jJ z6H?^K)KDM;gf+rJ1^`!#;7VLXH?IoEWSv{ya?U-3}!q;j0w0}oXmbq`{jZ3cI z9!?zOe`I_=G~V00Y>l{%TSR%!EVC4q`p)8)${h_Yn1t6#-MjkavDDlhp4{DPDMdRN zg#6BDTPeT$O~0jqg}+pNwSCx_vzfb*jq|9coNg5D-rLS+LZj~AKNZ3<@!s^Q6SCj{ zW{^lEc(aZ{UXFI*4)hRc6)!Jx1;5e zcSPjDA@ptyxM-j!hx(mLL*tD>A5UOW(jxTLM9mALX;I2fY~{+!nqM{d7Sm36qoP{l z4~B)GH9kE2V43wGFRDsiZzF4JXN}p-9fC=zCi?AC$=cqfXKUa62<+E{#(BIj?atEN zpO7K-XKbMbZJwO-hdwIoYQ4iu+giu-aqNAeY(vW6qrmYkn+}K`U(9&D}4pI7taO<^{_uqK@q-5RJ2-ZAq^~P0FKd1@WO8a;;r`N zriU$%dG(MHbMj(EoM(6}$B>r_Xq^f#m1G%{S4QICoOsvRq97q(LS8V6Jc7wVQvULyJye8&OFC*!8Ci$__Ar{46))y*`i{^xG1s4@)p{CC-NfojPOJgi`T z(!G01p}Zm)SHb9$IWFL3w?INf0euQOuYU=ikz_Z1pbtTd5&hnq5$@1=e0e)HwdMlb z2D1+(oTzIKOG>h0Vi^?V{}-*yXdUDfK#b2diC4~JO0+kQI#4PZC)aW5ey_m~$A@+X z_|ul={iI`Ow41lDm>-u9jrFHke@kbsAU^$3CaadkTrGoA@7;1>W|>m!<>B(A7fTKXQmi&WV*=*B!c%%W&4{&+#-z6E zua8Oj{p3*B8~rim3MT3@q1S~6cQZ!T=~-HGTyGnMALckTo*2R0j{SgPg2!Nl?3!3D z&U-mMHAQAE;O=AxJ%*#WInEQBOA?<{OAJQ5(hkpXL@BXvbl-Rg% z8$nCl4k^qUG9wjmFN$w6{s4hQ(l`YJ7=B43EARi5*tm;1Grcgv5AzL*z#*YK_eJIv z1O%VMRV1Fms|j^-_5jLwB8J0WA?g;?3ke{tP8nwp&(3z>1|!RlBvoL-{7K>Lq09IF z9~8%9c&QD+%}<|tBf9IrA|}+^Ak^*m&uIl>dkzlHgpeIz{~IKhlvd&VFa*5a)7J-4 z0xR@%TIVt$kG-Fl$Ac&nj4xouXostzE1UNdVHI}9FyOUy=hrwnPk(5DdIlW-2ttU^ z%ke=vbfU#^%fMAAV6H;m1D{MQo9%Z31s1v9g z3}6yj*_Sl@bz_}W$_-w<;r~qxDj1b5dly9BT+aH>X@Ku^vUMZgdkh#G84Vms8j<+! zaH_AZM$sy3;4rUX@MX?&e}#MDe?wF-{=%_DJKlAa;1DYGtc<+^QhJjOI_~p!3obXf z?=ACj_J>^XAPCXxa+=LHc^AB?{odW&zh{7fe}TW(!|+7>Goh65yyw;{H=mc@;)DW9i3!7K3q>1?|%j~9A$Qu1q@ zN51vmZFk!ovxiKA&j5`3z-VBR&7Kt&>9T*w{%UK&uIp%*FIAOTG$|&mH@;_R@t>5u zoL43Ovy#`Bo@54n*G-O#BhyMRm09dTqpV&y_q<}6}2>LFW5x4!~Uv3f|xp`zkq{CzuN9T-xOflB=25Q~S$(Kg~f`;Y`} z8&U5dVhpaHIY=(~aR@>2Ls1WfLJvSU&~X&F@^DM3K!zXMe5J+i2*n>*X0Od%P8!wP zJdJHek#p%h>U1)Mn3&UuDhaMhjQd$z|6b4Y9_~HpB}nUw&WVkB$^3+zss8ZQufqXE(r58nCqtr1L!V8$3!Iel;s$DFWc0U11sEoBH9E8^eo zfuAg0&>N^JmfsBy8l-$HcK>cDZnbq{AdM-+$Zb-pMB)YRU|qG2dyNei8aiif^Bm%& zFP}a=U}#&!`DPrYAS-(ZOoVB-%hI>~h~iDfSO1fxnvBN>4Mrxqp}|xC@fFGt6xz+O z#Hdpz;cky@A|LOEp0&v4#1MLE4UKSoo+vV3paFxmf88gvRMKUR^TTrB-q*msL{ziW zqboeRjV=XHio?(f*GA5LX6vpHHNZF{vLrJaB$D45 ztl_>~kS^oP$fn7&Hib@1`K2o7Wdrqd{z{)ePYXo+VvDJwyDhZ_El7vsRNq;U1Kejc z=VjlrrjKUM*Zo}HJ;*<5+hWM0c%071 z(ZhR`lqjjKvuvlQJR)3juB*|fU3!|iP>gb2xCpx8TTZLK2sH zJzYX{5iKKQ3uF)^knQ|opHtcAaaSH}KawOyjf)z|3(X>bu|vr;oSvx)wtEfrpL+TF zVOmm_k?~8X6KVzp`ubMeKo# z3To4stBm(1y=u00kp(yhnN0_A3S^oEnLq4pZIPfEjDDvJ^E+TeZ9s$8!V`|(2KnbC zLr&?-${~~#P4)E@(6mAQ${u+c_V-LBCRC$jJHp;2j#8P}#2`#0;q?%Ot3IrSrGyxn zh>{V<8JYY8cN0a!BjtPWq=RuMT|eH(25fe6ilR%UsK;b5Xa;WnnUnR~y5||*!x+Kq z7eg)E6NGnS+d{4S0vCF=YKm00(lhLe6BtE~&J#x~1Nq+<^IqbA2eqv`O6=4$;j|ig z%c^jUcyX_ho^fLYFwA^+@3Bt7&r2V_G<0*fUh@RhzGxVR_13S*#(sP@wh59pePdJn zWha+#>;voc_F%SMK~8eCp9V@CvU3N8BBbPQ7@QR3x&9zGyEXB?sKc?zQ%|f`UbeQU zjg=+cU~~C=eWJBybU{ARc;d-9U1bSvk+}ZqxtJ%OLBwp>KjpbAu%txr@ngYsDf)wX ziHdV(CPC)X9wjBK;#sd8^IFb0#I91T^{1r!@U8RELz&)Nb5P`CtB<>XeShXBV5P+@Piw=k>m$L&4`C47+FtjL*#cZM%CG z%jhuj-hnC_oH;9C)gJ?|$HPc-GqZEu3TVsNV28rm??U?qC4pB|)Wy&u2$%9MX|>!2 zjQxL9VOpTbPl`GCQqeM;My4eMj=mF+bFNpt@=XOhp2YSbW!D6q7=bfz^XUUTg%UG; zN=^mxzh~H~4KOlfUoPi`HroKK1QbkJJxpYT6MPk-utEP*;vsAp)Gz1pFo3u)&=D{S zrwf_m2}fMe_dXEWEF`@fR;Dy^^TDf~5!|p)^lBLe;VB%DxJ8~}n}ajQ_*|V#A%Hsr zIzBWn(qGto{!7Vw=%Ivdc7}?qS&FH5ZK1eRqf<|8&sA%Sg5NPchB?Z0TX&GpHOenvOvw64m3>Bw> zdalYxzTBlhTG0b%yX?A_j{jCE)9JnS^NMBN^025Kop8(rp@WCH{a~8_anG}=FZ_cm z=OVK)<=xFGbp5q|W5yS#Rn1NfeRLQqkZgRd7_DM@;&e*N>6Yh;TNNE1KgtQrA5wiz z%e~Q1U|{Z~6u&j^rc;a$vc0Z}Jh$Lm)!e%e9Nt~i@NPP0de~GK@D2>rZmOL4KfYlMC zLv<9^2&^4%Ojk!Adeq1f@(rT(Bm%4SqO988@?Wl~Y3sh&+YG%S!dWqu=3Pr#YHHF~ zJ10!u5D*YRr8snal1}G%13Wn>1U8%>ONS*ux@G=*O;wFcR^&0&rBQxp`G^7;Ylg(L zwm^&3TW+2Fp%9Lti+z|TF#8L?-?b!d$Sb5i0^PJYOSs^@Ypo$6yRJ zL}+G97yrp}A#i38M4G#SnSfYSA?^!A6buvPs8k$K4)qA(4@&q0zNf{PW+73)tVT@0 zucPTi8|L_5Rl|)1MnSEWO%iqi)1TZpOh=I*q=1k+xzBcdpAA9_^^zLOlLot9lx@Ot z-2F9zIloiqJu?{0?{5@s70mA1mQiL`1)RMoCY`Cqk0~J>k+;&9960@2S7PXBXEP5* zbkr3v#f+b0;HBTI^l)7N)}x!vA!dt*g?i2(XA?V=^j0=@_vMK%18+wYHR{_Wk01in0zlOie+0qQ&f4yh#$?ZJLMXcE^MK0 zC&J_cRb(|3&a^*kc&!sHzy0vZw>hVLTezZ2<%17z)#}a>la3BmJIGYA(y{6kab?Xq zeWUdrk}fX%k$#9lhYzoTjOs$h2|-FTPWojO9&om?K}7Xqq@ap8jC)~Kf6#Cj0=m4pLcXnBT^6O)aC|H-1%ZCqexRh415+A zZ2$&@Z9Y-pg6DpBiMWRWXA*ZX%7li-Mn7br>O(kOUhW)V$VtXjp+k8NrWPGf|(YG z|5j74(s8-rJN0MEVDkEPGmUp8Ts!1;G<46E{b>AH^_0Xp=ONp;jh{Fiqv;W*iWSMA@u7(H6V#rWxC>4g2><;<{xwlNUa z0%hiRQ_A*Ney%LJ`8rOlKy3R*k+W5nBDzt!Ja$)nLK3MzFs|0}oD`0A33F5kx4O`I zqWg2XT$7x_??va7jI`5ivO9B56h3_`&${LE8IYwfz3q$#%+_;J-lb6V+C0Lo9Qz+- zqwoBZmCPj_o^e;1*WX0-Oy9dqd{MNhsji zC@`RskVZHG=RZ0Rru=IH3R~`8BrBn2B-1+0Q3HbpC0GzDNjVQG>l+~Khex?LaqWKWXaqm%vsondp_4hldcd<@ zH8yU5rx_!jHy{=UCpf=(SGsyRiB^Sw=qd8=VcSZ@2TW)rU@r=eNI^J@35r-w`!GA| z!Gj0tO$G!SveIKMLf z3lS?%-@c6iuOP)apm5;CiD;DMZ?N;T%Hn#IsZhsJq&L`KdAEqYk{GEI$ zx(%*xb1zYM$R~lGqFFf^rz;xdDctwgb|*`e^hPOn?WDH1I##}tL;9y~#5Sg?dimym zmfXBcgG(e%|Ei&v->9eInonAEj#tDH2HAc6axZNiW$xvR4i+CmlXS(kJPZVL&uE5Q zN2zE=KXP!M4o)n*A{_GR$Nl8!j@y^bjWR~&$CVj1w-4?=A(}jO*VB-ybftgYdztQp zLFe$i?>bXrJQ#x!7}f3Bv3F8>M)Rm$vimO15Tp`cWY}eB@Y>E!N<$@tv7Y!i4I+?xf@&Fb-y{QAWQ$CvZ`Egw9IdJ?1X`Sj$QY+ z?;geO0OML(9QW;ce0Id|PzSBi-yoEYkCC_le;NTq&_x0sfg2h&+HpU)w3xC~ zR`L7NxAB^=akuw>)k@OCe{9ENbsPlzghdE>_WtfHCeS+_a zq`K+tnynuAzEX6Qx0{zo;F5pRqiYS#jW0Jar8%?rX;@ukvHct{Y%~&jgFSJ4Vbgg0 zc$cBnI>e>gl_5{t1Rz@^}?%%F<_T zwexP{22`^U);;Z-;HP{-6;I|B|B4mZ;2TRgfGTe?Qeyi%+p zQeI70Ezq-QPxHfrKKu3`T!X%MYst1H?%zk9d}crC{t;(15~8R-*&1Wp*Jyiqd8?Ig&X(-GlBvG-Cb=BcqYB{ULH2=#0{bzkPFxqs!|}zs2&wRXgTV4xMuM z1*JVD>69W>MU;_ko6?(hWT5CN1@9Da_OPlo4q>U3ME{PWhhx)d-W`u>O>x~coc74o@AoeF`TuL`pU~iw_k1Zg<9n-}kHY-IE+N5!9iH9a*th{hXi3~r z5ri{1IY~4!oLsy`wLnj8R{we+E8|Nv?<8)3rHR~v^yx6VJ=VuzS{DXcHU#V*Hkq25 z2JHHsl-a3A|9Pk7<>{}Wh{RTBTRd>W{zQpDpr;sTbz3CDyhep&GIH`-zc|^!f zNUmmyzvAU%>U?81dCkDe(n~ADQ*yL>L(D~iQOtqjIM5h*;d%!Q)5n^e9kgYnLAo*`WPLVsR_ARc6jRzOuJzf$_q@rQ8op zA4-@0rsq4RR{rhX>({4b7oX5io{|}*{d7nEmqGB9-y!T=tA&3;4L+(@Q`Jg7K78k* z9@~DQ$iU!6oZ(YXK5gPM%shPDyP?9k}wrjIq% zKJ0#J85zEnk1B^5)$emv)punGWT)oFT_i%H8yD3%b#r!KP_r+jkw2~7^+Y}@EAohW zKrct@LnF@3=pf@9a{XM`)@@aEly_J-mTPnhy({z^kN7=yNR^)Fe()}9W>!ageCTCc z>+q-c+b<3^*&O_}p`7L8fz{IzOcH}$b&n**LXu=-bJDM+<-j@oy#30Ffa;waf7HKg zj$FCAJa_KziG4+DDH((rDmXt5Y^J=+uP1lSKnU2To$$C2W=t5F4u{Sk+d{ETX^4gpCrZ<{6*8#)AV;|!JHYP zV}P849<45EPYBX>o*$abP5_63Y>FtcAOtxBG?kfI5w9V`?)gW^tN00&%hS>*i# zb(?m?uMz|loVua8c{6P5WUQwa;6F46WGDp%VpJ0mnNx&ZI?fSY3?@NSOlAgOdxSx@ zH}eouA(Hh#own4XHw-)g3DE;1A`5dd-X#)P2#u41UA|VaA4)pdP0pfO`GpKpJ_zuB zj*X0v2`XqMZkLq_fg3S~Ggw`Z49FtlCnPhOnZ1NvvkCBT_NBSO-SlYWNOuRTA>7V5 z%3+q1&;F1!46zJJwSh0~7?Rp3NP-8d6O2`gc;nw1u>}j?p?@GW;g6@`&HSF~wG>CT z9dWJrzHh?ze73su`#e>R42SG-IfOgLOvHtShMk#6<553YbaHh(Q@(fp8*7Gw0Ygj8 zm`2e!*&zWLEZK5ehx&OMJz3-m$_HFk*HRn_8eadF)C7+1k#CI9vnwRpe z`-k3&4;8S7-#BrvB;8`(E~4wEuix5(_Iq=>bNz7@#c4C1?7k>nWcEYu$2{$2nbCZM z?KV3epEBMSelh7Et4Ga-0j-4)#6;52(@SINkeC4q5Vc`J?ZAs;Bt0bnbwe{@52Z0_ zw&9!sMox$$A~A#V0W3*|(=MJ%^)O*zE+;L~@xg?IkSXHI#k%S(G~7s#Ixyr!X$&1k zcC(=4#CZ@oO@zCHYLnIT4<9yNBg_E*_V)k$Ng@>BZ~unDuK&bfty|aJ+M0$Fj~HDb zB)K0R{v1dk+erPtgSHNB3B)`Txzhru0SCS&;IfJzXP-Kj_?TkI3ig_u-R4{zo;BD(>XZukU^2I;+tvmLYb0F+8W zXX@hOYpDMRT@2yckVvWf`w<0LfKwQk4jO?NTpE&`_rz*SBad77mvD@UHhl|EMBb{` zwos-(u~%oCd|WU7cxX5j5PwlAZ=NeWHbVBA5d&w_@!l1VTM@Ea!(&`QVs(ccz6oG{#jV|NzWdnf;DASytXlpy=GFGHDJxeMuT!*C zDRMjSAEsKnX8j1)GkSWp(+2}7)gIrbx^(Ki#Zd+t%EA=Qr)#M=4(#hX>$++ExwV_V z_F9y+FE@qxvTgDS^O<*-WSwn)r7T?R)^mKJO}L}8*MY7ta8B@|r`6v0cc*nt_f#T5Xz2T>Y8?K|I(yk1 zqB`hP@#jj}=@=w7Zr!lXq_~%RP%ayy<4}-guDW-8d|qP&Q^zD|n1B2X_z)LfpP_b% zTAc2SM)SM&7|vK?cY=--7RW;gD)aGK57E-yRF0$pPWI{_-Iww}6n$Fs^R*BC;1++GpfjV5MteC?b%iiY)ea+(W(}t+Ie883zMd zz0+)znx6h*vN{pIRwPg5T`muXR)j3$Po?>-BjgfFN;>L1;KB}xQ?4cr1lxW%2GTe- zV`7a(Z;>zl;&$s8`^Gec3A;?@mz%-li11tF`OLi%ouxrQGYEms$&VV@LKc^V7CRx0 zZ3V&b3-QR%@jQO-W>OrB4+)Vls-I1H&t6avbKQ1j@$W8YP1?nZ_$c%D3zz?lO;P{a zwHiia5V@Y#LCN2t;gcTo-Gh2(p2pEyuql_RQca2F8cw*E%up!w%u%$;J!M!`Jt#BS zqN3u?wvE+w?7eeOxfAQ$`xgtJeo(vCnigI@dEc{suUWIgRhuFGXYHTwu9}O9{+!Y` zeXYEeaoxx5&R?Rv^kQo5h4%FB5pWh5OKMy^R`bS&VfIp6bYI0F_kiI=yV>w@YbB}?to3NF8?OAgYq`iXY44}c|UE6-n9S4$V^~uDn@(W zdq4T?nG}hbLHHU*UN(KUQPt5=yq4oEg%nH`MadkF&6q?*;+1Wvy(fB$+412Ja~+7F zPKA$mk}g4+{bDYCP{XZWou7UL$=$tm160>!)*%NkZzhPYnJw?y6cJlUMsOYu z4?u;Yx#YqCPGe7w0~W@7Ba<~r%qTR|W+}>k)ftl8T2LMr=*YJXDOX+r$HdgMYevW*`!B_J>h1sv69p+LrYu;9{;Z zb{Q#O5CRFQ+?TKB7$qbR)d^_jh`) zlPEQxecPY)ra8&cjjCv4;c~%8p1>tT=BOK#-pU044LzcI$dF`c`;``Or zgGyfFUQ5qzI8n|#8!al7G>}a3PC4Xvs4GpEdcrANVZp{Z{(Y|6QCC5c&ZN>{3lC*u zL9=b{t_xX4my2yrGUxp^q)!xamdbjw!(i*v2emd|Pj0c#-@USR+=20CQl;Tux04X2k9-p0%4zM*;l=X*Fc)&5uQXZ5$t3>=+3P`rQB z{zuH37Br*7r#4pHsep&W2el*#u2MX_6Nk+UlVX>A=ZSlzIRE0m;T8{vO;ts5g;cUo zP}5-1d}ExkufxI90NB3`r8Dt(JT2~!Xhqe5glBP_3PdW2xqp>yPV^&oeDUpM_B%!c zgeN8{L15i^t<5KLZ75Xg+Of`=@Z4{_rDDBEDr|qeACTP+VwTUCew~c7n*=;q#1%b_Ox8 z=LWdhT}xw*{nG2%r(moz@%)ZmekC)P*x5|BaGvW7S`Aun7Ybwtm+I1oGrvt`b#y&` z@v)D$hoNo$SF6^Co17biw$r+DMC3-~?!LKTGeP7npcoie?G#RPOVO@XWB{Kfm2|+xx22ozK-%Pm)Hi2<%8Q4ytl~)1RY#n;A}aP z^4{O%kj`H3+7|&&A0ABAr3jt#erw6Bcz&fmh->vOjb@l*eQVR|O={D3_iYoOc6$)M%*0D&CPQyl6H2d(gn1vr132= zv}Xfiv)6A-KIdT0A4@7U93^8&TO`Z}xsk9q-$0MS;_+Rr?)L~O0~ru&-hQ6v^%PI+RM??-%gJ5VF+Dvn=h)c zFN#^Wl_M@_)@5{}wEm(%pAxIu0E3gNN$Y}jj2h4_}1o4{Tuq#iyL6wM_-K+ zBNP&3RaMnUR%^7p7qS2yGXpHmswhq#WrUWU{Ua1$SWkeOwKi zOJbxcsF~jkb-~*DYo%cc32kVsdtvs%{^MTk$DQ&NxK%GSy?FD*mZyr7m@_}I{7}^u zxr4%N9z_M_Xt!@UT2qgP6-HK~uZXv*3P4**LqlV;Uoz@VRo?Amh=W3IP!!e%>}t** zm=;jKL3wowPpn4gJC@`&ljnMlq6SbT0$$!=-i!d_Ha(9Y^teZLBflX7E)~yA0{sY` z0g8@LA8TyDpDaAva@6Mi6=#xGj+E&=P$!Y{2*^6d1`tt%Rn43XD7*&r3>Yn1&i-%# zTI4qcdXgm53e{MrnE(O}VxgOgj@EU}t~s!7W?vgRoMy~HgPkO41cIWG()YS)4<39f z`K4dK*Jl$}95&DQ$@kPYV`jD@i8}V}f-&tTB!Wxva5%fBGFCH*uttZ2R5}`)jie;K zoyw7wxLp(&0LgMn6m^CKOkQ7qU-?HTCL)CflBF2S?~5i*$g2MwB#|v!*NG;=D-P_TQ|_=B;79#ge6l){ot1>p{riH~ zyD`jIv_M}QL@0<^;soe->a#>d;NbC~cYr$rLPCx@3$g9bIinhLbPrwRa#CPWh0|3B zrneRSi${gnf@w4Vibs05EZ?wy9$yk@=BwbC@_wR|v#NL|HZ6zic+{&0Pg6Fuy-Ys6 zEc)@xZcF8OQGfNRnS8svrQM;U=c*bVMpU2fPg?n<6)X}GW~I_(A-Gd-cZVBw&7I`V zkP(IGFI-NA`LBdjQ(BXFYr+?{_pOxeytXaFH=i+aEvna1>#@C)E_s z^w?8@A!y)NQ*kgJ^~CEtCg@TazNnn?5B0K%RZ{j3`naXuX&-Hj+H>Pd^>)~}Q7oms zySB!;`sYmbEWgo9nu0soQs?~|3{v2GLM7LCocmigIT8^vtAufg@PXm8;3IL|2;bdl zL!B=at0?y8+jDn7Tq83F5tzgbD62C+8-c4iUHc+rP>BLo;ND z(roShs3^?hVT9R?#HBHxwDH1MA`bBvaEGJAO?KMRj*|V@U=QA11MwJO+H+1?*K@^@ zZ0DZBgVV9)!wn7TZxv9Y?$v_)AbE_Br4R4?vn7$-534IHHaSm6EL|w*c@BJ{vuu@Ab?L+e|NdeCW$zFt- znUqOh2$TeiZxF{#A> z4t67}J1KxMDL@z)#|RvU`vl&Kdt{|@=(tUnas*1D9$Dk8e;5}XYR`y0{7yv(2)`c_ za}MYngq5b)kJu0lM|Bmk%t3yP~)plS|koo&Zd&xtGh|T+(j;12i z+oU6@oy0++9K8X7WGsZO?|(uI!E?E+?fB(47(~#H2UE2wh(SvSu%QvZE6_%YOELn} zuVmlxm_{(g^8I=N`CqF}0)bIw@7d1V2XI}~=hip^eYuUtM| z=j@ltv7|E>RQ||#VypWEATAj#q(Au5%V1-MZm8~BV# z+b}w!;GLha{PtvZB?MKBp#UR}N>hwB)-ApTy+S7 zMDvra^pCakabqJSSM#dX4qKDJRH8Av7~}s#JsRu}ZapLtJ>RoVGSvV#7@1-~5^(G5 z>*IJ|Nz2|0c}22zA$=Sk8M#nZJ@QlXO-KjN`WyvE>f<63#j0O|$AoGZdhN1#i}HKZ z=`%wwM#wzmizrjG>`g9D?tF6ZaN4okYu(mR$mEAUv0B|Ctrqjxb*)oYqS>HX-Qco# zFwe###g~hOjgH^!ym{A5XziVe`q84R_8r=~|K+fd)pJ4A>E2%1&~-oj zTi&co&h(+xUHNmN=y*{;iHvK@4UTU#$5~^WHLuqhd?~ue6Qu1YZJMI2w%0vDf_b}y zqL1*c-GY9~KSXU!TSYlb=!=9rj##FCpc%KfT76}Iq_T#8n*Q#iqSftrjNFE-tP1yq z#}j^f@J?OaD3lvMB*nC6dA`$({`-a_!h58d2ND%Jt}1=o!Z*JX&m>&Ky({|7^8E8A zP1#faJF+y`Qk2%0(%#w>AY}Nx+NRmHVWh*Vd;!fC!bJy*UkN(!J_(O5Uic-}vdqGB zid%n|4g0aIgz1*+UoOz=n7&%p^*gfBN$K{pE1%xeFiLFqD;^(C;j3cr&dr;AR2_`S zOM582=j+!|f5SGjN1W~)@P->Kc^;_vQQ?Kh#JmJ>O%ygTg#1CE&&fpPzbIR5TaT0z zWgGD1W3h^$I{1P7k(e@|f6BN7;WHYZ81fE5hehGB8<67~*>N(EM8jFH?Z=R(Q{T!YZUMxlMM z!!z>%h0(q7nG!pX80Y!>rWqm6Y=6qwxa2 zIEZK=aPZSiwg7zd10IW4KoGq^3l}S7$p|ee$*#f=ovB^aonKfGA7i1VmBRc#;dj1d z3G?OoGF_sq^+~u__PLRWgBV;+s2NP^T-%L{6(qeKn7G%bK_|t zVbRf=CXzz&_s|;hA?!8@*8I(_$5DpkPv*fa;@Vb^D&Bl_OPHJ;HWp`=J70jCRpR&f zK~y(vRQ4=gZ{9%L6}(>m>^<&8Ewjm%GgE&~_`Wc)6{szHMVYAea;7BwYV*D{j^{hg zX+1VtgX1*yi{;);$9>fv zmw4T(qD;S&mBylx{p5+n70GG&&`17Nl#b>m*3dID3Oszcq=ohSn$^L4z?VmZK_w=6 zUl+3+wlc>c$J(uHjQzxZ?fFwtH}GTZM=xdU?TW8I&HvCdGC@gk1JiP5u@Pn_VrM(r0BA@Ka_~h`gB7s+V z`Q+GA_L#92HhAj0>n-c`OI&@c^`;%V#drDALFW&%_thNxjZKLo-SM0N@Hzu^6-3@Po&wg z<8x)e5e|+|U~GqRE{{H{o>|+!Q!L`=fun*5fVHZ4Dt>$bH6<}KVwP+QcBXMlB|7<} z8$+9bA^8Ilx_4rL+`wWh-`|W37>$mMbfOFEIQFaYp2}8P;_kwAvk-AaLL!Oq00`}z zM{oBSbB+l5h`xTPc@<+hhBx1qpeH;vw1S_#H?r(QP59!)*#XzdBP8_=)ae+fGr)O_ zGB`v*fOT9-gp|dk@lYNfK5!k@gENlBDxg5CYlR_VtLmV~-j zSJ&LCTVHwCtV^fQ2-S^I)>dPTjZ!@9n1m{%f9}m*3(>On;+y__K8^eD1yBcO8J|^P zss7EU;?MeUp@W>|uxl>%+z65dkhshqk& zar(&Z-IlwR9_+tQebL?PS5Lsch+9(k#r+qfYMc5~?DzEA4O1??|Ijevi#k6;hikoBO2MCBWezpnnFEX-pXyw{W|3T<>Fdv=GEnOpX}O2 zNGg57U^ObVW00%@g4fQE!c})1&;zs(iU<~FUquFb4!ii%Y$&Xhv9thH{I2%^9w6>1 zD_iZB3#(m8KhrFflgD8_D$zayDX=Ln2SQj~;)&uv9Rt1xoxaS!NYZo&Re@@~w7-i+ zjSR6b_^hU!wKMl!V1vO`Ky-+NzOnXLMsjQKhZ>jTtA7@#@uL=aG%gUX1Hx_gV-E9U zs7P_4B<xLflvj}la3y;ssenD{#x3l!XcTp0q(SQ-sN=Nq_I8 z@`3iyn)v~4W;x-teD{mkeZ_+ozp330dFeRXHDSixw*B}M>D3MON*k>Vt`A-OAYgXp z5nJ^4w6DyZC#BBo(=keJ`*{GNS&5m#N_-66jph5Iq*m!Bb{A>UKJiie(I0S~b$XRU zyFI|(^+%Lu>k3bv6St)CAD_U$?8n>dC=2O+&1Ut?((Ewb5ph!f*hGh=zm9Fnz5cX@ zOIs{1ZJEAw#EDM&DYJgPw$a$9pB8Q25=*-shmGmO9UpsLPPPi_%rpJjbK#`L%odkB z?F}2LS9NFj=qRzgWh57O9$`(}&KkabWo~tmZ<6o#w!3x@#w!OFD-VGG<%9c)nR(qxfew*|XwTb8bLz zv~GrAK2-&is<)e0ooQulRZUD;m`2D=k$c~7l7o&P|qVc~p)TmP2qhY$Uc&hsyH z{t>&LF_xrIeEzd(WsfSo&~P{Ih`+j;0?jY?I`^;n)Lfp*u^HX9zVC7phOxpoa12Pu zaWd|$Lt>c+4H7x_=olFIv1F{!4Dk0|aO%17b@p%bqI^cBlsr8H11Y&|0WAO_9D8^1 zZT{u=3yaycLUht|?%Pd&^K>lg$h+krMc|>L{+GAzqNWQTiTTAJ7kwFn9S%bzW_=gy>5Fe*Dvw%Um2B4*=*ug;0zk z$#4EL;tDq`JLvfnzbpPM4(-G?#px>@EA%d; z=<%4UD4E&fTk*kms_NRVGedqf`kPN_ran&>tmLJe8t) z`7uvH3y_OSlW*RSDK-iHd*|rVnXeujidi$5gtsWbw7=^hv zkivR(QFa|@1d>xy>@jW(jnh7~J`+*dg8&eUPF@pISkO*J#50D^LL4{*ZwNs8UY=H< z>{Qvtp$}9dzoXdM6#{Pwk7wkkN$ds<01n{8EHA*=l!i|Ne|vlT;QV!Ik}@DI`2OPu zzqojd?YD+@Rdw~VP&vhKaVV7^tADqCZ8?ov$>~$&_cRPRS%h})PMej0il7(PTC*mv z&%V3w=r08}C~TL@4(=H{i|LS<$@8_pzZ`{9XoJmzt+$vIgI|)A2aK1+nR*2bR3 zCjWypbEI0Oy_4nM7Z&tl=Az%ETyyV^&nnJ~d@S3Sxu3K18ruewpN`RY=b!oaG`T)r zXt*C{ru2Ki@QU|q9xA8!r7(aPB9S4ZI@+fa8iaCVg>t^XP1X6tWOuPPp~2gMO>o?J zYH~Wmrt_$~7$=SHRce3jpa#uQjZFUhBTO>>*WWj<8r_kLB~9YwmU1Pv_A|p$Z|m356Q+#p zzI#N%U9w3=bnvfp$|D{#$IW+V%P<-*>r6FCh%H4q zF!Pxi%n-Y-{BRt?Z))pzqxMA2J{G0G8+g^p(A6uks%{)SayXS35F=4KU;m zF|pY~N0s;~hbJ&8hMv9+&XQE%+x>X2!3uH>)I7#890RZ-0Hah?@Y9`6^~AI&ld2Zl zpcn0SP;sFE@1?5<|41K>JC1bx1AJQE$uD%4zqG0&B6yJD?iqTZ-Y0x*)Hp(6x1E7R_!+T?A&_sflzacZW;u6Qp zJX_}S1F3cE-G;eObcEe})^@FFl2KJ^T;z)Xl~1E$EN5B69$nliSH%~pJBXucRryJj z`VvU`e=be%nrgTmE+5p>)(@}AJ#4BGe#_fKkTbCN#6iZGo3q||uSU4knm*OzeHg>aTtCSW2l`Ux{{K-}6Y>KK8Q%{ri0%EMI!L4WE$C z@VMWqF<+q+Ug62!l+jgs;j={^He&RZfv$);-{>l7TJS$h zUim{;=6n){M+}SikRfq>eU{jGK?doA_m1;^#gOIIokj8lR{%2ffz;3_Hi0V$Np`y@ zSuf*9z3!Kz8Pi2x3z&8Wd-QSBh&$jHVi7DO6AgLQ1?gSc?PQ|qA|81HeZ%oiZn4+D z6BrQ{4vQ>=1~ZJQGxHc^o%NK*G>YljFMu(LoU9W;fzUY6jUp8>G?)|8gpGl7WEjPA z0vsP9Sc(ABc>`D1l1-pW!tKzMpJM^gsjRP$mTf(bNMUag#3mHCGic5=LD|O5NDCe2 z*EqNCwB|g$DugOHW>{ka^?1jdP*|J5fYJL)yN8Agt|I$V&mjVNwp_3RbqVe-w?J2` z%l?R`oY0X78ujAET~JM!xw!rsIb(VPbj?G|0_xujC!rSYoxFVjoj3IC8QAvp2C|za zAQJ^HN(7}Ku#AbTr(=+cneL;=p@!W>^PzBcXh>8h(WMYwH`v?+5tk#Ru9?$B1_%f> zQ>iUk!mh1%B+K_0vxJY1-NC~+ch2!JIidD`NDC*>Q$AU{Eq;*LwaPs*xWrrvGKw7JjH+lawe7UCm^y{1F zC(^yQi1tMvjIT~Z3Kw_L_erF2_Y)_vu1Ooyk8JwY=PrLQ{Wd*QOm)k>s7|>(`-yW~ z4Gt==2=NJ>n2LGr@VDBoQl?TFjNGn~1(A1G?PQaUxFRc{z>{#-_kGB*)pOB-UtdVK z6^@H)6fmb8by>)F$X}P)PaC56xpb9XC1cqgT5-mS_X2ON)_llsKDY39QpU=81+?91 zB}_$}Wj2&H_@G;LKxcY?w9CWQ8y*;O5AfzVl-CTQytXK9o;y2UN2{+)qje-*SBr(l zTKe5xR+?jPhZzTomP=kX)nh<#$vS4Y!nSL_{(5ON`%%GcS&4c@i`nFvqP7_w=dt z>fzXwy?|RfO`r5v=i!J zKK3+q-u11n-hu)wSNi&bX?|1x6wc}zdFsa+$*!y|r>kceQ8x&e!Ijvc?WbzV>_Uv5 zdxS>mN%p~)PES4c&*TF3Y};>=r0H?I(o#yb&Nh0JzU;%RqRfm1VSQJ) z_;arO*guvPDb+Bl@oLl26lW!CtxrNGu_6m$9?vxV6A!Co$*$`^(^yd4eF9vaHpPF20{U)}!meb*jX)d>rOi_WE6pzsHjRWCaSE;26-v%p` zyE30r;7~K83zXhz>XfuRQeBmthWdDy{Z;*Sw?xnEei4BmZJR$Xb9ryM)b~pgsoXsV zG_2eDS80jSS2MErIZ^^zeyr}4=jr?F7*ng%l>s44oNoS}TXEu(pL_eD1mpVC8% zLD7P{UxvBl-mRFoN=cS$f7CO1B)!#P+=2gb&M(o2&I3Gqt1O%v7pNCRGpC+jDzSIr z-N=;d#T8h&h&1+|TLsxt+H?Mg^`$)%uGZvI<#$Kad_5(-Cnme-Rcb`8@qU&hC;TFDcR+L#Eyi4#)Bu(b?eY&3;Ew&?ZVHxcBzJKh~NKRV_&jS|OT;D1e zZ~+gwkGt7bhEH9z5FO+A;mK}iz;64t!>ej;!M;{0N&kg$?6+ByXqU8;t|t8L=K~XR zS98+Sn-r~$3uRMIDjjs_bI$i?^z>(oGaFNRFy5b1&7FE&`$2!{HrDLojn}>k*cs5` z*bHc%F5g#E9rrNro09kK48Gu^wh_9-t6r}U@MYie;}(vP<8q0)o`B4D{KHPE+AjYp zg^Cj=_MXn{-jm9GddTNl%lUO7V*Sp!HL@n^BRk`ozLuHw%{J3$WwFT!ioED|%onWP zTYCS}_FYBm(kqXY?iZFwXHGJ%f2}{l!kryIKhv^gqfHgm^=qkn;i;E<&&Gqlr6*id zLMVJCdo^EKEaY#`_&q<{GZrqxa-Oi6M=okg0OB!&=Y^2`{HETuXsKPEZ2$OFCy(kD zQ`ra0EP^${6gdEAEKV_NG-{D}K6!dmK;*8f!ZzVJ|3WdQ?Ey^U{DOitGhWWgnAmtc zN+cY(5`d;`L~|kObC<6x4rIAnMF`u7c5G|UIS|$rtqz8fNb8SXe^+19(T`kU{w1yX z%fq00e&d|QwVfq3CpA$-0A_x0hvKM&{Pk3mv-r zsb6^Xw#<5gIO}_fxlWwQm9&ZZdW;h_Rr_jad(7RY*upY-fBl$xWtOewcpyZJ#Z6>~ zaypYq2kCi7{M`JK5&8b$&F<7>!ORL~Q5u`e-nW+BO)8rhcIEpptxdYqnZ%|VGQK7t z1_30ZaI;^MH}wfSK4tR&Ivlgd(!bBFT>P=7glfIO1nZWo((NF}j)w~;$~;XhT5aT8 zsIi+;8x&HuSKmjEUxW`hG*D3p{C8v+2!qE>7(FPSlK$C$v=#tRkhKABKcIV^j$g^; z)7YG$-o}SIgcOOrJAzh8OoUjUx?$lU$}Pam&;tgL(~1Hd!4L->%daTeNQJtmBc0l)V&i?1$Xk z!hYKq9TObhF4pVIVkxf*IdA*Ttd|z3WoEiOe`@$7wdc62?9f%m&3l9=x7;e>Sw|&T zbd73Nx#pt>#%@1{+D07waPLRCHQ7u(pNAM+3%fMSiXi!IHoCt^Al4>l(oD@HZf!9W zAAZem{AG+k~vBp#$(R#4Osdr?btl#zz1AN(i?Bo8s zOAg;q*3{OZqx4nSzhO3LedKq>0P>WETrc-U8m_E?$h%<{{CleSib5m&&Qwmu8@tMe zXgs!W>h14%@S7I6^rvN;cN+W35TZ5LGDwfGUD&zC@=eE$OFSWo+FjJEo~^x|m>_w9 zE^};n@~d+ylfO#PyVId{X*xOsTV+c53c55xy|}bZ&5wJ@oj@0YSF_k>ch^xl3~UxjO0&G#`(-=o3&54n#^aMtcz#ud2;%wU0{bR zxBGnmH?{h?XzIZ!_N`AemYUi@iCBMbgG)?YO|o-S|K25^zs~`N_e{F;JM^_wy6xw> zAAiq-j3TgSPlico9=+T(d3|jow=X%B;iIbWe;67ADgtf#v-5r>R*Iem2L|bhAJ1$( zh!Klumk;L@0#7A;4ixqg?dRYqY^Ktv3J73hcKem8Gi=ruq~_mjsr|5$xcN(!-|nB> z`8V~l?{ja6vi*<{@5~#bx+uOTA`xjBRT*AtnI(F*=5F1#UFstaPhbh} z#8?(<``o{~WV3!{vY?HAczj5iW_YK8V_9%h-hHRCV2V@2(4YRP6R2p=s|jbz?b=l> ztoi<3yyxg{@E9jN;vPNHM0Xa*GR1j&)?WTNp7aAXs(WYmFr<$i@7HJ$n{*#pPuJ11 z)t~jz#C+k~$@%MIg+rZn#pw-!T|G++jzSUU+4^0pGVB!5z=75x>x!sy-Q6 zSSSbF8O9;nc^we$xu0A%RLyKP{JZLja%;X)#Tw6@Iup}96>1{YicKH)2QkE$CEl!((fxIE-?W=VvCFPYPI+)6K^h>eqfSvmn&rHG9UTk(k*uWt z)p;XXL-8?#YG2nD%QgJC#{O!Y-P3n?%u-m>N~wyrub}+Mz)k~uei0e~p{OlA%oavl z7{cCFfWzkj9z*b*LoP$IFmP7QD>q`_XJ(pakufOU(Qc~hq;<~Px_@e)lhBQ6Ru-06 zkDdGrf=saC`(a^e zkhFyJowkn%Wa?{_gz)dFMqDkrv9%k@OoJ^ll0RA-wSj949b>hPn59HaS5pDQwSp8y zZS759Na^gWpwZPX($R++M)AlIY81=xCVct+^{~t>eLqpfb(ilkFftASh_k{9!)A>N z95VQ9ete=5b|K$J5tqiyZpkqg*!CLCFAx(kCFbME!@lWcicn=`B|6M>Wx)(X>T^P9 zE2XBUI`+u6_8g-tTgP9qGXa)M2Tnak!`4XxQ`Z3Jq5Vu5NsKiZx%&iXLY?Cmhs%?Q zpoFXeWldg&qtuOPmweCU*0J;*QF&`DVCs$$pt-kj>l-kQ_<8=Q5EYr2r`0<-CoRXIDSjEKDwg{EEw4i0=)b_v|J(l$8xS5+J18 zy?+B*Yx+&f%!@+cAemDEm27rU z31I5$GtkmHY;JCTl0B(Pr2uah?(}-%dZKDBJoiPsL?Um#dL{Pr7Z~7!8LvN}vj#OA z?Ef}1F=;99WHIDSGfWhU)x^?vVc%*tmLV4Y@{Da*ZwR_9q_VjsN|uiJX6~_Vng+wzIOSAuei(>GFt>hxeDw4k_*Z#ZD|MOMf zfj8Fmyx8dHF`r2$?hZsb;RW5$ohzXiyr1(nk+NbKfoD-M9AGE&hIt|PyV`Ciz|Vgz zCEhcHm~Dl6fBVD_KOpe-Fn^Q5(=2i5%3OXim_wMgcv!G9L{1DMj^~kjHB&iy1_oj~gd_(+$*agP$}X8XEFO6^8t+^h~1u7vrKrs|o$kv3Yrj*)eHPM?Zao?Fy{&#yT{ zKwU~8no{UfaV5h}oPFDLy4|F)Na9e)Q0AKbad{c zn`oG6l|1(J>C^b(_TZZz0CwqPE6hPHU`XvkFsTJ0-AqjEPK;I;5Z}JNJR7iqi%^1y z(TkF7I{<4GjTBv95MG4jV$?!kZNW}2;HNYuf3`oNNe;;O=fzT1(t6%p;sXv~iB6tb z-O6&$$~1^`5EiaF?C$O!b@4Cgir^p_-RfA-_3`LN`3kUr)@|4z|ETZgO)VoMqbMuH zL^L@bVIQlurs-gIZ{V2NAt2znH`A`@3t)tQ zhV1O(a#;RshbkW=+LJV*L!2+%AS^({} zj_Bubf}F(p0`{pDhTjq?F%$p>T`D&&EWlqY+p3N&-uL&?fhqmhW>_##kZbLx0PA5< z73VpgRZXaf=Rt@HT;_vCkg~EeTvC6VZ1DWjY$eZozatO^#YU{XsC= z4)ru`+I3IjXsr5|4VNFAQYNm?lI%*@bfoDj5=3lXxjMB+ya53L?$0tQLJOTv8-0Jx zvBNujhm6dZnQr8qM8IlX;JO!y*wy9TG`R=sV?LaFf6hri{!Z#Kd=uL>2=)qu39Tt| zqxN$e_VPpg;mB?uPutf!$rc+ELuls|3Xx%v1Qv&Z+!$u3H|055V+xTX<`z-@)>CGc zg~=iq(fvTMcEX*`K~vJ??-d9-!=D?H5_PuWxZHHj_AB7bQ+1+X#S~$JSk=2CqQjWF zvCzY*{v~)u#PkKxI{mxdzKf0Lf@k=#2em{7U5|Pqt*Bsq?i_13lH0BWbIuZTQHX`7 za;{+A$n0hUHQ?AP8!H0dhRTdc1@X(*(9+KI`fpsN1=tPO9xo)vtQ`mG1!lqXVe;}% z`mLa$pjyT_{|o(t9YdpK~0j+4S13y$t`mI23cKt-vj_^?M*500Sf5^cGKk z&8e5$E?$>1#4*J%qGK=lWET*2)2|T76E{GTFH3eghgmNXeJmd2&{Fg7d%TDo1oIjaErMUF`EdTKpyyF2qGNiO`HCEAytl}rK zihq*dsBEJ>gkOENVC)^dJdv!aC|=v51z5e104$OE^8`m!n`9OfVt0 z&daojw1g@#V@4#oa7h;us!cE0RRZ+|HPC3?JYBsIwR+fi8s|FjV^?lhQ0Pe)A?X4i z-y_wQG!Vj{cmBd?`<}5Pu-rl+&kKT=Iw5P~Ddd2OC_YQL+~&v|@I|B_D~IgnBv3PI zA^psdkP!0k$ec@U#9l(ginFW?&~(w$eLSM(RJci2u-=B-F&!$F2fKB_%CNxbOnuL{ z><9lnp-pWfoeLh*Bcf&`KY5aRIB#X|>9?KP-FWU&C2T$&7Ux($ipZS&8+1*xOHL~x`WisP_KWmjcU1|zw2c56NwAW zC&Z)FFQ}{E!g;SXhqQtH@8$~SvrE&1LJ80Rdh^kCDQYyl(o!2r?Pf%Qmg?%`7Xe!Z z7Heb6kAg(?7_)&W0%0l>E$NY1kIxjDWHrD44(0OXMtebu(8y!t8vmu5(L07%|N9~! zbuCH1{>yIvUw&0V!w%_+3z!2{R(QiT$Qhn<|L-@?j~2N~uJieN0QXAT528h`qefB- zc*WJ{xP#Q*Op1C8*d5U#e&qYXf4*OSk7Ur7KQi*l)xh|5-TL)AK<&jY!)VTK$5#0+ z_Z4(Nnv#xhL{5m}su!m=tQX94Ou(R6k12yyHe#T5u@i5*4b}nyOn=gu=gxl)pa7=f z9l-4&*>S)uBa(|lwp09To(l<45|Y`*WoaBi#FhsBGeq}BkWW1MNlhbUo(&2?VoHVH z0;x?g;l2$DSU()m7Rc zJ7-yVU@DDbqF;%rvBM;_iYUbRyF-aGs-cVO9rE(sv&|Jd0Aq)#uuf((j+L z=<*%={1JgW+|p%8)C(hur)!19^|oO@;CYn{xZD2E7Pu3Sfl$OCiQvVElZX#i@T7q8 zRx~_@qDA8Ib~kU4yH3(n3{6YIfJB4mrth#Y4>3UmJ^xC}tQQE^AZggNPkXuL{ xBD5uhY_flAwODe*|NnSo{&~;;TMy;R#+%a%lO=SXK2-QmMOj1XxuUt>{{aU*`a%Ez literal 0 HcmV?d00001 diff --git a/tests/triton_tests/plot2.pdf b/tests/triton_tests/plot2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..56b835edb943c428073df1f2a1ea9cc52a593485 GIT binary patch literal 16044 zcmb_@2|QI@)OV(EO__=c*OZKRzFbr0S(%ctOxKW^ONJCeWR^JG!?g_1pG7>+E6ewbt3|zxP>3*g#D~5+#L&370*Gm)62ia3tK* z?kG%N9*!^x^diC$su8NBqAKUPB5?~k%&GXa2#|ALl`^TI=FZ^!SO%t`gqzK6G?DO zFszyeV1*b!f+Muu0ShWWa@8NXHr)C*c2Iv#0OA|s-iHXdTW23(MD+Fa^RWl>f%pgG z>k}PZY*jo1z=%ljheczhWZ-Bl^xqoD089+D_JtGH*;DrL@B~*VsOs;+fNuYko*vP| ziR27N{TNZh#SP3Cj?i!eLQo^xdpZyywtY!HL|b=QKu(L9uIFxP#)j_eJ%vh^jcDyV zEDFu~u5cw5ZN?>or6f0e0@ZK~pG(j8&nL1kV=wRoQ}y4f_#~B~n#{NFOe*t5&o$X$ zwQ+i9jo6F-Ep$}G;LJk)jQ%iY5VYAiZw!DSMvL3 zA5FhqlX~_N)1ez1L{ZI3ne%xl=~GOu4!=dSvF2=E6Q0^wmOxLdncd%|>xUZP-Y9MxadqR|!ya-y<0Kz-df|j_xmeGos=ld~l>H89ok)J;T|vkE->2ptE4uUq zAMK0wK%AWj3q^1VxQq1)2o?q>+!Bi6jlF(o_KAH{tMelDr}#0B{5w4*`hE*ftNdR! zvH8VKxGla4Wa3}m2j|EmsJ2(?`dTXYMiP1v_wb$n6_KK zg*8v*<_?uxCCqt4eNqwB?6ic8h#a-bsHDqRH4ozV+=`!nIycy2 z8ZzY8a9nY?!Ps-n^w5z}tI-SJs9sK;Ytx}{?N_0ueOfv`xYv}KUx{vF;r2r2dH;cH zE%)UqWMFuzwyn7BnTAJZ)y}Co_!|0XA=&Np_V&%fo5b7O(jVR~EfPxqu!8%T=}QG#n_dNUH$mClBAp~^*NOzwz_ zMm^GQMQ3H(N#z~%Ni{HXtNy~-()ZNg*p1F8sh&B&+@5Qv5Z#u(6(NQh-za`$jA#Xq zwxDZpCTrrh=IojYoi1eVndfre)7cC|$GEnJQo{WO?@hd?i~72UbaBjD%5PWs^~t~~ z#c=N3aTJ%cmKArqQa>B-%CVvLOo^%3W2QG0m`Hz9$V^E>QH{@unyTG#x4Z0#xli9e z-<;~pe&FH3^gd5_Nb#Fvyw$?vF_&2DJ(9)U?UAGVTLtwUnk9<6J6`vehn=UdpU~02 zR$O8IMGM7vexxiVw&K8)LnbmcGHc*qlH<$JvSFmN@$(@p%8ZBD8w%^*h`pwa&j6Iiw&Q|VR#*CWtn^Ub0E-8kdQBf-n zbeEIjM|bqKZJW2N?5IAj*)`)Pe>YBGI5@&MOz?$(#Boi!Y6sp_mTOJ!7h}9t&Klfk%{FKvXzR-BzPYqNsO3JzuJ_ZAwqPIIXdG+JNqu~J z{J@ET8A~=Bp_LkOieN?)drGg-9(nr@d&L_O4vDf9wg)d6AE}`Ka&IhF)hRc#`igSU zVO`0|l&ZMhev@ebbV=1bSg4cqxTh4w#R{U9m7$jlVH zBk@z_xjnosIm4?BW?H&ZX^%C}9XoSSHFzK=)4{u-)$3sUhZaS3&q|->Nt+J;>Tiz- zF6T6#X6jywy}cT$#dT8Dj;{H58^L(6S?Rj`hde0)#|_bDedXrp80y9;=0gJ!*yf4w zQu$~bCazSGj3TccujEG!jED|N3CZz$6L09pGTA>rbxWsztc-i4v#hsxAc-OGlQK!s z>Be=1^vD4RRvPn0*Bj;na)Aa@wxnvkArAAb7j;$xdbe&_9J-fy;J9uqR!{lLm)glA z%XFV_LrCF=oBM$o$UYJBlPGB%{wI(iJ5z{g3Ag@9L;vp0e>hYOQW|*C|L~>?DY_n1 zC`Q#~#Ym}bjh_cEd??skn>l)NSVZ{Encaii)sLpv>X{q!CSlJ;oP4w3RdIe%&aKOCEOC<_Nqym`x_gt|2#JaD( zCztmrPKsWdbm$UM6l;Iw%gM*VU*phYNA;}Tj7o#~k(9@d)BL>ci^U7CFAcb=bgz_) z3omI6L}7eH-Z{5UK5c)M^nEm*S~u~UGyjLN>mGelHuc7`>UIwMeac*75AI(!;COPo z=?>$zk8bL!qQ1$`W{xyHVsK4&+a93vnTGe$$R1s0)4}25{qtAl2YZ(JEc!`&e<}$L z0hf(n1}XF3ve!>gW^07gQO!399N5CqyGD{#7+5D* z%(Etp94in%&$4DODzypI5c=G}G(d6x&9r*5z7GvQUBvQ*e8j?)i9-tA1xDt2@9r{n z)eDJRL{3sYp=8%wU{jrqNn__jA4g21k5{AR7GwvrybjcOSTzxwFt(Osa$*nV4yzuk zGTL`O@5=m6!l;xHjzo007?hXI(3uo{jn$kap5dqTB=w!q%bjkP;T#`z%0$lRVy9Uj zEF4R9J?_VT;CjV`Nf*CG*H!kNXUg3Il%IA8vEE{e6-oEyH(b54t5jCX=B)6mrO@ZO zX^zK(jI>#ir;e9(ICoOwd(PH5?P$%`y2L0c{!x?IOG$R@7zHokR+|u{1UZf@W^ry9cqX!&x zO2Sn&!glt8lnB%cA^Q1B3hvUL6HY#dzDgX3)VzOnOq(MtGWI%+ z#oW8Mw}*N5_MFqJgZ$3cBbVLni#2rGp4?^9*H58aRik&R!oJ)7{bcu@hW;-s`}s{+ zHxU@LdHF+NsJ{@{0YVBjKjXu|NJZp9u3@FSmoHc02Ubtkrey6;THx<^!p@TwkR>Qs z?$|bL;Q)6WGuCUl`{^bN`<3M}UV+qD2HU7)ufDQzj|&Mq*8J1D&1mQGD`(6PZyUI; z*LP^UlhiPcy-Q9rp~EB2n!O_4@9Je^GfiY3V+5(HDHZN(Qn1vKfS#O|Nfx;fJO61| z!k=)0`7HxJo}Y7SMJy;Be=2`AaGr9fkMf%J$O^A|-Lx=#7yb4=L%Vl5uOp7!@h9=# z+RNsvlC3Po&foP$tY}-dL0RRcTq8rT62;5YWfxDrK4K-giM*iQ-9~v~3I8oG1p~PR zHGamogh<4K(4wFhuSTh)*K;#bFYQnja7N?@#^iV z`|wg#Blx2rmT1y!5k8q+mL~_Z&sN$#p-~;RHRlbfNP%__;mdtkgzqlmtf`hr%oTxz z8BBuc1+n0Z0ox_!+VPemDGC$tmqtc09fcQDAMA5~kP4G^?4_1qyFKQ>t}pYjPTMGs zI~6^{TjVbgQYPPiNp{*n|52G>*5H!^!&xi4y{eB4IqXksBk^{w`g`-Q{WD(&~v76O|{5ZX>}kRXPD`wIyg z>3Xy%(G5>=Smi!mIM*F$ba15pi>IrqQHfZ645}3w@*wHqdlO9ixYxzMDjN~?z#))1ScW(yPP9VcDrq+dT&r%<2F%C51ws{E_qZOSFhHrctHR2-gG@l z(^d~-?jv7Qn^X-ZDj|aY!hAq1M>V4?)m0s&BFVrz#)2L`} zMk}6=7Y)jnDBqb~uYtK|yN$JPxv8CM=Qp4FZP?4M0yoVB&fT(8i$CM$t8j<9Pj9}3 z(AUI%-);VUU)~K2MvOmvs5a{8TP6R>h7Vp<7x!VN^W8Vm>F-A+}k(dXp?gc^xr(E`i9zPsDLAw;{2?{fB0dOgq^1=$xa+2b=f}3YN!anbN zXteOHV|j(=I!o7AtDHY2h=ERqHaw?IM{~{>A374C}YwTb6FLkh*sG;bF?;3lBX}!!y`f&3Kgsv^m={@Tj zqqlJjcxYVY@Syy*u%J+X>VXge)v}_mSRWyQq7`j=8vnG!Hdo!U_}S!Qj*CO@$19dJ ziUv6?KECTM?8-OndeBa2-tQ|7^iASyEyb7FZ_>1L552>SI&V2x z8>@Z)(qfQ@f4MEqNMxN4KPhSWm0RpZO-PWN5DRwf&j@y-`$$ zme=;ltqFI$l?u~r-Cnt3ujku?=GP3+B7LN(xuO8>}T8vvsSG;l;thjyX#XG*2IS8vy zZ+GafGXgHIdmnbIg%6oL^P=qzB z&ak4?W%T}$J5&}g@1&I~49)N3^_N}TWU`w8cj>=?o)~mBSOm+l6l_u3_rN9{x57lD ztdtr`Vs5KQ?a*Ruen3&l7m$r4wZ6_v?O@`eJQR4n$)1{uQ>q8Bv zQ(L|eE*ScCLjJ4{5p{bF&-BOv@1IfpEkZ}Ih)rUOBM=$VvDoEM_2wACRBjgyhx*>k zj)T(`l%;aU%~`H7N8j2LCu*L^ZEe~0$eK{%;C(61&%{s6ORqcaqv6P^zj zQ0BYaZR~-26PJj_xvTG(mt)`uM2t0C=HP=1v?B2#X{F*`=iNT_>LQt`zp1ciPQ)s` zWejWWGG3Zsn;jqbp9-__3}n?B+}57V&&3n&f8J{CRp08JHSZ|7?pxf~cF(^$mf7du zcEm1))0ZOlxoGi}tCU4Qx9sRyOxr}!m#9a#Z8GXk>CLb1>+Amfor+h~@$)94+yrc* z@RmIgN0xN|DWe1IKw*k79`^=j%EiEenPlw9IlmIee#^#P>I%) zdcK2)(S=r0ejQ@wwu^T&RZXI*qTQO| z+om5l8oDK=ee8Y1{NQ;myB)8mL!RZ0sxQ(kl_6^sN4SF>Hc`bUXbz3~izisY;C4JU zit(X!$Cqh5PB;DyE1 z5ogPVmwRCELt-X5mPekYSE^z&oCzlwFk)AB@jTJRXM2gU*XBNN!hB|OCsCQ6=B==V z_Y~OlSHXSSShyJPWwFN!FzV=UUAyL!c=2%TF2$zzsvOBF%#*k^vu}&OOxKMQMUp){ z8(tmGDPg8!-JOzMFKBI6A;re(f>YM-ZnTgLt*bu#eb-IFT1KT$G0ooDMhSU$UA+ps z5wAsz2R0GWCh$_`ZywCJI;KV>omy~_4fTmVpA-#K?~mSP+9vE#BQm03#Y3MZ>Vaue z-FkshBFfr3Y$+YdV5Mo0aDMiZ?X!j%>iCaEMjz%`QZLMwy^4BEL8-FS@lV#Uo8UGy z<}Y@r`UV~DKzm>6U^rys^zJ%d+F1D;-%hGNpijwg)_s+?m};5+fVeD3@a(94t$KQY zn#-a`PuKN|@0S=U{nrdkwp|sj(CM^#>`{T$MhrH|#AmqPN|-7MjxfIzurNOGL^SM; z0{Wg~oZT}(uAmDYLh;w6vE!}J{8VS6d%qsI=^4?HCM*P>EPoD zTNGMGgfh!oS5)y@vEr8kk=CP)&C#Q((tWTbmT z5Fp8A(V9k+tVERJfuh5A)F;0WCzl5&=TZw%J9hJ{PNwdgF$#VFJA3!DgypNE#rcjq zyI$+OuMqZ)ewNyOmenpM?lRX5_pNYl0lEqmP27|aoKmlAuPu8-cco3}-F`cN_kGVI z{R=X#rN0e8D>;gmmFreKGHAM&C**$j)bR^1FeZd3i_)EGb~QQ9ZT!+{J2;jHBkw7W z&uKff=i2e;#C@WKn@lhdo=UE~aJ5Kok?=LO`*lIL`PIIPk1-NS$`m|F-tvZFQYBjb z^Sg68a+7h$v-kOu!!O3@>$_ph3d1zRbt4~@$)g4;d$-VXdK0{54CsAwcb*#llyI@F z=)y@;QskUBLGi{xt4jyv{4Z*>P-V|GA3fsfKVp>j?nR)CrQ?#qt2+x_-OH$AuTrH; zQbBJUj`5vVJ*jrB<|xDa#yQUJuE0q4tax^w-uIuW&!}wWe0h4jjsQ!IDtW&(K=_mG zMA6OU&(pBi6J6$))IZn6zr<459CC08f8Dm7#yK?{`#s_Q!LMIwS>If5-9){cKv6XI zFCJNa!@8^B9R96Bh{Y)!3u)*WIPGHE@>M|W?R$VxE9gI(5#H<8^Y$QFb73m7kH zr0dfHXB_s0S?6L_XeZ>~kZJ?&tW~9)Ro7s)Swg*ziqSc44zIq{*IXUnsTBG0dTD25 zx7KjGZ1lsV*qRi{$Y3>z2~@LubNGDedtTnvvd)f5+E&EiF&6PpPa}x@<1O4nww!IY zoD*i{e*9O01B?rrN`wZ+W7ia0d?l$Rk8AXV9+|j!Jz;D$WxLCAru&m?LFIaKR|?8^ zyB+X7yyu98(`{MaZrYAd*9KUP>L@uPZ(|<$zTEr7RO9A$f4tUFD>u)CZ)Q3@ksekW zX9SZjmI%n_7Ibv(fxSx1;q?)tvYpAkM)KKp(EIX*PTaZJYK*qy+Xk(Qtowo1GL_#M zjU8_Vigf$rKP{n~uW-`s=^>%#%Ul!lJmpk#DLc;gB?b&31v@yf6Q=>|XSZmLQsKgZM<1?SiERMqdhB;N^^i8>q6qWHx(!c-hYOn~Ecup& zD^x}k=dwGET9&t0a&${x*X(AkmV{9ovMZQnAvgt|b}voK7JKPj3_C63q8d}=6*W;Ep|kiH_k5Z?Q@n-3oy{DwNxixa`Yk6*e>s7InC2aJ7V~|*WaWp_-Q~*qB^o8lxL89F}Q0{DGao+!$?tK3js(-g(dEpoEyl zo(q+QwykZv79T&bn7lcDdJ{!#wrltc^oGITG(d(naj0IyZsD1vGTNl)O)xqN^A~HV zdZI2fH9y$e<>V(cw((7Qqu)gOuFZv>_h7eIh%^bi9iFEkV0)ddV;#S zqDhdEDND!L>9W>`{LgbG7Q(f~qWBGBGj}FS5Bh-> zB;QiNFJ<4UW--L`@mikt!oIrE{Md$*1r#XyO=hzR%*6oo^xr!UBZ3D2N^ifxLg1ku z!}Eh+$MN+pQ^_C)l{d*KRnEf1|KaIKlUuW-)thNUY|<0Qh>^|o9i?n@Mp0VD^``vK z=Cjb(t8q@$?M8+7CkiIKJHq!h6<%g&Tdk_N{<@tfk=Kkn(-l@xJ=C8y>q*@ zz2agFl0*H&hO`(7!4m$4d`plKxN9!8RRaAc$ZU!9p|9-}19m?D>(nzwHY_ndrIq&T5Fsi*%g{8STJ1@*H;)h1q#h+g~s<<()jjdnqmlKT0{7zt5eMmibj>)8X4;oH{#PQZgpvtd+ui_|=l?!viFd#|lfLt8 zUpb>=8Ozr=(dW}GFHzi1)&0b$+h(n=D`Y8mlvy39fmGti%MFJ-#l18`rbJEB0zWQ(e@|f!Cf7T_f2eC&h>jJ z;h(*FQ6D}2CGPsv)aqBNBQ}=Ne;PP$lbrzSZ;s{lleC#t!STUz&ix(55tzm)`MXzY zo#*dfRi*U{`#J&ll9%uC7Lp=4OvfdNpW8j&SP8d()hn~sQKQ(5QXa{dG|TXB8|-BB zy^lswR%vt$%I7z-v{})yIb`rPG{Oa}6IV??n4ewhu9h1~ti9=YL?OR1W|pp{oZ?XE zx7xNa-TSMDpG;(k3EQ)uUJRQNFv82_ZM$VM>vQEy<;Xd4FV3Z}y{hl#Xl@D0zu08j zn*e##-y8`mB;I3gLeiw>y<3_+CkagjYeyRKw6McJGi zZ+R?i;=01oq9iQco@K*aSn!xfGdajkhRcfg=B|WS71N(t>rxy;dRkR#g`Go#%Zy!~ z7kRTc^WbP^L}zj?)o;6fvBYTrn=*o}O!2K@x*ObW+`o3YwEp2L&7EUX-#3}nCiqqQ zFLoyiadOO|NU|<9olgkSszis4o-7o-8T)NXXt<@<#v+-ry0K$IvE+$Qw~*ZNZRhwW z4(+nIb?w39Y{uB^xVua*M)dm`iez>)Zo<3 zzqjlA-LAM^Q{H=4Q8!chvlflc7W2P0Rt(CL%Bg+c-+2@}m3~vwH?*dz9&t6LF~%|a z)IFEX8j5Ur3f2DJVnfv(ygg3~jrW#6r&c^u?ZGl)e=97bihn8wZnu|PLb){KzA0JjXXv;9#<<8Uwp)JBcy zYwzRYMe_84gEJW@co$*pXGbCjIzwXscf8-$9fS+3Z0vfyy~=usup|lzLX#x1;5ZQ_ z4c_aZ(7=&{IALfAeOr=`3ptKh3W7_GmsC61b2obT)+hw$PJEghao^0U=S}j!W)i)EEwnk zbms@e0y+x>?0|?a5bKOM68b%CnyiYSZ_9qE`Zr~fCHp^U28i9w*2x#h z=C5?5>`Rsz0S!)dpb;PeN}@4fHBhzn(pnb<5{5AQSp;F`AOPOQUfILRjR+V=7?X(Z zrcgoOHh?@MI0^$a@GB&qJU?hUmT){0`Trxq{$KmS0MTNB6=Ly7xC}-Hj>q6(1mN1F zWsq>RG!Bj-Ajt^__=N65^)bK%$T=`EQ2+myU_b9;KwT(7ef&%~3Gyw+#3696%;W!-V1Uy#Kz>KA300Y3}$T@T$0}=@Emj>;j zJ_#}ekgwCAoIDwfGz`iiCZI_{36H{%+v32)*ZTl=*SqM-LjZan@BtXCe5GV{969?Lm zh4Pa*C@0$wG!hoNKtdvd*oLMCiU@jm4Wlh-+I7GgGwMg6Jj5-gMYCK5y{2`dV*U0(+k8D5(y*U`_TabPk{eS zkRhzw0AvUMPGngFyMS!sw?wuF$R>U!vaS5E3CJFPPY_?^YtRQ|8$S~?b376UbO?Bb zY+#c_kO4Y@>;kM?zo(xz@iT{n0`v|e=TP~#MAr5CeW=bq39<`Ft5E(cr1f`PX$uD$ zBYUBLmTz*{C2&i2aG*o-Eqm}C%6iEG4)jT;5#eZPqY9kI`htaCUs8Xp+~^;!4O;1e z1lOC87k_eG0_fcdn8Z5S8S*LQo?L+OuG8G$U>4+Y+`)sgUh)9;@S_B|3Gl3wD|o_z z@YhQal{^*-)mN0pfaz?1ey-=K2+IOjz=uz)6u^9rnk- zz|s8}b={u=$wC!~R;Yq3fUCM+4aW zdRM>h|9(=WjQZV6|MO83LRA@bg0Qy*$2^dyQ%3)f)qqRkD!WBuBiB% zclqt4=ogESgnmrn7ZDQH&w1%Xs|>gXD$xg1{2^<7G#uFdIu$^fbvgg)fZXC|59@yU z=Vx8NV3UTYk2}CiKY)?66b>#4{ri~!D*73?4PmhVh}*^(w7)_dEKu4BDi}vvNm*~B z+2d1_l*w&nwcl#X&3W3TO>pAx>iOR@N%(6fUp>Yd^xz(24z|ChOorz)6~@{)dV!;T ziY+(mHf8iT&Hebc?tdXoY1H2{3OMblL~f;^v0fLe$q8mQzDEyUQflXYNUrhPgGWVI z)XJ=C1(A9?pQ%?L+EMxzT~m`&vt@}{Y?JqQWIOr)W){Ht{r61VlN6Xk;f#?Wyn#X9 zNcn^n&z-1P)cfzHa0_)UEf?Jvn#0QAg-M=+%}rz4ich2HHa3)%KsbX^=Ev=00a^lXtoAoK$;le{+k_yz*^ zyVb?@Top$2__xmQgVjYMOR7&IP<2m2Ul3sDH1|6nV? zhv*0cfB**r$l}Kj05cqL1CH>2XpjX!Ke)#a8VZYtmeqgLFgOswvyp~Jf{AaaCk?Ie z8)iy9l3M2F9+yK_u*cKd< z0ATX({!m!VA3TFy(Vy}|5s<*`K)+uyg*x0w!=M1L`gc7f8V%mGZlvMM4m#<1N!~LGZKphZ}K%q4=YP|1C}`)jfd+xc zjWlWeM!6ty1OUJ`)B}5mKiUG!`e!{H0H7P&68@a8H2P0jNn;_*1O0x<0ASTW#+3n2 z+{St`xIgS3DMR?f){ww7{*W^P`-feDT3{2iffs`GA3URwD6qBvcRdsm+T3iU{V8V@ z5(3N{>Y-47%nb!L62I92$;Z~kjp#%Es@&KmhzQmfIKs%&6Iv3<=tJAX(GyNy63OQR Yz9d^85_u^?VP){(<8)#50~)aZ1EOsslmGw# literal 0 HcmV?d00001 diff --git a/tests/triton_tests/plot2.png b/tests/triton_tests/plot2.png new file mode 100644 index 0000000000000000000000000000000000000000..94659c0a41e63112a0a69da5cfc3aa655e9219e9 GIT binary patch literal 51996 zcmc$`2T;{Z_bo_JR6r3$FrX+%5|F5Xl2i~0f`ST2l#HTC&OtFCAX$)Xbno7K?X}kKduP-XH`4B;rJ$hL zsB}tRgMxxeje>&m+bzzBEjijlxAt52qrsQ&?mv zr?=6uZ;^lD$3w+lKZf@@fOOuS^)huP6Q zQHvU3Kfe&grKKg8<>{s!(r#iD_*w5)@vh#VAAQfI>BTS$Tr{$FK43D#3SodBQG#jr6VF+xF0yRYj&K z2JcyW;MspY@}=pO zPwnC1tH$La`{E>QOW2E)9~|(QX|c$@*0FHowkDPfZ^WbAd*%5xe0H`?*Pf$CkG^*v zQ~UAaJf(!&6i)z)AQvz1y|-7IzP&jAl8WWsBcT{oz4s0qSXfv(WK{kwPH7PX<31Ug zJvKHrfnVH@vX)16GqJK(mjf^_cb8~ZVt~90PA6_nhho8vE z$lMmV^Pg|RUNS#9z^j`xEIov$8U6d~Q~qqHyG{g`Po`Pbw<2%vr)u%BiBH*f96Wf? z$0<}#U!N{}TH4C8sLh0sd|3HFYTo;?2HPo8k3rKSC;(tgZS@~rm7@o|^G&tzWa z33pUP4g6&>^a#Z_36G47OyNIt=mGwFvO7SqKXtGyNI*BoNND@iW-*2Pu9N-STsbeC z%^shf9cWOO6^JO@NaH(tO8me9THlYwAAQ%+DMYp>$=K7DhMz8EN~NTx35kq6nPvV} zJI8v5_v*4Z`6IjZm-=h$Ar1s0UT}T1T{2+mon1PM;**p2{eptNeEm97Pn&W^ihX9Z zqiQI<$e`KyXHU-w2M4hqFSP`%&yH582o`*<2uZ`!o!-+sBA>b>&CsybRYaEIuf43n}ys}|K&2fguwwF2&;dnYwI z@?9tYPEGX|-=O~f>@2l~g+*(wjk<$FcB!j|$j6m{lvI@?0b3x|7pW zujK{n<~PR#6NN4v&r!A+X?;8Y)0e@Hv&dD+Z17u(m`x`uK8X3CUUbT7#!Kz*Z08mh zj4dqg=H`l@xVz~`mFU5SXxR?ALxy&Wx*Vhw+OoZ!S^qKH{)14bb~WVM7$RzpW4m;+%V{;``&$v5NPRWt=QytD10RrLwv*UbrBOUL}irc1`y*!bDG z=UWg3eQ%o#8aC!!%(nc9T#^;%zBFfx51BaATggYh+|p!i^+Rz6U0q!R*S>9c7}(PI z80c6Inj+dgR^|uRtX*sSvzT&Wdgw07h_w66nrwYUH+2Pz}_9bu!KHcnPvw&e?oVv{XEl!olGg#GwhK1FS zPu!jGC@2fsU6q>X^`fflwXV3Hp59#7>dIXlGyV4tm&o%^|Ik{HM14rIZppN6%efza z)cISgI@`)z4@2ysD_;_%-L>C4@GdXTre1B!S!n#e)V_66G~cdIxnp$f&ySa{3<_c$ zhZ_4Xdk7ZJv7fj@e+ggwy-ADr%KTlN+)VSYJAbas{-~ZBY?x+~pm3iVzV!F!M|?0f zstg4)zqVYAh}r1i-K$G;;~rD>imDF}-;WWs_*UTVT$3OjaQE)I(9qC)1a1EKmt9k% z9S@_Uw{O|9CHl}6W*mqfM6gclR>q_!Pj(<0M@L7aRk^UQgfCcm0t>>`Wj>mU*p9RwD*5zjx^JqvS^I@nx*AGi zMotcwj7-9Shsm+=_K^ufh8J@9eMbMOpWphxlN_r{ zlMH|bQl5)5rInTb+1X;MT;ZK}kB=vFu)Dar+O90kdn_98BU>357%&Wf7{75MjcGrd z_mc3+$_iOib#?W|bVKzbuVo4EmHZV2hPrpQCQsE9ui2&*6|Hha9=1Sk-yvi~fh{Y= zW1=RBUMg8r;4$2sf#3HXNc7(0W7k({JKQW>xUz6Xf81qtd3JgMZ%@xA6Nli^&QI3V z(1@1x_DYDeevy7&%LeOnZ~8G7t-`9aJo^#`5!-ieCzh@L4Qpg<*yILQvpj5 zvqsHtF1f`IOE{0R;u~aJHPGzZwadfP)2i;#v9vN{^SQsjZr!;<_4mszo4U7Gv<_cu zKZPx({N0cmETdxf<wkjE0G0vhF7EF3%L|U4G`ynY6X^tqN22 zefw_A&AF&@Z3TR4$+;3)T3gG2RWUYZ@9gY+_u+%Z4i!~Z`U@8>kgDkezCsUEZF5TQ~9VHD7;YhHQu{xi|d&+oqGLuJ>o4$PE(q$2%EiIy)wD{4|GGDmq zfG7w1el9M%=|ATwl%ozV%VqZJ+KR-eddzk>ztYYqVZYEZSu3j^yhnOMO3261d-(wt z^>-PN7F-}vdPqYmmE~Wh5SM_! z{kB{i-R)zuLruZRm6PLTY$jqmb+RoP6FnxB5_;D?(@O7@Hgg>>Wtm6r23TaDd#=H(5`lGO5;SP<&R^;b zM=n1LWN^?R{}d9JT}rg+Q}@}?DSR@()fVJ5AdT`W!J=2e`^0CrWp-`__%QICKHpyx zuYzyXT;wgQ7{pqUITaZb*wkK8u|Z2qYx&~syLUD5uK3~(Yfs3(^IASKGc(iD*@Yxz zef6q9TI$=khb(FnN*f!Q@qWv0EiEmVY;4j6!{6uS4bIpH1qE%te=@)c*+ofBthF{dtxnyCX2t<_UFu29m))vv#Jvy3JyQihOnIKq1mZ@?i zzm@RNpZ@+Yz>)$Js22P|uV1~Ia>B-+#y$W;pf5T5N?Ta}xhBubMkHGe4UIRsxwTPV zmvWBg17;vaR8Jo`?bNGc`yPo72htS%5mK$byKa{G#+O>@=E!15!dy*G;^ra;PMhDn zd2_D+q0KwTp$loOn%fRjCATzR|M=s<>}js!$HbG@jjUZqI<2}lmrexOe-*bw+IJZ1 zj7Q9F-MQ2M;lo`jk^GlX2brhiNu@#V@y#c;FClhdIw zfhPz0qptMlSI-=a+4$pY(Qna#m|FJ`_bp6J@(6ZhL>XYup5OIPP!f-n4TbHFKjL^? zUK<_Y^iY$<;>^f<5xTq&>5>&qfH0Sv(!SoNV_8o}_vHM`3v=`H9hwR5`h2415AkSi z{D%$JDw(ep`(QF@ZyE`ptK8h&{QD9!HY8K3Xja1-MQHr*-@S(qMc5wtpXkeq zzfn0RAwts2{hf#W?%(G;crfVi-|N`a=SgflOs>WC^^A3$9Km|Fv0_$RkOIt*Zjm{a zS2N$aPI76cs+qNBTd5{U#UP#c>k5e;bsB!88Y9B2+yGQ$YiGxfp5)HPT_U#K@_+_2 z#h%;NC{ko?Xq9;qT77{(MfpJq01uK)9l)%0+gwku?BB^rld0eJZU$cr%Na@le+PfO zWT};&WqrhZ{OsAY%#N>KzaDJK;=Xk0QgdtT1p3~gR<~=twRC7Ra5nF$#UBy2tlLZ4 zgf8#Z(^yfo6xl*AWF7x7A@?)1TX?zDxqrig$}CT5l0yk3GqF^Xm76;-KagngtK@c^ zjAyKm4+ZM|lc<6I!hU%y4)VdCE|1BLC6y-*BI|mZWZ}Xc~z_Xp-thvtSnIkEr+CJ z3;^ENty@36x%7!gD~%O(!sO%4wclTA1$TKZ-rvS|K|4D%AtA`GUCU`w(AvI$cH4nB z9`kl+>4HBR8KJ8}Dgahd&;|Pap1z295vUyHF3KBSP8$KHRZ>?^Mdp+^a%2Li_1Djz zK4`!2RN0pGH-Yvv(ha0h%*G04_h7Y>Q&V-boNINygL4QD~F-;J*yD(uQDNbesv?FX#yxi8*X zTv2fk5EzHj2*pC6>`1>^=jvk7YERxk0tXk@N%8gKh6B~0dcY$oo;$|`x?&$f9XoI_ z&LrVbf;9|^NGNhlbE0XOfHS4vM=)sOaQd;Q<9j#UYh@>?Ht5w%4zh&s;pf zq|V;rd2rHyeln&bP}86=d@@Bs|K0T)zdqd^(C$P_&EHl&*-(ZSU%o0*Kp^K?Fq>>= zQYYAk)9A>eC2W&Q)7z_6bX2&ip92zXU5{*C2?A}0NC}wLyMW z^eX3G{xJHvMrbAIB~_LOAQ}C3As=jri1Fw3I09%6pCBzDThs9I@g;aISRpl@x_tQn z2!$v112z0QnesM0-=AxKE-me_uco-WX*WCC_VlI<6GfTTC41zVn=fDT5lCBDSO@@a zZpDXI66HJ1u;|)ZC)xRQy*w&Q|1lf^yND5lHR)~0LWWn|N=#%MD_lH?j)C$KP*$9j zYZy4bo?pL?CCYd8i_x{csz%_l?%plwID)iv2Mp&0g};r+>17QGges9<|)5So*ZSE|>k` zWL#X_DHW9-vhsFg55d|7~i|1_8tU-&uPwP{8>qvUL6*K9`jZEVd&L zUh69Snsc=+Q(mT`>hhgqBD+F+sRRWD(H%eFKiXEEzdX~%ulKIbc=4J6DJ@vTQX`=^ z*u_P+zCB`=n`Kw#o~*Ec4H`C&^YHKh-f_F}`Nh^pp%1pLqo+}AQC-(49y51PI^Lh5 zz2V6*`DYj39zcS!K zyaEa7^bMD6HJXDIEvNVH^eH&WF@68!%DSn-aIJHBP9r-3t$u;6m>OxDH5?fK@CY^c z67r69$NLBAg$oC&ruA~K?Ivj(dCsFZGv(=1yPhu_!C9zZzka>2{g`PP*gd7?h3Smk zTyCHhe2p`irWGC|cU+^RQi%io__FB%qlNa9mWDkFHaFaIEoa}cX! zQd3q=&Qqi*tn&^WL_)nhI_B$#?@*tlP^|y3nh+*{7`Efx3N-ZFqX>FIL`3;y@-Z<8}NF`4AIP>b6d z>dBtT&8c8+-O2B+p<$ZnH@qdZpS>jhyN-O?HtqDa40}11BSHWcnR9s5*LZn(b^rNu z0JH?3_1Vlfu9N1J$!OT^P!XxFIOBk5ppVe`;PjgHh{GuWA|9@A-ma0?ETP3mz3IY` zGO&fA_?L7#=xPyQs~uvNA?O8^$Gc1Wz^~Bxe?nB99XWz18 z2d`y#o!g(5tklk~)#a{LrQ5XIJ~uXAz!^6EmaN1)02WFQEZRQ6H;pXw)AN9Ub`jn{ z6W_o??2xoehUjn+Z7es)>@b2rbqP;K9@*Bxfk|9k-0J(Y&{F4VR1E%O$BrRPx$)WI zAQqLdqy}CSy-HTC+0UC>T9k~89=+1dRin_!FuvU>vlooYf+PsC&2sD<9Mk#e?^MFM zPJiy~G^mUe-~%-iimFr4UR$fqrJlf_UN93Yar9_FaBwi`-kD7#L6_u1cR@>Z5Zx5{ z{tzyl)5^kPz<^bv+keTHnwnY_!K?YXtIH6zF~$6AY^HUKNWhg1X;@yu2x>7M+9#DER|JIy|Q`KLoS49(@m(Edu#Q z>g{aN>asc>`6ZkB!^5X5qXccv4&c+nK}u-%rlh1OVdYE#HvffQbHutKQsmYLvOP`CnqNpt*6xVv=9@wC|)=L#H^yM?4JhP4hEtHfBhZTNoMlLB$CR3E2#i#@El%aHc9skVOu39OscE7m;;& z+1WWx25tt(iV?R-0Iog8`}sID3G7JfVQ6_sU-!ATb}T_~3B}5)@l}kr zj!p%77Q0tbv9SlPUArdYF?U_kb0M4csFNyCMXI-#Dj54L)Hd+!jx z7U%UEMBqWNG?BjR7%BnoIC*$@YW*7q8&XRvDwJgAd$$5UBoDXeX`<>RYo!}3NRnfU z!*A#@emm}nqcVi611F!H)l%;6=}7?<_AT2=v!&4U1NpnigZeM{m}hsXIGJ9E<2oI? z|NM({p`cEc<>haILc13q9}jA_)c6&;(+J2-<@HY#QvmR93j5|WaA(JwB%j_t|YQK4kpnq?78 zA>ur$_gw!&4roRt^s0ShMXNO#uK|ITaX?c*akzCN#85#_UPw0(;0jj)!_1FeBZP=f z+)?xD?St)kS$^f=k0oLtY9J+T194SJy`02UFT)@6*2dmHFfamOQpwWt89wCR{rmR; zl2w6jqAgOCdh7Yr%x3~-s#>HL#kA$O(CJa#PWoH1#Q9*abK;_6{Ai7tc35~f)AsF_ zxsWyA3u}%Tx z^S9Bq3lN7gJ%!P+oS;}PO^m^Il0(fuQMkI|&a3}E+s|)lWpT_HtP6{v!S^a}P=Zo? zYUBt*jp;~AL}m%7xBT8WGdUTqW#FnHC?2D#k!h+lk6J4)-wQ>K8M-LJ5`|yFcJJ7h z3JikeRxR=gj0D?Y>)V&5+TzyDkD|aPUZn73CA)gemqsVM9w}1awz54vW=V>Uot@>N zUI><}FWT?F5~>O-B7fd*R5MenT*%#Bvi0@ z%^kyGD{XDPm>xbJ55C!nSMzD0-iN2JYneDp2XC;bZ=|6{#ZCqeVHoBBYC2FhCWB}v zlr^$5A=NlJZ-Z&aUh5k zVn)!AUV(6$-Q7fm=;5%fZZtvB_#M_D45NL!!QsXO4e@{1*1=kNe-8(nl0~`b){i?10 zL?yJ2XygP1S@6y}#$#sro1nA+^gNWkH#}Ss6`lwQxsm#>lcz+nS&~(y^;c1G1>}QE%P| z(bG@^24y{4tl&<3_g{W(J1V6Ni*5_$FlBq3K{AnROysuH|`Y68%cyf=B8DsFJ(Mc-Xpoor>3QMl|H+| zq-$*{GSK>s8#%?qA^-+pHmE?R#t*)BKbPh(=gyqvII2D;IsY0MT#LfNA{C4U;;OIN z2@E`z&ha@L1lS%fG`$w@7<=D|^ON~V^@=UzmET=OMYkXB z5Zyto(N%SHc{#1mH>i|#-zD^YI^ERfou&~{ix0FWTAk2?H5Yoi2lXN0vBI3uH4*(F z7cPN6t{~bX1Px`Xf@!b~`T6y*J4lZ>(>I%!_GZBv#+biMs_3)Xk8ZKRmI zv2IcS(HSObdO3reYc>e&5Hilw{o%sBx?@l0=(#LGVbKrHV^0BOPy%dGnXk3K`v{fy znfvGpP7(i~hXsibEuP8IO%f^|tnwhZ1U9Z& zw+>Oh8@cwmwy=9D?~NNb;$*!Oe*d;yUi;pC_FBM^eu?|__4NyBg{T%6(Hp03A`)Ci zV*42&qW<~iG8(aZxEYu$R-*JW*H-un zS=6wi;fgnz_ZhbdID#}NZOk}cCXg**W9N`HMe0s^&P4KW_gPT;lOZSVq9BW*=6@BfmGT7vgk2>C(EE8XJOm(U zb{ZNQ8o2yg3ndJ3)s1&jjTO5G8JpCP6NF_qd_J$BoHqUPdhaxX=0Njzd!8uvF9iZLtHgB**j0`n zKR$Xia*+tMV2)kEDxrLdS$(?&X4Pr5{T}!Wa9Y(5;(#F^12hp%3(X z7j;u~(Sr|iAiM`q-V6SA#P+sT{t-@n6ZmPY)QWLKwB204baJ##NGb{yv##QL%$gsgsro46qK~u5^WDedH2|uAfgPJfE_IxliVg&iAPN_ zLYHom%Kg#k&2>d0fPx;{3jJ<;diuxx=&v*%ceJ6ms?U8U%L7d`5^S#u(N@e8OlQHi_Fh%(UyMVhAj(63f)=MfKN^^t$p)W zEv=@K)oqK>#zNa+a#{;D|Dg5RJ#?)(R|BG=SU{qCUJ97dKD#;bhr1_tp8~hb&y@F2 ze!#~Cp}#3iOD~sgEZ)AC$z`p?#_W@#I}^)4>Bj!^)O`$#jxGlkh>}CZX_#8FE`6eb z+-%sK#Jk7GWA$?@#4YqV-@ea1}EIKZ);w z?-PS;^~6W3j`xRPQn0wO{+W+OeLJ&*939uvm-tn|ZBPGWom---i0aM;cp|=ztStMS z&Js^g&pZdEf=0S6f<}k_t~FwKMSb~Eo(NeUd>c!c-XbC*9MlcBg5H~FJ{2lTW3=}T z+cWAV>ZNtRH-2VG()(x1ThhfV&;5sEiZYo;v&FBM$wwP0XT%blYD*nES>0O9cAF>s zpDS*aW+Fuba{YZE^3IaqU-M1H;Uj2F)317&UU*Sx%Yn>N=4e`LN{<_&a++dVG|wdy zpZzP=kaZcKq)1}tBv$K+`w!cQ=xfG856Ubl(gQfFIB9lb*E5A~Yw+wbA$Az(0ef7Fz&UNFzr&x=!4!PQpulb04(^0MIy|k=_%h$VH3RXY znkcgy2o!Nnh*aVl=Z$0r;{TDP&WvEk&b zr>E~F_)uXN#Q72$Y*8#il3@1v(yD{$Ul;e<@0@cn>XV4 zV;4(sn;{Uam$l{RK)504@yb7E3g*Kuaq{yA0#i*cjummRXJ2Uw0@Xr+a8(3FsPo2% z67gpSK@6%}CA-Ar>dGvEv(pYMORfN@Q+takz+<3pZbw$~S{xBa%LQdq^t!Wi(@_3& zqwXvH50U8juEU>n2kbXC5q__}Cm;4oh9=6n;_Hp*FI%04A4}p^$JCB_t=qVBzc01r1vV_gha9@n7y-7ILvh}Dw4E0PEs`)H=&n=rKRAh8{?3yJf-h$pfxr(`$^xOp z*4{oGiC^j5xd?+5!Zn;Z6G8|dd{I-hZDFyoFQO8qTs2}3Up+u-)b{N>c)P6i6A#;p za{=C>@C>9SMz4I~4*d>xP;YK|iHV6|0d)v{3to#EF*Lt0lrHea4lD0E-xpQd(V@?} zdp9BVG6BK#f;mNFe3zd7zrpzIUwrf)E7jkps*>W6I2 zpO8gX2Bg(!i!-zwQ2x0}kP0jgyGc9oP_VSZNXYN)+qcBT+Y%|L z{YEbEq={Fom#I{Re|U^9yTF3dlq#!iy{{B2hDmRdNpN{Pja5|tj z(3W|ybE_vrp#CW%e&Ek44=Qm8V@N**5hXjKeIK#}+%e~0$O zv9#OEq#~%=A3!@i84W7wQ}4lyj{XnoOgJ`AJMitLPj}G5G6S(aL!CMYbtO4B!{{Rw z2=6Fl8D$rjcY$oO24I#POL=8v4B#kY2Za!Yh5>DMWN&Y;hb$d_z{1U)%BqPRW-lc! zijWntZWiLzN{b_`jdnUhaw2hciMwn9F!>^i`IS%>#Qmo`8^c))T#iXeOQ$%Ev@$0O zGKPYWQ9?@W1(6Qv#}qFiIOd_aP0jRn5zXKJ;ov->6TvR`rZg(ZPb1FXzk5G>7!SKm zxqct7ocziG)f-H!J4L*eVpQ4hS>>tcYA7s>Y_rO1(XhIlr&Cq*Aa*fFM3q@5)3cRBIQGIce_EY)DO|*N-gZT0uEciJWgirr?0gk zJ!Fy1a^B+>PqC%oI++6UQSS`kI}u=Khs5^*X}SuKV|oI5v_P%DCN#j_=_bS6-^5t~ zI^HpFd1HShGuY9XuObg?YsHDoUg46hz3Equrn?s*Bl$EiOGXk_B11n6 z4Nz>^y!m=<%TvUeKaM2n0$L&s^IgtDnKK3YWu8ZhA~ZH3ywOlqq09)tH6UWsDOI4R zp%I2+aDoC3LuNQq2!)jSdzb6uM@*?6yEtHXctE%hqzeT9+^@TA@H^LlNV|KB`Y&%C zlApH9&Q7pV3FZy1a^$Kz;G=6l68i z=kBme!PrdvlgPwCzvG~u01HXFRnY%$nD=S-v1}tDGObZY484|b7yX%h=xY;4eg2X2bpWSYN1n9DQ1ut~0_xzuk9FS`H zse%6;o6`%bZ8H;-X=qf#^2GEsC(e4+rb#0Gf-QBQ@h-~Aa%ZLh_z8dMG^PM1+zJ2) ztvbLbA@tG38 zuMztC;m(4GuswGV43vRXbRCFy=7B5~v@CsK;fSz0rp+UD-5mu%%uoBIjW)JklUgm=s`A)n7>+1rVxzosCll=f&bTBRbNnDVIl0#* z!PoahT5E1bM*XTdpN)AbzbFwHAC0^aZDJF%JQH;F?(N&cu$B^_h%I!5jgz>dFn|Ha zh>+KkO9H?z|A3j7RsStl@wcXahOx1i2Xzj@SxCfjc-{F278e#^>(AfRpAnP(=^qEg zxo;n(4+Kt&vnPp`9yDoJG@)$JHVRW^TkgQp5J>l6obt{rb!a*5?d`;0jgN#QF$?xk zoV6ECP~^9R$?C`99f00co3oqo(r9~L{=%P^umD|*cyLhvNEJh1aj_g)flb9ojjj+F zNiPHOjERDL_*iEFy2+mL^KsQsCRspS8hDH=K#VVk2Tx3>cl-+&Gxs zhdVw*!4&k{71iPGGm|SG!Itl(#84>pz(p_nw zfqBlgj!MWE_(wf56QtMHJ$tT&IzwGUr%rLYMeX$IuicI1mSW7&8Vh@w{JMNV!u%~` z^Zxr?QyugGo&Je)AlxifoWA;K)t6*XQCy%jq-;_5z#<2I49V>i5xpO9!iVlw*%j|y zEKRL$(Q!wegK|6F0uWnS;7c*kQGskkarFY^!$@K{0U?5}x#aV`tpYW(m`Q+L;PQ_H z#QOjkkKv0za660~CylOLQ9=19+B6YXXZkeiz;lUUm?FWoe+LFCHZU*@AWJW(cp^lI z1`9%Z0tP@Mv7H3h!+UuN3?rI!V#8_3vQS6&by81{6;Ye>`w7lDNM#|UMyQ0zD%6Gx$Jhk^jwMF^f2;AB&a>tn}$qr^-Fc z>||m4*)HQDVN*EI#T9-J&=iD26{rzgN5>yg;iF|C`%e6BN@pX68W=PPLHp@0gJz+J zYe5iP3#fcY!IHtSvy@Nr8UW?*?|zDva{3cV1(@@_yep`{0?}y%iM_9x#`X?ZyghgOo>3H-BI$i@?SG zpwURFgoK)j`2tEZI^-6=VJnwkR8$mNS32i4K*w9~Oc@y&jNl48{93Fx$7S4yP~jP{ z;A=NOVMatKY{Es-jsWhVc`^~6Az~Wp@8^#nj{%3JfE8!{5Dx}7MJb%i_{tSNWbH`2 z=Hm9d0e(J4>g*IbUGO2VRl}FT$!4Ewe149IU@O8ZhXZH+Oe^UU=9opWP}u9 zOh*K#cUj`V0p*p+#8p4QR@mK?0mV#_UGny*n2YSPdZ@}p9LSt|_sUsYTYqk7&^&(P zgrnBcJjVhdNF2QI-$0gDL4q^;UH5nkE9`+x@V_b(!$<^j!Bnb*J>V9&zZ^=eyb?vt>naa&wD9$4E0MP}bJgmN}QFcmjt7 z0xFb12#psXT}^%Zlni+g9dF=yeaTPt?3DMK&Ou!?T+~jI36IaTCz9DB4i11>y|k;Q zdk*0kzA#~hJ|2rwisP!HwRksByO%#rKTo{{s=F zhy|1KBTy6lC#Fa+LP>?@obFRjoX~c9X5k2?;C1nJ5l>}`FaP{{)O&!EVRWB7-q5P| zQ)`bQagOdfY|UdW%(PZDjfs156Jr+J281r%z;%D_0FB~w%PJ=u%3XDl!{*e2O#j6X zF&)m+h=VD~uGXY9{4V8oitIJ-sTX}%ZX^_2$bF1$;QL26xr7MrN(E-=XsBpJl>|7@ zE4;12w418VL;C(nckM#z%a0{%&k=5_yXz$84I#&3RL!5Y3Ohl%=?Q^|B+E;5s#G~i zw=M2b#Zr$^KbK5MA>>hMxFUXGaR+s*Dwi4c^qOyLB>eiV+6g_xc=4yvZNf~gZ0v2@ z{jq@Q3HT08c&e8Uzvd}<_gJ7wU~T)2BA>nBqAuIqC$CI!up6P6{4iG7gFg+y+^1td zi+6d1r*uLI`9$y*M#E@N9#;rWrfap<6g1-aYnbig+?6y^smZfP3pCU*ATe@o*sKT@ zg_(vRB-t}C3n;ecQb8~*Ma<8mSkmb-P4hF^+DZ2~O!l8f&|&!Ivyl+--7YuB!I}f} zE}0<$OqjS-`I?6+LSa&9ML2f%_se3rYm1n{{3$RRLD@qzz;7^rY9YYiKM8_LuKV2}>!P)SdqbT`*?=gvji`F<=O?f5W#M0R;< zE6MIaFlbEA;-Rfi)RAcpiv(91pd=IsOSFBV%`TBcJ5te+`{EBZ#gqDSw0g>z)7TSh z8TpesR|2z>Bh+oP65$dFMELya4qooA0n zs*S(PfDsNB-CD?3(USIDjg5^UAb3=)-yx~iK}ub2E-mF$(M5zlLfKe=52|!}Ra#1F zJCr=Q5I&+d!y-4{dL{iV6mFy{N$E3a`Lww&t*v#x%jJtjT5eJ5cIM>Kr2=hZ@BD^C@-3CBw1IQDag@7Jrif1rHU?vhC{Mpp^z3Nus39HjKs5o3>xUV}Z65J(9PkT+ z3}j9T{dxi%WCcsW5n@*w=`nEuovspD1a$Sn#04kOtLqr{Ag*8rOBonx&hQ0>0N3RM zH3$7#AJD&B9*Q@{gdRJOb;6nXb7-gv9|XGTSIJtSV1JCYsi~=vW&+(quN(g?rgDTK zhXRw3AzRQt-9TH}?;;`M;wgzg8Z9f#QH;NSMYfp%M#zIu1k3!`rmHZ2arDE&L>p$& z=0-n^&7mcmiirvI+A=u-ryKzu5Aq~*2)=N>_1L6<%8OW&uR zeN_W)wzVM`QlPRq%E_tV1zsH%J>I`LZ0ipDo!Y^8O#T%?2Fc7X((i#hVt^LXQ$CR9Q1)4M|>~s`L3Gs@+Skg@56W?>AQ*V$GWRfzi_@E zv&2XdOmhEs?~uBNrv1vAl;q@7h@;N{QK$~#CIy%_C;TLO)hdAJKIn`8-qC&Lm7gEn zwf_$sM5q2a*YI1N`8pIZ<<&1rU;pVtK}>e+?C5HlkzeKkE_$JDX4(&^ZJ?zM1NS~S zfMJn)ckkZy00m)(emv%g;|uUPlD6Hq{QO|hRe>T&>_M4OF5t6>0Hi#JVa{N^`~8%{|{$*$Vt_yLd-Z`nD-GHY#tU7tV2 zM2khD(y3E^gq%T#!v(G__u0U}05>K_wtze!_M?Au5F#;Vv*Y88uxz4H&LFoTNX}gO zFD~o%Z^HI&WgOA52Z4z^b_yAl04ZpRd%Mnv9C5*dG=bm+LV7qP);E_M55N;*8b2 zt5un4rNNgtrc^Yg!e$ayXb8YMNB3xhL*Vnlng^Vkp_Tvd?MI-y`hUBkL)j*WnbCw1 zN%WSH$P$&%v2I%tu>=)_+}exoX4}%f{2!H`Jwg`G9Kt#LtnX_ab8hfo=@eSM3dDB_x_l`jk(d zS`UDWYd>~l!U2h?3~5vXjUsdy&`@##FU}xhgdt=yXZk^&J5`fum-suMQRycr=md6oWk5uka%`&PI%*IAdV* zTh3M80AhcHCy{@`?!6%p$fJ7GNGQ0{uEWyy|t;VwRr*oY?A_;eLyE-m5G5 zGp$!Bz@}%w!iK5EDpWzrf5s+zv_1FW0U^E+^t#Yw&L+uk2si@WW&j-`ncN1ZIa%v$ zYx@gT9z4?X@~1&`I!|(IEFIF`nP< zrDdMIR+!JGpGC=>$bb|bUgbtYLY8%pSlc$y(V3vBA`%)Z-~^c3EX*CCe??9*f^Sj^ z)7LsRRj-ewwfJd^H2bDJz4(O%lU$FmCiJ_Hh*iyF1s7I#r2D#B_YYk!x-$RB1iwmQ z1azkp5n4SX5%a<~RFb*T9qk zP$w ze-Yo3%eSo*Gymi;qW{a$O8;Hz(N#5W-UJD2>+G1s_AR^U3~O&rIS6f+>m4=Zk1rHJu(guJ5H$c`UW7sg% zOmfwE;)VzdIArr)?84Y1kO10h_Wk>B;b9qJCN4UAc3dqN~yj;YuXULkL1C-O)?9AM5Ol#`vkq^A&5Vh}DMJA-Q%p#^17^uR@RW7~dj9hAQmbQhM<^iq)vaZLR z6v7>}J|UR`w+oZGRAPw(8$oW~L7H}T@y$D7YPug|CjBuM3H62{scP}x4F8yX3H*gS zB)&8?ML_|gM7sc+iiBP7dQc=u$de>w;9Msu5`Iho`fLR@0-npt?!rwk4ATrQH+gO- zF3O=8v>#(mxCww zZWtiVAAqd{5KiI4>*&4!rSGCS!1pRa5p#iMi+CO#6z0)h;);|DhToR3G|{lWfDEl) zj6LG`Z(!~CZCYp>fO9)Z%R%I!0(?sTCo)WjpOQYJXnAHAENfqV2(y9stCSXIUolDn5(dP%I$?xh z@D(>cylahxH5p8#Xu-rEb?`JE;ai@)1+gcCBlm}x4f6Dh@h~2hz_WXx8Hl+b@;$%P z);oZ`2m&&f0l9%6uX!u9iG{Hx!{jb5;yFRQNCX~Ybb+qtI%6n$a7_q`eBf9~876u? zk!pyca{@A|cDdop3j$C>UU{)%_GNXx)|)uDh*plGMkWLPY328`p}Ki245mU4!&vfR z*i(5y!em0+XRL_dm+v>jifncq{8M&9x;3(Ll09w8wj6 zto5_8RtIM8k|D5Sq?rqR98vnwQ-y+0DyYH2>=zO`gV#D)iO$n`& z|6VhH@kQ0|R@|+o?)&jf@-}XTUk!{r>sa&cttn*WbW1cpj56)8<(s)4)z9RA_ZU<( zge|+67!#liv!FpWDWKbx`Su7Rp07c7e6vd1*)#&OEnuqn!m)rr*)uXHJrUkj$2e={@J^b;Og zV(8MPSD<{tJiYDqaoX!auavhIR~+s8XKi9SG;yfMv_HLQ#h7R+3HD!sRfxp9Zj0J@ zukqa+erJIboyUKgWXC+!ezg9U-9LV+fT-t# zyGdQ5(ekl2eX)m_P0xRxk`pVmjHW0b{vFk5p&J0e)UjpXU;+eRD5tEfoYt{wVcL>u zCbc-6g_tDG7#Z0ICs~FH zmJXr@F4P>!>QS-3t^$FEWK%fUCfY}VY{_L;z@9%r)209~lRG5P&-`{IV}C_n%PxZ{ zQFAaB!g%}(gUVG~M3u(pMAm*S99$z*5Mp+g5PR$#*fB6j7#(d`72kOux3Vb-I@{a# zAY-^*o5eeufz!#K`ldq0;^0LlSNq7U%$an|kt<)w#FJ{AL@1J}$dyL^KTYX2kM(%> z*!WW>w$FRBUw`^n{_GN+RejQ0vGDedvcxpGD|qOIn#`e#`jm~+7rS4{+WKiAGvt&DATqf_mk(rWo?o& zMYU3s^Z;dK1pmE*6}e_3;9fXRBbgzTTtpQS{oPz?gUBq-fL^a}K23rHs8WSpCpXT1=H1yZRr2^WRFvO5I1*!0;&Khv< z0Y7~vhk?I!7CtScCVJdUNBChM{EhZ^Kp?A&%AK}y~1ImaqGEmnnn zVJ?(4)pty`W*6OKq18=F`UIv5-jcdtgCGToRl!S<*3^Rg2O)R6 ziA2RL!iKU1gR{mkX2Z*U3$uV3Fpy9+NRKLcvC~7DufiP=`!UO1`%u{A4!VJVo)S1G zv;Fob7?QCoj57s?6~O62x&O$K)}&74(6u3u!TOsoz}5hH;13CYs&e34PEO7gxdw>7 zy;$c+xYV|}w5{e$d0mX%%ihTqt|v$}@hoDIiG_t59rFYHy^@=oTj96T^MjcwDZDE0 zvyXU>n5h9YgWc!AzuXKkUSdXSE}9w0NDsP81FDb%1pW}C(4WTFq?l>oQU>DeBDBP} zh6Z50<8@q0)8FAD#cc#pAadsaequ;L+!6^0_vnyoEdnNS6u59%&}P8i=eQIGhXmF& zq^5)Di6bFEo+8#YG%piCB+780!@72@od;j?zq77YfR&t@Nc37zLhl8-QVp~oH~6v# zFxnO@LE;xe}5wQX=U!H6zTDmDli5eSg? z0sLu_oSW{EeY@)Y< z*r-ra$&jYbdxz(xZ`TlW)xWXN>(_B_#b(I#(Fz~7a;a_x4MRpjvA_zKdmg$c^4VJlIiw0jA>CKkwf ztAn}+_1+c#*A)mmjyU|L&J_nefvMv}PH%h*5C<~X7h4`vNyu`T+ns9SjzFBvKqdlU6VS( ztv7z{5%$20+4O3|w{PT{7h<#kWsdlx&Y$mph$i$KbmW2+a68Z8cK|>ocb*VK1HWz# zJt+SE+C*|c61o2q83NgAD+@~*s6%o$AOOuiz;7}q05T6NfR2;G2)|Ft5^$O@m^xB5 zAe?99)GRpSb|}XSue6V0WCR1A3?OfxA{udL03!>F#aVIOg@hXcLEX9iMRy$@7FL2^ zM3=f3cfr7h>5G2Jdm z2Z5PbUWn#s?t1S?CpX3|A`*stCjv^Cc-LFTrq6*8)~M^(Fx=wA$Z0xMX!cJMef#{!@@R(>}ss7A5<^>A&tpa5y?rZBnz5 zp;vP0k}Z@d#Hbve7+E{ds|S~mQDX2Ow{EJW+ayHJ{QX;=@fGi@!{Y=(A1!zt$U!Je zbn>GwD>grT;dyMNrQq(Jj~fxSPop=wv~D}avZ@@nk>&{1mCC3AYD2@Dw7Pp0(oQKW zcO&CMKQ^(kQD3+^*OPbCZ^D$~hUO=`)x4|Ot+yVWX!Q7ZODhHx>7B*+07#I*$WlSr zAd_vN(#frFaLrN`Bf14ziBbaq4t^OA1QL#(0=ILfLdx=)*|Gp<-8{_MzuR&JDL|63 zi=Csh8W|p@FD7#|V5cdgam^4M*gt`;bX1urW`)O9uAeMDA~4mGOwiH^)_0 zHfLvX!Nopo7fM6|gbGS>|7O7w2FUw~bO8>A;yW04+mFIuJyO0g|GK)~(bl%UNDDVYO^xThe?I|Wi2@`}x7Hj^Em9nzi_ieSVblm$RDCbX z!rVwHMiIyv1wU(s+ysZ4FUCLqbSITmG4m^bP({EQ1;PlBZS1rk!;S!< zi7K=YxuuJfCsS`*;XzF98)NlAMFmkXf#bsF$lp4@;+Fzc-(b=IE;56Cwlf7K&`&;) z$H4(?11A~azW&%Oh5v#}i$>54fto;15Q59!Kgz(j=ILoza|OELkh$(~+u3SA;QvYf zPY%|PAXOq7hLLVbXA8GWKZ?Et9c5vsd^~&h5F76HHhT2Bry!Z3X(zg!!9M21_32-_@J-_VNp1UeOG=#EKo~p=ehVj)$4G}5^P}KvJ_wa^;E!# z|N0M{$`JYC2-)0PeD@iOw5zW0S0E0koYqWu|IzwWw}ptj2#LfLi5(oZt}uA__9(id@`-2M%ry=h+jV0c^;FpnTcG@+~s? z+ZCSjNF5_1dMHoNgANpFkp*gg17&nN#4oEc0=);uhisqpfuiSM#rZUdRp(zl}MI|f$J%!i6_Temag;4m9aQ3p_P0=7f$t?jT!$p zX=8&-{p+=*^>G1JC@HaZTv+c z)-W1|8xSeuj;C0F91Jv#&tHQe^Gtkd(@1*x;Wt5P=s17hx!!$>Ey;JlRsOSP9hL=x ztGR{*TTqEx#_rLBS~mjM0UiasU!;x(8ot%4rw8p2Nq|Z2;ZM_gcU-6!a38=ySA{T% zXX`&C!-0iBvY@#QK0S`OmvJYM7CJ!uYPbWmKWL}VK?(#cgxg(z`u%RhjRNsi2UK?u zpX0+~5<Kg!;7+i?5(4vEY?H0mOg(ZJPr-HKNm)&&d->I-^#z<3{ootqO z3vBAYW<`pngxNRyRYy{qme{({B#|=rLw&Z`ub=}#A16LO_=a5z-q63&g%U5655NV` z3ak1MLgGF1!@#q>B*lQ;*t)+G60w*_ofy%vh&~=pS|KD147@N=cDh7J$il%90jKw6 zP!_}TzeGw3lml^SG3Y>=<=jW~Ulrgg-Q;zL|5E@~;}SSwDf*vXv869N{~xZ{G^yX= zKyL%qwbSjF6<(WWuf+e$6`Oj(J3=jFtH{f@5d=1chrk0?oPp@2O0NlaJ%?fC4TQHs zBas7oBj^!8#BBv{uU>zPF#+lgcckt?N|FZ^EM%ua)fV!r9?6loeof%B2Fhx;a&ldfMeLhLGzu`tTXJN1GtsV71 zi|Z35L&oSVzfpb34Y9M}*Y{ayLX%3 zAdIyp^7yZkaVtRi;E;rOliM;3Hn?_xg++rCm9J?FODX#`R36-r$}~s{Ak8f>sTTlA zf;F6n!1THeG2Sa_=uAL}Mh3D=E65xm%hzssYzrR8`*0IMj+qA%1W>jh$XGA~_96Fo z@kcNUy@hz8AF7tHDpcSgy5l;BF*7p*_vMrq=p8yb!k~$cIIqB^U=JYyTwQBZ_26GL z2#80YA=x-cEVH4;h};7R9}JFBqACgy+C0BF^MV{0aVdrQsF;0~lt?r{S|SjdKxM{aHS$7g8BJYh8+MY#HmNjJy1@+q@sAAb=hg12aZ(pB zHzP=AsKp0E##7}m#D=8Yz(H_fjs@^4*t_5ZZEr^)^d-n?pwU(dIWWZS&m?X_rt~Uz zAB+7%I@ZQlkV@QuZxl$Yc7+yxxB401cUShO$Q?c&J*KZP$k|J5?BWr9)%4N)XVNPc zQ60kS)j-@6g?~6$BEB@pCn~xb^p7|Fz!g1q3HMQ))_TU{82yH-lmji3x zH}Jn}DnZN|84&@UMq7vfe-&tJIQ@Km!1jWquCOk1L3Y+45e${tWe?A_AIuOCBII`j z;{)l@0}Rk#46mNTsf{!&k$e_PZAj*g*tPgT2|P62{|?a^L@`L{%fv)RzCkby$gc{` zPDB?2_pQc%!}R=*8cQ1p6Ioa)5nK|yG9?)Vb8grv{=YSs8!;CDliV_uR>ISn=085) zml0oJz2-t{f^c2H)PZEs;GK>G4*0|BMc~r9NYFtMvTC@@)EN1-K}h7wNOuGhcigzG ztu5{kdb@jlyf#heboFCi+GfwNLhiBM&BdAnxDS$QL8*xP*$^%wMgEVvlIBO5IL2k# zioSoo;;_C=Q~Orrbc2_=)+{92VBQrkKTjXeelD|DUYM7ZRM9=wYG9FMPB4RIOEiOR zE0PTgyYLFN!K$Y$5|(ZN;weZiphYnAbh3uU4p=}N6mo8X|5sBm$CiNyK;)phoCPcf z-08nos8)>6XAk_JiJ@km$IuchO*zje3m7$|;Jhz${G*Gt){uGAk5z~>ljrxy0mjum zhICd+lnvTp?yiMNrb;$Q3diIBC-?BL#m(F7P+$XTdS&kg939Z)!GSG_xO>TN{8)UD z_DDW9w0HQ{=xtTQ`+nZz2C8;X9I70-O>cHL#YFA&SGo!h$lMPfX@L5N$%?6fjEhr7d_MJpnU>4}uMV zb|#?$nw?ID-GzO|>74v}`7N#Y)oa%?8shVe&{E88}bjxsb93w+|@#ZJyzg^Sc?iQEV^zZgF%F@%WTd~Sd(S#dsK(> z!HNFYGCXdnz25N=(|UCMDlz)EtcL|_1+B}{eb7uof&Zm|GVh~@bQv*?i8K7IA1=^& zTw7fvVu4qe6dejHtMU|TxQ{kTerscTU#R<1_O{pgWvT#~1yq3yxnk6o&e8ojp@|ej z7Qa1Q?mXF)aTSnLtzya0zwJU(RL@=~?Vn$$7&(u|0{kQWVWn$_mUW0xH&m4sQNhQ& z-AN19F$Zt1fJ7tF{-4OHMSBvL&(+Taf0g-#v{=p0sqqAOuE5VqH_`Lv`hKNRTH_SB2dGQMO6=RrtKf@UZdS!w_H1zN7`D`i5L|aBW=z;A)7#bqUWX5RPaA zcNruFyRyNz-1FY?NE$1t8f?^w{_w_SdF-I-8#v|mr2jC+FT_`)&M$;Umdrw$AEURb zE=7gM9|*GG1o{b-6_B4%-Iuu!1qe_*K*S!I;}6yTMLFqW7J2BEfQ%EF`$4v*CuvT~ zkK_kPhZ9l>A84#{cO#lLq=F2^6|J04Ceqya?=o;}On_Mo38McEO#EIbViYTwcF*|@ z@4;8m&&|l2jQ=w;$Ek@Mfhmpog1ShE5aDCN#-n-IKNPH_PzL~H$aHFA5=Fk&Sc!s^=+HF>W|$-Du&&K3a9OU@EZSH3)L$YKsX|BM@Z|ayY2KcHGjt7 zpOsAf_%Zb~?@y*5sf_>OmQ=_!_}yNE4g6QWgCeE{zidWv;<>R_Rp5j-ZmnsE)7Of> zO^M$&NQ{y~D>Z`rLZoGBD~Mb=|8&W}RkG{^Gst)x7mz0rJ|(cH0b-kVNhd+N_{eA& zWc*0m=1yX{Y2!YMMUsnWZ*sdW_4BuRq(^fXIZYeAwV9fRq)HFaq+O3_M~;{slK32xHVi6mMALNDWapAM|v{ zB1$i(c(>I#Zt1Yn>M2f!*(vOroZH|$8^?XZ5IT~?B)#{XhepaV>*yI4E%6PSEN-Op zWez<)$_sO7 zSns}FpXFBHknhmei_#Kt<7NfjM<>9OLB6;K1)L@zphH|raM3IR^l%x*I|_nH?pUhn}NSsNtjxapYs zSQogpS-}{HM0}xD>p2$n9qsP~#|hyxzvd1Z4)ES@)`>p!xvy$_fLE$2-@&ppGbd<) zUBPfjx~x|)_^(#{zU(da$dgC>vU&zQBqbU3qCZ@zYwg^h%&ivO(u9AM%rZBFCZolV zl6~bcM!;eA4hv~_Equ(sWL(4%$)c}Ru!_lAY%j6P2D{e6D(LA;avI$7Q-X_We3|s{ zt6^(17POiSb(m4yCR*?h<+pJ%Y*VR@I!6gpd>#Mj)34Q)@%&zYH6WSvp>A;@!}E6U zfNd4U+?&=X<0CiflB0XDy{>L?GX=gBv2RXG!I;hB856$sE{_`dC_V-F?pUo~>E%CH zQ^mJ_ZXJl)AW@gUOQ!a+XtnIlbktWRx19=92Xo47VQ|Ln{wwc7XmAcOQsnk;a=Dok<-ViJda=&HkkMT?5 zI-K_DzO1e_IF*Z^Dm`?i2`JVk;Bo}>kWoK(TTJY=uv+pfs< zj~l<@vg^dask0{)VZ@UPT0*;6Trzws0w1Btv4P0;!Q}xE`g@?x0a{NEtUbi|^Q8Ic z1;l~DiCWvCkTL3sdS?qfD0(hx-fu!D$!)HQH^(J(lBb8}Bg?;ve3WGCf9MX#xGYm_V>vuDd2b_7wZ*M4``r5s7i$yxhnF&j{gIFQ z8HPUtt)JgrP*i|u&l>a`NCgghP6IGz3z1fX(!Izwlj;sEx)%4jMGTuYW)TvlGZF#^ zvhuzG**kMsDWCsC!>G?iU$a1rpy1g-6s`)PK%#~s5>gCC%!YwLN0al=`}s0D>>mZu zM^;3}Z@NMnuJgAr`%~cDbe?x+iIO4wlNUbpfpzOB?Lr2ZR4VoHe^%j+Y~&E)BLJd9 z0i>r;7O{c=0dZdf<^-7p1eX@P6l{n)iRpz3GOT$A|CtFjG&g)I(1csnT*fLO(ZMcPhAj6Gp7FU&F+Vk03e+Lj6DOCKB3%-+)s$S z9guUtWV;E?T9xw2B0Vmo)(ORYBMbQ}dXJv8jX#Y`FEbR)&%Y|>N17}73)Rib-%w=x zDy9g2XZ)geR!(%@vF$a7S=!{bN zFd47F=l=e1*NEP}+`PADG4ccPL*w?j);awW{+s^(_S#r@xW8xHS1BlaRVeLM zhNfE_94g!>Q)S3@;?_LVh0M5#G~SNt_P^|!d*IEw;KP?OE<^3^{0Lb&R9aA< z0&h0pi(`Wme|_v6xLmUNZI+IB3HL)IkxU@ti~89%_7CWk5pp_$yuZqPzZZjBG1@uh z(jKKGZ6Cy-_pnB7M==9!x>XgQ>S)sO)S8N9{{8oL^DUtOz6SX`I9GyUpI~T0WQcS? zpfjq?)F!1~M?I9d_&q&@T4p-UZRKxN9#fhW5xa)GzvS9<@l9ojUxLuG90uoQ3%UIj zB5j9`{L2`rs-Xjc?fG(Z79JN*H^VU)GtS(E+}zrEyLZWfZ`H0S8%BdGg)55oHMqLBRb>ERRA;Z zdbryN`Y>pwL#IC>`x{q2mN&+e?2cLq?7s>+8J(DX+CkEi|2~-?ugksAP(>K8b7KYO z)jTw7kPI351M-5G%y(ZP)LBs5nD$pn)6`LiU|y(YyQ69FPDOj$n5HI-bYjv2XkL(o zUzZXJ>mH|j6F|r|#TQWT*-eOK+@xbz?DJ z&7KAO_lf~lU;s)+*LsY>atR`gm|ujd#7bM+VV;@%e@VPQ%Lus;-c$+tij%L=Uf$9c z^=E5Z^6)9y=~ccbliL058YdU=@f9*l1AkWuqcF4j+iJwHPf+4>t7)J%L2965zWeMR4YcVs#Bq!Q=O;U`!I30YgyUMoGPjNyPL^}2kkvNoWL}#h>JH(MBWj&>W@8EWl&m+- zUm#^Rf6r@B2aO|S%<=CPcjsrf*p&UZldAw!iAy(oCM8J&1TOrW+ccpcLKnM)yzuGx z8sERKO*IfE?>@HgT+$%a`ed>7yzXt;xJ-1BcAd(j8xzh${?&7>7=4(Fm^H4ZLRy{k zFnUlF*Kek?3 zIgp*!m}Ew(WW8?JBh=uOP$S(PZDK){Qdxbe*Q2!$t!n=~1B>wfzqdRUaks%9<%=|_ z;eyQEVciI^fUbF4v8e{!lk>qhpU+Ioe^GM=?O7dLH8$fnX%R@@!JSodl>T6p$Ze9$ ziAfZ6(4UZNgQl3npVO;m@cl)kD)#TwAXjno4F;Zo?Yklzd0%=3B?x`lH@tadp50Te zt<@}ilc%6plH+qbFx5McIJJdD#;jlRRZ?Y0(c>^4YF$O0MAED<7efMnd~O^w>=}Ij z|NXf?i9l+lT2VPFU$0woNkfVR9E%=L$*rGsrml3yDNwD*QZ^OBL5LN zb;F*K$y%@8xdd;4&@R#o&`(fwQ)3eL~ z4wD9)t{A5(t=qRJD$TqPr+ra+nJO4!jZ~F$2k35$P5ie05La!Fd{H`9=+AkX2)Fz;Ws2JHc-6$tOs;B$L-takWJQs!iCK~E zyu=i-D{25`bT9D(A1anr(p+H^g=tR~iq(=eF3TYQ)NYH8{~B#14WkBf+lR$!rxM2X zw8$L_VX%rU^mgdr@t$K~ZUz~IwD23)+$}>tSWB@%A07ZL2L!0a@B| zlEBP_wuNQ?Fy-;v@Nv7jS3{Z;nDUWVVz}#DF(k1iNlruaWEqr+I8O<-c(=^wE{FYs z!=F%*^rwoO0JN=k#G+wV5vbjIk~CvkUtZbZP-DY@Dx7N@9Zr&lUC`N#Vd z^IOWwF1E!P{InJiZ6efzu|?-Uw+M^kb}(X>wi7PLEV-TfRo8iGfw!13bG_S-#QVWn zA`F`Cuu_G?76L%SYvtxG>7cE64NZ5D1^IMltwHf^34zl>$NhU1Kf)=42pBR4^7yX_ zRs~5Q(gWAfvsaSP{*gQ|h`V9h>~*xhOMcWlZ8b?Tr+pN|qDN=Ki7K_5pTw8CRO_i( znvyuEUBlqh6+7!u=VQ_GpYX)xPG1#{|Cw-3E~=)+HqI=rJ3jhyr$LC^i7D2 zG33mO;&CUvn%oL$Xl6%SBgS84!5fU&O~FP_p^Or#tXLL&YRmSY%89j>**9d^1auDK z;ko}VG=(DXmVi^f<1QU2kM+NxtKM$d3?2 zKQ@8+`df-HSKm54W6k9Um-d9tW^j*6+l%8k+=22KQF-pyOz0^boE+~W8?2?Q;*4mjoqwjn6oiP4}}PZn{Zha&Pq>8BY^O}Sn~WCR+h|I56jT@VZ) zQ0mylg#`?kV?JTw8d+ zC}pfypL`hjVyDE4PXdnbmG=*#1e5Jq+t@--qNOE=H>A==eazDWAKebzCt8t^g*STb z`o5ho-}+Kd=PelL1Qn=c^XEsnA*~sFAHkTdfXunr233fKUDCxT4RsOqWB~8cfbl#V zB+MYJ*&VG#d?C+I54n#Z{A(N7zP%H}Y7ynuxsq~K-(aeu!HBrH_%Us~OO?LxCAI_a zLDkMTW-DwCH?8g;MBz4O&9kxG>_CNvE~HAlcf-%az%!Z70^L>kz`OLJKSxSIdZc?1 z6+H1S_1U*x&TOxK{61PEe^g#Lsh z8699kr?`RVUNuij4gUq`|MB2bw2=qY2$PIFlb=KQr)Tp0aUbcDaSMQ1C_G;#KT3`59AJDu?zLg&t ztoAXS*R~cywUYAcWOgpDOqa%IqvcgBQqwTI>gAQjoW-BITW4&!a!H>@ZzOYgcy+fd1~_686QiBvee{iDrV_glvRKk{#|mM0QeO-g3^m*;rs@V4~U)vbmJoIir;) zu(sS7aQ#iW!=KLYjUFzMhMd~p-?lbCYvs~;+`&!K8W51__2gk4`S}rka~k$^gIqG! ze$!0R4Q?@rq3EnXqTFK#y4TOdf^!G0*=P>u)v*d*ft(71K4Xd=2?x|+1SM$1sKFPG z2YEoy5>*wrsSqLOYG>bDR9SNh%U&Ie{q@f1hYeQ* zu`tnrL9;;tbb|_L?7!m$n&{numn9HSDW-!O0aWUU%0C;*b${2!k%=dNY!F00=#yc} z)(sdp0f7dhs^i)D)&&mRJWvD!2n^9#!rvtU-_pX#Hva5ixsx=Y2(+iRSQ5{BDe;)70D?n ztunFSWeh79Pe=y!mj8U?B#s z##X%4Zo}{DciwijNwxi&{r0?k!~OZoC9w^ zli)2IklLodV5ZKDkyty&UrI#@ezJH>RZ@3p`Ya<}R8l=p!xL>bXCz{Cv;Hc_529rs z_FPQA{)7TywA@^ugoC6eRxeu-_Jp(Ub~xK-xJIee;2K?rk`4GaelGuMgm+@n?EhiT zC9;b&dyj!{iF#lrH7Li4*?K_(zH9EyspMR7f+U0YS22{$)m~Tbh3>xkBJ%eE61uvL zITXC5217$#ed{Psg_XoVU2^&AiLJrAs-|S`k}TTRuG2K5?PKE7e165OZJUb)*p z#5nO5tGG)6wNoGO{z|z2P(6j%1Zy;l=T9ki88*?z4b~K?gmIA~Qb!SZ_>xCIS8J>U zk~jWj|A9mB$As-3SC01%|3g&vheVZ{fgOeFT+wOlJ>q_OQopQV7 zE({){p!Zy-$6VLMakjVnbBTUP;jn*D!mj4U?Ul0ZwK?Up&?V6#`V`D#bs_A(;<9wrhQ*8n zCd^1ax~Kw0eLw}bAU+<%B?P)@u;C!1h(RiVs8tXlBfw^^RF_1uUn-9|;Gl}QgK>Z0 zj=}c+3+aGNZ?}k7ArDYvHP&7^AN#Qk!SUY!SI)KO}3y)x{h{xu7{R zXi~O)WvY{l>)OcmAv)MWV|Nt!rGjStUue3U!;T7!C z@xmc?eZ=|Cv{rDWXmEFXt4MeSS7Bkr0=w9Ac7K&gPjZSK`}LUb({82}d*5dQxv(Wo zjf_XR(k~v4Y20{i-}$Qr^C~JzJWsekv{7QOXxr5#dl` zcs;#XTG&s0t85?YC;gfdW-X+nWw<)FeX(Sh z&Dm)UNNh*Kd^h5Fw|Z_aH{;tX%lfZm$LAL4kkVc{ja+ix^_nBeph#gXXGHM+l9it1^I$Ld_4R7!dN@ANUMVr9i3q6fveFaYrVJuw>mSAFf>s#}$O zr}^As1+uQ<`}So-6PdU-rF*Bx3wqH$7W-+|Gb460g$B4aI#3&s^mVV%wF5~nEiufb zx(eFovqQ6ddSW~<;1KEl2v*=3k&=F=J%2ITxO-U8U3IbeLC|>L`b`|}8w>)8FT9od zG;VUOB(I#NwQ@LXB1ZQ=Dh6`$T6{42#4D@%Hc~OSlqEQVFhY`$<0OTg0u+C5zBczB zow@Z{C#Mp`-q zm7rI=!4;a`n^IrC1l@c!zb~PC`{*dgfj^HjD(VHFdfTjt=@)Ye;!V3E-k(Wbg}xkG z=H$Yy%ffTyj%<#?Thy*p-aW$3XdFk|Y+vRoE%}H<=8k!R#o;pDx3ViAn4I1oOL7n# zOXJR;t+t39+-N!}Gn(YRp&;D7Hu$S4{V@Zz`ojRM^Vg*LJtMV`TV2fsq8c$8;zaXV zhD#;A;>1@z%7mq0?GgliB2cc^53^^S7>tUa;wE2@ymQI-3=6Lrt5#j}4W`8P7u79A z#=bQj|Fi`h$eCQDqHdi{Y|Fe`>OO(-P|ej8t4WaJ`aKjsQO=e2CkN=Y^@?@p)ih&2 zN)Y4B$WFIS5^-a(WjkhSqsCB9ngQ8)7=a2Snj^JlL*CvC3Ztry)bfX4Hs^GWjJgp+ zAT%FfGb7#&sJZL{{zXUI9VPnlG7qmKj!6^MBX(?cv3=4t?@N)AgqCe>^XYy}Hx+Am z4u8A5^_jsMKN4br`RPV~p8Sd6$EdK%i5BAxedV6g^7XU$IIH)MWFNASV zO|4_z73qQ^X3+-^9#nmBefrwigF$&C$#qMw;90>w?$-mk1&xt-pT~M{gTG@5ar#(A-LZjPI*6Eg}8@^*5$eFGv%piRAm)SdS)}mji*Z3$0Su))o9EX;h z3X=mHPe!0A84Kot9WZ&h1ELemci%y!kAU_eQF{ahDNr%=mB(Mn`uK#rPwl-U1M}9v^V0ODp1u%D!xby1h|1RN&vT4P1s$Pk=sX?Q zS+5EUM(ZA&vRNTr%3hy>n!oDV;t8|i4Y>xBA2dH*9A^>^HD)EgPJU^0eaRf{X)KH^CyOYqd|x>8 zc}jzz>SS}chO}1s<^**NqXB03BUCu!i%WjRpK&o_XjPt3DqX_bqM`Rv__{&dQG zq2DD*zsqZ0X!9Z6fkWl4`5{VIE0(DgoXj^}XL;Bf*|z(RqDi8eSOb=G39E~jW8G+} zOVsN-^+OiJGtyU{;ulV2b3H%rr;5n9Evz}`0-}BstQv8+s64~{mvC@$IX#})e&RhI=PILaIi114iqmJp9Cu-Wv zoSLwGjqbQcHDnTu{ic(}C10htB5s~R`8t++zH`60gFV7&ndIbMnL+PG->aLO@%S((lJXvJKa$fW*~)4URYU}JL=$)FH^C;%Uxj8al(-4>#}(W|L3yKasV zhHts)zei=(cSqBt<-5}Nzj*Ri1X4fB@KOo z6||`-sQi|i+IgZW+0Cl!(MO|~zor!D=h5V2Te>e+ZrKRwyj_|Ie0S_}d8)xeQfGux ztnbV?jNI%{ZxIBeFe_OAwwJJfQ5jSqL8&=?F)Q&9)Hmk#vakv{Zk>0pLjiNO}nP_Dbv(618+R< z&uCBSay@p}Ym3Q6tf`I5k`(a6by>TdTOXExOaw&hK-7?uHvn>P3~2mg2Tecjdi+U8 zOH0nkCiwX0K67CxFko7DZp3vbZ zu}bnQl_Fab)#Sfp?~waSWBsyYJ|N=b8)VuLlRV~dp2d7GQk0UfArq$d-eT<`t;n^B zHq#$Wf>SdjW$RgOL*dC4W0)ICS-VL?wO{T8SA%pu?(Sks$MvYYo-JiMD zPD-7!vpZiUT;W3*MMrdn48M3?vV{O!3zI)z^xHjc+@GuND=kXOBl7ndbi491(}!2F zPXDpEFpRP*ipd}xXF!vVZCT$P(WC3t^-xYMxy+Dc-dYH74*Tme9$>(@ud2$*#q|or z1$|J9!W4RHjU%%TY`W$f!UbT+m zVd?&6I`ojKyGgLL79<}#N)#xC<+p43Oz1Bmc6?|7k@gMOk51n1vkQVLg(`RV+9x*Y z(}i>|H+>o}S_+6UjM(idpZg0!AK&SNzQFT%m|L&|Lpe|U)a9$|<`k9%$w-^zKPTsZ z*I~ZE=a9xzz93BAygS$%=ZgIy{wb~-}j}Q@>0pZ#6K*y z_HKKdSF+I*7E`zwS6xIPre8#h06GbzIq`U-ooK*$Az$_%S+S2sBSN^H#W#(EDdft7 zs!>j?z0K?k!W| z;S&yb{pJha{EimQpFgQ`&AX;=S8O=~p+R<*B7=jI&1oV@{^Y_!lXFI1eNaL*^>+_RV1EuPtv zCDnxNPfx}T%E5EuWG<#Z`pfPN??TYd`T^;9Brnn=EEmUI_qqMg@A&VML|rFgw|WaR@MS#VmTZjks>81Gb0pqXTXg<0A!zx; zjP~Mux@$=to5K2cgU^|Rk;C*p^E9+{rNP4y_YCbH9}B!URpWsQ;c`tiq0MSWq;IMZ9Z4QE#PYo;xV=uwx=Ce!7O4 zi$Mb|np#3l*3l302+fbEQ^6ylxmhTVaZu?6>g1)^z*DQC?++qG?o|~`6#VF6w&S^G zoVSu4(}YQP5t=iZv0Dx2t~Wn%(y293Oa*DfBw-IN z{G;gA8TPLXe^X-M4V5AlK2O1O!xaAX_r*l3?c?>TD_0sOR7XwzeC)eFebY&@eZ+9s z1?_AR-};di?yR>#+=fGDM{k`GbROs$y8*04au5r4BA~BS+7D#0rKOUuM}N&ruYC0( z>-cUx_HgEG`$sP)Tn;XKHd@!qe>che4hmUz;@L+YBuL0cU!Q)CMO8rh^Q?WOPLxhP zaWu4G-6u{5@8Y{QYkv;?YrAj11#hOA5l*{7RZs|hDY-nYFrd3|}Je_nvM3_jp`!jLW z)F9>h19E<}uw;Ew$?cRvabl|Mm9jw;A91rZ9uf?e1?)&D_rNwAcNPIlJdbb&Acuo zi{d2Tq|58@i&>bt$ec0>_1a0VTJ)&EI7q2Vk8aqoyy-d@X9=AY<~!o*U$gfo&U8HB;^3D# z`=e36l;ZRd#+ZfRk;o4TJqNMv^Rrd+D)40jad%_vE0G&BKfe4R@!k!y(H6af?Lls_ zt*nNyv32#DZ?9~%svu32-;{P;X7WGECDb%-Ms2o)yyAmsfw?*aha=m@qS>uR{6`vD zp`zokF&-PC{4Y7PfmX%2fUhaqj)^7PsSv*K7(VS6mUKW)>(56s@{Pa8Vv~Wzy9=W! zkikn}OnjN=V4w10=#RpfN4o#c{l@_MU6M9n7QgmXv)9+F%Rp|)&++#B8DBwO$_Qq6 z36}1rbbqnvHSrwBjLOPk0p8hqtl=LY$tO^p%1(UPPn}p3QFN3fFV`#HnMLGX!Z~hu z!47+DW~GQQ=)u@w_uOjsZRBQKvGHIY(GxDjTtYBZ5u^0~tqF3?z3W1x68sp1L<-ayQ_mPk86m(sbgSuogqm-+Tob!gOr5c_ zn^E^rD*1|~jK-6^un{vn+a3pDK|N#bQX30AUvBKDmAb+!0JgQ*&FhU~5T$w5p^`)W zXkzD#kl83GrYLPk-{9@jm~yhb`I}BDRxEWq(?#*$=P6PZuId^Rcgci)qt-OcsKxct z5!Ad*sfStrz6_Y`oU%5yMJ~2a)q)1owvJad4GV)u+R409w{$1BAN_DwnjSp4O~&bT zW?lT5_#3~3WCh80{b3oU1nToUM$^1h@*H8qA9CHpi>(TjGG`tZG+L{zCl^uaF7GHd zlBoj$idj%F9vW428?*IK+5h8K6WQ;K3ko*-5x(#pNJw5l6%>u;YYH}fcJ#N$+)NO(m>){ohs|- z-w*USf16e233htuZQOL48r9VosH%I{f`k85kEldtaG~`NU_g-xQqHrFhsmg4kTD`G z6K~*qUG-buRSJ%`*We^#9-_`DiX5(uv!>1vH#L%x&CP8)-YdfEzNeh-v-1;qA-6nK zH}AbZ7&Buif4q4UM=ptJ#82hYJmY_s$KTEy(_aGSrsB`mz1|^bb`Xt;V_9bgkBE%9YG7{)CqGm@^T6$6Z{a;FDd7g@%!)@K z$kGjGo{vCzu9pIDdh)1B=6b@8T?kt8@(ur4Le#XBrJi z)I$DIxOZO*#lXQVIKy+%XxdLHS$cGf4bPL#FGHKJM4y|ebMJTUcM zv0H)4rC|PKdzyvRER*(Sx5`J3$9htP)+^!-d6t77>7znr8GGqJce%vELPO)y(>FW< zL7;+(S@|fRoTq~x@9Tm5g4)Xd^fp<>7XO+Ly}X~?r$%Pqs0N0lW2rSGJXEeG{iUPFKW}km-=b5>y7g}R=en6FV2N(NFrV)p=?4IJ{uuyY|ewxQl zap#{Dr!q@$*CR8bDkl0Qv2k&6iLZ&pfof$7DA>Z2FGWNP-6$CDD@t28kr^_|>%E+y znYhY=SCqf|O!FeUaCY!m5_4N;cy(t-H%LWIFFPk^<70cfUeL;GtgpA${R*#W7(eCm z@v*zDK}=|C9ktn_Ort*<%F(Ck(aN4I7van$j>j_igU(Xr2^G<>8V;C=&l3LCIG3&)tJ9BXy} zEc5A4{|yCYo0_}bA#vV$WFVY5Hvi~vYOJ!$htNdTPUqns66-c ze$AY#r-*-*;fSWkx)Ej0!orI5ycaWJrEILGu;g34!kU99c|G-83F1zP3ES2qj`C3H zh$O<;#XX-jeI|*C_A0=Q=HlY~QSBqR*csFDsD?syH!QDn=cGOkpV2=KqBjH?Joe|> z^C30S*Ku)kW4Fl|rN4oH@Cb6{@y~vuz>BbfcfM!#3FeNN`Tn^H%5x=~mkeyn zmM4l4q+Pb*=!y7GGdwM9R8JsQo6^G|@eudOCA&jM}M7TCB{Tm7Hv%9~KSj`uS;v z9+H@bPfevv`YOlDYh7NODaDh`R~N3*nr;Y@6kJFf!MWe+=w0M4ub%8y(9Nlp$vLzq zl)+9({`;n`2LX;T{_|k%h{XP!n+DHJ?ZE4v(=b5+L)`cQ6DA~6d^Js-oh|-&WEtQ$ zOE6HiTy$|5@P)`B4FsCvPtkOOc#YHFfi4tqZl=9>L%d<(!QqB_)Y#qrVS4YIU5+Bj zA(=xOO#AK;*NnVOCCzp9I9n zv%xj6B_)~p7JlD1eq0|0GD|IeeKr^JP*(!=Q0d({G}rKZpA|t3DLsP(ygMJ-zxHKv zA1VA z#YINFVedY*9376*Hen*(efUddY((ez%mgQuL1FM-yiEmb`T56em(EO1Ir^YTqIc$j z$aK|jHDhL~06T|WW(ll-poWqCPc3KBPUW8yLFUH~v|?MSae!c8_XF%seDY*(RoTbq z4#4*I_6gjWAyqM=NPS3>JHE1S`n%XEJ&)-Ai-hDje?~FSV)C~qHxoW+^$N$9>k8_8 zjMs}GbW}Kcq)GmFvm6gSF6eqg54W&dh=V8W>|_L^V$#W1Irqlhxbosa!g3)sspzp= z7tJO((8J@)ln$$DW?32zp0gIM~v_z7U+ zyMQL;1GD z$$!~g8O!Tqxk6>Q-Qu2UOuHxr3WuD?xyvkz5bZRNeK~9UXZ(qfih$5ylKipurS;!8 zY~QH=^=7YCDw}7Z#|yzPo_{94PY_)#BrG5(yt5-z`vkuRCr|zNck!=Z#i!#vC@4LQ zEKK`|&W%V$GBhga)2~sxX6fpbQmY78stkXqQW4(a6x!j`-4bxbptl|PLs2kvlYH-X z!BU^60|w`5Ka$(VK%YMge%T#xRgDp;VSqCNL054bRe$+#_tQI|O>KP>>>Hj$beq8; z#!Cj1z?`c2;ua>x_X6vT=OLL_XRXX-RVCuElLu5AA;Y4=oQ=PG27LNvTF<64lC?+6+TK7hP)9iM}+} zMgF;ZByoUz{jREMaYqMUY_adWfh#ANpvP`E)j~(DyQgnJdW>u%IrsDw!R!@(YP5q% zJy}i9*L5_cI`{&QdRF=vNbw;^G;!;o=Fz^^45C$DnBe1S?8+IMza#iI^>|QW!#6rB zgsnt*vEq7kJ5>RtuoAoNoA`)%@jTmibetRobM`4pMkz0=g`%@qh;VV;;FrcEr^vF+ zeiNT|gMBsB_rh(Msb2rR`F*`G$Ktu$s&Dm*UK`{6N_lHim1a`)d1egnoupmi@NWV+ z@igZ_jKA`ugJVS}ya{}#$CtrOE(1a=spl~H%WC_bOgRky1V#*KHTpi>75!q`QVlcj z0JW*}IRFMG{BRb5He%Cr7g^c=)!CVcL*0e{pJXjdwj@g`DN9<0WH;d{q6vc`TZ?_k zmhAhIl9b54WEnGKDf=?kN@R`6W6hGKu|^2tcYmJm@2}r~zwdKhuIIWmW7}y466Df zLTcEryq}%j&nJ6_GMU;@P!OwiD>D0J%xJ9)M4s5=KYZwS^0h3LStYlX4>^(ynb^a| z7*cbyUL5E;>N$XQfj&a|M6*Z7Dn9L%aijl; z$}!h`f6L2t{@;9bf17mwOKBXjpJ_o)-#}=DvcH^b$8ogy)0>MJ9P_bO`;h)8T!ah* z=kt!chUH4vwFu05X|;GK&DFK6D!GuqeiV1DGRc% zjs)dE>TT9)Vj|jIuM7R(eY3LC?4V8zZz79Ym!4KYaKAkVzz2weLh-1HY0r(fJ4X=7JbZGJR`8!I2@;PV~o{AwhF_ z{~ZE*1Vun^6U*`{F^9G;Ne7}p{nt9anZ z#+g0FBLSt!CoUSZ-oolqE;U^?*czrO+rF1cojI?54+U362oG&-Htg$w?pCICSP^-{ z#Aj1BIY+4yfP{b|ajbFyMMKbnq{V0#A!@}G_#ddg#+bWl@LRIkcTJZJXt zcFlGNb}%xBC_nW|_&O9(;YBS;mu1Lrzkbd9)>1~Ju1U~LZE7mwfH7Fm?7IgjSRebcpdT3U`HS;_92>iEI4 zzJ;+OSzlk_rTS1&Y?ym5$V;3hNFT>2gO7_ zcysmB*NielkSRR!j)=Wdd`1M6t>rABS5)17Lf+O%j_GAVe_by%kn#92#m`KCsNx>i+EGQjcZO z->t7uep1pxiWJ9HI$ut-P)=a-oyrU7NJ!R9)ASJI=KjWNm`+Q~J`f$<|GE8=@^kof z>0s;YY}gik@7MOf?YIdBbeQWBdM&T8pNxR4yZ<*Ne@Xjb1Yp4fddChD+dAUv>Bu*4)Mc$Fl5+`!Er^#ga zUZQ;$3=xp8H2o^z>j?yL=h{VC!=gJ+iso&TKS9fc78H@ZGnTvqJu`~Hiu$`f(BQnZ z#)G~yj@Cc%TkU!}^H%L0jp^f=f4T417i&JeVsv}!J`;6h$Ywz$WpTvQkS%i4a{3`2 z`I*pp_>iuty{M+Ra{a_mT#`%_yWYL3R^@!Qd@_vrl08kb8R1h!>3XbLEv@a*SRB!^ zTI|8+;(bYJ@^4OIzZKfz`fYI|Gf!+9y+0itUG+r7abvPiIjmFlr@c3+8>#*me*3BO zyqfi6%Wnvt^pLADj#+sM%l3frqKt(_Dl<+vg=y?!>Tj+yH8;b@KYa7KvUkz6pYu{2 zYd?;M*N5E@aoofzV^$YWLs`?7CUI2GmXFJmPZ#YtxbUA|e6HE?5bToep((?&xP9sW zBLc_iQ%XKqicr?kmA*_EK@fp*g)|?6=+(_oHggFCwRXZlDi@SzS-)4y&%oct;lH(i z!7+L(G`=&9ITaZ8S=->Y_k$w>+ZHrjhU7!7>~;cy!rL98f6f*DwN8$+4k>vk^YSoD zp@*QBHb0x;$B*K7w)AWsaFZ4ut?zcaY0uSxKV!k%%S8N{5t&dI4&IDv#!JC;*p($;8pNSY&hg-=fApZ+i^=xj$$TEjgk^~JrORw-`hIQFgDEPq@o@5+=?eQ3egh zekU4}r7+yE0ftT@OviF7*4_dMO4fL$^Zk~P1Bt`uUSr;oaK_0Vs|Nwi_xIa4i)yHZ zGa0GyP>C;`;Yt%PVD$0F5-txAGQ)WhDWZ4i@k6Ihm@EA}%E!(d`p*~X)7F<^GqqCp zLR;N#xy7iJhTY+G3{JPQe=KiA*P|yUa1Pu|=a1W&LimK@HX`1)8YxF`kf67Nql-(x z+Hq*6YXuRSDggJ<{`u7^jF5>z_Wh)WhQ?p2QGD_)KowML0oUp_<5!f2nV76>0%F@Ad5lo= z5TS}}QBzkxxV5zv7J0lwLsm^ggC2x3XCP(vN&L`l=H5JRn`SSs1G=GUXR7YwbYk=$ z#Iay<2QdkS%HM=uvyS+w0C4S5BXe)jrlzBhZwGN? zce3aj-0Bv5Os`tJq`}DVKSPl^Tp3oit(nUwOnm?EDU5d*DcWy6-eST^=GmKT)W`yA z=5+GAnjQyVlI$G!+lV-V_*vV`jMp}4Sj~uEt|6h_Washly?qUBJ3Q@NW&T8g8MRy9 zgxDpk#$}y4q>$vwIoWLZlDSo-=%iCC^UKIbXRHX-gBf{5@gin@&C5nS-m}UPSO1m6 zP}1MM4D2|br$F%uqBMzjS9MYEI95c2Vr`Lceb6itH=HUUiRxA8ZyRpW>D=?$tLuQB zNZpq=o@sQ+{k_e)Y>k<}jG@EY(Z)BSBBybzr?8?$s;}?)Nt8^ISS&OO?D+y>PfV%u z%+<))K6zo8n{)zoGe$l1;m8|#;>aENzMX^wf^Nxs?KG^u9skI1XWJKrL)De07fdAb z^&3L_Ab#Qv>2_d{y|CiCsndriksQCa!6W=*%@*=v$)2x3d|ik@2*omU=N4Ff;Qn`| zP?dO4g3>L1J?63WE(um5fe=z^Ls{AI=O>g#M`21$vg@rLr70Y`a-k!@(;*`|u<0lC zvrKt4>uDf(+v517Nq+G?6-)8!0UDrT3-0t@58WzVEpTS^rj$0Vp5@l1u zvU1k4a!AxTxF7d~AG|H#<6c33MG0_w3MP)gf#oh+(0GwOpKF5J&l|C-SmGGrZ#;XG zc%v_#KFfMde4A+gYO0z1R9)GlYeBADv$pWezt) zcFGx4JCGdXE?t}Cn!jn$$x%&XF>FPJuGVL@<^&hh-6x2Ae9+_EBuLV_)PErxH!sUjxaMIg@Nc%LaW$^6r{ER8 zW-8WxVl?6#D~V(fzCceuHctu(fgMHd=;O547EN|=8Im-wLqxYP+V`KW@mhX9ICV^0 zr_Z?Vu`7g7IM7iQt@p9o{Rc()mc0yBaYdVAbYI*cm8(>)btxt`3ny}+A|TUke~_0o zS#gp*sBOVM(30`(EVGB%=#m3|5sf<*ACGq}l9zC45+{WY;GmcSfz6wFje>Kd2MyW# zAbhiKeyZsPlgN3;GZF_qZ?{C1a<;q2WU!F9NY9B@>ZPixq)pRcT22zdE?HcNoi+R* z?9{V!Hq+W^uiFf`NDCV?1*PrXk&}~5t83)$sx)aZ6CimxPc9nOa9GSFlQ1`m-s4KM_>#> z0e`F{dAE+A3ggIL?Cl_wC|=;@GYD)j&-KB2J5-L>^B5<*)!d6^`2In=#I4(s{sY1E zCAt^I#Ed=F^yM1;l02~=WxRtUpQ5qy ze34@(UsHq)EHl1S`O732%oX%zDS<%i+UZr~m!GS7o@Zx(#mX!$z_N%k%&xY6b>Qxa zyOz7VHC*l8s5*|)C8{U(5$CEGzdsibsjl3VDGIw=c3%C+BTt3%yL>P%?Z;}0Tw7v# z3LO-Pt|kDDanFcEp0midXfP*8w=rmY$5YH8)hTln{hX-aTg3m|O!K??I$`yjvq%X< z{+MdTUtdb`BoI(=8FDr&-`F%51>2^K%!d7MqY5iVqk3Oi0OG3r|)?mo)zkx=mQ z&6iuj_bPvFzVjp`<*qtba_|uozS(u3T(~fwrS+|FxMMmoRYrzz77+(CHiEB zinB7!I|JW~=P_}!U{rRpa@r2`M5RyPTX`RF;O%+dr3|Z;L`MaUkNcfGX|i+E@31cP zCCOm)7Z&&a-pP@>1MjmEn^&Om!YPhptavZLEHc_}d6Ab8N;xLOvcVrH?2wa)nI=?T zVysx$adh!x(FNFtY99*VM11bN+Z4oTpGaE%xQ#wvZ>_aC(&&OexZQQ)xME3u7v4 zz_Y_aS{NIaI`}EL`iT==^ktFxdt6Nl5w8N8)e{Ri79JFm*P8OjJwO2T%AabO8j+!+ zI&=y!ib63U&AM=La ziu}w%AJ_A+HoP5;jl{;W(=$GcwPL*iiiJckH`fz)O4^g-B#n0hbb&>|>qv~U0S#BH0;kJ%sc2Rd2$JxxzGQfAEm z4n1T@t27N<`lJJX9SZpM)Pk?xaKO_XC>_?wMS%l-=ka9mBYH7uc?ofb z!2sS!X{d6m?!Jb+Dmy(>eeO}lD&ZM)@M>)@McL_`*jAJnh?uGF2Ou|Sk z1%9&<_y7Llb}2(X05xT1Cy>{P48@<3K5JxX_zCXdh?074ZY~65ktobM06Rulz(AOX^_HkR8Cuy9fThd zsi^^T-x$!D1r^UWkWq^4`-zCT=_vU0Q+dZA z$nP@@eBdsi7k+_>C00t^sX`J-Pew*YG}BkNraa`^+S@^Tx`Wb8-jWkwg9!(!)Con! zI$W$&n60u~Axp1v+3#|~ygcNY0jx#;s$fF=OomM=q92Z^oW6}!HW9N83lEP=O2W;S zBf^@ePv^wt`v-tF04JAeg^e#LHvhfAn=Sal0UNcEC0a~z??hE^38W%jeemb)$kPF5 z=@|46&>#&2n)j|wv5zC?OIljmLVs~|6vO9oarK>^gd%Ll(-x)IiHXP)t$o#& zc7=kDQrq5>{~Zd_5i3w1p75f(nbP^`a<^g&bkhtr1gaqUTZr0sdOQEmSBUOzD#*z} z5Th^RKwc+o33I8`W{4y@C_vNf4^7Osper&o+OXJ|cguyk; z!XF@HT!^~Rz!upFS=bU|$-{gXg1}e`KkC+q3IYTLGFTUZmLth~=%|@T+{8SL$W72Y z1<=6#mGLIOzrR{NHv-`uM|_vbNPGDBxFv#pfCP!g?b~q!%V|kTH*9UsCg-)S%AiwQ z8lb7AsCG#nWb1*rWA*uk2=tKWn9qK9HzTGO0S5%7a}Pwd);FZ!EC|sjknl(A>2ZP# z9zq-M+^!Fsn}b{Xjhi>2Fdo@{<#+t5%{PIL3<@zikVOP#{J#q}Dr?&=GvHOcBG?4A zEGZqhKW_Jq`;T|B&vOH2szxTH?_Isg~=g@8wqj0|z@6&p?ol*xn(iDt@ zLA-De?BT}u606WXC>WRr9e`~J?2`kQp`U*at{n22vokXlpLe*j-}1@W#|M0os+@zP zPe<@t69#=LUdjur8a^KAqj&15*HqdMaISY*B9ix*oxKL3UwVH1STh z5f)3?QUoh6zu(lSzE7Pg6E;o?jo7; zN*9pq^f54quLsN$M0x)-&haN*bH_X_ZlE}0Vt&}+=$L!C z$t@k7Y2fdHp3B?+Nty#k4Lk$`(o{z-7KHm14T$e}Ru~x=tib22LB^9qP;fN9@91Pm zFH~gs41?uGw@hT>@tj%tY8Cz z(%1a1M;65cAnSYqkZK^^Pq1(YVFfNh!A@l2sZQF7=Ye9M1pETjj_=A@sVEEUs3AWp>D-1YayeHBkZr0YK}=AvwQR^JvxEoW|8|qTtApmP3T5 z02DB2N*YM;)kk}9KuYUAs3ZJ^?0sQn_rQF4Sy_KcqQ0ss4a^pC@9(fn1471A1osRK zzPqJ^pkXfww&=9FvrYnCNI08idmXE^8pwco3#jx*@>daFIeuhwU+`fdFA1XAB4^JsVH*Ab zJuw7!U~2J~t}aBl6xmsj`#<QbR>OW{6hQ>RBMMOo1?{-B(xZuGA+jfHr3oJ`JrpfM$0GO9MRx zY6q`Enl3m@98BDmQ4cO73x64el-uK(25JH18?!xlQ6qJoZEzG~%F6XVwF*-IXlPKp z$wl*sNB{eMe^(f;#yQa9LfNTs9Bc+!!(fdk)OUFYX-(6xMTWuv0IA$`b&;ueX?lA4 zBGAsV-@e@%4`nCJbNiDGK|J=h5d6T4h{e}1l?PW>SD_SQuBHplC2y{}pSh*7>#^NUEYjZO?qj&(S)k>h03Ca#y zRQ7(cs}|<^0x(w}gTd^2!VUm%@5?||r#gY_1^Q$>xD(XRUFi2bRlVF$v)oz(UO_me zQxk3!X^TK|k^>5*GNN(|(2%0iWgsvZ7#LgtLM2!qNcsGjRoTBvqvZ2@9IVT!$?K&0 zj**^TGvMdjy1QoqT2iQAw>V$!Gu}uGwh&Hr=-H)1IL~K*M+awUH+T0oXw`+i&hzKh{o6bw6wKTeh1wDR1gOP2Zg*|o@+{#K(cr^_#n}$bdXR|XNg8k&$y0jwP8?3=cC~o|FPsc}v zGgT4VCQ5zQ?NxQ89%0DuFP`Ju3G-a}Ow62bvXHuFyfqzO`K z4c^e}MQUmXqW3^9hCnT(*g1rSEo`(n$Dw_j2524MR$LuK!Y=u4I4T2hmRHIotYrf4 z9>WTj0z{vTA_@fc_=FnZZ(-f8G}6_*LUDPVgTv(iBLr8kNgQtieTYY=r5ru99jm_ z^}i%299;ig=iP#g9|k1IgYxVO{rS;RkF?+eCPM(}-7}PbBFwh+hhZ8=_UM<-pDn>b zs;jGW2nltS+4TW-K5D|@IKg)tVnML7E&=%)HlIsgOLxR2B+f}mO~dcSL_~sNvLH63 z2j^`sp|K>6?83NXkkzYANK9t*Q1LD2m-yva%?2z!1fOVpUE6_7R_53qPTz3OD{6f-?&08YpPjf;9b#Vw06@bGM z@W7doYYPn|erAp~{Gkv0(<1Dr+yyd?kN0;sw|?FVWN8F{$*1fmhd@4&^FG|3ii={R zpFaoZf0blf`COv7?)}#Hct}9Jw0MoYIq20OTLhrwNu|3$On2;e<>cffTzfTKF48r99=)QL@PFWSeQYe;W1NKHX@UC>LF-1{V$dcSA#0y$Ed?@_zu8nMZ~I literal 0 HcmV?d00001 diff --git a/tests/triton_tests/plot2.py b/tests/triton_tests/plot2.py new file mode 100644 index 0000000..d433548 --- /dev/null +++ b/tests/triton_tests/plot2.py @@ -0,0 +1,69 @@ +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import os + +import matplotlib.gridspec as gridspec + +cmap=plt.get_cmap('cool') + +if __name__ == '__main__': + + fig = plt.figure(tight_layout=True, figsize=(6,3.5)) + gs = gridspec.GridSpec(1, 1) + + + rdf = pd.read_json('tests/triton_tests/info.jsonl', lines=True) + + ax = fig.add_subplot(gs[0, 0]) + + # now plot the % speedup for different batch sizes + for j, batch_size in enumerate([2**14, 2**15, 2**16, 2**17]): + all_xs, all_ys = [], [] + for k, marker, ls, color, name in [ + ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), + ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), + ]: + + xs, ys = [], [] + df = rdf[rdf.batch_size == batch_size] + for embed_dim in [1024, 1280, 1408, 1664, 2048, 4096]: + df_ = df[df.dim_in == embed_dim] + df_ = df_[df_.dim_out == embed_dim * 4] + xs.append(embed_dim) + y_ = 0 + for k_ in k.split('+'): + y_ += df_[k_].values[0] + df_ = df[df.dim_in == embed_dim * 4] + df_ = df_[df_.dim_out == embed_dim] + for k_ in k.split('+'): + y_ += df_[k_].values[0] + ys.append(y_ * 0.5) + all_xs.append(xs) + all_ys.append(ys) + + color = cmap(j * 0.25) + real_ys = [100 * all_ys[1][i] / all_ys[0][i] for i in range(len(all_ys[0]))] + markers = ['^', 'v', 'P', 'o'] + ax.plot(all_xs[0], real_ys, color=color, label=f'batch * sequence length = {batch_size}', marker=markers[j], markersize=5 if marker=='s' else 5) + + ax.legend() + ax.set_xlabel('dim', fontsize=13) + ax.set_xscale('log') + ax.grid() + ax.set_ylabel(r'% time occupied by quantize ops', fontsize=12) + + + ax.tick_params(axis='x', labelsize=11) + ax.tick_params(axis='y', labelsize=11) + + ax.set_xticks([1024, 2048, 4096]) + ax.set_xticklabels([1024, 2048, 4096]) + ax.set_xticks([], minor=True) + + #ax.set_title(' Linear layer summary, varying dimensions', fontsize=10, loc='left', y=1.05, pad=-20) + + + + plt.savefig('tests/triton_tests/plot2.pdf', bbox_inches='tight') + diff --git a/tests/triton_tests/plot3.pdf b/tests/triton_tests/plot3.pdf new file mode 100644 index 0000000000000000000000000000000000000000..19e93a24eb4a38dcc82cce0729c3e8995a096054 GIT binary patch literal 20122 zcmeHvbzBtP`=~UmgtVj!hzJ7P%Pys~N=Pc*DGf^q2GU5kASkT}5`suchct+a3JOXp zselL~+!@gCYw)|T_kQkw_sf0`bLN>d=R7s%i8*KZ)#VjM5Ta-Zf8hwUpc;aJ!lBOQ z=OL1kP?+`w7b_@C&WvE@;A{hhshinYxkHga15K#3G{nlu5^RY3=>|n-Cju0;(*UM> z#{9gM1pz9yd#d0~P|_lp5v-u-U4Xh7fneq41jT@-5SW&&nWdeR4HWzRs++TgmK6bN z01PXy2%uu+O@P8qIRYriem}~6Kc0db{e%wUPXa)^LG0bE0PJ?rhiO{5JA1fU0Q7+P z1LLb&S=yP&I(q{n!hs(SgBFG35MppV8ji(7kw}~<4v)rSP{3d?Ie<){FLx+@7g-r6 zCuiUc0p6$y`G+;&75(jRRvywuj!wsn>f3L5J-9nM-Z!K_ z)c0}PDA&b9M?op_n)sc-uJl^;5iBCsi^Vh1WXo8mUT6Bufhm@UQpG~|RnbLJd5BK3 zFNQj8>x#KCSu9iOXpU@-+FS@l`q#vwZ z%?KnM{?w7fJn}62Z4R@oM_6IkRff;;F`e&L)hA1nN}gPM7|7&ZHOyQ%GHw<`W>rXs z?JY^2yKTllPNQp8`neSOIvsZver__s?Qz3{o~4id#3wZ3v)FXA*U^RYxi5~}t;L&?V;|IU=`%l)kU)h&*WYCmuzc>3~gMcp5Ok&>%nm`gW z8r~D}X*yBSYMIsq-j;{2R)_C2ebJxPxMIFh7sV>Yd=Vc}6dy83>3SWWu(}Srd)X6J z?MvMoGms#ek{ZGu>l~{)Fz3rLic{8LTlA+eE9J4hJE5z=Jc3}n9fL|rBd)&|pcFE& zuDj%!8734s*1AKQDcK*wxjQiTWs>>&3@r_pv#C8P zP}8Xlm(81e+AtL+aQ5fnvQGcy!3lJ2kML42x__@SG1gj{ zP5HT9Ghc6hn?vElis9Z?&CmH7jVYdCUAYmw0^%sAVYZ&)GDOk9YZ?tfX;Q6_RHMw{ zkH!)646{{ZX=OB?X%N3*t)sZUN^)QQ+cPwjBHq)6x5?|X*7o&6rdc&FExk%hP^!{r zUcHq6l|(mODyN{8n_&T0M~}gWA1o-XDXY@bc|EYH2Fc-JZy;5H>UOdxcBPtZtru^+ zU$9<2E@;0Ioa*pAzfSL*9tXRE4;B+bs5ohH}l3{^m zHH?>O9W^5GwWG2thImc*o%BLm)EXmYRz_i1&YAaDq|u&4KJfd&x6H&|o@rAHW_n>o zzg9YUxz>beBJ}mKpjr`^lGe3`F}BARcT_lzQk;2lh77hJe>GXjkf_h}y+vsma{2P5 zWuuvH-PZJsnlI3U#;_9t4_-7Srm7OtCkz8z2uja@=JFXNSGQk1icG z8+7<8^5Vdy;S;%cGUGC>cvcuIT9}}VddI_>nKIl$nl{G2qQ;ke5|7hrP>k-k)<3)M zC;DabB|r*(9TpZ71Ctt_2U1+v4PTj z9Y!8`nBzvVLT?S6@5ZYNxuMEfOUAdi9lyf;?dLN_9*CVU9F3ms>;58&wjDvf#)Zf$7X^ZXO%< z9#_d&_owy4#8fPkaiz-R3}kx|&NuG7O@B~ECUQec32SFf8ZFmyHP5GH6WMI@LPj=} zdnv`=T@8Ng%C%97RI`GmV^$I@(QDT&OMqu}Is8&zUIxO0FTH-;U)q0uf}JGsTF{Mn zno=u^Pxl@tS{f|87`u!z=s$R)*^_z1K_P&O5`ih4u^2|9nnE;uZ0R1cqE} zilufV#<)tGnf2yj+KxVSg+y1bt;KQEGsL-k4vL508VSa6x;>(~DpKWMYrs0q(;xn# z-Fz*@D2Y6N-ExIC#83$)`^gJ}4yj40@UHds>%PG8q^qs_GkvyLu4Edp z7^XwnBf@pq7lJRk3+gWfrLX8TH8)MULKr(qQ2~zkZ$WNE;2M)!4X)DIWUMus&yP)* zH1+_}^O@1Li#Mur@?()ZBr!xk#Qe}7x9O3=s-cxlD=ILwCsovyW(mM#>->Q8g=? zl(?^|%}rVu3M!P`#&HqGw)Bry_Lb+`e?gH4HEIcdnH#5ba+h}`>Vt}n-^WSG;ctiC zU_JO~scs0pFYYLZ`lqWR4d;h39sN_R1syz)HercM)C9skULZt6vM6KNxmVt8NxXd1 zu@$)RG3U_}y@2J%{r_ngms<)nLKzf#Xy*|^2%5kPgu-9c9*8E%R zjKk?P#~-dL-lD$$s()anCD+1N*5I=5B7+Bc5uzj|zvUYXc4_jzBnNG{j}}$H zWKo3Utdm_(5;d;CQKZQ|@=f(ML_|d-=LcJb{O0XB(pk=$X#zWL+lhZ-^G6OD6EcD(&MEwowe8)_g}C6TT^>VP$oHgtl0 zc(`3WN}c;qn@$Av`ds>GWzUf zBFpSUj~`;m#F@)R5CJ#${W zA&s@)P`LqDQ=QX<1R3@eKO^?IB}wLD`GtAY@^$0s1F>h>r1V}s_r0$x&_aLY9(lbg zfA>Dv+pzPLuWk_CY;>i1z;lz-V(}O`hE2?kJpRJDkG`ydr+KK#rs!IgMz_qGa5NF) zQB!v85_a52gt;;AG@CG^_im^AY&$Zdl}vJ8N@-;9m;}EyKU#5{n%8l(dz7A~XI>3{ zb>wd|U zi=QpNEQOX*9ZB}OaK_Hj$9Z(b$t!tiIOu)iC~GRI0vWZE@J%q1(oaKI=id14Oy8)kEY0fj(Eis}P2YF7lFt}+(^A<)I z2Mn!pj+ltO-(+VyL~u~|QI0Ff&~C1CD`${M`d}0=7m#=3$p_S6(VSoI$Kb-JIcit9 zR_pnMn{wWqRG{EtsUPg(x-Xo=`EpjEU&tJV!JLDumz3{g$jeiNOerQo1cMDvNQWkNS=rjrT2K1v zv9z5aKgpJ6wlAkK)!o#`trA)JWM3Eaf$l$%K!L&Z?}`Wi3!z4Jfr=-$sM1LJR*U$n zwhmp#)==<8pGG=^$CXqp)Q7TP1tpR=`L2&3T!(&0yB&+Ya=Jvqw=S;(o@Lue1P>fM#XogbDt)rZFTDT8;%Zb}`{a_@H! zjosukakDq;Pnlw66)9l5XQeVQXLi>?g!)6Ij92ICQ=g(`IQ`->$*7F9)1J_f~;$?rTwR z+My#i4L;K!ukFv^eu@#L5+j_;wh}~J-2HmxiSK3va_qc{k%MM|?;%NJd7YUV3Z*b! zi)7a6aT%)jr<-i`LleCeNMsg1m3R75EeyXo|M)&d{`Fo7LF% zCbM7I%l3UKImW+o@7C+Hsg?1M>j?|%Cv2Nv(s8)V$sua79xo_#rdM{V(*&Nn@%-;r{6;botuOj$o}z~b zTrBLc?Iy$Y1y+4Cv!G`(A|?P;r9in?nfl{RlfsB%%qR~xvb$B9>bp0H zL+`8K!yB?0+U;}AVzAi*7ceLM8!kxHUl=6J8GIZm4|UgtAZfVH!Qm72w{KTqU#wrL zj!RLCS>ov!WaLWmPT}J#v2GvMw}e`cX{of zZO_C$U5Z8Qmuq^)2VOL(JU_=`BRWiGVVBl|?{JDRVl0jHxOZDiR|%d;9Z0BXPJp^= z=d5%@A*W_=u>v>47e5aRd*UzC&hE!X@-R<-5%TfJ24u}$SR`h5li4bRMhMO1?Vf;{a#Ay10p2>xpm zB+#A1YVPbwfv~&0Q6ErIf;WYHZ+Wu_&p*Z*2*gQEK;LR=hIQoLO6WZ9*qHzkw;muB zW~dsoWK_j<)tu6dU`s%L;Lh{py;LaqI9q(iQnj~`FXiRnncOHC+1XmpvE~Umu)4$dGJvg>BSwd!1q{M4xvL z4({PnqVSl%;Gm|mQ|k%J;b|trj3-Ms`YvedysrJ~Y%iyoFH{?bXoFwsjH#MbHezrM z>Ud~eWFVP$QPe;`v*z-_``{P@^cB=_hO=K4l~v>zqT>wG@Tx}PgKOFEuZ8&*B+YRdnck+ zzD4?s>c-I|6_K-TPFidxn+Xw5FA~Uv^3k6XKK1LA_3zgWyTI@nzj^Y@^m7NEU~#AC zpTZ^&p<0m)y{5`|)6C>?rLkc>5Begms>)u%)kBHTT~x|vFF#&Avqc!z@A0U0qs%%n zsVcKnLtCU++R}oQPLCAy$r3JMPpwr%HJTluf4kjvU z{aNh?iWnOGA6vWR67AEdV`KRfBlI3F?q5Bpsv4)RkpM<%nt9^{k8ynMZMhV=>FuFT z+8{hBRdHN=(skYabcn!1!ct#idECXmaa_)NS)=C@*YR@u`ZTYW#uv>yqUN$jp&Wgg z^h;8i_6&NZk3Qmsvs2vbC=LZxlAx%gGS$UD4#15J z3){3dM`G?=7_dlR+i&rePjU||_wbq#e-of8#I(DT0bcXsU1EN?A7r(#eVJ}{t2lUN zP{!w=EVcQWrV_(*u4aVh=!YUGnk!0d3vC-ni`r+pP=Of|>e#S?ft(G`R<^q<6VJ?x zva<|f;=ByqapMIi_%4dyZ;!92Kz$<^ADZJ?WZV`KnszcH61}@{VqNpl$EVRRH9n+f zMhhLY=RW`HT%ZyC>0xQn+eo$7k4W_2K1wW*dbN0*+f)409>nh9ri%TAqlH2~1Rk>z zG!a8E%Mr+wHTh2PSeGAGB`eB$lDegnsEpdLoIlM!)W+{R)bT(oltsQYC5%m;D~%|& zYwR@1KK@~rjy>?%!}G(T{$eKY#xwZADarb8YHjibKIUC>0?c&tKnjMy$5Q0 z_*m$_2&VRARnmd8qO2;y6y3xFNiGoHn~jO&`>Jec1ojwqFLw&_7rvcBtg;&^lu~}} zrazzG=7ePYX)8q4HkPK>66=|K=Ud=uzB~ruX{HPCOC6tjEGyo7;m=BNkTEG7z22cS zQ%YPQq1BRNA9jA$!fK*&P=dUbqt^(ZZ|RyH;i2szIyNfwgC+VNuoZDGzVo{N+_T-p zJdW9&D11cl&a}O#{t%n^n|M_FMBdln-YV1NngEr>_2bX`HnvE(1+6#sVCB!A*6wPF zAIlzKS{~+vo%Iun&TfuSnC$M_0uclpiGYfru_!1)44A4ANYu`fg75hQ6dv-gskLz= zXp68IoyU_%F)VSJTZ1s4J0Eoa;T;16BvhD}As6?+bq}8g0if}(<0-^QFp%(2%lWEh z@_peWk?ylb3S!rsEnYZiHVo=Q18NxkpT6=BoAD2+;D1fFc#qV^V)H%L3Bxc!>fv`b zXWTYH^HNo-T_WM5E!O?m{(FpE*0eqe*=!jH#F^jBXHB@}E%dI-xvZM3)|_u-botnvj+vdFau?OZ^cw3ER3lpWR|QhX4?)B~??LMx zwjPQ23l*1AuZkoEQi4W7?79s;^Z7O`;mUd2D<{`K^A4~mXbNFc;sjwH3H*9mZ^SPq zwa{J&o^-Or@(U@z;!F6~`XL`Lg-tQ7y?&NdCWlV8#b4f!61vO5HK>eDbrE8$&KPM% zZO}LpNX~|Eml{C(b4-USplG7jgEZvH686dOl7h1u5q z_Q{>5SSwl}*2%f^YJ(HqQZX?yQZX(!Mc_s7B3$zRw<~g;RB_3+%I`8iB^V@iTCMTnoja;0 z}4}9GTHt?)O5aYpkB83()f!( z!K?43kd4-5kMQsUsb1O`waT{Q4}%hkhe^V!W0qxeA+y|1`Lm*r4PVzxR}+~$n0#x- z_hQsOsn$1)%*nl{SxaX{G(&9kN_-rB+COSv)9BHN)eZ8yo5o^$@NEx216aNE?@u^J zQ~5q0f^v9Gu(ClrB^n`pCeQeh!qnDqY>97d2I*l^>pmX2sf6PnG<`cE@%0}T%Ha=LtfIZQ?GR?M~M=X(*Y3*fD zqxAUvDa*$h=3J*EJ`+Q=Cum;=#FpK>mnZQFznRcCnbW6t?|JEJm~e~?5m$_>q{daz z{F6hAoM{~yu^4!K)1g@ZTVblI4k+E+t4jXLLA`~Nh!!Z-`6qB){*eMrPxZ6I^S~M#M$$;X5zx=1)PEPiqyMDOFeySh&wI? zC$dF-X6w8Tg~(lzFRDDh|6{{Eb6?MeAjXtPMy`R68>H7{$(i4VjMv~H3Bmav$-VhM zn@!}E$8O9(CMSCIvK2NeBj2KlOwU=``A@d9kl7~qqqm}(bT&6B=-(H&?IGShJS!yn zFD7tRjhcHv@^w}!hy@4aUh6K1dV9B8ZDVuNNb}|8m`-&xYgJoznq4}rcrr?q&XHZ& zp;}T*Rx9M1f4+U#hlGNZylyRF)&Au-dPP$Q$9#3fo{p}B8WYEA`0JQmrHzrOH2w0@ z{)mIdeqKbdI?B(k3F2PKfFNaycwlodL5N!IS)b6kudv%zFN8|G;px1`@2A?{HmTmZp7>hnmJXmAug9dE|9yDF)V}%iY#Qc^#yp-)Tm9xSTvcRy?{Filoror87kRe?AGL4GA z*z6vJ?%`9SP=65;iD)XjwL+5{7ReU>#e$FA-0Owi9c2`4u$NwR z$38y|wBi|WWqW1D+-}A^p8s7Tz+DdBWD<8FM!MBk=LoV$;r<8#pqdd(VQrl2ZRulw7hgJ%`XSv;{P z&l@^8M}5;h-5=y+sCbPp=2kwhWJXR$_Yugu=rnFOAri9>sYL`g4jtFqH@h)6!XKhe zSX+LaM|?jm9hI_H(JK%@DPAww}r`I<=mW za{aV?T=MLtQrVT@hB!%SB}9kc95&1`i|oiWcP@W-3zK+BJ8`Xb>NA}g(&h`@irVFlRp82+f?CF%bmH)!zIHtHMbY3IgsM%#9smF_Qz;=s+ zrE|=JInGb%b}0jm9j+m?;ifUJ)p(f^Pumb`k)A3{ZDD)lwog=8L;uaPT(h=zZvE9| zI_>utL-r8F&mQ}amv(>Q81Ce?{hFENiqQMGrbsmR=y@;y8gTCa-M@}jrX}S8!m6~a zsD}1K)2_((LGIi0e(6q(7E(dlSF8Lpr9|yX(wPtOuvL6~n4c7-peL>Eqj{FDV=Sbw zt&3+QLwL#mlu$5_diZtrSP?6yhknCYn)~+?wQ{3M!n5JgpUWMXxlOVRcs=5d2eb&m zM#ihT9_K!gP1P_!4?DcC(%) z|0NTND?u|sLSNgnDu@-bI5w=OeZrhEgkwPl7qsdu5o+Te1MT|=~YVA`?g zBJH|lnjEij^VilgFNoYH4}Fx9bPd9FHjL%6%*PC+$V;COYU@G=<~dKfM1-lw`gvY` zb&?wIE9|LpXa#i?!8r4ZLt#YbDu3K{d}}pfW%NsbX~x!z2(fx`q4x&*dKOQlIp!1E zgpu!kbXN$=&)Wm`qZhL_r=L+Xpsga~+9>Y5!Ut!iXq@$aC>KxDgJ|bjroGwBMs<2a zlBqtLpOGu!@hB}#<`om}?1(h%8{(;~TO-O>nYsFYyZGaS zx}l*%T=RZ=cmjao`tyYd<}Y4|h-jR0Baxx3={vR(XnlLsBKcUz#P$$nCj~i8_z8mk zyNrliP~mg6ld%Q1&7&xU|0r3=gj&`F-4yEk%rNo5!!iWDQUZcJZuVbO$Ooya?gId>69lRj0PegB+p;iriNZX;Jw z0ajQc=gKOUgjK_?@Y9Kt4d0yTR^9K%k9sRl_p%I*`}TBvYSK1sy)oo;0{iUUXl>~D z*NEcj>Ge$#6H|lGKNTEU(C`Nj5r2~pR*gADD+h$bYiUiacLGrj(~|Xft8EwS@5xbk zT-}_2x=2d)yB-!LSk6R59lOCf-cSa$csGFC?x>V*LCCz$646ceY=7C!;NFBp5LYO6 zyp+sppldgzWUx#=RM!CIHHuz8yQ~+#()Uo}b#!&PwTV<#L)aW;YYEXgzi-v;SCyOA zjRz-^h4?KPLq1*oz^jRs$UN{sd(Q3dwX)YYj=3^))xvA=k+ zfqH<}1hU{*U~PxBA|&s~)K6C6Uw zO`P?m{x;caep^4^LM^+IJXgjRE)3ZR!4GNKwFjzh<=ebK$Gt|E#kp6~)cf{n4Q<~p zsO?%Od*mg$wFgyuc$`SYUvL_ufu{x1EpuzkTPF@jz`W079qrkw?}->V%iU-gT%N#_ z@<}uP4$q{Pv`>m?TJ^|K_j&YmQn`q`Uu8ut>|R_$n007CquupNqEtyDxuJnO8gi`M z{ZDhXj+Ts&N?&{EMEBa_!PVpnp6M{C`B64unVx(FMi$3DfgDUdka&2LJpcN!R&HC; z!K>s%XK|cEGeb*>QfH0~5SWoq3ou;=sTdV_CD)_!o-xkqqn{udp0?2!o+qi5_y$+o zw+~n{N^E)SbVAugi*xBp8Tq-5EuvdW+DU)%2l=z3x|=%N**yf4HFLN6UQ^Z3kUx7& z-s-&BSr08UCwI{LxVaPLY|Y$&hZm+|_Pvh8V1Tq4uqj90%H6`v&V}IY21SDW*M$C) zh6Bd{tg)JzBd{4r_V=!L@Bf!19XN!l8NtnNXP1yD91hMQ!0$gbBM1ycP95011ZfQ?DuP9y^;xFW&b#mvGAiURV$E>^$>C1A@E zxX8o8+0oGqiUvFtA1gO!D9qUjpn~89)Cjh2RzTev*v6y^Yh0a;8ZU}OL>D9jZK1BL@6LV&_NpfFD$kqHcQ0a!Q)>|%nQv$G`F0vrIJ z?4S3F?Fi+^oBV%-{*%~t#P~nR2oS%6nTlz0K>^uKG@y@Yz&F65a8N7?3&8_9J~2STNMM3T;o&<41K2iv0n&Dc2t z$_BLm-xbJ@>nNZtSinAh6b!Hy0|5safI|ZX=m?L4iUBL(uz)^efWgJUf=Tc}1IJw$c-$cdB?bYHAQB)^ zV8J3#JNIG$!gu=s+5+S5)PW!vDTdoA7%^Z+A5g~w_8tqsu?QqU;O@9+U|bBK%P2f} z9~z1QjyQl>;Blt_6Boz~kO|-s9Dhe>AOis$0ER-qF)_eBJ52e39C+N(A8;fzcmfAm z1fmU+1|EU3BM)%6-2%Y?B<|b^^7o$#fZ%QyfC_@QVL+?^%)#41HTWT}A4dRNASe1$ z4`?x<5wHpvL?6_HUr+^sJIVy~1m5+ZUO=SaaLCTJ?;YTQ-H|^Es0h0{0P4Z7Wrx>* zUVu9BQ`yl6P$zzr9bNgZ6QDl)TtK{b&VfEa-S|;J%CT?^AVUCGPzUxXcpM-TP%i)* z{B!xC6F-h1QvkU`c8*~Er?MmK-RoeR{}fO!K&gVqe^yTa9P-Nm`nEF{0+a8~dS$>e zsQ@(Z!c?IFmcb}X1Vple_a0?f|9pgXW#a{vsvb7l#MaJRN&ICfyW z^Bcgh-LswfAIPBtXOJB$0SI{a(vIN(+^wBfz;wo%^oyBJ2eL=`um8E zz(fPWAb>>xgtc?k849d1+N}Y`Y3Ka{K#>ce%Dd1X<_vf^c3QbYf!(w_eYim}-$!r< zWVCzryA9iExoZ-E31jDs00mxF?$&lpALct%pf%iyjDS%+0J-h9@q}W(k9Pr((JpMq zgaR+%!A}0!tqWfKVVe{Y&>aWm=j&kmAJ=6NKU?$vY;A_g$pDRD7G_Wc*fC57`9C%T ztmQw=Q2%2yxF}%qfrj8wTKXq!e{!GxA>k*L`v>@Q#zPgHgLdeKsR9rEcdn}7w zUxnM=R`5Dz`!M7dli7|`L9j|O8xOWr25jkCm2oma})xFK*Kd3Pd>qDz||iUQGIAqaIaLqa0F6K zZYp!Qlv-3&Ky+US8J*!CnK96N`adQFh{Ay1KaLPq?BD&MA1DJjKShy~kQp%s8H!pO z(=yQYcD_Z*jPtmTsW_NJNjzMaVxG>XW_*ls9BloNwTrxbr9jLc$p3-+?pK3g%667u zh6{+)K=*IwBRU>HLT1NL{=O*{NKgGkxE$~y959b}6iB$aI5-m=?98E_C{Zy{gfP^W zKyYyvhrt~Gb4k?M%|;Lc0}^kR9u~lNb$-6f#nKvTZf0Q*jPkt)K_CSLc5mnGBoBU( z=%~Cn5{^W{QAj)-iNRvT^aX$VijTLOl{Ewi9I+4}2LJvCq%bf*^kohG4-B-`;2+fK zI}CxtfW-a^!(hRz;O{Unp7{-i1A6)c2CDe)?Z9~L7Z`8?GV(VV3XIZzg`qG&*!>#} zj)d>g4oH*y+74Jc0p>n`=??*r_6Kdi2b2D6Cx-u{ofsU9*M98}i0=PvhXFPp|K1J< zq?3Mw;lOm{?=Tb?mwm@hW!D2EMGFoORLh6j?-zrk>DFzWxc9q_fT zKlqHp!2fKA#r>`ua2y^Cb${&-zsIwHM}awn-`Zh;^yaTHz@UH`jbGXUDRm&<_d5(= z*6%O`;t!bv+OkJG%pdxP0M-=$PA>w91X5PNjf=#98KYnE12SNL;DW@1nWk9r&Mi1UEA~2P?OoZ^&uc`B(vSClsdX>+(S=0@0= SW)V0sBoe~UuW&{Y^1lFs0*>PV literal 0 HcmV?d00001 diff --git a/tests/triton_tests/plot3.png b/tests/triton_tests/plot3.png new file mode 100644 index 0000000000000000000000000000000000000000..e83178d7a65f7f2c78c9b9ad369b13a6c1a3a917 GIT binary patch literal 58335 zcmb4rby$?`w=SK6G}57fBHb;GAT=;_N;gP%iHdZ0=P)24(v5_4i!_qb2qF#VneW?s z|MoxUT-Wis6c~q@XP$RGYu)Q!_x*lUS5?5np~OK#Lc&vgDXW2mgjRrrge;DQ0Ui-@ z7;FJwgx%$I-8G%9+`V4AS|X{uc6V`bc6YEfqxZCQb+dJL;^h$J;AW$@ad&rd6XE1^ z{GTUqIJ;VNF5qB&055{=@>0(Y3F+Z$#9w6Ci&9%8q;+LQSt+P@*8Y;0zvkIh=kGDa zjC%FvykY$c2aX7OR@SG`_iUVNrm)mV62tR|2$WEIlmfZ;;ka_0zcr_jx zq-f9R)#LLk@bmaoGEcA9{U(_mC*zqN=f#{&wv)4N9m|AS#(dhxbf!Fv6tEb=2&Di0 z=LN5p2kQU)#31VuCzlphE?vRmS7(I$AM=IwXn2>#gmK@Ef2Ra0u-ZJ;eabXl?=Wv zi-c{onP@|uFoKYmmFzQ_h6dofKJwY*JUm$%Vb?J52k!KqK$_%qJ2%L6*=JkJgI88~ zz(Wp;@Y}|`hWJ1o_zuSEYJ@#62?g(v7w7|lKrH9`t!itmMksyH52Jg{ep{c05zJ%> z=925_={akaQG3M6jiuuQN6sJT=Jew zo9Ds2teo7yhh~BTS<=haxasTNf&ckc(%XtS`&VbsW&2l$ABp4QrM}EWFo|a$r#+Hh zU%OdSW{~CA(pu3A6(G=Bih9Hy8Wr_G;^z2kuviX(bog$m-}#~Yuknz}(`_9O%?%Nj zFapvfzcn<`{#b>EIUBT_b|;#QUg?(7+!WI!0>(VFfZlGu!zN6XsmANWmdH=nV?lRs zOKial7nsh~j!6uu2rg~>_Wt$U*?0^5WmXltJRI5`)j(4lKA-e zpPz5uK9n%+i=vQtk;~gOip?Oq`AyUfW68=*uVletBU@Ngjva*2d8h;ddR$yw=V!|y zA+JNjuFqLn)XRa_ju`@u`iJW(xf0oh3b7}rV=vt1EK|8Hz7e$F{#_tCzdRY@6!!h| z6l{v+{1@BdEa60^S1C!?i!-Cfri@&}-b)5`BZSgeCVH*j+~-SPbSwU+cnUGpWxHFA zS`AK1O663nYc~dMo1%+ave6-CBoRqWm7^Ti`W-U=o;@O*A-?QBb807Kji+g{kmIHxi z_nmIF7q=U4{K{tr)40uT#`BPW{`~2FFrQ2pc+q8&SP*4ya6}7QjUS+k? z9^e#v{`KJt^^U**tC0+?!`xZ>S0MtO4y#7%b$G}6oBUE(3^zZA(|OrU+Oxpb{Q$1Q z>AZavX@%cG-E5kGBjrN9`VCGr><0B{rsH{1HW~6}W@cRybOC<|Qw?3Q!6NwV zzem|lmBgo}>Qoy~@fio4X<^_|${x(u+0K1tq6@m=&f`e{e~`kXqdf-x94xnqJ`_8a z?0PdfHANxeFH}=gGhJzv*&cBH*~|H-;PR}oUvyg9qxOq6-1&OPSJ@&SzotqbD#X&V zpLr1_A-1`z(S&D&cf!J)Sou`hwc`oSMvjn_r#|JJt?dT2#J*={*4GcX^#kH>3K9&S zh7t5C(hqXEZoEw4Fw*DhuoxgZY+R4Qp%Z-a1TC^x+R4c&{rvKGKiyoj$8)fae;Y?G zey{u7-=6#2p3PqFl;+w@6rj%YbC` z-k;@F?FBkj#)#;_CT32$|L~{J*p$Q+^CYsB8MQ-4l{bu_n{GMa+z5o2%-;7;O(6F4 zK3gbC34URad4D`E^KhclNHm64e6Z^+=FWVbz0}tJWbrEym^M=5mS5lFJ)o1wx<2a2 zT?xF7PBjTkM{HC&uXS^l8+gSBv|?!o3k_3MCNB^Hqna%uCoPR~)NwC?)Z%rt^=qO~ z?0Ub}X+hkq2P^RAw}Qmog?an+p{To2sYyrRkE~{23u|j$>rv`u|5KIr%PlH=TCr~+ zdJHR9)YJ{P!;ar&lPq%DUQ#kIg{4#t+$JF5*h?3yrc3W8Ju6~2#FC4|w5PPiB`0&J zb-VG#md9p*XOTP?TnxNEP|gyHe{pw#8my8jP~$Mib9dfx?-MY*xM;XFn*GYkiWz(m z$Bzrf9y3~L%FM2AZZ&fbi$2@g;PWmX)LW0`Y!;^)uhq>O7do%BBeCg!39qxC{p|hQ ztS_3XAD6RL(%qeJD24qwEN}&`Rc_X=-{i{l^~1w$aD6Pl#W5J)S8Ud!9YZZlP{~l$ zw2a-tj?3Ci^(_jfwp7yu(oOlsKf|e*Sf|!{BN3Mb84+#i>FNI9>boB;XY1G5K}uC= zq@|^8reTBvnbFnNT)VSX4<(hAA4F5~vkHCxVmsx%5wCCoJ{F#2tiyl({SR;= zcXoEZILsw*oA)&g=;0m@a_Jce|l_}upIuJZKHparj@?n49!EUw^PxAWj0q%DNSy=*7(q8ZZ z2mN!5T09Afh}IH{p;gA>AoCfyI3mNqgHi-Id2iPHEJf7DjkaZ7#vxq=lPF3t=PXeH z{vL;XbTqHfk_(U8BMW8fnpA1JDf#FtUVrT#NPGA(0zQ7cFOBu{BU(XL6t!@|J&1p# z*5htmV>;`_#y5!KEaI^{Uj6zjPe3rslPBW^a=1t!^9_LwPUEq}iG6V=1k&uLlguam zI;Eu&pVO`Ba-CREp~gFc?uneY0{SqDknELy$YTzgJT?x8Z%=PRdGDTMH8P_bg^+1| zZ_eX$UKS%FCQcXi;yBxzwFap# z!U0qX8ww9+@Yp5TwPbupkgS3NQAM08Cnt`Kk~d2pvpxY?jg5_=nU2osBUUFTC*MX! z*gfAaxB9qA%gD@qu_XqeP9m>a2|%KU=yKKz4bEeGRUI)SR;S+UH00#{h$?-4xI`i5 z&2`j%T^Zy&*-*$M9mruRN^?P4jq>`=K88S|;n9vOrK!Z_265{Ccf{T$vZ(QZtu9ec z4;`>D1Lyw%jYtv&uN6{UQc};~gN1a70FiQ?DvopERCa^D#U{6%rIv=F6Dp_15U*vQ z4=qRSnd9T?C#R7?wO@4EdQ(hX2blJk+ghY&lfmOi4cqQE3-#bqOP@89H3xqf&$DeDw?@r;h}J{a(|b)o#Ph+KqAkZHQ9o~!nsx-UQi2# zKY?cpX}`t5y8zhdrIM1_kLuTK2K7;K1a$X$Kh#aybGeph<(8m&kh#Z{yH-jvq$_;L zxyY!&$&xfH0t#5U)rAo~PUwMrCC>32rO16oucE6!tEelQAbgUJ4}4n&Hw+r{<5nk>qXw9ww%xeG5Uwyx2@Nu>s4F;y&)_ zk^TP3Kp~aG2n!eYv4B8IKdoOBGAhRS7h6rI_Dd^eW@3<{%==?##D4b>TiQ)sZs*39 z*jDJ*MFienkxrMne*bLw3Im_|8>r(xmz${{0mzG2pkWJ z=7X|~vAjkkiLw~yr!ktH;KNgyopv4g zjAxhs=FJzstDVq_GytBox?HWBRo9lhh-v8_HiC-w^=nzc-Q~0Z%H8OGQxkw&5CRYf z+InhPk_~g;g#CZqz6`M9ef{PQ2Qpl}|N82To8qBD*!kiN=9Htr$!J!*WD<;QL$w0S zORz^S08v@@e@s~Fxw`UHwO_$!w-4rOS@KX!qp1W-02=9)mvv1^NC;hW+BUhrA}>|T z-I{#Gxf#jXPA?!prXErQeklSZa0bXvu+I`;abu{YOu++EU z++k!!UujsM|2Z#^dkYf<7g>R3xf~*>VGE<0vM_z&^7xp^g}s_GZcG!B>~2a|DOH*q zT;WPB)2xmb1(rfYN2dlFJ1PMOq&yIb91C9@7g7Lf!KW5dcQCRA?{ipxy#zX+o+-5# zBj9*9?UZC*tiF8+;*at1H&jbc;u{Epk7^cSU7NMk0b%)vZudPD9 z_5$uL(Gj9BqF@~PWHN88*w|7}Gqa``dmKEbRz>3O?B;aQUze8~Ibu5!FFp1jHIHa> zja79tA6D4v?tVYhOq({W^wJuBR?e73KSYH4XnNJFDyWo4Bfs+QM`USZsp0peBB zF36TzPn|bd3`#bA+Nmz)W#n3q{_8{YoWooAv(13+bOOs#R7~|QW`)%qmc>{@5`1?g z4s?wVGgd*7p*+_q0TQ6CC4QVBOAKI?zd{q#NbvZ8{Gh=@oCq=%-vTL%Fu``OB30Lxt+3His2v6`Nu zjU4*^o~0pe*+ZcqF)*QADk}c~eUm3Rp5OkP5|a<9t-aECefs%C(!t&1V4rKf_~c|c zL&GdX19;iVB9%pHalDA7Qq88L)$mg<`ay%{r5N_dgl(|Yci;wJB9#C zTL?4{m-UfWdk-3eQVD9%Dy928PEpI8z=OuL?!2jDNrP!kPwUBdjHV#u!$H%4C@TPM zvii4{=J-Z{CXCde9Yj=U{QjUhDDtFEi3A~xe7s3l3wu*1w<)r7kt;=ZF|asIPfe$6 z!-@%Q*TU&?{=QuiI!7i!W^((aNjxYd+ZU-)qffiF`6A18e~n8+Vsv1+ESJxJgfElOCnk?%-HRy0Ef_6PMGo735HWgSRBF8MfZ&g{A9H?-<^z;9kj#-mp@sg)E zdWxbhFOLy${kPb-&A0Xb?m8_hK9G8;m4}zth^Jt3N{jEa(^4~StT2cTa3Br1j=qKO zf&y$I`*Af9Z(bhn;5puu2=RRVBM`0^FP%gXZjEyyI3b@n>l=4L2c zkxeDGx&?(bvI?4|mcJR__*jo6`0YOuR+XEg>0&kT6}bctv;}os6W(=qFJi#DDFu2o z8l2wAUzG3rD*FaX)JHvV*xA`ADK!`Mh^WQHC$UGB6BYOe=;OEl#;D8SV|XG$`3g$H z0D?WYaZrh{W;Ddwhz66be;jW-Ct!Yt7Ni#{Ks)`B|A72s(O&>8QUEQ<==`FIz-typt zhHGpH(+MQ^lL_Ti0}NARbiVMjV#8Yf(B~3cn^c>$4DA(8X|4R$T+)-KwIE)rVR}x^ zsvjqoLn+*#Ev60l1F3BLtDnMYGMXp#7Svi)9m6;8Oy-Fw$v=?ML_2P)+GPg?)?J-H z{1{HZ5sC{7=tA%PzPY^-=XcY%pbTG3WT`Ruly!ZoHhOz$T@z3jN1(;lX5-ZXqPAS; z;U^-@>ThEMT~=tR)Kbjb!HMkRGzL~yzv&ZGKL_5WSLBL6(5ti!n=aFQR1^`z{Dc+0 z+BRelg?5srr-mNP7)YWQ4Dt*MfKD-LMWR(scoRhpg%_fyeg<}0=BXwbG#z8&BkFgE zZfEO@ZESc`7P z#vjSxF9da@Lbs;d(xMRn_o+gKhtjC~C9lF+3kx~sV>+jFB(35diPRrP_!iJo^Yi`v zom5>}i1CcDcpjIHC;i7_8F8jEv;6vP-rMd54NXm-pYO=iK$qaP63{TzT&7vd$bxiG6}IF@5$#p|P|w_#~zHZ%-98p}d9VmbO_d!m|KvN32BQKrqF zthh;1aYE9_GXvjjs7xrZ!h%UcZNEAw$jfU>m4_*GtY)BlJU(W$91Uf4B%mRW?_^!| zTos3UbtHC*8`m2_O@bg|zuc!sJguN!BHMYIesQ#PU7O2(N0u{it>J9ZR8Ud`oRKu@H5l7y{tP0mk{YrUL4C6EjNOhQnnC9RXntZ=CrXBZ1M}C?*_?w_`F*L$-{)5{!mS){ucxrq%|5@FV7;ZUtU`{kV}^u4ni|NZUJ*WzCIdnQ52SG;y)k@h;0>urzYHYc^Jsg2Mg z$sD};eVV4Da14=2!(ID*(i}g444x4lj&*W+y0h5C3l8gQhJ9sOw>Cz)Arxae5fr<& z;6Zg(^Ho32-IBqjRkHz@t9tLD%GY4a&Y+NYoo$`irmk~i&*ACjBuB>EhZ*K5cOmxU zJ7#&>n|zvRmHhVKkq|T)se#vOCotVhmV!Fr za1XMJ_uZXSe;M~=Aa6eip{S***NYAi!f>z6a5VE06CFZ)ssRbY-qh4YE#xA(yP%CSV!>Du7|^{t`vI5J zU9WV{w+kyMW*+*z<9=(g>#c5Vv)`azbKEZ*wwfCBkYcl=z3;t-l)exL;%y z%*V#8j}q0Cy8co!=AlBU1f48E^9~}k!Bh{>(2oF_3V7O$Z}E>onfj0;?ssxFYm$0V z+_l8DkcabPK7uUpw6n%Yuc0bF@O>PF!KMTLE%p{e^m1&ooDH^Sv@Wi1!Bh6FAu~xu zW!9(H$+~38C=JJ(OxZ&vEtl`BdEC91#NQ@Q(kOzN**D{Xlha?cWUlpOu4^w=K2+|E zWBgGSxngSeb*z#J%QV;ic@JOqH%2?Nj?678E>5;>+1Rj1NG#8b9XIA;=b5l|V^Pjj zei=jsDLYY@x~qeAPI-~sFYZgeUfr?FyFry^WmiYd2uIO_J*j=Gns|D{pwu2^3;U6C zqhBuX=_vrt;^qKLuDht@JM-~KKp3ZP#z^r1-c!q+T(Z3uD)0%=FniMs;E+<^M<80* zm8GZ%gAA)!2_@OnEhZ;(1rMe_@3%;VTy+v3&=N$OM7)_PYLdS)ipr^;UI0k@qVcK2r7?En^YZE@fym7}SKm$D?uu zy~xF%#*^&gUA@UETj-TluYZdVZ~anBz%s3uSRLrYqJMZERwM}|MGl~GbKALEHJE?a z=QjA1=IY8t?qhOnS7#>SHCnF9B?>zIR@l=M`x)2t6T6viTG~SaZpKGif4x$IAISPp zS}kOQ{Bd=DGzJ($oi*+N?qSby{F+X?o~|BSc6P zdiXQQx20sF@{Re`*`Cfx$d#1wtk`hj@Du+;N&ipYC#jn)V^M@!EK4dSQ^@D{e|=vq z@lTb#RrR(IijKO66FdLAO>5IH^my+Q?*zIz?+Un%|1kBo2==dm-4UK#Y9kSWD;Kw{ zCYaxa*BIHFK~T|H9#yGC4tE4csBO2*>ByY9k|ZAiqR#Yiu?Y~63B$^y8e-<|=^}1h z53EPCo_+Hl93C!$6n6VPQYe7}?stjS0K{k;03dUxBZ8Zi{nMY+=%n{%%0CNeds49) zfN92ci5lm5;A;wsFi_l0YwPJjYFR*Xf5$nnwIz_Z-oKx$(Bp0BPdQ@K8k>a0 ztL<@X$y1dV;3a?3c3=YxR6A1sobt_h{*Tj>YKlVgEy~A_=6QHJoj4y&R2PkTxE;*X z;tq?VXd|GnQ&Hz4G0V51Z=DG_s>|_@!+Ls>cAUSF=ROBji`6mB(F;NEfT=;_{^udq z{C83tEYR6FljzG#NbDx1H{hFBkyky;+woj*;affvQtqRe_<0!Lt_ieKTYDMw0Rn=}urY3kqa_O9?FqOR zVAFNGIzv!#>k(wWxbGi&KE5xXGua0j_&6%3IX8a({ynX#T#<#*1W6J2{-mAbN9Dr7 z4<24E`a!}X9%Eb-?NZg(4f+FPqs%WvGqYqqh}Em%Vn4%DB4t;1elu9j^A1fOi}}0< z+cSnP)~R*^>mp#OS&99S_E87A48UPQfRrE35zmx*ixCf)vG~Ud2!t7Qdqxgc2J23l zrPl^LfhGbTghx>N?BGu0O~0PLJ(lTopANSA`b;8RwcyRwPN!d|ym=;(=&ljEgQO+l zzU-1s6|%n1%3&UqOP=rDL{}+9>KmeEKAA?BhzBj;LgjXYg2S7Wpnl&j#YlYK#ngW-ljP`- zEYkRpLwGs&1Lbf7Pn0B6O`BOm?d2~>rOjfR=!g|I=ubfPL9judMqotPxRLM*$@}Ja zf73zzg3WTPFqoDg2CSemACCyGK+Wvm!G!%O=%~D^%W~Yemkm;^+vi*Jb+90cS_+=S+iB2f3vKp~(wpDRlZ)HmENpsk^?#a08OD zOmDc0{+831)MudjcBr2(+=hrkJcz_fYD};kP<#t{+?r8x^HCTJ+*SGQBbxfZN~k~( ztW6nlaZwdE15&c>g*)~h`KrW>lmdEQ1O_Y%RV(IP4|XS0Y^f8nn=wOh4pxq&{TLeb z7}6AjwD**Hsqzodse2^B2cIF>yzc!(ASiB2mxpnswW)<_zPBdrn0WrAIA3czI*RG z3}a1;jgc?!n+*-dkxjNtI~aR$zj%*_Eud|R5f?fb3XjE+)S}1TeLk&HvK~~$4b#xv zEPK3+O-kiQ;*H^r?s0#MQQ5^t{t-q3Z-+UVD|RaMl|LPgx$>+=Pm8^#Pm+1E8yVv^ zt|+fb)Gexd>*)Tex+S{AKIzAbyo{D$jGX1OlqXbUV7qpNc5Cm$xsyw1Er&iQT)Q1v z=F6jcScuKR$5Xo~dr&{gtw( zg_vWPM;ad<>5W(ADO3gLduoz{DA8Hm4jSvPB+d8h?;x(fkI*cR4H?3Y&<(^(b^(Wp z#jo=C2~PqzbVB$3u?usbv|V2fRStd&zPSmt+&|B{a;h{kKrdos6l(Eu{@A-T!I#=> za92UW6aq8d{&K>FxCIo8EiPhO+y_-|Bj@X|H z?e5ehLe{fCdhm{0X?w@H^t|PFmP2`A-=jC<>i2(gnuq!2l3Q?!INX5n3a+a1fFPs0 z8_W4e-0^!nai_wCgAGf|e(s^j62RE-AY30k4kw?S1`3bZ*?L=KDSabdoI8E+<4e^2y*7SDTOqggK2yS4B0;Kj2FOFOEvr1@zolCx7!XB2U2v#2T5N*5i-KY)Ig2kJ=SW!M*90(1zf z1N`=31O>2{=B$nz8qZnbKcA^@Vp-fqH#^brmN*)J8Hm|k*2Wx@D)Nu@yZ-Qgw?*RN znR=(%Y_b2e+aoWES8}C)TxnbIS@k}}f7lUUy?41U=g$wkaoQ9sMab-wtShx5^+AP? zh-E2KrNNf{$g0kVYxne++W6s4)?4@lAC-sm$|KUF9g}?d6eAfZ-bpd+(3dJC5+XN3 z^Ck&}($1#+3N~Hl0~0;bfuu`@aF8c2ZDbv6RF=PbG)_3%Ifb6Yd2NrzYHrH?&OFY@g4Xi_ij3iOGeR8w|#=hf>I@+ zD%R*F`F4pHcU0drrSx&mnTWk(04BWy)V;S;_lcBq7_IMoGH# znvL~XfQbLk@-ygcn||`dJkKP=L_h1T>Wjiqh+JI{U^cnicc#WsNE?Zb7ydUtrg`R( z@DBEU7zR_YmC983C+;rEcX#CUP+pD>sd%px`!W8oemkI&xlfr!WRz;D>H9swyseIU z#O05JJGnAzGaK2;nQ5wp0Z=R!?2@t{=D1x{zk2P8OhqK^q4-4QmLLl(V5G zEX1*r0#qTAn#Q9kUy8FA0^+~6v8emFC7E(&?Uyr@73 zl$Y+FdWZZ7ljdxkTUZahiUzr4YEvwbFQ=tf7&Bv6(?c-eaooR4h9?&{`g!|{)e#fl zE|X!)&d6#*JDN%%O&1d1Lg(MZ(KnwCaX<1#t}@q`u$Dh-Zoxn6ye5b4ln3R1t~1LL zY^T*VZ*`I-Exn+g=@`#-fZ^F&o4w|Y%CJL8Zu%3tJJyHD&XZSAlCM<~pR_O3H7)$dabbub^;9;WhuHHjR+Y5l1YzJ7{qx>cm#lwVDj z^cdHU!7f~w^FY>8%8`G|nG1hz{8~mkLfw>^3u%V<$ZPuL4|Dadvd{!|a@5RVV_37Y zQ7~uJ=4P_uZw}9Xka1ptuOSKA%+<)4zo~7m%2E0~dzjx5+|`jhnnC$e8o?qPEosED zgq{+Ckv*EUp7lmDkx{et_Z70;bYlR!sYd+EFBCH}V(N5-iX2MvNP$2wL6KpU81iRfTYAC@)o)+pmNrfurOkFP`p5I0^qh&`8w_ZbmPEJvE zTE;|Yu>$Um8AFS%^9tMeo&I;_D2>u@klytt;}qC9+TRpC%vS zi@?m743621Ek9tRVd&!@<3-8J#f zTBa{5?JhV!+t^meG?=~UMn)@7-wPg{ywjYW@Sstd8s|r>2@>j- zL~Ah!Ulmq{A+s+_2W;qU)j6W^*kvJ#O4wVp0(d61DJvIduBFt0Oz#(gN2hJUIuMiZ z_6g0MJkwRXw5tlx$w{`=j^G_fXcxZMEu2dr5Q#L{PVV*hq~kI+G0Y5nb>Ag2Vi(#R zR23EV`{6qtl(ToIq%GMa3bC$o>wSy10z>4ggkKzB95r&NuQ0QhQ=z|B^d7{m^6|6t zbP#xBj4hdS$|vuZ~G^By-JZo4N`!_zw?tA@^a+h*9trkG@Y^ z=8`?~@RtR>Hjn2(HTTu*ul282;qb)RU3>!#X3$kS^i9gHQbJt?lfP>ns+#6}@R$_; zCkHWdy`+bWd_$&{k^8t0%aF$n30bD&!Sj&0OXgmo(G9&~q^hg@W^FAGwHd#J-aDzCgnupRl)SO@8TDW4>C{mfZ(5&2u43yK zvOj$`nodJr(%k^P`j|WY+l05uAJjuOC({rUjZo#YvR35O%UDTdvUstO5LLG2R%=q@Geq-P@LZuN4^w?npj4q1wtONlc21Dl^hp$0HhX-XZ zQihU$Ooufug#6vihll08KfUW9tgb1?dC62FrC)%fe1LmmlO4kf*&>J-G-Rc;ac$MM z`umn(MoS8#;6H<$jF4Y3q@+R|&oax{>b^_jebLv5dq#V9-}29yA!rBTDd`IK!;V&B zDD(|BmEgZ2;xWkvn z%SA)Sqt4E2Ymx3ik^|-|!m_GU|`XeZifmM@|iTo^9($aJpM;R#xQvxMWI@%%? zZ(l0<;*ye!<`Lc>Oxx!ne(EMr z>ELHDW%9`L4-Y82B=)_k?{B(;{C%=v&xn3!5n5s#7Ft=G;{0QB z8+12k=>tSwfDy_^$eJa#SaSWP4Gj&uZV+z{)O_iJ&a|2(%3UCuJ$9i8u^AX8AR+W@ zK&h$>{0lSXI)2yd&`P&X3+SvUO)~l!4p-;3Opa{88=wBP4yRdW;s3i<_TE#`9$;1f8h0%aegD z+BvXjuon9auO}Y;%o!>PN6}Ny&BIB!f9IQUhXvniigMq`Hl_RL(*C5nYgU~m;h+d_ zj;Y)Orjbh9gb<+B2VrZ+SA_xrO3JNnTuuuhdzY1$Z?4GXvn2wfZ_AU70g21dnm6n6 zYwPPk1&4S7(ihuxku3wDpc@z&*#t~y?{{I{PudmQ_Fewk+GI!qnRuBa+9mZu_0RYI za(wRk(}+D)%gkpa<6Mw@FDHaO=+W8tM2u@lqp93V=h*%${vd@uq~Y@&K`(%Gy0_gB33zwn zv$BSJJ`z7)Yy={Z&mh)-;2l@e*H;7y$i6uidi&t+1)8O*K4RZB)x>*&uq1-rq}B?r z&OX%{X%C3pQ|zysTXrXZH^oc~YhVVyou#$t|3>qpIwZ9XJIHQ4m|VntwWv3=WbeVv zt(TLE%4XWq;Q_-t0tuowPr#!YlRLUB_-#JnZ^iLl>j&mE&WA=vE5p8P#SAv9LcQ37 z(>+DY8=Q)Pd>WCSI0hL+EDon} zlZudaf=@;v>gn*$CSzk`|2bUBguy;Xb6V8S>9%;X1HEUr*)y5TBL^-O{@41eN$6)9 z64BQJCEU?%SHhozv8{L`quF3&&7*5K{*}tgVLh@{G0(z|X%0N_PJ>4BLY!tNlquPD z>Yv;v`~UgL6rHG@R;EV`?eq9Iw_!nUy-CA;uXai)z>pjv0!78dch_rh-BdCTytD+O zkVB6`9v+@wAehf~XV!1eSNbi`jHuJqfV!X0ZW?Cl_wOu5oR*i3ZP@YlX=JZPeV&HT z`CK?09T;Hfkmq?R8Mv1pLK{bgHl)n&0#_AP3O!%G!+b#^ebH^a%F$VA#n1Om=Sb%8 zL5Ne5?oDZHu3X6Q%ItsH)MU>#w9wX@wsQ)BDsItxJ(|yUGK`FUTCjX2=sx?gLd-Z| z=gHV~J44ZNasE#5MaN2jR@*#R)GG#|^-E7506Gi>AhhlhQMfIgo`xdy=+$EU{0KF0 z2EYB*NbI`7TxVftEi6BF4f1&H?DQ_qIvBe%vhN1HQ)AjopFx&%2(unVG~^xZ{%C1J zeWvVf5mD=lD&l{)Fno8YTyh#H7r}c288=E?N=gYn8Id&bCjr**=d4MP$DG2{7me7= z)YN_;ZDCDPo`~oEOI_VZ2qk^Pk|!1LQLsFF_7MnXYJo!mVRq^GQl|VlM6nYEr#6oaIpDeH z%k3U`3CtoQht$4|prB$IxjvsK+Y<#d>$^uD7Xc~2_`5Nw*f+{jvtr~xyPjKu!5C!3 z(puxTttxT*M^nV>Flx!E--+|?A#gAJze-G~Hts*w3&M*IeUtzHC2^Pw9`v8L9*#xZ z(#U6n9RX31U(}@q)J$tm369G!_yqH1G1qv~2PHxGG>mB+6{6Yk1 zM@tLutjUrOf!KlbTLBHX85;2DnAbR#sAdm>tey@`EJ&bGIjpp2?@X1}`qo9svHp{v z!>Yis6Y)Ma-Jz0_N{#x=x<~xypV9vFORA;+HQINff~zM!#@z)fP$=DzT0E*U`c@th zYX(x4Mt%<~Ls;!)=PVdINEq^_GKT2EhM9sHI^RH72e@ZAH!s*BWsw7U_9D%dg|?I# zm1UbuB&=1oHy+u?@69`L)&3r>i~)T^8ozzY&M2mAvhk zry_N~!W<9pe(==94*74oXMN?eQb*t|m%(tLkfLe1>g>lqf83FPYtJyHzYnf}3}#fW zEx9vxy>@kE!`7CT$fmV(z{q1M(P6%Dvi~>?O9Ej_wT=XRgd}3wJ>i73vnl|;jAn}_ zrl;>XW}*xMU;smcnjxOqE*27&R#uZRNMYz^ zjB;7C$6oW2^IFfxC>qgvQwiW%2n$047G4!7bnZhc*qsWamK2~WeO;rgv?7H?nACb( z;bLKrz<43uoP}F+2cCCVdz?Yk%_mH2myw^yzt$Z6%h+Hd<38ILej*#4{E#}&`@JTs zsT`@2Ta-L`B#!iqyhr}?jLi+hmF70?k9JIPmBcYuYG4QsWTefeJ|LceXBOBihYtc@ zzkcl$nP$L7R)wOpe8>`p#jyztz+|QTYCz z0sfa_u&a!f$fZI>y88U&FK0i3@t}5;@NGna-@m(V!;!a|#Nkf_!o^p+HBv_!Zbw&OrVqN*s^{J~E%p^VdCfO{j zf6OKDYfXVk9subG(G!4{K|w86X*iIqW32bgkpy_l{6N$28mJXp|4hDm!o<|% z-huz{;dY9F(@)UZBp@93P~fycM|^wou^BlKG^I zXdXGX<|@9X-(@ss&G2NGe>|cRmUt6WOYdjR0x1fZ08~T zD(1l1x4B#;@=E5+^lC6oO`0kFi8jP9>}ekuLvTD);*HAmF_TFRQGy?q%DghPMzc?LiKMNyc zck00d8eV`ESkR|Ra|0uK&0bFyDFW0T z1-c*ClMO}SK-}xej zc7c6{X)kq~uZq!*#uM!Il$v2OY%th0kbNC1p3!>%0h?v>5CPn@3omcn%CHAmUV4g; z*8f-vaTJZbmgI6J0=z+#mp7+_1-iFJBrgI4`sZLF6(I&|Z0zR#zP`4WChW!>aP{XW zun(W=Jx1*^UWFunY|0k-4IAj8UFBbuUd5e;{W(<^&)&FhhVhMtlm;QIK!Sn`gDHY% zY!^syP}QiS&cxN9`QtwhUg{Jwt^VD-pvy5>@oT$x8SbP`iq*FVu2y&_5hdjy@ac^L z7qsp73JL%PyX@*F+XMW7FNPmsyBpp*dAQx;du9jnw>+TcuyJtB%(}mRwFOZ1RVv5XOe{0FBSVHx@1UUb@-Pr&?7s;B857uzy;_~1d3q5LheeC- z4$YC@}jnFLZ(CENM5<* zZjhR8Laq7FgQO0KAUH1nUnb+}m-X!B&5#*z&jf@c8u3>`bPF&LzkpKKye}#r)Rz4p zHE;Xrg0euXT@7v^fh6Fx_zkGqAMx9PTTqN`g8*d|7Eb^6?Hf^5EVw!kK>nHeWRN;0 zaUBKB`~3%3fEyqtCf1DiM2~8NaVC2rsi3MnUa`HEfyok61gc-WYgt=cLEfg=cG zcjM`(sIcn>7Px`NYj8UT_rt~Xcevyz2p+**N%a^3uLHK>1(B^pGcaKJ;7q~-+y{c0kjP-A}j7G8%Ml@ z1mX1QXgP30i5&a3w^!79%EwPK(WekWeABms{lV&n{EM1U>leTTq_F9AgIgCUs;Mc= zdZeCZl+*yRE>3PK*i)p9tu3=UeZ7uAakUo*zSdko&JJ$nK{Giy`BGWA;#l|{K|%8g z>w6wxe6G-b4tNbKkVPvoA7K9k_PW;yw!zj^>y->sabqFX-n-)rd}>Mb1WJmb_AZYw z5ibfgd`O^kzRNoGF__{XNB(YJ(mhnH4lqXhccCRkECpSE!@lBzF2o#oeZxL{m;$+b z_xq>4srBvcZS&f>3!r{W1Gdono9!2h%E}uJD}i6FY9~sf?SID9J1(HaWn{2|8=c6i zpNgbggDqbKaw-HEd5ifF#pUjDdkXjxQ@g*+7%?)L+!H_l#8e>fGCOlfU>DZ%u}@$r zm_w3ia)tvb3(Yg{PLJA2n&}fMDawYAkSbT8G1|~S+sR0!`;dd5Aq#zh7@rfN)Z#E| zJ_gcS0veiw({;dFT~bT&Tj5%&Y!B}6v zQ@`5kVh|ma+S=NB!hN`cNOL@Ux!JrO zRgei;Bf`1bV~RqXudD)v{^wZDL*(EBWp6g1DHd~S#%dcWqh*WN2A}8MTv}6m^86~e zzBCu8=CDDs!lIrt6oK2@3q}ZYK&Vp!ED@CUv-t^<0Bi~`ji8zHZT}?duvM=gLjwb_-~<{rT8Q7eM9ZI$#0)#qXkW03cxwz z?Bvv2Y|M*z7eFfa4z#c}01u)Np8z<#?6+5^RCj(noK{|j5@82e^9cX{W*v9kxU~(@ zztu+K>n<9LjJQkp%7>TS)q_&RI2C5jy;PcOFr!jX^?OZx)yST&1FcFfcN%S+`coVq z`p~DSzdW2}`%%k-%h$2IaZ6sGyldH6^n5Ko>;0zqv5wyJ&w?nyIepxYaUtKQxbi2f z9k=w+YSsYaGi26c!A*JdoLW1uE+9Kmf?WPnrjif4lK0b|YdT8TCK}XO6QU6q1{Mbo z*nmK?4IUrX%_kxKMhU5_(t}V}T>Sa4H-JLRxepZnQ1C~Z0;M|mSB=Bkm0~@fz$oCZ zt^u}P5q22*xbHX!Q57bFW&g^%Z-5jc=wNT%7(^s$k~Ke=?vA#;fA7`Ga-rG~r2Jp^ zM4SiFX|7f4y@RFDSemZ2eF)0kYn)f=mnG~+j}ds=Ma-!7`OQ-Dsl8;&{J57}>qo{wA@e zLe1lL{L(os&x^m{CQXeAS-TW=3d8MRfP+r4S$T8#x;UGV{+TnAWp&Hn;Lyv1G#UJR zz!70uW(Pm|74l=Ke-M)wKzKp!5&-$M)tiezC52NqReKte`dM>bLM(46a?I zw3$Q3?R$0c^1!-;%D%tG#|7k4&7jf!UbTLQuab==%6mxA_gVj+9FvIS;m1vkPYA94 zP&mm)o;pIZ%H7NM!S`ip3`TL;i`0vmC5|bz1do7oj9%Gf7VImdxsk^2h&Xfzo~)8q zXFwOU19k-=yG?%(vyGj#X{ZJ4szxDfIQ^dMQw1*rsK?N~AU0Iro+K7F&h7$F6W1@V zL_sf&0Uwcso-dHH52Rj*zyZ3p4mo9@RU_c|kpdwNpiftbq31mZMk+{v_nGNJS`>&0 zidEorD$mx#JdoAOUspGdn3%P9!q{Ct0l`TjIrn!E1H!cF9r!g)`FGDnfteMs(& zD`z8nzh{$TUDh+*Pt|RjMXlPnvVOX`+(oPBi@v>nlArwqME7Q;i8gTQVcB?2DtQyL z>Awp$h0a~eS;I(kAd#e+Twc2^3+E8ItXt2Zo|=(i0N+34>mss7k>D)qR~L4$p0l@} zbNG9{2I9EzDuzMcjUXayn1)z4%=ZajGcS<$>^$s=Qni*=g&$GN-Y+P7M8Z8 zv}FLY!8QQtv0{k4XKVO(gkVxHnq4dUwCnZ8GQwC_G8*=(Mg`JxT5L+!s zkVa~Jjzfn-P*|8O96VuCp)hy=P%M(FdwWkO9tXb!EqE*mEX3maG1@FYiTYiHU-ihn zqUB64m7eUIcRF5;e&s1Yg#J!u)PFVj^k0p{9ojOEpH1o_AH2Qmh3Az$f0WqXRA0@x z$KY!u`1OOAx08}zuOAbaoMy?D#y@}G4$0@giGr@)KzuN10NcRhqh?dh%{azYozy4F zei-mtnSzr}2+~tPksVI5m6uQV^!6#HIJ(TjPlYR!pPVIp;ST;&(3ekd z8r?Wq1+H*pXs2jR0tyR2DgL@{~-pzzv33q+#`KXW|0@8aiHP2 zv(wb78Is$0;-$>yGWU%*ha*V+_sjsik!_?N`K~|Qv z$?QDf+k9eTCApo}pawf*WaPa0bA*OdE)FVpu)%8qM0^LLon@uN>9et{@~Paz@l>iI zZuS1SB>nsJ3Nuyc!u_+!nk)>~hW5vVpX5?%OPr=XI(Ibo7U#wyyS=*)FJ`c6;?u#D zCQmf4ICaET!lWk4ZUy7OqjKcT!pA%Q1g zGQyjC3+&QbW90n0%5597p$NGPA`)Sv^!Ly+y%QHVi7zR0m}-f(3f-B8;F@2{MG|b~ zG~x^-^JrxWsl}~v)+SRH&r5!5XvO6NG!Bbj|CdDXJJ*BrW_+IB zV<~{;1ELO%CK2I>ci>+h*MiZA;S-rcrb_^kv@}CM2#hKaDWS3Y&M~lR5q|@f-DdFS zhzpBHCT>c9f&3)RHYZE)F0=g4^mns3fn3xf5zl-1^=FYCmjpjP{;A2@O00Vk$7K}z zq^VY5RFMtcQz%{in@))Qv5bk(OU=|?JaOi)iqhoW5R}!$jvfJ$4{*9*lZLTuLxN3> zNzT?L&BdMJeYjV)8XvWtxehNn#p{@JteQ=I*x|o#!V|W4L(a$%9yS6#KD(Apk`J`B zHR!$Wf-MmOZK#&rjf=x!-`%6u@Ctl?;r*FOo5{?68EQQ1sj#NNic;l<>lPQ?j-|G) zG}*_y2xg&QL=0uV;ijp-`^T2F$&2c?feMz=Zds?m;|ljheKnKDY|GrTUkiKcZLYTM zMnBz~#;Z?oVnmMXTtqMx#?|C}$-Qzkxd~f4KQ1zsb5)H+QZ+m2+P#m-wV!TOvkx!Z zPXtW5jrl(zABxq{t{hzw>V}>V1&s}9KBwH(CRh^lH})a^LZp5+l*FWEN15%pY>4AC zbIj$%Y!i+&7PmZR<}%QA=E-^*U?<9*dgn*xD|s>z(IH31=w_jWex++z)vHc^Up74m zB7LW;?i6hwc73eAVw!s87-nuJ*{0i(k{=)m9nB8%!F)H35W}hd$5&x>zhtKKKCjQv z5g#;{a_3qLAq&Son1lO6;_|DSEI&VKh@G!Fty|XS2$nuqzavgxyar$^xXK!a!S8#L z!V9k{ZQVq6E#FtQQOYs8*3h^vf4j*E@Y$BN*y&FXbwA$S-p$u-ku|rtO(4b^%h{&5$2VA-W>QCE$Ai63h(@u|2bDcx?#bM9lAsB_5aS31t2Lf_LFi?})q$)ZK*Yan2v zKO5TN`}fRoANnM4peGfHeA6lSOO^j-q+4zz+!$XWFQ0u+&46pB*Bd@hkoAsJX5JGG_x!XHS5^K={5 zpEB|OGHTyw#vSK$vum&uq4*Lxxvdl(_V-3vel~r)ZFd=cA|Y>+fB2@X>TB(d-9sD; z{-hZ$sQUNo*zZ@XuPD=>rX9R*;)K`x#n=B@eg6&>FVp`PM5W#@)-(m=-8T3a>5UsV zQpW@!?SzMoavQ+60!I8qHSmShsG!@6BbvN+B_}3`%>(Nm>_$xIc4+MpSNB>DQl}Ok z8h_tctL}+^@Y|oi!?+CM3%PCLnUVrGNhKGlLgKsVA4EAV^yN*3Ik!iu({ga+Ir6d7 z%2(4Ftz>k+ynR3X3H$Kb+>hXg!Ny9X3)lV^)#3aWr3Li39l^~YoJxEAUx=JX)qr0X z7|oeo0n=7(lL*a}cYs8nF)$$JJ8|MZcv_nMzL#I*{oA=Dww7!ExLbgd#wKxpgeO*X zqZ3PddIinOxmb=mkVz+Hh$oz3Y8m>W41Tv_opG;}7dZEiuPV4DGR4M*8=F|qWReqF z6$cghUeQ<3DRXry2sO_ZOl@iPidYRe8aBOFomS1+^A+et;u%tm%++>7qvQ^WbUI*yl?*O^t8fn1iLVS-3PQW!_s^03ci93sr@Vs> zw}SZN^>^?i44%)oOf@Rx%vR@Vt@K`AqmuJU9}=O_jzI4EG2C^Oh^r2GPDE&*%a1g4 zNu1`yB#fM~`j&R%Xsqx__D#yYKyJA>lY*gWI@Ftp4La$hNXgRN1l~{{#_iqBB?GV? zB~rY#DGqx`0g7ZvXDN99sUIPB0rcNgfL7 z_1$(R`Cb5*OKK_MU>if~2Pj=7>sPhG{Z7-#>1k)}*@wb?P)|WEC2lJnnaZ8&qd{sO zJmq7?7(4vn=Gr;+RhCtrRaHC=N47^jOhZHNp+i~#UsexRtQq}HK4KO*4)4Ui<~(BZI^B(JT!c+IltLWc;CtySgZb)5tu}Rp zj)SL#4CBw%%5fo~!r{l%;W(TK^k1GR-f)QKI*!jPz=`xKJBCWH$}oH%?ca;39og5I ztf+KHdA&&yXZr$ayB}$vZ%DjRqm&P5zhP!!8hYQ+$&`I) zgf4h$M%5mdSZLUU-B{8k^qBF+$=Xcl2uZT2k>hBlj}eB2Je-M2O-77O6r+~1aMTU0 z??T}QqbRPz)uV5!tUS)@Zxj&zcAUQe0x zEr5UA6@bH#tweKlE~vBeKlnip$>QC<$>Ayb&rUknarCxT6>lH$qN(6Q|CT*BlGEZr z1WgDN?(!pm2_cP;U~rxSHku4Lb7cp7qXZ{GgNch2^!j3hArvewO373LN*7+$aBSyB zIcn(JUpUDXttE2AE&S5#3{Z)-5w ze>{^m{5c>lC4M1jtM=Eaqa^9<k99Qilb>Bs1-VmiX{cNqjs%&Qi4fS3kWtP+Ta1o$l@R<}qe1=^|EfB|D%AL2M3 zO)42&&`UM--BgYCqb*1p^0In)|LDqk=H@M$G`r^sg+C&clv7IQ)ny$-_iix{XPs|p z)1bX1gQ0lc?D~B@d_t5)N4{+A4bnJoIfMXqeWC$K=T`xvKml!Y!H%C_?)sM$ux$?A z-?;D?zK62GfQa%!<@KY{=0=Biu0{8suU^DUQ_EL;#X4quX*z;aWW!xL0`KX}^@6P~ z-UCWB67F)zjbtfifiuo1f8tK;ckvaS(|0@ao|aMRMVL$qLoV(hI0uQ0jbGh4B#(w8 z`6V9(sWeb%mK*+f3_id3E{;(UUv?uuxL)GJ567A&(zKVfX)#Gpu`1cJ!bvFa1B)XfPP ztcQ@=^ck|+Z^|#CbM_MB1FtL0sI}V1vn-Z%`K^ls*U&-Z)i zxo2sFsjt+p(*bWRr1=FLoxi$dd;WU3W1ax!1T72i)k&?|n@g7}+<_zm#wBcEXAQ=u z)l)HC!2`OC6l)R8cKBZn91D~)(_gtIlOFKido+DuPQh4>AN3PZQ_D(AOLORSbaYhm zTSyrVftHO$$!mnKL5<4z6!Ynd*m@23=pz^2schH#ln<|H`>f+mf!AR9aAl7#Tj3h5 z*y_`d@B4+1?4;|eS+JU-JH_AveZXivP|A9q)XMha^@i9W#Aov$1&pI_;%~7V{1GM!VFZu{v0P@VfHEJ zkYl|HEtMBzzS+V<*e@yWY=}wh7T57(qZzvMpC)gdZbOn`#owTD3US{GW^fl8hW|YqU zTzJ`ZPC z_U-{i`bGR@acI-DLjnPZT@TdMw$g`CU3)E$L62s*nUu{+6FS#vvM%RdA4KwJMiG@p z@P@NRgkggQ%;?q0WAgR&Qsm-IlIHPzI`{A%LwAOUEy1QNLBiPB*#8;VwEO?f{1My$ zyJvX9pGz)CiL)B)a7poz8`3|KUJ+!WwQgch2J@%`0W;c&KJNM*VnA=>E?;0Iz za=T0D*F@2tD4WZ%gKaDL*U4XX+X-KE8pD2z)#q;cgw^i#26y?V_sNe735f$Eww88f zW+mrM+v?`amiwg~;z&$DtW)4{fwG={|IZc}k0?RFP(%Ea!7=R@;Jtkn_NUZ$=L8Y| zjR7N8!0v(Jg|Ktvu9{e~Lvm@3m@NW0OuT@WcK!U+n|BMIIC3HPuk}x$Gu>Nc#08HW z>s|+Dr?OBhflc@sa9(->oB(1nn$;OS#_S&`7m=eaEH+m^Wd83{Ol_KfGKGLnT@ z5>3kPU^9(v>!zG0wR4Le#nPQM6kx#f;2QSS29YW`fow^Y*06eM(-QBL%e8>ciP67x zF6JI9(dsRHOvj&#jEOm7z`8QbR(+zfTzwlXV4-5*;tAchM*!v?0U`$^sVxu4_TwOw z+vfZ=IENt%U=-HnH)MUGLLKaF0n;+ByY=c{q|i&Rt~P5N{GzWKMVx8j8_U830`)&X zWy|W<^r5_X56}JW4mL`qG0#ndR%`r2Bxh}RC!g6#FD`(}$L-`+S@1>k9(G4DBJ$m7 zyi?cfq8W(Z*$z>LA$)y+H-$JjI7mAIWR3Xf^dRpou{roEP~xU$ZO8}nbl*vHbY|Le z8!M&f#o%v|`n^>l3QptQ$0=>XN}k37e}4%scZ_n99WD`mQq#1)-qH6nT3&ThrL9Eu z22(2VgtK8sCjhH;71FTJ(E0}GGV7nHsaDQ&dgJHFL&#YW_ib>+*7$j|&Y*7uvA2ie z8?f}Cujw+`6oEo4vVqK1IeZex^3IUV0Rf#j9L~`_IzK4-8JB2WGP>Qjf zv@0l<+KZ|bW8s`yA|#gKLF`M_J^o{lsrcPkfu*}Q4fcdHK+FCf!{uRzP;LNU1skIq z;N9B~CK{jV`7lFtW){Z&b`6!!pPY!`Yo%E+SUi|2Voe zIZDuDBa?9VR}XYpu7)s~*FVd5-Nwa!HA{a?kt-NK)y=ZR`JQQJffXh;pD*~mU>%|o9r#hzwb@<>)= zN|g!j8unLMJ%YJMvdXr=x#Xo@25R?ni!tmE8MHhA)Eo<5|5OiEJ*s|V@G>&-0i4M1 zZEpvFp)kAwIFAvHUBHe`b>UVQy~Dp%Xz@pdW2@|6NcDa5NCx$IFe15<#T)w!?$K1gKW_ZI+RJoU!Rc4MAH!sM-##qMM$JC1 z^*>S5^iKFl)q?VcIs7GedPi&8-BL;{2K=(z2rdbEFUpVPlYPcs)S@>Te(SD|i8DR> zM6EIZ0SFNa3wPxuUNjdrxiGV8PDC!~Z{<~f?Z3^`E2ZK9yuOFg{VBqDot*8fM%V5xUAPKX1pCOc)Zd$TdAODw5W=L|VamVfp_{A6@uh4dBcn;dlu1CFgVKJ{S89W^qphGrEANSR$n zd~NSaFy^N#Cu49SWU2PQm7`zh?#;M(4%H?toFUdT zQ0ybceY=w^UQC@ixc@FKiotmh-~mL#5zS@;lp-2P^qvBH_tmwb?xjrt&P#u=La~Fh zh!D}6F@=sz5+@6*l(&;UP`8G@Hyk;B&IvX<>H0t8fuN@F1RwMc-_VRtZD1rf50tLg zJX9AhKg-)MwQn31J@cFQYUCDaV3ix^p?rqYQ&*iuWdm{6-OYK^-DGWo2^j7v%E(7E zoa01MW5?UuzctLlbdAK0#gj@8iA;C=lt5KaVWMQ(UxYKhDCGQ zNqtJQC8a5u)6&bQdRSM$bsU5?h7ZBO02^(``$D0+T)*-Z)a1BfSY^`bxTj=Q%Jp1Cq20^wVKSE0~Y`ZYnH@GQ{*j!K@aK@^HyrF<`x>+;LC33Z2&079Wt=F0)g!re17VL!RE>Rm z(RBjk06~Z?I+R);y1RvigrcEH1ToPk1djvCP{^+Wg1GX+jb(4g2B`&6AL5ClU!(9G zDok@L8K8i36S1{Eu0#8ZCG}EWXIkTtv;+*whSElhG*42q_0EGmzkFz~9Vk~Lc$Llu zc8cT?GM)xv5aLROs6P>+<3ytd7z9gfRf2J6pWpZr0q!z#39zh@I)5q955n*oS-Yn2 z_yczNhh)BF;<)0j5}X&q!YRT-$MLlC+a!5W3_|Nb6Cg<6G9i8IPn2ZB(qh}TqJ>i} z-jYInH;MkZ&l-g^<)Ro!J2Dylc7MAH2qqlI_vC2gAk!6~gm17Lf%qOU^TZ`!F3&sZ zgkUgGcny6cPlRdUhp%0`dn@G*cN&qZN0&djN69b<>Esiwj_cIC)^=XbCbmI;+q+fk zOLYt1=qP?IZuZi*zn|5z1s^s5GdozNY^;i|%Nw2Qmn-KIs{c*iOFMUmGmMka=tX+l zq&NZ$FSN(WY0Cay_a0pL*-u<;beNLFEtUYb9J0Jk^bzcdvO8^i$2C>V=r^XD${SRm zrw89hBv!^n=PgjX5k#NOHuyf|OuaxwLjfP~9iS2&@S3$@mq0B)4X)HbfV=nmazQkg zm`TWtOoRx5`we}^l_g>_(1Do{wqjK&ep{^ z_Mt4MN)2u;QiePq(xuZdD+k=wreLZ|E-)jIbQ2^FFs$L$=Fh1PvGS+zm&ha=xS5@o zA+I5e7-7$4u9bIP*t&0SX?ugljQG`Gxm{=a{guYwR4I3&%W8 zTk_q__a$r?QSf8j00E&ns(24+>Vx|yVrKY2$e`D6NM#O6%zjXh0DHS;`&+`r*KVm{ zoKEAk^F7~oj`Y$M&y*~I!T^<`#2^og0GXo!w61n&q#}d60%4OY25(EFAbmxMnvl8u zf>O30=4dp^E6)v^5aF_$9^&rD1kai0s;nyD*;+>f$|_;CWq4gO3^Zx+0$Mb#)fMp7RW z6ekhIFfBB6hwn9UHMjW@Dyu*Ke?Gp%Ek34;LT}||554DHtSdkpyW*AuKAvIFrpPBT3!Ayt)J39~sxr$AIu`0M7nn9urmN z!A<7FGG<}7B{b#mlZ#@SI<1hP{LYuAkSDf`Ko0q@4;OXreI%bjWx)mu0^3LJ!<=2t z?|#x1omX|A_wPciX@UY1`O$_SN29#wpwR{C6CzCq&;yaClhO11{3}gI${3510skbW z*Wuo&`}=>iK76Ivbf0(){F9DnT{)K$SS9kI@A)hX3@42D+$Iu2m%-mtiq|0IW* zEirGh+*2?bXElo)xRG1xNhc9S;q9G8A#gn02q*W&#Ux~6Z&l8bcA=)L9Je^gK=m>s z{m+3f6iTW8LzQ03`L=A%Px_?Av`4nGU&r@K5L6npnftpT0TKnj32!!xO3$G(0^w_4sy;&3q#YS~1hxRyY3a zobOO)!7i`rLZcdZuq^78BHHAe*nn6y)bQ<|ct08uH_3itpm3M|@ zUus@^d0cM}b@AqUnB5=$+ehMTkL&3x-K2Atm|aIIesmWs41imAzH(c*)4( zO~X#)uItzAGv@k7gd&x4j7qUX;DEZiOCc7zHjFeD`IyLZvM*enF@Fx^+Oc<<60qDx z;!R<&v}+qQAiSHM`Cl_D4Dkzh1lL7@aIOorg8Ryc6#V6(_2XauPqn?s-9PikH}hR2 zHGTSxjN7Ald^E;`k~XUwt*h*N_E}83x2`Hp6kM#z9oam8{I`go_T2{z8?TigvBvOx zX_mi+Id-3^z)m|Z-1$Ex`J3vNrBN0aIeEE4r~< z7{GMld}b3j{`Wu#)#3*+(b0^otQPJytq!6)1`fBMdF>ii(a0+r(O1UNgA&oBPZm#=it*LU5*BEt9!DtEZi|96 zt%T5%CNOG|0%k?}k_(J0{cXW21d%|t-DT#Se3`r#CY7bBxaK6JAUGcq-7&a-M839_ z#18Lie4jp;(#oYvfy9PHQgdS$QUlGD|6c_>h$16MA>Fq(56iQ)DNd`?;#<@SaRD0a>;A+(M7(t38#Jp7_}_#99OG-Q?d4U-4&CrRO~>pS zPE63GG7hv>74I;a#|3j?l&Xs>_iP)~k{GnN$Dq-Gm>eU%yKqthlv`tG4dfP(OMQpI zm$Pq1Jqm!23BtijsG;7tkF)@j{d;4KQlwEj+0})C$JR4)wAz>PqPsU=cV!a~<|IuL zuCQdjDAZc^bnCEk`+51Yv;Kg$laU{pL4CU^CBNHAaMg;Q#cJU!&G-|w8Z=JJnlUW! z5d0}MRQ(a|%8|EP_6!S>c64sOs=On2FggcMPVgozWPyBppcR}AP>`R;#+RyiO`>3? z?EmHY`arC_{>6)oMcX1ToL^!3(X|7~t~BK;Qtt-xKdy>fuhWlecZ<>J57eStR3bjc z8qYvNXMo0d8h~ z|G4Cdc_n{U_--+iwwW>PB^nL16j3?qN-pcdxv|)T*xEc>+DuRJhky6d`6Z&Ff--~G zCPUAr8`Pqc@x-q`r8P2|$JG~l8bb&WU~s)+E{J)~Eqz|%Wllt9DMrN4BLNea?a%zY z5qBrrO~q6$%61t(^H`Z%C>?)Pk6GR78jPfaVZ5zHAXa#a&#=hk9w?mBCwTew>>M!H z-Wbkq@@)FvpaCFKT>2Su89-B8fkUN0BCG1ol{m{5651{jq;i03+V`RmwIKD^Ymj*PF81ggm z*L4ZN@LZNLbnENG;Kyun9<NA?A>uY_=!E^{-y`HMYxOsnen39RVDxVP9xIMPhVj z8eNoJy`Fay2)hc$xdC;w6{lBa4iL1she5v5@Gu5E>>zCrK zf-9?jj_@}LsgX4q!WCnU<%1dczmO)9TX3wen{0F&;T9iS0k1pW$MM^upP{GY{=-f_|6ZEq|nmYI#TixR7@kAY@$&I2~_8@_RWmw@~&IIENYP?DNQMVx5H6UPC9JNYU_e0 z|FjjzJIN#vhdcyGy+HUw0`EW_$wr|xOZxDTMGDvLBv5$+A~5=#V! zA-#F|zEa$m=)==2*Vme@o4n0Fuedsf@?i{y$bU(|f6jMGC(HXZz#LL?=}HyeE~S8C zw?+bEnObLH^rW2Q&tS~zxw3-a7rTAb4(@Q+8T*HM+)~cWB%+#fdfQZ z_WBn-Z4H5lx&kwyzLR%vg&=hj#c%nW$V%KD=7zeQ!f9_=?JM;CFJsinl7H%)YA zuPAceVzb)5aJ&XOVaGP zSz<}u3S>y|CUZo)*qzH|rFA7bRXm&wuZwnNzi?@;p>^}7a79n)`ZwHbyLTY&woU8* z)5}d==WdL2;GTd^pjn+J0UA8*enTubbr#3#6p*3KPh7I6Kr&tkxunO|t%YaBpl`fX zIsK2$?8~&FD%nwBK0ek8{;_%v-zIwI!&p1fe)VKSBYw;=wluP* zxjf&p$#u;3D(bM&A2f8GXEsUGxfW~9_K+Z^Gu>y%Flb?vXwv!9g@5+AxywRVtk?3W z%ux~g?ohH~jhGK%yX9{I`;ix5FlU9n9L(Wix$FUU42KV@nb}Eei~8SxCAnS@Q^X~B zER+6=3g?X2r?g+!lkoK6Q@CsvHWUDmv#GzROBn>DQ|4vcTrW>uhBDR}#@E6y&1k3` zKuEIJeqCW1nB~Y|DTI}FE=TD^_grgt8l=$l-nWw=>+qGLjcVUV#!ip-Q-`xaGcUu<@cEZ*QjDB+Q+XP2AzZIaF)}-{OQE_m}$No)4&% zbb;8f2-Ju6B?S6`G3RMu820PeuPzum&7&4n0iDMDf|8F{6{QOF?SJG;vP%&w1cp@~ zHKSQP+jPIgUk%R?I7%gF3aAQ2fZs~&PpOW|+htTwFNZ~R6}HJhkmt~PW{}8dt|$Fx z@^V($Et_K3>>)}1)C5q7t`tgS^eRkct{I;Jhnf1_g_3%x>j6n%3P;8j zK)+@Ua9-Kay>0z#p3!mu$&!JdfXaqR)`VWC8_f+1r@Rz_&tQUm-Z|U2IbxSa9a^sG zvlN77(j?o+%pCKfhRmGiUD+Sp7JfKy|9A7w=j!An9w_r2{De@YZ$ za{K68n!VAgST=}>4l=1W>bJnbWO(VO2hbWLok1d62o?}9Axqhd7^}QWe(!1*J$jIM ze02|1$lQo#<5=M$GA?k-n22E(fL0Zw6K7W}@sj??nONQRX77e{0Z$RDRR1mteU9d5 z@I^LNesD+0l23K%MUSR0mQ0!r3q@;T3}nqa@wg+hP1(y0N}IllO2nL9q7ywB9ib99 zSdnq6@axUmxwN=huArnADHMw@9nQJFk$8i!6FhPB5^Oh0a6xc|>C-TKU9uCXIGrFA zE5Vr;Zga8I1}DVxD7DScEssxaqVB@M>c@OVef)BSFcKIBE?HiSd&~QI_Irax3grTf z(4D9`@)@;YRV*C4XISCk(~-GL`Mi$rP%+cGpeXX4p0;Idi${q4W8Cv+zg^T`k-y#} z1jq4|ev#4%N5X)<0@KEsVQ%g-Wy4c`c9N@16&yHE@xN%hSu*(6K%pBiaW^SRqOvUI zUzRm0x@xaR+=t!Y$mC2^Ucb22r%sTbxTdR;mlwP6!?R1TF)!*8qmUwzs;f+8uakgr zzx2HF<)1gp-uww=2Hdk1W*X+t0drC6y-`Hhkecu9gyX~pY(GAhC5 zZCueJIB`8m`#4Dc##{NfT*|tQ?2wbg1}WRB$Ks$<#z0(o*1>K9W8=PV%Y6T|8uV?M zQa3MR&HgfL1augrk-{T;=uf_M^)psNt)UTJE$#DXx^^AdTMOj9J*iXSfnJ-k+kg0R z{q)-1#0K?=lgH8VB>5ck)YRX^-i{+oELRv(K8$&eSvJD)(#* zFiR9H-_hrfz`giiE=e}LZh~$)OOBdH-6r2ItGhmhbKD&|DGmf2R$c81e7|krw1T^U z|Fa{h>FJjoq{YO=K@3~r&?&)cP}6Yik!vG;_-#$u-mksV{wExZf`|CCS{}su6V*jL zX=;?C^@$s!qrh6HY{|A|^a%3npzwcCPUWJ0FVXhgT0IfIJOXVdv@VzolIkT)PHsUr()9Osa%!n8s5dwV-!6oHM3RpgNyTaZc#JX#kHcS2hKG1HZ0{Gj#@;G%?!wcb zqg8kRcFJH%9!EHW^+M1%0j^yjn}+y>$j5c`^n5^q9jN+d;s=p2QHF`Jlp$`iziW+q zhFO|8fM{d&`(*;|c^VCnde*OVHO(*`<-@q!IXc zt+y?9!ftyY)j^>6Mdz4%!X9Yz?iH>Q$+)kICy0N}^MoeM$ZoXBI?aLUkgC&MMMcNcvltnU@;^S)Cm8k zuP-z-{SBL8f}!u1F-qVW{w66xO>iw2aT=8{?^f|}|-_MZvgm6KNR<$dm5 z=Il)6KmMtf>vK-4PpP;{W;%Ue!N7_E$uUZuoe>MO3-x!J-nP9UGarQHfkIMHH_;AU zyb3+oV0JsG;L~J_nR2ovNOdF5%Jv~?)5D_LNuu|;`Dgb#{XZNMeQXMKJP@#>>m4Qk z#DXjpR9pu7N%^h0N37HvdzKcO)X}{<@`3rYT3;u!>xgE!(WCZFWIJKc?(nMd_8gu4 z!m|2P*=)L#iHoY8$9E}P1UX&y=B^XiB9BgRLJ!&OoN5vjRi{*}MC z=h@Plak$@QCC>W@5dHf!!+!agK^;Z^sLBAenJlj>H_$%5J+ zsr(FnDDyw_Cmj#YPpVBScUbpZA$_KS)8^zE7=2l-GsN=Iw%XRAcVL}qC|j;|N=@_3kW zUAyV?tAOy1Uth;^cgYXX_ek=CMaVat?G#L?%?}*Uq`o-=??hAoy^n6nmFNC~Yyp&M z$($_oxonst5Od)2x+VYG+SqqAx0uk3Zzt=G&8yT$iH1bMfsww+LXj$0<Zz5?TuVYWEPO%6y(s4wa zBOjxc-yE#iTb7dKdsWfaStgor!L+n<-2G9xOkq4JE>UzvdT!)iovD$_cxH#$P)K-( zH<4xc<6Scz*lLkKamAl+wC%lX$_mdR-vg`^oAsK6mH2;n{z+IVXb|mj`5pE%3{3x_ z`yqhj*l~PE200y8Vt?=8JiniQlDkaI@CEbOslWVdo-+76{j<;3v(n2)l*wvtDENdrF@{ULU8L8a)cim7SkD`@iLH9dW$Dx8z{DQZE)H2gR<(PApg<*Jx!0e#}jM2RGPeSHxx z&T;FUIkvslD96Q|I67s1y#_9BafBsFBJAujOG?e;L*7U}=0jV?R(9l557zOGAJ2(> zH+|b{7mRU{zk8rW6hud=RvX`uOnQD$Yu_|jIxz}LX>0`A3o@S!pM-}5ddYiVX&ipZV8@pq9#Ak;Dx^Q~b@8`8YrQt6UD=*x! zXe+L@UHzzP-JLW`Bt=I&A4_z4S}wu+v>68z3QLb`nQQcgAAPRMWa8~k6#gF5L5_Sm z_L2yUjJo+3o(Ti56T6PT>GIvlFb#q zVzPQ_o=HpZnfUfr<9u1_3v=W_$Re+bu z(j`t%%s}Z*g!<6gi8wi~?#$*C0KxIn#Ec0u&6$R?&vcy5GhG_}^6iMVbpsZbqJ{mb z=E`W~s)^?OuKSUC%jXBbHEt=eGt3PT zV(X8ViG{c73#qGw@g$_`|GnPkXT>-Fyd;eIwUfDyW$H%lM=LU|=P%-JuwaMEZS}yD zQ>VYHtIO`%wFG#S7fj3kKJUN3+1#kT5`J06#P*KWtot4(S3LY9x8a3dZJBGKOJ18= z`r`)=(6X{G9r{0g`t%Drz)X+9a2b62-mI!gNYJA0Wn?4(x{&myG~Kv=hr`jcEIx(J z>Yp1r(T2&=IX`E6wz=`agRk??L*&Z@@cEAbsrXW23uJSWsYOs{#DE`qpXz~T)& z@0zs~HX!_>Q28&&7@W!l25y+j`(DK;C?J0es>Rz44WZ6XNhdZraaS+qVIC^JAsDjU zkvMvR-7y`1CNM9l`uulHe%Gf@_aUiu57O6z9|Nc05C;Ig#Sg?Q{U1qHzvhz6l8$3w|r5r{Q|l7#>MnF{ae4hL7{!&KGO?46uQ zR8&+*$jH6{>HX5UR8a_Wilt9hw{+ZwgUIlRywV%whQZAOu|u0&ScnHV6nvHoD4Hl} zXmr5g+gG=)0VH@Sm=%_h66~(CoRoD%_`0?)AzJqr(mXm8K!FM;xej-qmUPEB4Fae^!s|M_jf!PWJf+{>fq zP17+g&T;JpHYTInH4OC)Z!(uSY{Vj}s&--*L7r~_^wKg43LN^zLD@n!$6n?~{&{^H z4w?J?c}5vqTCXT(3FU1=B&DOsyL76_zj^p@#m=2OqY@INy$THbJJZx-{qkYVRUjDJ(=yS|6WCb!PzQwSh~kg+;eNdN6A{(%JD^xe3af??c&G) zYFz|^{#tGs3I+Ph)rJF!V#dUy$r}Zva!Ct!&m~w+rHB0fMT9IZ37hQhI-`jkgHDYj zwtHtjtvL13mZ*nPR0DQE3RN8R!}oQqAKR~Azkbp;^F`qe6e;4%E8`7(_-xkW$oMtt zdqSQWc~3ib>}c{fmCT5YjC2VWYQkCf`trfzy0Q;Ld3m||00iZ8(3A*>i!)%VUpaI| zH+N8wE%!{w`JHp~nu)4r?#bovKHS{g4KH6>$OpA2iV;2GdS2cbM5>r>OeI`I{)8^B z;q~hcvUC=vf}L1TX_r~f|a+}J2( zw;+h8UvNi5#@fcFt)U^5OgTd(wZ3qxfI!-#&(J-VJ(}@&Q=LRHIC=7<$c&(qp4Em8 z8+25jjgF2+erp$(?xj&fsK|7&gm6g`OK4!Rq9%70J7Yh%K!)f2b=JoHBX5t~pGc0U zI7aYgR&?qHE8*Yn5&x1=b(+JIw*O21{kH_`sloowY@E+({h#&8pQN9+^8YnA|IZ&a z=h?u|-$i`vv&knQnUUK`@0C*dz0?SajDG?N^Q6xF*|joeY|Sk#>OvPyPVK;-)omeo zh`VP;yyOQ{R1D`$LrugBXSd;h3y8xv1jcaRVAY==v*~ZA4|~v_h{PGx)7J;+qD2&a z)ZCnVqoVZaJ!!l8^KkxDOE+j37~loN#e@0j3rR_lV#Zl%;B;l=<*A5s>tQjwDgd7n z5$7Mjd^$9MD!ckEW{zC??()wL*2MxY$sg}u4+9}8lqMIiUd^5N2lhJ$v-b=V>oBnf zUOWZ^4*P(fkohsH^9TTHZfV}Bz`BYRcxpe!DZITa52VPp*ls1T^}H)zE9aL1ffjCf zIbNo8LeD9kx*OV0S~aN0x%5wYdK&CvqAbIKGGkKGvQ!|Nst&uc-C{{P4&PxBOi$t8 zd6Js{0}r?0yVpNId|19b&x(6M*wAtE!OVV+|e$QP&k4$99!!?cQJ7z30X zOq~7Y?pBlrWKxNgV|WZj;_(tdUZrtPi~%;})lyCj)iWWtOaR7Yy&4#3LJMH@Gj12h zwIe4^+})wCungC#u_uzGROD~v!0#AnDUPg@suYTml@#Y?m)G8$(cZoef6N;{%79bm zKHA@u$A>eu8ljN~zQxqw;CdH-Oc((5nyFrOM4{yLuipn}W(6|bIe5i#y{how(zmM* z^ivp%l5H{9NAQ(xNht?}C}OOedP;FB3Im7|6c%22=FAzTkY6gi3T_c|bDrP{SEIF^ zH~RrcGT-5!XUwNjHG_L!qd*|E6@-B%2??u-_)&EbmBIP*=ks<2;EA|#|I35Xiqho& z=Y4T9CFccS{68S;|3I(*^M?G_UzKh7zrI}mD?gfk=nap3fl3s;pP%33XU~de3rJb% zA#Y=2Gch^2m)uP=?f^-gym#;3{kf-l_V)G^{u{|3pFQF*ENFS$*0vWf5c_HJc9HYL zy#f*6;n*?86)VE0o-#NUC8796P~4YmVuoMkq?dO4_6m%@B**_?6%vvhi!Ro=jXHN> zTp#DGslA;Z^Z7bAld>!HU#Ot1fza5~^y4rg(^F>>0<>BX_U%5>8|3{f*cG-)>WYhZ?3+}vx<)H zVNVYm9uode3vBE1-2n6l)J;BRuO3-le#z8y^Ei@bXMx52^K}J3k2dH_2$*Fr zz0cyfaLZEc+8el3BEQ<7I1!J?7Tb;I^>%hPzO=N|sL!xle;$=|G0ei;kA5q&KYH{W z-Vch=%If<@VqxLo4o8ld9k0H9do$+v@;}ccnebryh4M-|x?@2B9^!YS91B>NQ3ttn z-&uqAv`tcyFg?RrYNq+SLw*ICTUazcdbD!y-n|(y?3qT<_|goj9I&&G0SM42je{L; z1}ZTFQT800njO2b;YKC+r@2>6%y%>H;mvXcf@CkxqxD1O z>C%(Mw3dmftJ2HqIt;6Nmk(-dH(~=8zUI0rYa}oKw;0Z#gqD`dKRzr+0B5P5i;L*_ zxHw3?X(*K#!U7qEueTbLD@)(LMf*GC1h=B9B9vDSc5X#~+X;Q+VDJxjZeCt8cQMhE zov`0;)vrDD8&yuP#~VIpEv1QbkKcC&^9zk#T`UCmfEjb=A|6>+r;3tm+W?SkMOU$tCU8GoTG(L= zklPz{1En{7RN=8{m9LL0v8(*Vn;5`_Mx-i|d|{>*hGfpLaD7k6#&6%gnU}h-;I?&Q zVhXl{t1$Jcj;z??CswGpG8AL24Ja6Kd0e1QARA4(KEZ}o1ODlRvcP8-l9xB}F3y>C z^udE^ZrQ%{xz7%Z=@{_{oA58fDPPmp_T~jCa1hw-1P+ewWYvZfCr-Ko5Mm5&)c{5!dteCpg7L>22KQ6I3fXs>4hto zE`?#ifCId?$rWp}wj!D!MaUgf}_xGzEJXlnH|C8l*1qD+p!4~ne z5*g-k$(Ju1k7UoqX;>_9)?GDHJm?3pt^-8J#HbPV#QKdJ<1rSj=6#=!lXJ_R<)zVt zevH+PKf5J;N7|;E9U!0;kt=p$!bLAtehq>&r+6s9!&w}~bU<^ZQ?naLkxc*t3d<&C z!g#@Kb_8r=Ge{e(ag2w#PU0~;hN5YVB|XG39v&N81^lSu_vy}~Z|BzW$DsUi09B*( z*ZA5fdewk0RD>5i=}x$3V4gqf3Nf>V4>uDo4+`R?rKJX(LjoeG)RpWEs}S3ylsAGO zj*5$GMv$U%sYWg%sNq&HW5O#smP=Y_+qSjvmF^ZT5(nB?!qSd=$NHo;>^!*=(*)%2 z5&aILG1(Z_k9%9(-X#6tCo6W@HRA)r+HEcNFw)=;dnOc%rksAt(Kp1peNX(lHyqcC zuy{CY&4`Hiuu=l5Fb%C|fB)Uw6v^oI?UOTM2v9AlJ72zWgCEAKD=7G!8c5+d_;eJq z6XTDh>;BPEh&kxjtl2v=P^R(f7lLSn&W2N`PGQFBp1Fl-N*-&;or7rPv{!v~*45WX zX&yPZj3K_T`1p0?>VQ|moQNYnBs@`KTZ(2S^Y!4|x)nRo!to`D6)@rBaig*3Ur9XabYlody7wm$FUjFHrAB;>8(AR@Qy1_jxmtOiY-*b zXkk-AVKN3kV_y_=I6ZqjfBp2s__?)Gyz@+~GX6!e&q@;P47e3;wYPF}SE+7f*raJX z2!I#~DW?-YoqFaT3YM3w1$y}Uap>H+b996v3<(K&T^Dd@bVjZv&%$@xg_3olB@sM{ z4LxP67F@d!P#oOcVlXN?3gT}+zBCIP+X~7ORO#tOMRrG|2fBIvV|MQtpO}c!h~-?v z3PeEUxxL`R?`_fu_)N%6gdVnfU13%% zFQ8HNUkugT5`O>w{RvBB8|Its#~)#tg9)a+30)$bo24v+ulH9BRPLQ1h9w%Jh1a9KV$iy!Rd}6McmbR38&Y0i&Xb z-8tAcw#=crfMkjRI2_fGU)R^JT|0qAIjr5cRQ%LY)5C~GKyQT_mSkmoPDxE=c%Y3r zE1EQT-K?1eQ)yCUt7=&vST(R*<2AaPCJnFIVF0#++FLHK`G%!BB9ZEXKUc%>>adH8 z);Z=)TZA%jJ`Au&MmyJpVVX=zU)9?WcD!b87r@Ci7=R z361c>sE)?Ts_ST6Na-!i&l9F&%@F^F4LP9pGG;!*l?%%ev0dutkNn_i1Txl(U*f$? zwYCbG^lYu$@C{4$RQ*n$j{0>s>)@>e8#itgky=1;%SIVCzoq0AMlryFlsRZ=$*Y}d zG*Ihzf50aveg@$t!W9W^WhfGW&g(BovJ(I#BEdT1PBBKwM}5GLYlDHoTG1_Ci>T;I z%FDMP$Iz9Gk!xl80JnmJ*g4(B$kZOPh!ATBq%5- z&+fr)6zmajAKC|cEen`8*}UZn=Pe#54&x8CM&i+*dK7-Vo>XVQt;rs4wA-J0s@k=@ zUcIsd6a(TYkF%Enq1)7(nhMdup=zI4TxSR?8?i5%Z(^z(pAVuig04Vmo(Q^e4bXC#w8qsjG47u2XmC_$d~a zmfiRV#dhqt0t%l+G>m#^BgWn?E%N`9PC& zUb5I+eb1YQkVPp9Z)*K6qps3Hug)9vuaJ;3j6-SCa7~SoTggvFG0g5HwCiUw^$pG( z`JF!lK!$;yUI6_!hv4U!75-Se4-kNzpI;yGHWP6!amgwt-WfY+Cm?iB$LHz70ch** zPuaL9c$;O>6z>Nl)jPLu@52X5!%H1No3HJ#TvuHo*fA`F>B1`%lah*$-?ZldSTFJN zdRQj7p(Ma5)%Q-YOer7P{dLHx;1WT?IGy`3fB$xLG#o)89X%Ksv{(M#oZJ!l5tmKt z;K5dOIWOM484rgjye8AqK~(oJ+1Nnw1hfEMOv@0N(fUw57$4g|=FzM@+bfqqMK zg7|DK>(_^#SwS~~t~dPL*1;E+^bcGY#fQ4vAE8hckdj)5vVFa~qO>$KA`^un*zP%H zKwRUQq`q~1APzQV8BAQ2-#^ZN@){=VKtAgl;Yp=BO-Mb+3le;psE4* zjwE0mWgO|2At4Db7Qj~UA^Cu))12@F8A}-@Fgsn#ugs1uJv0?ZQ6EjaM(@soHE#aNX_?hmP!(X|K-3pkVU%ES-#JrYbT*^eb*E{T2N5TfyG_( zgPDT-yPqseqHI6%q=9@IV34v94{_2C#Tpy>v^34Zi<>pcUw0sb#N#$~{t9GJ2}xoF z1>JKm1lquFW67ja29a?Fqxow9UD;-f8Ggs*sRS;qCw-i2*Z9=c)!S<;{4GFexz(cE zb|oe2_U7R4rKPP(F@svzsXCnbDKkHxobX{(?@g$*iec3fIN!__6%)f+9e8E`bCJ&| z+$4NA=kfL{12fjw*Q2TAS3KtK?(T(q6!%H|q82q_Ev;x@r8I}Ly?rR``h~#e!i`T& zON$;(f6^+WHxYW^#Ks5)Z(DhRxE5uVVzl$cw!_jlhB6dyutrm7f7ZwLV-r0~I}QszU4 z4y8NR;BSzYML}Vqom)U8-&&J|MnxDO_Lm}4k|TzML>W7pZC2MLIOBPi3G5U*lQM~)so%ziDaT1)I;^D$RvNARtl zKfbub+(T=o1o$1a@3|a~;)yC}+#7UnM^PFzX?S!oen$Tq-;ER5llOg%D2i7&;WD9E zX%7C41e{zYJ%tu}(tzVyiVu3kvT;$wIT556JKznk!s(n~zUPs}Q(?+@J~@y4zb}db z&#a}Rqto{dmo?)N{uFjo+u6^Hz0xke;_3^gRp+a(>`u7jsILl01DmzA@a{Z=Uc8{R z^ubCguT!V+w%7xMf+R{$aQHCo%2fDr2bT|72P9))u}_j|o86@n-t=~zzB@FTwR!hh zde9m~2qK~=fND>ua!&o45UMa5P$-1v$8TTo_UU^t=*TdTGYlPkXSZF zz6ZOpdF@kI2Zu8Ry8(%NdU`g%MC{`A>)W>FPhI+jHBb(yh|KTVpP6$Rgv4MS&gQV; za7m)+7D7A$bezl06(fBNunj0W)?r6Mii)2cait}B4|=ahh(QQMZVI1LP^!7-f4qj+Ki`OX9hgO`F3MuP-e zgg#P1;)d0Jxb26OtAX@LY{tRAGKM#&rlzWIxhx2@CnzY0dT=XR4PImI9E2|gmB(gz z?9H1u;GE8ZP_qXM13*cX6J2S0S2$1{TFezLXvzDltZNCP+@5WS9i`|5uUVaT$biQA zwUsX1dBIUnfE*_g6Zj*>wjE*?3Ae{$dN%tK%1d3kXo0#V4(-EjqQePZzka=dUTjp< zdR}qBey@*%syOod+>1m__qhxwvv8P|$|^zSfytTXaa5rMT2hB}V;&^Ej6$&*1@Rg^ z(`JN2+!B`0S-d+>3S$jx6mDo^bF)V8v78*lUUU{6f#sXZrVs8}7EWz#Xi%FO3a_5z z*x54mf-&)zVh(TZZXqFcmz*{LUugU!{d@0aw7*)~#KIz&%E`h)16C8cB?I>l=onl7 z?##K|R3?E{JzWj@7?8$ylUkOH>U!yo&zoLMRpOq4neyY-RsgO7E%BsFfSrYQ4{Ws# zF`ZU~muX39sc@wFUT}4ZNl7A4(B5>#gEx5)9Ms*{w^c$y;uDw^kkSkbco^YG=+OHd z5x(4(gDWPF#g-2CEeN$e&@^gGB^%>MU4dqsR|nRH{N9a{BM#L%GG`n(;yb?gK3%x5 z9^@ql4>z{~uyW)XUfoZi2ficEwDtBTqSf?mX{i%Zx|qKe5j<$saOA|UTuDr%LLfkj z;EN{WB1exNWmvoR4)jJ{A9JFS2@R0dA49f+->k7IaPCZtpQ$4KCtUMf^d2)@H5NwS zzE#7s&qmdD(OPONumAwS4Huq7IrE(tedA?+ga9aKgC9gMaXnt;1xW9Lu;w+Rv16aQ z`s1#yi%7nx!n#12#_}h_2<`$@-rB;fq#Di$+pEhZX-P%?zEf0m5Osc2L;qB~JJj4X zX(&}p27Up`B^%5+?(Nd_3C>?Va_Fnr2kaca10vxvx=1JzyHJxz^B%S-3lW*owQv?< z>B5gh{?+^X^>ySYO++h>?q?umkf=pgtXSdpqTs4bj{ESCyu5rOG&|td3~@2hJJWCe zGKP}77k)>V+72IfbW{gaVP$%1yL8<|%z6tei%ostRkF7kycsBnTog*|TwK}n^)6Q0 ztp-W&fi~R$6fDfmFW^Z7tfeaZ=ys=~>SX1j zlDllp6IW!-tE3El3%+dsbqh=Z>Z03=i(!YNA#l5pnJGf_MZ~unR0}$<*}gCFtu9i9 z(t*Q3(_NgqwB9EidZu?ZE-!85XKnF0aU9~X3W4x5V6BL*stYk#_jPh-rvVOUwgh)x0Wf!rUh(0{& zS&D2p8cY-D>)rtsW`G`=cDeyOUqLNef{Vy%hM+(HhJ~58x3{RWvgJ(h;DeA`xI#JL z8N8wA!$agGJQ*aKIyf3sDmyznIRosRocl4g#$1Q?)Exs+dESCr)W47`ijYlNZ6I~9 zcTgZzde5F)V6bs))fO{0`#=IjrMTes;B9E)<7ea7$E+TWM=)B=eDJ^l&edJBK$7H_ zdLu3kf9-w8ovff1h-MG?6T~+b)xaeKZkY!Y3cn)Ou;rk`Fht74mA)Juy#`3l(9`qYK*e|v zeC&Fm6uqRA`EFuD)I=z;^j1<=RIXd-GIlVujpLqg9BVd^fL;v63Rd> zyj)mV*txzIf9}Ykgbw*(S66LH#1K;A_IHz$+D5r1y>Crk$i8@|9|@SW1rqJD z2W*D(0Toa!eTR@LvgITo0Z6%m3%}$z3^NsezTZuaJ7t|8hJvIy{n-aXDAOm~1SFt@Ub9b`9tS$RA! zxX3B+NI`YoqF>VQwNZbqN<0{oJ!wuW5oa_t47ZeMEUNEk6a(`K?A*x% z_(C*0l09ax-^>0{*nR1)|M{t7>M1QmNt8~76|XPp9+3W&<nKR%vU5_6A$mi&p;y%t~eQ2?0?;dx2-T)Z}X9G>G>Fv&T(xiSQCF>*uNG5Z+R*UR^U}7pYPgoj3h)4<0m_6|jVPpWGpJ z3|M%l5{yD=o&KItFtnuq-~>o&Q8aLE-n=~R{zsx^dKub4 z0jx=e)<#_5ziDj?K4HqEbmBb#XvQ~Eh&6`OJ?!pg1#*QaUL(qZyps?r zCkectccLD=c6R#yQN~*nu07&qa&7=2q+~@7B@3K%bhWfXfjOXX*0Islp7%kd1+$f2 ze*dQ>{@On9%?R7pZ!B`zHTX`N;?NN*lR}T8J27>@4}wZc=F_hNj3-S`t%e@!8uX_C z%m!~>c1EeEhpMONK%}Fn22C*BpalYxJI0#FdC;AQ)YcQ4e@EAl<$Tj5Ku{wD1CYIfp_dcxjl9FlvMmf|8aNq#%r656F8U51^kO@e$Qg({Iq@ zDB0skS&DQ>WWH##IGi|P>kNHk{szCv$+XwjUGnXJYuNfLuXTIg{rw9{#q%Jxgp`!H zpFVv`7CCq38gCWckD6sQU>jSK<`~M~^uIsUo1s#6aB?!no1VFvnre26(^lg3X4Ghl z(2J)5@R2TSX(<(FhiKs`zlZAn0%e0w5rznc0h$Y!FK^yf1c8$Gq!FZ#d$(Lp2Lgfv zw;lCWrT=GfPE*+70xPoJ*CFGQ~##=0EcSo!Wj z$O!0Z!cf8tRCpwywL(8hjldOa930$el*^8qy%AMP|6Q{~RsW6`+zHcCeEf%?qc#OCvOsyXGVN_3 z85f|CFO55l%eKnE^C`Nnv|tv{y}ega(TX;X#w$F&9bkLu(fuU@KZyNF^$+l^z^684 zPsBA4z8Ksnv@;enpU9SEv|gjoKOw?;wk>*OlX5XM!>koo@svp|3r!rZ9#Pa(`{ZA+ zeNlDe)~y7{zQ_Z`kN}L2cxh>Asc%M}mz~}5U;Xnv2=9>Cu(!WN(i%q7K)rfO<#n+W z9WEIW0H9QRi;Ul-;>_Py@(4E%-QHq*u)<^LEM61%&jg~`LuZ$Yz69kh;4W#EU_*8| z)M=K1lHdDNpgsdHBWyjYtfXWQ2@6I8$cPg}DY&q)pXV?-A`i<>Mg%2fN%Wy?D|bk@ zq1izMKtRD*0J3S9?}zo7Feg_=H=L{~sS%FASg-467^$;3pU3awE zxOsRQaW+Bf?}hp!G;HHZR51J8-10i_pZfntML_7e(V_1?$Y=#ELPA92`Yn6zZ#P%; zQe`~fszEuj0SK3#nYk7F3j{28Z`-D)^#kosoF!XW>)2`8p=*Kl9q}FymN`N1rC>)F zY+JXPP*6p!1qY)J-GvwS`w(#tpr$mWbdf%KHsoe!B3q=`!afg88jsg521^pH*kxIi{AVfMWlJ{Ym1Z=>E zD*+Y`Y&T(*a7ZJe77%4;0K~O1ae!~aS@Z~@ZqZ|3iY0hE`1|D64N9IA^sgHM0A-In z;TBxR38tr~C(IWM56>DXZ^5t_R|>wtCJr(pf{-$TLaV>RV-xytLfiJh{(`8iK@>HC z>*z#phx~Infj#j0+`68N>m1&^J>ES1x^)DpV$eteuPLaq$_CI=vQ4tm*P$9|j@bh>E_I$sHciaou}4G<9eGoB;Q zu%@=}F42Ai<#M|Jz%r%=OecAwM8H6s6ZB=6)RmAe%42N1v^$PhLHw@}h-9Nt{iY-O zrtJJH%b@Fr90omg-P6sMSA!=antjrEvHV@ubnN7bB8-VNHZ~IXYSLZ9Jz@ulAA5Qf zj`XA-2-PGvvYsZ!dOEBAI!Z>gQnrGE0Tg_QJnsd#4DBT=Nlx9myY8l5G1f@()#G5$ zC~n=_Vbnbj?Ew!|2xLVgl}e53CJj*NA~K+^Bnn-ep<)E*lkQL_Pl>8*F?q54!|i=r zb;tvwmh-3z_d!yNnl38);H?p870+g8Zza?o#)N);4EZhb7k_5+As@WD6@b7<4=l)- z8;OQ=EsxwvT<(@gMlM7El6>g<20_ddhG+NBxqa4ECs$!es0Ak>_Or0$Hqf_-r48ha zqNh)~dZ4Vi4_sHGNlQy7`c(A(8z6xQ#oKVe&=f-MX+=sINLJh&;*SMm)u+biwb6)Qf4uUL%pjoVoJ$Y*L| z;%$CWy*dWyaCorPd}#%$^c55kf2;BDMj~_&oR^yuXGwB(x= zs^c|8{zSa8%n=}6ddgd*IxQU?@!~Fb)H4_lVo#f(2TM#RB3~D^lde1O*f)sSR)#`B+?z$WawSn>)SaGJ zPbtUb97@APtUU)wA)MSXi5S2tsR)LdSIJX=g+QKy~YfQ^3C}g6m&B|iLxv3x2Oj-5sP1eNkfN z6VQ12lnDeXD%E@B(HdGc(63D@fJ(wiI2_%x}VT1V=3Oj~Tsn3%=nlrT;zqja?V4tg;j7%Di`4y#vq9SRO ziF)p-Ti%x!7D^VH7prR>O4^AYRg*kEx#j&MaV~GbBxM4{v$AxdjrjP{qll7{66ykm z)jH8Yiq5gWHFk1mW@hG^Nrd@KvdAH-efQne53$Ob{5_~!yCKlB8Y$7xpi(cO&FK<= z=2A4Nojd4LM(+I^hQN|Eyko)V(izv*+0PIKI^JAZ#FHSaI zgsvzlG5M;+)Yj2K)oK}U(HomomVWQq6kW66Djl3mS1GtcLnu0;qb8uc*~-t)4@r?}%}*3c zZLeO{JX)5Nh-cxj&Zb+nDta92wNANU`hmLmyN?TU31Oj`H=Ecq+sYOJx-|NXkp~39 zu})5=+T|>xZ3<36EKu5ixgcY7k$Kv}*^9p;2x25WbQhS3~5cwH&o_j)?i zQRqLC4iNTA#>>2EYRY)h+NU!zHYW1Z4tiba#kUnJK;-s+rHA1J->5PD6fA_b_4TjLzsK5UT5oX0C}4I&=WYOL*S0*l`EwK8ns3La!zg23nsq?284Y!3x5s3*(-o zk%G|N+0*k74U%x&YGlvY@~by)FzM=c$Sq`b=NN8I=U}2BRm}s}Q9@soH-?f%VzpPm zzba~^&>~4d!XgOykE|wlB?XGx42VSC`gqvb!ckVBK~&Z29^miK@Vla{Y!yYtf03Sv zSKeS($nb-gF-z|*AB^~o4)#h+_)cK0*vQC;_#*ua016pg1Y-T%J7^B<)I!n+LUkHq z$^=|Y5x`fZefVn^xDIV??Tk-Ym-(b9jnD+IUIWV-vmx$Dq?NI)c zjjO7w-D)>wR-vSWMhhb})@a}9r`SFS4l=YDLZycBLedaYKUQXxOGN@HKHSDiS(PA& zZWISLy-A{0D800&rN6`Y$k9<0)Ze(aS=w@bJK*ORbyfCuuc`c-TAu2&XL;T4f<7bz z5_8M{y@&r3j6J4g_98)bBJp*l_;461-OTgP<($pEoB9PH1ETu3c%bO}$4of${f{=j zXgqBwy6K(9(^v&f{342vZBHUXA%F;u*AIUmTH>|-Tz9A9o$2`UUW>kN)LPBxMXu-J ziN-73jA9xUBJ5Vgy>7H7Y-+_XAw6bH(4)!9jAVroDT7^E&aP0cME>>_W?e(32`Y=C zakepz)%5C>RZ13yt3dX0)F_HiRzDjYthfI0rFu9<)XZsedYWjfpO@H>Ij<4Y_`@C> z&&3JD0=owr$%1w83lT>XbZ`6b`2HV45%DCQNI&#+;pP3wu>%8$BlwuSW2QnE*=wpg zB;^{Mon>TX)G$85!pa(k%1Z$Fj|f(7+V1Vb;xHl<(Vcl!1%^4|cQWVe_RRtUl!G@_ z8gV<`zI{u9Hdb0%dU&1}r4UH%5r69Q_pdfMV^}G)My+bKu=Mw@ zA{VtHbk@7O-|l`)bLY@#dD%N)W`ea75}18wU#~%L2gkgrwKcm}v+%Io)&0}bf#*$= z9P*qBh@m>BrE^(((70UB&tKS8PD9Ui6W$i$r!;y|cMeZalLdqJ5LsnDeRW%tnKV@) z5o|(LOE4wr0)tlXU3x8rly)H3`4DO*oZ;v3x6PKAR5EXzLd_X7t84N81Io;K$k4AS zKLZ>lKO2aR9x31;a~EjUaK1bNVTWr93*C!jOhHZ!cX4AfG6#2(5Gbhs8UQCS{B{n= z`;Sft%}Pv23?&botv+pOd0rnfK;#ALXh5DxsZm-1x1pK8g;#-4sZWRj3pky*&!3y| zs)paai@->Gvho$z|AVf&( zPG`=HbtGdDS8QeOHArI~04V=>Q%zgM)X26)~9#JuiJ*ynNnBm{MxI)qE0n zgF0mD6gG*g16l`F;Tlv2q=u)6?�W`yi&$5)NGmkRkSt67QD6F&_jesP(qV+Vr%0 zahNjZ<(VEZCgUZT8@eKGXItP?DGX^53OzM7H40dPSilxXFnqi>ULoe0l16C|1$pe;eS zuzcnfp8{w{3m7IVNpo^>ki%)qC!mcx@PN*zN8;r9=Y?FN%N_;_r-P`}i1phc+9j)a zVpl&uDl!UgTU;9=6g+=O=@mijZ-yNO>=`mnJOCcx#z+9<&5C z40obNh@?=s+E#tai?68($n+T1zX**0LMIXWR00P>EFoPh+X4y`-5c`smbpk@62RMott^O^$tx6St5o>e_5lKb`&5CT}&~1q>M-j0V!=n!o zC!h|1gg8E@!X7mBTL@G!8Z}6J0LX3$WEBIAhpD5EdeE{%9n@3}A|3n(UYdgbIg55!O4K$Q&8yR;mZH z<9X9P(Nv@HQ7dm{sSw1h`m=VJnsC#Ltgx#rH_K9!1{=)Ark2jjqC(NYlD zC_wpvSilNk(}>b;9%FF%HsyDg@|Ymi(Zs%GVx3Biv!QIR#Lt2xHXE^@B4--%b@J}f z#sz(|f=mHI!oH3|d^qs=&w*uFkMnjGW>0tR9;`!i42>}YG01ol$>C5631)qn?qdee zExtWI_a~|ZJR%V?QUuLB8~JKvE@;QIf$BX5*#*a3DI}AFJphFo0YO4y^=@?Q0c{*H zNrRk*!i-ux*nm+c=-@TY%~NCwqOGOQfpdj!^C0oXK?WjD1D&!`?U0v&T?VVR7t(!u z7Z-YzyIm^ymfV=bz)&Is<|WA(6ka26)#==0^@{Df31-da968b9>9#e#ae;w>kbcKN zCQ$`XCPe*6@8_YXpxzMF)zt-sz|PGr%4>2c*Q6VdBNw$5kxC$-zZ)Ih2kgQ=UWH*H z6tw5io7IAv0fulcSP$Ox_7dEOvQfWXwwB7qkM_iHNW?HLo&NnR2*bC0C}N=K;X{i| zYsP*I5~5y^(xei``R>LsF`V+4`~RdbG3R_P@MI`(f$4d!@j22?M)#i#v^d-(10G0P zZJnJLVBR$W(G@3C1Ue)-sF~Ig4GB6BWhdTP%2OQR3qX_v$nd054HT4%3*ZfvlTi2Z zd!I6X$l+GG4b|7Rys~#bh7&_p*CQB^svPX~BnnzCrpLUjtm=51U<_br6AQ}C?)cmG ze^;W=%>H*JN-1!^kkOtc=YNP%+Urrb8G-yHV`1cVRabXo6M5u9_MAsp*eT|i-H8kvRcJQpbi5)K^sD0C!TA+gLHe-A5VK`4-jI0{Ov zf%>2tQQ|B@2oEBq!bf#`T`9}2a@s%(&n+8G9ipT`uO2!omeo#u!tpd|V9`kM(Sa5L zhr$*3A#*l+$1HP9_d82VXN3K1bou*@$91{l6FKo=st!#)VTZHytY zQidxqqp;S`-~W<{cxKacUT1zM{nb%JNWtLrF5;|s78)-qt74}WdGvx`0s;bh@U0@f zeO7$516J`4GG%j>nAMGHmz~<`^jsLx74P2j62$LM!IqAg zNg)XwI%Oum1I!_Wj!x+IJioz_5B}+w;kFV3a}+fMN`?1-T#Ur?DhpzIz-VRX=QC2^Hj+DjU8W@^ zCC}O@$(W;o^7P^a3k4F(eK0Ngi@04DE3&Z&qE(8@E9~mkm%4Va5BYWL z=FN@hVPWnTgSpK7{Ik0nxbuU7z5vzW6w=ufP?aYfd~9RBY!p57rrnRwnD{Sa6MI-7 zK 0.7 +# 256 * 128 * 8192 -> 10 +if __name__ == '__main__': + torch.manual_seed(0) + + # hparams + repeat = 16 + dim=8192 + layers = 4 + + batch_size = 256 * 128 + + # simulate forward pass + x = torch.randn(batch_size, dim, dtype=torch.float16).cuda() + + for _ in range(repeat // 2): + quantize_rowwise_nogroup(x) + + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + quantize_rowwise_nogroup(x) + torch.cuda.synchronize() + end = time.time() + + print(f"time: {(end - start) / repeat * 1000:.3f} ms") + + + + + + \ No newline at end of file From b373034e31c0d0796363e389f4b6f02acfd77a71 Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Wed, 29 Mar 2023 19:04:53 +0000 Subject: [PATCH 24/97] test --- tests/triton_tests/attn_decomp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/triton_tests/attn_decomp.py b/tests/triton_tests/attn_decomp.py index 9e8ed28..fa86995 100644 --- a/tests/triton_tests/attn_decomp.py +++ b/tests/triton_tests/attn_decomp.py @@ -97,7 +97,7 @@ class Attention(torch.nn.Module): def forward(self, x, attn_mask = None): q, k, v = self.in_proj_linear(self.ln(x)).chunk(3, dim=-1) - x = torch.compile(torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask)) + x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask) x = self.out_proj(x) return x From 8645d1f71cc78155887bc3ba082b1a610a05e31f Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 29 Mar 2023 18:41:37 -0700 Subject: [PATCH 25/97] Added normal quant. --- bitsandbytes/functional.py | 76 +++++++++++++++++++++++++++++++++++--- csrc/kernels.cu | 4 +- csrc/ops.cu | 4 +- tests/test_functional.py | 10 ++--- 4 files changed, 80 insertions(+), 14 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index b38ba1d..969250a 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -9,6 +9,8 @@ import random import torch import itertools import math +import scipy.stats +import numpy as np from functools import reduce # Required in Python 3 from typing import Tuple @@ -152,6 +154,70 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True): #return torch.Tensor(values[:l].tolist() + [-1e-6]*((gap//2)-1) + [0]*2 + [1e-6]*((gap//2)-1) + values[l:].tolist()) return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) +def custom_map(seed=0, scale=0.01): + v = [12, 10, 8, 6, 3, 2, 1] + # 16-bit 7B 22.33, 4-bit best 22.88, FP4 23.25, 4-bit 95 22.97, 4-bit evo 22.45 + # 16-bit 13B 70.35, 4-bit best 67.16, FP4 100.78, 4-bit-95 69.39, 4-bit evo 70.48 + + # 13B 100 steps: + # - 4-bit evo: 86.02 + # - 4-bit norm: 78.73 + # - 4-bit FP4: + # - 16-bit: + + # interval search on normal distribution + #v = [3.090232306167813, 1.4589770349449647, 1.064410327932115, 0.7896806653244509, 0.5646884166925807, 0.3653406435875121, 0.17964844284441311] # 0.999 26.5 + #v = [2.3263478740408408, 1.4050715603096329, 1.0364333894937898, 0.7721932141886848, 0.5533847195556727, 0.3584587932511938, 0.1763741647808615] # 0.99 24.99 + #v = [1.6448536269514722, 1.2040469600267016, 0.9208229763683788, 0.6971414348463417, 0.5039653672113453, 0.3280721075316511, 0.16184416680396213] # 0.95 24.53 22.97 + #v = [1.4050715603096329, 1.0803193408149558, 0.8416212335729143, 0.643345405392917, 0.4676987991145084, 0.3054807880993974, 0.1509692154967774] # 0.92 24.81 + #v = [1.2815515655446004, 1.0062699858608395, 0.7916386077433746, 0.6084981344998837, 0.4438613119262478, 0.29050677112339396, 0.14372923370582416] # 0.9 24.68 + #v = [1.8807936081512509, 1.2980047163986055, 0.9769954022693226, 0.7341502955472268, 0.5285136765472481, 0.343225833559403, 0.16910470304375366] # 0.97 25.03 + #v = [1.7506860712521692, 1.2496468758017434, 0.9485350408266378, 0.7155233557034365, 0.5162006366043174, 0.3356393360829622, 0.16547334454641704] # 0.96 24.85 23.01 + #v = [1.5547735945968535, 1.1608220210715001, 0.893800631179489, 0.6789921163940618, 0.4918050830048072, 0.3205236191093902, 0.15821711945563585] # 0.94 24.47 + #v = [1.475791028179171, 1.1196635980209986, 0.8674156943957149, 0.6610637542614526, 0.4797170937629045, 0.31299335020578195, 0.15459215234139795] # 0.93 24.85 + #v = [1.5981931399228175, 1.1821583959486879, 0.9072289939325966, 0.6880384454306778, 0.49787602226482025, 0.3242955535308664, 0.160030379970179] # 0.945 24.287 + ##v = [1.6164363711150211, 1.1908453913294612, 0.9126463450304729, 0.6916727602238111, 0.5003095327012462, 0.3258056171348078, 0.1607558311941979] # 0.947 24.293 + #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.207 + #v = [1.6118251211466303, 1.188665228776879, 0.9112895004060624, 0.690763326564427, 0.4997008778346997, 0.3254280317127771, 0.16057446047146948] # 0.9465 24.30 + #v = [1.6027040905517569, 1.184321770169049, 0.9085808314549837, 0.6889461706317986, 0.4984841229538408, 0.32467299997597887, 0.1602117348657326] # 0.9455 24.293 + #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88 + + # 7B evo start + #v = [1.62129629, 1.18870191, 0.90848106, 0.69108646, 0.50515268, 0.34927819905, 0.14122701] # 22.06 + #v = [1.6143079205628337, 1.1888081407660314, 0.8990131955745421, 0.694373759813679, 0.5083033257326773, 0.3452499746844963, 0.1148939728228951] + #v = [1.614442766030303, 1.189401918639665, 0.8998038168964273, 0.6953094818279475, 0.5073264599048384, 0.3449003790823619, 0.11428378427205564] + + # 13B evo start + #v = [1.6077535089716468, 1.1914902148179205, 0.8999752421085561, 0.6967904489387543, 0.4949093928311768, 0.30920472033044544, 0.15391602735952042] + #v = [1.586363722436466, 1.202610827188916, 0.9003332576346587, 0.6904888715206972, 0.49490974688233724, 0.2971151461329376, 0.15683230810738283] + v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908] + + # mean evo 7B + 13B + #v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237] + + # theoretically optiomal (0.93333) + # v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333 + + + + if seed > 0: + v = np.array(v) + np.random.seed(seed) + v += np.random.randn(7)*scale + print(v.tolist()) + #v[0] += (np.random.randn(1)*0.001)[0] + #v[-1] += (np.random.randn(1)*0.001)[0] + #print(v[0], v[-1]) + v = v.tolist() + values = v + [0]*(256-14) + \ + v[::-1] + + values = torch.Tensor(values) + values[0:7] *= -1 + values = values.sort().values + values /= values.max() + assert values.numel() == 256 + return values def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8): e = exponent_bits @@ -168,7 +234,7 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) values = [] lst = list(itertools.product([0, 1], repeat=precision_bits)) #for ev in evalues: - bias = 2**(exponent_bits-1)+1 + bias = 2**(exponent_bits-1)-1 for evalue in range(2**(exponent_bits)): for bit_pattern in lst: value = (1 if evalue != 0 else 0) @@ -176,10 +242,10 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) value += pval*(2**-(i+1)) if evalue == 0: # subnormals - value = value*2**-(bias) + value = value*2**-(bias-1) else: # normals - value = value*2**-(evalue-bias-1) + value = value*2**-(evalue-bias-2) values.append(value) if signed: values.append(-value) @@ -502,7 +568,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra out = torch.zeros_like(A, dtype=torch.uint8) if A.device.type != 'cpu': - assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] + assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64, 32] cblocksize = ct.c_int32(blocksize) prev_device = pre_call(A.device) code = code.to(A.device) @@ -585,7 +651,7 @@ def dequantize_blockwise( if A.device.type != 'cpu': device = pre_call(A.device) code = code.to(A.device) - if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: + if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64, 32]: raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: diff --git a/csrc/kernels.cu b/csrc/kernels.cu index a2691be..8f33161 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2953,6 +2953,8 @@ template __global__ void kQuantizeBlockwise(float * code, ha template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); @@ -2968,8 +2970,6 @@ template __global__ void kQuantizeBlockwise(float * code, ha template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -//template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -//template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); diff --git a/csrc/ops.cu b/csrc/ops.cu index 07ef850..8044c66 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -71,8 +71,8 @@ template void quantizeBlockwise(float * co kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 64) kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); - //else if(blocksize == 32) - //kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + else if(blocksize == 32 and FP4 == 0) + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); diff --git a/tests/test_functional.py b/tests/test_functional.py index 54cecca..cd4728e 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -152,7 +152,7 @@ def test_dynamic_quantization(): def test_dynamic_blockwise_quantization(): #print('') - for blocksize in [4096, 2048, 1024, 512, 256, 128, 64]: + for blocksize in [4096, 2048, 1024, 512, 256, 128, 64, 32]: diffs = [] reldiffs = [] for i in range(100): @@ -167,8 +167,8 @@ def test_dynamic_blockwise_quantization(): relerr = sum(reldiffs)/len(reldiffs) assert abserr < 0.011 assert relerr < 0.018 - #print('randn', blocksize, sum(diffs)/len(diffs)) - #print('randn', blocksize, sum(reldiffs)/len(reldiffs)) + print('randn', blocksize, sum(diffs)/len(diffs)) + print('randn', blocksize, sum(reldiffs)/len(reldiffs)) diffs = [] for i in range(100): @@ -184,8 +184,8 @@ def test_dynamic_blockwise_quantization(): relerr = sum(reldiffs)/len(reldiffs) assert abserr < 0.0035 assert relerr < 0.015 - #print('rand', blocksize, sum(diffs)/len(diffs)) - #print('rand', blocksize, sum(reldiffs)/len(reldiffs)) + print('rand', blocksize, sum(diffs)/len(diffs)) + print('rand', blocksize, sum(reldiffs)/len(reldiffs)) def test_dynamic_blockwise_stochastic_quantization(): From a13a522c4c3dfc5cc90e5b88ee29c45df15c6b75 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 31 Mar 2023 11:20:54 -0700 Subject: [PATCH 26/97] Added first triton test. --- bitsandbytes/nn/triton_based_modules.py | 85 +------------------- tests/triton_tests/attn_decomp.py | 6 +- tests/triton_tests/full_matrix_decomp.py | 4 +- tests/triton_tests/mlp.py | 6 +- tests/triton_tests/mlp_decomp_autocast.py | 8 +- tests/triton_tests/mlp_decomp_autocast_ln.py | 8 +- 6 files changed, 19 insertions(+), 98 deletions(-) diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 9fe0b69..0344464 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -133,7 +133,7 @@ class SwitchBackGlobalLinear(nn.Linear): -class LinearFunction(torch.autograd.Function): +class StandardLinearFunction(torch.autograd.Function): @staticmethod def forward(ctx, input, weight, bias=None): X = input.view(-1, input.size(-1)) @@ -161,87 +161,8 @@ class LinearFunction(torch.autograd.Function): return grad_input, grad_weight, grad_bias -class MyLinear(nn.Linear): +class StandardLinear(nn.Linear): def forward(self, x): - return LinearFunction.apply(x, self.weight, self.bias) + return StandardLinearFunction.apply(x, self.weight, self.bias) - - - -class _switchback_mlp(torch.autograd.Function): - - - @staticmethod - def forward(ctx, X_3D, W1, B1, W2, B2): - - X1 = X_3D.view(-1, X_3D.size(-1)) - - X1_int8, state_X1 = quantize_rowwise_nogroup(X1) - W1_int8, state_W1 = quantize_global(W1) - - X2_pre = int8_matmul_mixed_dequanitze_bias( - X1_int8, W1_int8.t(), state_X1, state_W1, B1 - ) - - # X2_v1 = torch.nn.functional.gelu(X2) - # X2_int8, state_X2, = quantize_rowwise_nogroup(X2_v1) - X2_int8, state_X2, X2 = quantize_rowwise_nogroup_gelu(X2_pre) - - W2_int8, state_W2 = quantize_global(W2) - - out = int8_matmul_mixed_dequanitze_bias( - X2_int8, W2_int8.t(), state_X2, state_W2, B2 - ) - - ctx.save_for_backward = X1, W1, X2, X2_pre, W2 - - return out.view(*X_3D.size()[:-1], -1) - - @staticmethod - def backward(ctx, G_3D): - - G2 = G_3D.reshape(-1, G_3D.size(-1)) - - grad_X1 = grad_W1 = grad_B1 = grad_W2 = grad_B2 = None - - X1, W1, X2, X2_pre, W2 = ctx.save_for_backward - - G2_int8, state_G2 = quantize_rowwise_nogroup(G2) - W2_int8, state_W2 = quantize_global_transpose(W2) - - G1 = int8_matmul_mixed_dequanitze(G2_int8, W2_int8.t(), state_G2, state_W2).view( - *G_3D.size()[:-1], -1 - ) - - grad_W2 = torch.matmul(G2.t(), X2.to(G2.dtype)) - grad_B2 = G2.sum(dim=0) - - G1_int8, state_G1, G1 = quantize_rowwise_nogroup_back_gelu(G1, X2_pre) - - if ctx.needs_input_grad[0]: - - W1_int8, state_W1 = quantize_global_transpose(W1) - grad_X1 = int8_matmul_mixed_dequanitze(G1_int8, W1_int8.t(), state_G1, state_W1).view( - *G_3D.size()[:-1], -1 - ) - if ctx.needs_input_grad[1]: - grad_W1 = torch.matmul(G1.t(), X1.to(G1.dtype)) - if ctx.needs_input_grad[2]: - grad_B1 = G1.sum(dim=0) - - return grad_X1, grad_W1, grad_B1, grad_W2, grad_B2 - - -class SwitchBackGlobalMLP(nn.Module): - - - def __init__(self, dim_in, dim_hidden): - super().__init__() - self.linear1 = nn.Linear(dim_in, dim_hidden) - self.linear2 = nn.Linear(dim_hidden, dim_in) - - - def forward(self, x): - return _switchback_mlp.apply(x, self.linear1.weight, self.linear1.bias, self.linear2.weight, self.linear2.bias) - \ No newline at end of file diff --git a/tests/triton_tests/attn_decomp.py b/tests/triton_tests/attn_decomp.py index fa86995..b70bceb 100644 --- a/tests/triton_tests/attn_decomp.py +++ b/tests/triton_tests/attn_decomp.py @@ -1,7 +1,7 @@ import torch import json -from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, MyLinear +from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, StandardLinear import time # class AttentionOld(torch.nn.Module): @@ -116,7 +116,7 @@ if __name__ == '__main__': va = torch.randn( batch // 256, 256, dim ).cuda().requires_grad_(True) standard = Attention(dim).cuda() - my_standard = Attention(dim, linear_module=MyLinear).cuda() + my_standard = Attention(dim, linear_module=StandardLinear).cuda() sb = Attention(dim, linear_module=SwitchBackGlobalLinear).cuda() standard_compiled = torch.compile(standard) ln_model = torch.nn.Sequential( @@ -360,4 +360,4 @@ if __name__ == '__main__': # import pdb; pdb.set_trace() - # # NO GELU, ST GRADIENTS, EVERYTHING FINE. \ No newline at end of file + # # NO GELU, ST GRADIENTS, EVERYTHING FINE. diff --git a/tests/triton_tests/full_matrix_decomp.py b/tests/triton_tests/full_matrix_decomp.py index de37b95..e2932d4 100644 --- a/tests/triton_tests/full_matrix_decomp.py +++ b/tests/triton_tests/full_matrix_decomp.py @@ -4,7 +4,7 @@ import time import torch import torch.nn as nn import bitsandbytes.nn as bnn -from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear, MyLinear +from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear, StandardLinear from bitsandbytes.nn.triton_utils.v0.quantize_rowwise_nogroup import quantize_rowwise_nogroup from bitsandbytes.nn.triton_utils.v0.quantize_columnwise_nogroup_transpose import quantize_columnwise_nogroup_transpose @@ -350,4 +350,4 @@ if __name__ == '__main__': with open("tests/triton_tests/info.jsonl", "a") as file: - file.write(info_json + "\n") \ No newline at end of file + file.write(info_json + "\n") diff --git a/tests/triton_tests/mlp.py b/tests/triton_tests/mlp.py index 1ec85b8..8aef105 100644 --- a/tests/triton_tests/mlp.py +++ b/tests/triton_tests/mlp.py @@ -3,7 +3,7 @@ import time import torch import torch.nn as nn import bitsandbytes.nn as bnn -from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear, MyLinear +from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear, StandardLinear def construct_model(dim, layers, module): modules = [] @@ -41,7 +41,7 @@ if __name__ == '__main__': # construct models standard = construct_model(dim, layers, nn.Linear).half() - my_standard = construct_model(dim, layers, MyLinear).half() + my_standard = construct_model(dim, layers, StandardLinear).half() switchback = construct_model(dim, layers, SwitchBackLinear).half() switchback_global = construct_model(dim, layers, SwitchBackGlobalLinear).half() #bnb_8bitmixed = construct_model(dim, layers, bnn.Linear8bitLt) @@ -61,4 +61,4 @@ if __name__ == '__main__': - \ No newline at end of file + diff --git a/tests/triton_tests/mlp_decomp_autocast.py b/tests/triton_tests/mlp_decomp_autocast.py index 3a1fc9e..54bd5f5 100644 --- a/tests/triton_tests/mlp_decomp_autocast.py +++ b/tests/triton_tests/mlp_decomp_autocast.py @@ -1,7 +1,7 @@ import torch import json -from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, MyLinear +from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, StandardLinear import time if __name__ == '__main__': @@ -26,9 +26,9 @@ if __name__ == '__main__': ).cuda() my_standard = torch.nn.Sequential( - MyLinear(dim, 4 * dim), + StandardLinear(dim, 4 * dim), torch.nn.GELU(), - MyLinear(4 * dim, dim), + StandardLinear(4 * dim, dim), ).cuda() fused_mlp = SwitchBackGlobalMLP(dim, 4 * dim).cuda() @@ -163,4 +163,4 @@ if __name__ == '__main__': # import pdb; pdb.set_trace() - # # NO GELU, ST GRADIENTS, EVERYTHING FINE. \ No newline at end of file + # # NO GELU, ST GRADIENTS, EVERYTHING FINE. diff --git a/tests/triton_tests/mlp_decomp_autocast_ln.py b/tests/triton_tests/mlp_decomp_autocast_ln.py index 2596278..0a50cab 100644 --- a/tests/triton_tests/mlp_decomp_autocast_ln.py +++ b/tests/triton_tests/mlp_decomp_autocast_ln.py @@ -1,7 +1,7 @@ import torch import json -from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, MyLinear +from bitsandbytes.nn.triton_based_modules import SwitchBackGlobalMLP, SwitchBackGlobalLinear, StandardLinear import time if __name__ == '__main__': @@ -24,9 +24,9 @@ if __name__ == '__main__': my_standard = torch.nn.Sequential( torch.nn.LayerNorm(dim), - MyLinear(dim, 4 * dim), + StandardLinear(dim, 4 * dim), torch.nn.GELU(), - MyLinear(4 * dim, dim), + StandardLinear(4 * dim, dim), ).cuda() fused_mlp = SwitchBackGlobalMLP(dim, 4 * dim).cuda() @@ -162,4 +162,4 @@ if __name__ == '__main__': # import pdb; pdb.set_trace() - # # NO GELU, ST GRADIENTS, EVERYTHING FINE. \ No newline at end of file + # # NO GELU, ST GRADIENTS, EVERYTHING FINE. From 30d21d585c7b8d962cefbd938c6aa006d162fb58 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 31 Mar 2023 11:33:26 -0700 Subject: [PATCH 27/97] Added triton test. --- tests/test_triton.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/test_triton.py diff --git a/tests/test_triton.py b/tests/test_triton.py new file mode 100644 index 0000000..acbe32c --- /dev/null +++ b/tests/test_triton.py @@ -0,0 +1,44 @@ +import pytest +import torch + +from bitsandbytes.nn.triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear + + + +@pytest.mark.parametrize("triton_module", [SwitchBackGlobalLinear, SwitchBackLinear]) +def test_switchbatch(triton_module): + for dim in [83, 17, 128]: + for batch in [13, 128, 256]: + + standard = torch.nn.Linear(dim, 4 * dim).cuda().half() + switchback = triton_module(dim, 4 * dim).cuda().half() + switchback.weight.data.copy_(standard.weight) + switchback.bias.data.copy_(standard.bias) + + + for i in range(100): + x1 = torch.randn(batch, dim).cuda().half().requires_grad_(True) + x2 = x1.clone().detach().requires_grad_(True) + print('standard') + out_standard = standard(x1) + print('switchback') + out_sb = switchback(x1) + + (out_standard.abs().mean()).backward() + (out_sb.abs().mean()).backward() + + err_sb = (out_standard - out_sb).abs().mean() + print('OUT', err_sb) + + err_sb = (standard.bias.grad - switchback.bias.grad).abs().mean() + + print('GW2', err_sb) + + err_sb = (standard.weight.grad - switchback.weight.grad).abs().mean() + + print('GW1', err_sb) + + #err_sb = (x1.grad - x2.grad).abs().mean() + + #print('GX1', err_sb) + From c4cfe4fbdd70088c2ff0db1ae81bfe01c35fd2ae Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 1 Apr 2023 10:33:03 -0700 Subject: [PATCH 28/97] Added bf16 Adam. --- Makefile | 7 ++-- bitsandbytes/functional.py | 68 +++++++++++++++++--------------------- csrc/kernels.cu | 2 ++ csrc/ops.cu | 2 ++ csrc/pythonInterface.c | 41 ++++++++++++----------- tests/test_optim.py | 43 ++++++++++-------------- 6 files changed, 77 insertions(+), 86 deletions(-) diff --git a/Makefile b/Makefile index 7bee7ef..e114160 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ CUDA_VERSION:= endif + NVCC := $(CUDA_HOME)/bin/nvcc ########################################### @@ -59,9 +60,9 @@ CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 -all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env - $(NVCC) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o +all: $(BUILD_DIR) env + $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 969250a..8bfd668 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -73,6 +73,7 @@ if COMPILED_WITH_CUDA: str2optimizer8bit_blockwise["adam"] = ( lib.cadam_8bit_blockwise_fp32, lib.cadam_8bit_blockwise_fp16, + lib.cadam_8bit_blockwise_bf16, ) str2optimizer8bit_blockwise["momentum"] = ( lib.cmomentum_8bit_blockwise_fp32, @@ -1125,51 +1126,42 @@ def optimizer_update_8bit_blockwise( skip_zeros=False, ) -> None: + optim_func = None if g.dtype == torch.float32 and state1.dtype == torch.uint8: - str2optimizer8bit_blockwise[optimizer_name][0]( - get_ptr(p), - get_ptr(g), - get_ptr(state1), - get_ptr(state2), - ct.c_float(beta1), - ct.c_float(beta2), - ct.c_float(eps), - ct.c_int32(step), - ct.c_float(lr), - get_ptr(qmap1), - get_ptr(qmap2), - get_ptr(absmax1), - get_ptr(absmax2), - ct.c_float(weight_decay), - ct.c_float(gnorm_scale), - ct.c_bool(skip_zeros), - ct.c_int32(g.numel()), - ) + optimizer_func = str2optimizer8bit_blockwise[optimizer_name][0] elif g.dtype == torch.float16 and state1.dtype == torch.uint8: - str2optimizer8bit_blockwise[optimizer_name][1]( - get_ptr(p), - get_ptr(g), - get_ptr(state1), - get_ptr(state2), - ct.c_float(beta1), - ct.c_float(beta2), - ct.c_float(eps), - ct.c_int32(step), - ct.c_float(lr), - get_ptr(qmap1), - get_ptr(qmap2), - get_ptr(absmax1), - get_ptr(absmax2), - ct.c_float(weight_decay), - ct.c_float(gnorm_scale), - ct.c_bool(skip_zeros), - ct.c_int32(g.numel()), - ) + optimizer_func = str2optimizer8bit_blockwise[optimizer_name][1] + elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and + len(str2optimizer8bit_blockwise[optimizer_name])==3): + optimizer_func = str2optimizer8bit_blockwise[optimizer_name][2] else: raise ValueError( f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}" ) + is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2]) + + prev_device = pre_call(g.device) + optimizer_func( + get_ptr(p), + get_ptr(g), + get_ptr(state1), + get_ptr(state2), + ct.c_float(beta1), + ct.c_float(beta2), + ct.c_float(eps), + ct.c_int32(step), + ct.c_float(lr), + get_ptr(qmap1), + get_ptr(qmap2), + get_ptr(absmax1), + get_ptr(absmax2), + ct.c_float(weight_decay), + ct.c_float(gnorm_scale), + ct.c_bool(skip_zeros), + ct.c_int32(g.numel()), + ) + post_call(prev_device) def percentile_clipping( grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5 diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 8f33161..e7e57d7 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2988,6 +2988,8 @@ template __global__ void kOptimizerStatic8bit2StateBlockwise( \ diff --git a/csrc/ops.cu b/csrc/ops.cu index 8044c66..a5a23b5 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -741,3 +741,5 @@ MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD); template void percentileClipping(float * g, float *gnorm_vec, int step, const int n); template void percentileClipping(half * g, float *gnorm_vec, int step, const int n); + +MAKE_optimizerStatic8bitBlockwise(__nv_bfloat16, ADAM); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 6a4bb0d..a485a09 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -57,19 +57,20 @@ MAKE_FUNC8(rmsprop, RMSPROP, float, 32) MAKE_FUNC8(rmsprop, RMSPROP, half, 16) #define MAKE_BLOCKWISE8(fname, optim_name, gtype, gbits) \ -void fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \ +void fname##_8bit_blockwise_##gbits(gtype* p, gtype* g, \ unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \ float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n)\ { optimizerStatic8bitBlockwise(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); }\ -MAKE_BLOCKWISE8(adam, ADAM, half, 16) -MAKE_BLOCKWISE8(adam, ADAM, float, 32) -MAKE_BLOCKWISE8(momentum, MOMENTUM, half, 16) -MAKE_BLOCKWISE8(momentum, MOMENTUM, float, 32) -MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, 16) -MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, 32) -MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, 16) -MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32) +MAKE_BLOCKWISE8(adam, ADAM, half, fp16) +MAKE_BLOCKWISE8(adam, ADAM, float, fp32) +MAKE_BLOCKWISE8(momentum, MOMENTUM, half, fp16) +MAKE_BLOCKWISE8(momentum, MOMENTUM, float, fp32) +MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16) +MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32) +MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16) +MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32) +MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16) void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping(g, gnorm_vec, step, n); } @@ -194,20 +195,20 @@ extern "C" MAKE_CFUNC8(rmsprop, half, 16) #define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \ - void c##fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \ + void c##fname##_8bit_blockwise_##gbits(gtype* p, gtype* g, \ unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \ float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) \ - { fname##_8bit_blockwise_fp##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \ - - MAKE_CBLOCKWISE8(adam, ADAM, half, 16) - MAKE_CBLOCKWISE8(adam, ADAM, float, 32) - MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, 16) - MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, 32) - MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, 16) - MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, 32) - MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, 16) - MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, 32) + { fname##_8bit_blockwise_##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \ + MAKE_CBLOCKWISE8(adam, ADAM, half, fp16) + MAKE_CBLOCKWISE8(adam, ADAM, float, fp32) + MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, fp16) + MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, fp32) + MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, fp16) + MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32) + MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16) + MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32) + MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16) void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); } void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); } diff --git a/tests/test_optim.py b/tests/test_optim.py index 3df2dad..92e3ed2 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -26,6 +26,8 @@ def get_temp_dir(): def rm_path(path): shutil.rmtree(path) +str2bf16support = {} +str2bf16support['adam8bit_blockwise'] = True str2optimizers = {} str2optimizers["adam_pytorch"] = (None, torch.optim.Adam, bnb.optim.Adam) @@ -238,7 +240,7 @@ def test_global_config(dim1, dim2, gtype): dim1 = [1024] dim2 = [32, 1024, 4097] -gtype = [torch.float32, torch.float16] +gtype = [torch.float32, torch.float16, torch.bfloat16] optimizer_names = [ "adam8bit", "momentum8bit", @@ -256,6 +258,7 @@ names = [ @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_optimizer8bit(dim1, dim2, gtype, optim_name): + if gtype == torch.bfloat16 and optim_name not in str2bf16support: return if dim1 == 1 and dim2 == 1: return p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 @@ -269,7 +272,9 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): if gtype == torch.float32: atol, rtol = 3e-3, 1e-3 patol, prtol = 1e-5, 1e-3 - + elif gtype == torch.bfloat16: + atol, rtol = 3e-3, 1e-3 + patol, prtol = 1e-4, 1e-2 else: atol, rtol = 3e-3, 1e-3 patol, prtol = 1e-5, 1e-3 @@ -314,8 +319,12 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): err = torch.abs(p1 - p2) relerr = err / torch.abs(p1) - assert err.mean() < 0.0001 - assert relerr.mean() < 0.001 + if g.dtype == torch.bfloat16: + assert err.mean() < 0.00015 + assert relerr.mean() < 0.0015 + else: + assert err.mean() < 0.0001 + assert relerr.mean() < 0.001 errors.append(err.mean().item()) relerrors.append(relerr.mean().item()) @@ -335,12 +344,8 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): bnb_optimizer = str2optimizers[optim_name][1]([p2]) bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt"))) rm_path(path) - torch.testing.assert_allclose( - raws1cpy, bnb_optimizer.state[p2][name2] - ) - torch.testing.assert_allclose( - qmap1, bnb_optimizer.state[p2][qmap] - ) + torch.testing.assert_allclose(raws1cpy, bnb_optimizer.state[p2][name2]) + torch.testing.assert_allclose(qmap1, bnb_optimizer.state[p2][qmap]) if "blockwise" in optim_name: s1 = F.dequantize_blockwise( @@ -357,28 +362,16 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): ) torch.testing.assert_allclose(s1cpy, s1) - num_not_close = ( - torch.isclose( - torch_optimizer.state[p1][name1], - s1, - atol=atol, - rtol=rtol, - ) - == 0 - ) + num_not_close = (torch.isclose(torch_optimizer.state[p1][name1], s1, atol=atol, rtol=rtol) == 0) assert num_not_close.sum().item() < 20 - torch.testing.assert_allclose( - p1, p2.float(), atol=patol, rtol=prtol - ) + torch.testing.assert_allclose(p1, p2.float(), atol=patol, rtol=prtol) # the parameters diverge quickly. Here we keep them close # together so we can test against the Adam error p1.data = p1.data.to(gtype).float() p2.copy_(p1.data) torch.testing.assert_allclose(p1.to(gtype), p2) - for (name1, name2, qmap, max_val), s in zip( - str2statenames[optim_name], dequant_states - ): + for (name1, name2, qmap, max_val), s in zip(str2statenames[optim_name], dequant_states): torch_optimizer.state[p1][name1].copy_(s.data) # print(sum(errors)/len(errors)) From 7f87ba83eeae47bfb6d092007dd6ca42fff6c57a Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Sat, 1 Apr 2023 18:46:04 +0000 Subject: [PATCH 29/97] cleaning and refactor --- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/triton_based_modules.py | 196 ++++++---- .../nn/triton_utils/v0/fused_gelu_quantize.py | 190 --------- .../v0/int8_matmul_mixed_dequanitze.py | 138 +------ .../v0/int8_matmul_rowwise_dequantize.py | 18 +- .../v0/int8_matmul_rowwise_dequantize_bias.py | 160 -------- ...y => quantize_columnwise_and_transpose.py} | 64 +-- .../nn/triton_utils/v0/quantize_global.py | 34 +- .../nn/triton_utils/v0/quantize_rowwise.py | 61 +++ .../v0/quantize_rowwise_nogroup.py | 174 --------- speed_benchmark/info_a100_py2.jsonl | 60 +++ .../make_plot_with_jsonl.py | 38 +- .../plot_with_info.pdf | Bin 34302 -> 34876 bytes speed_benchmark/speed_benchmark.py | 101 +++++ tests/test_triton.py | 57 +-- tests/triton_tests/attn_decomp.py | 363 ------------------ tests/triton_tests/attn_info_ln.jsonl | 20 - tests/triton_tests/full_matrix_decomp.py | 353 ----------------- tests/triton_tests/info.jsonl | 142 ------- tests/triton_tests/info_mlp.jsonl | 20 - tests/triton_tests/info_mlp_autocast.jsonl | 20 - tests/triton_tests/info_mlp_autocast_ln.jsonl | 23 -- tests/triton_tests/mlp.py | 64 --- tests/triton_tests/mlp_decomp_autocast.py | 166 -------- tests/triton_tests/mlp_decomp_autocast_ln.py | 165 -------- tests/triton_tests/plot1.png | Bin 121873 -> 0 bytes tests/triton_tests/plot2.pdf | Bin 16044 -> 0 bytes tests/triton_tests/plot2.png | Bin 51996 -> 0 bytes tests/triton_tests/plot2.py | 69 ---- tests/triton_tests/plot3.pdf | Bin 20122 -> 0 bytes tests/triton_tests/plot3.png | Bin 58335 -> 0 bytes tests/triton_tests/plot3.py | 193 ---------- tests/triton_tests/rowwise.py | 43 --- 33 files changed, 420 insertions(+), 2514 deletions(-) delete mode 100644 bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py rename bitsandbytes/nn/triton_utils/v0/{quantize_columnwise_nogroup_transpose.py => quantize_columnwise_and_transpose.py} (54%) create mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py create mode 100644 speed_benchmark/info_a100_py2.jsonl rename tests/triton_tests/make_plot_with_info.py => speed_benchmark/make_plot_with_jsonl.py (82%) rename tests/triton_tests/plot1.pdf => speed_benchmark/plot_with_info.pdf (76%) create mode 100644 speed_benchmark/speed_benchmark.py delete mode 100644 tests/triton_tests/attn_decomp.py delete mode 100644 tests/triton_tests/attn_info_ln.jsonl delete mode 100644 tests/triton_tests/full_matrix_decomp.py delete mode 100644 tests/triton_tests/info.jsonl delete mode 100644 tests/triton_tests/info_mlp.jsonl delete mode 100644 tests/triton_tests/info_mlp_autocast.jsonl delete mode 100644 tests/triton_tests/info_mlp_autocast_ln.jsonl delete mode 100644 tests/triton_tests/mlp.py delete mode 100644 tests/triton_tests/mlp_decomp_autocast.py delete mode 100644 tests/triton_tests/mlp_decomp_autocast_ln.py delete mode 100644 tests/triton_tests/plot1.png delete mode 100644 tests/triton_tests/plot2.pdf delete mode 100644 tests/triton_tests/plot2.png delete mode 100644 tests/triton_tests/plot2.py delete mode 100644 tests/triton_tests/plot3.pdf delete mode 100644 tests/triton_tests/plot3.png delete mode 100644 tests/triton_tests/plot3.py delete mode 100644 tests/triton_tests/rowwise.py diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 8e3a598..c6141ad 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed -from .triton_based_modules import SwitchBackLinear, SwitchBackGlobalLinear +from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorized, StandardLinear diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 0344464..ab76f4e 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -1,26 +1,76 @@ import torch import torch.nn as nn import time +from functools import partial -from .triton_utils.v0.quantize_rowwise_nogroup import quantize_rowwise_nogroup -from .triton_utils.v0.quantize_columnwise_nogroup_transpose import quantize_columnwise_nogroup_transpose -from .triton_utils.v0.int8_matmul_rowwise_dequantize_bias import int8_matmul_rowwise_dequantize_bias +from .triton_utils.v0.quantize_rowwise import quantize_rowwise +from .triton_utils.v0.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose from .triton_utils.v0.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize from .triton_utils.v0.quantize_global import quantize_global, quantize_global_transpose -from .triton_utils.v0.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze, int8_matmul_mixed_dequanitze_bias -from .triton_utils.v0.fused_gelu_quantize import quantize_rowwise_nogroup_gelu, quantize_rowwise_nogroup_back_gelu +from .triton_utils.v0.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze -class _switchback(torch.autograd.Function): + +class _switchback_global(torch.autograd.Function): @staticmethod def forward(ctx, X_3D, W, bias): + # reshape input to [N * L, D] + X = X_3D.view(-1, X_3D.size(-1)) + # rowwise quantize for X, global quantize for W + X_int8, state_X = quantize_rowwise(X) + W_int8, state_W = quantize_global(W) + + # save for backward. + ctx.save_for_backward = X, W + + # matmult, fused dequant and add bias + # call "mixed" because we are mixing rowwise quantized and global quantized + return int8_matmul_mixed_dequanitze( + X_int8, W_int8.t(), state_X, state_W, bias + ).view(*X_3D.size()[:-1], -1) + + @staticmethod + def backward(ctx, G_3D): + # reshape input to [N_out * L, D] + G = G_3D.reshape(-1, G_3D.size(-1)) + + grad_X = grad_W = grad_bias = None + + X, W = ctx.save_for_backward + if ctx.needs_input_grad[0]: + # rowwise quantize for G, global quantize for W + # for W, we also fuse the transpose operation because only A @ B^T is supported + # so we transpose once then call .t() in the matmul + G_int8, state_G = quantize_rowwise(G) + W_int8, state_W = quantize_global_transpose(W) + grad_X = int8_matmul_mixed_dequanitze(G_int8, W_int8.t(), state_G, state_W, None).view( + *G_3D.size()[:-1], -1 + ) + if ctx.needs_input_grad[1]: + # backward pass uses standard weight grad + grad_W = torch.matmul(G.t(), X.to(G.dtype)) + if ctx.needs_input_grad[2]: + grad_bias = G.sum(dim=0) + + return grad_X, grad_W, grad_bias + +class _switchback_vectorrize(torch.autograd.Function): + + @staticmethod + def forward(ctx, X_3D, W, bias): + # reshape input to [N * L, D] X = X_3D.view(-1, X_3D.size(-1)) ctx.save_for_backward = X, W - X_int8, state_X = quantize_rowwise_nogroup(X) - W_int8, state_W = quantize_rowwise_nogroup(W) - return int8_matmul_rowwise_dequantize_bias( + # rowwise quantize for X + # columnwise quantize for W (first rowwise, transpose later) + X_int8, state_X = quantize_rowwise(X) + W_int8, state_W = quantize_rowwise(W) + + # matmult, fused dequant and add bias + # call kernel which expects rowwise quantized X and W + return int8_matmul_rowwise_dequantize( X_int8, W_int8.t(), state_X, state_W, bias ).view(*X_3D.size()[:-1], -1) @@ -33,12 +83,15 @@ class _switchback(torch.autograd.Function): grad_X = grad_W = grad_bias = None if ctx.needs_input_grad[0]: - G_int8, state_G = quantize_rowwise_nogroup(G) - W_int8, state_W = quantize_columnwise_nogroup_transpose(W) - grad_X = int8_matmul_rowwise_dequantize(G_int8, W_int8.t(), state_G, state_W).view( + # rowwise quantize for G, columnwise quantize for W and fused transpose + # we call .t() for weight later because only A @ B^T is supported + G_int8, state_G = quantize_rowwise(G) + W_int8, state_W = quantize_columnwise_and_transpose(W) + grad_X = int8_matmul_rowwise_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view( *G_3D.size()[:-1], -1 ) if ctx.needs_input_grad[1]: + # backward pass uses standard weight grad grad_W = torch.matmul(G.t(), X.to(G.dtype)) if ctx.needs_input_grad[2]: grad_bias = G.sum(dim=0) @@ -46,11 +99,37 @@ class _switchback(torch.autograd.Function): return grad_X, grad_W, grad_bias class SwitchBackLinear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + vectorize: bool = False + ): + super().__init__(in_features, out_features, bias, device, dtype) + + # By default, we use the global quantization. + self.vectorize = vectorize + if self.vectorize: + self._fn = _switchback_vectorrize + else: + self._fn = _switchback_global def prepare_for_eval(self): - state_W = self.weight.abs().max(dim=1, keepdim=True)[0] - W_int8 = (127 * self.weight.float() / state_W).round().to(torch.int8) - state_W = state_W.squeeze() + # If we just want to do eval, we can pre-quantize the weights instead of doing it on the forward pass. + # Note this is experimental and not tested thoroughly. + # Note this needs to be explicitly called with something like + # def cond_prepare(m): + # if hasattr(m, "prepare_for_eval"): + # m.prepare_for_eval() + # model.apply(cond_prepare) + print('=> preparing for eval.') + if self.vectorize: + W_int8, state_W = quantize_rowwise(self.weight) + else: + W_int8, state_W = quantize_global(self.weight) self.register_buffer("W_int8", W_int8) self.register_buffer("state_W", state_W) @@ -59,80 +138,29 @@ class SwitchBackLinear(nn.Linear): def forward(self, x): if self.training: - return _switchback.apply(x, self.weight, self.bias) + return self._fn.apply(x, self.weight, self.bias) else: - if not hasattr(self, "state_W"): - self.prepare_for_eval() + # If it hasn't been "prepared for eval", run the standard forward pass. + if not hasattr(self, "W_int8"): + return self._fn.apply(x, self.weight, self.bias) + + # Otherwise, use pre-computed weights. X = x.view(-1, x.size(-1)) - X_int8, state_X = quantize_rowwise_nogroup(X) - return int8_matmul_rowwise_dequantize_bias( - X_int8, self.W_int8.t(), state_X, self.state_W, self.bias - ).view(*x.size()[:-1], -1) - - -class _switchback_global(torch.autograd.Function): - - @staticmethod - def forward(ctx, X_3D, W, bias): - - X = X_3D.view(-1, X_3D.size(-1)) - - X_int8, state_X = quantize_rowwise_nogroup(X) - W_int8, state_W = quantize_global(W) - ctx.save_for_backward = X, W - return int8_matmul_mixed_dequanitze_bias( - X_int8, W_int8.t(), state_X, state_W, bias - ).view(*X_3D.size()[:-1], -1) - - @staticmethod - def backward(ctx, G_3D): - - G = G_3D.reshape(-1, G_3D.size(-1)) - - grad_X = grad_W = grad_bias = None - - X, W = ctx.save_for_backward - if ctx.needs_input_grad[0]: - G_int8, state_G = quantize_rowwise_nogroup(G) - W_int8, state_W = quantize_global_transpose(W) - grad_X = int8_matmul_mixed_dequanitze(G_int8, W_int8.t(), state_G, state_W).view( - *G_3D.size()[:-1], -1 - ) - if ctx.needs_input_grad[1]: - grad_W = torch.matmul(G.t(), X.to(G.dtype)) - if ctx.needs_input_grad[2]: - grad_bias = G.sum(dim=0) - - return grad_X, grad_W, grad_bias - - - -class SwitchBackGlobalLinear(nn.Linear): - - def prepare_for_eval(self): - state_W = self.weight.abs().max() - W_int8 = (127 * self.weight.float() / state_W).round().to(torch.int8) - - self.register_buffer("W_int8", W_int8) - self.register_buffer("state_W", state_W) - - del self.weight - - def forward(self, x): - if self.training: - return _switchback_global.apply(x, self.weight, self.bias) - else: - if not hasattr(self, "state_W"): - self.prepare_for_eval() - X = x.view(-1, x.size(-1)) - X_int8, state_X = quantize_rowwise_nogroup(X) - return int8_matmul_mixed_dequanitze_bias( - X_int8, self.W_int8.t(), state_X, self.state_W, self.bias - ).view(*x.size()[:-1], -1) - + X_int8, state_X = quantize_rowwise(X) + if self.vectorize: + return int8_matmul_rowwise_dequantize( + X_int8, self.W_int8.t(), state_X, self.state_W, self.bias + ).view(*x.size()[:-1], -1) + else: + return int8_matmul_mixed_dequanitze( + X_int8, self.W_int8.t(), state_X, self.state_W, self.bias + ).view(*x.size()[:-1], -1) +SwitchBackLinearGlobal = partial(SwitchBackLinear, vectorize=False) +SwitchBackLinearVectorized = partial(SwitchBackLinear, vectorize=True) +# This is just the standard linear function. class StandardLinearFunction(torch.autograd.Function): @staticmethod def forward(ctx, input, weight, bias=None): diff --git a/bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py b/bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py deleted file mode 100644 index 50451cb..0000000 --- a/bitsandbytes/nn/triton_utils/v0/fused_gelu_quantize.py +++ /dev/null @@ -1,190 +0,0 @@ -import math -import torch -import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -tl.libdevice - -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_rowwise_nogroup_gelu( - x_ptr, - output_ptr, - output_maxs, - output_fp16, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - - cdf = 0.5 * (1.0 + tl.libdevice.tanh(x * 0.7978845608 * (1 + 0.044715 * x * x))) - x_new = x * cdf - - tl.store(output_fp16 + offsets, x_new, mask=row_mask) - - abs_x = tl.abs(x_new) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x_new / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) - -def quantize_rowwise_nogroup_gelu(x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_fp16 = torch.empty(*x.shape, device=x.device, dtype=torch.float16) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _quantize_rowwise_nogroup_gelu[grid](x, output, output_maxs, output_fp16, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output, output_maxs, output_fp16 - - - -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_rowwise_nogroup_back_gelu( - x_ptr, - in_ptr, - output_ptr, - output_maxs, - output_fp16, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x_out = tl.load(x_ptr + offsets, mask=row_mask) - x_in = tl.load(in_ptr + offsets, mask=row_mask) - - cdf = 0.5 * (1.0 + tl.libdevice.tanh(x_in * 0.7978845608 * (1 + 0.044715 * x_in * x_in))) - intermediate = tl.libdevice.tanh(x_in * 0.7978845608 * (1 + 0.044715 * x_in * x_in)) - dcdf = 0.5 * (0.7978845608 + 0.1070322243 * x_in * x_in) * (1 - intermediate * intermediate) - x = x_out * (cdf + x_in * dcdf) - - tl.store(output_fp16 + offsets, x, mask=row_mask) - - abs_x = tl.abs(x) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) - -def quantize_rowwise_nogroup_back_gelu(x: torch.Tensor, y : torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_fp16 = torch.empty(*x.shape, device=x.device, dtype=torch.float16) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _quantize_rowwise_nogroup_back_gelu[grid](x, y, output, output_maxs, output_fp16, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output, output_maxs, output_fp16 - - - -# if __name__ == '__main__': -# torch.manual_seed(0) - -# x = torch.randn(1280, 768).cuda().to(torch.float16) -# out = quantize_rowwise_nogroup(x) - -# x_real = (127 * x.float() / x.abs().max(dim=1, keepdim=True)[0]).round().to(torch.int8) -# max2 = x.abs().max(1)[0] - -# print(torch.allclose(out[1], max2)) -# print( (x_real == out[0]).float().mean() ) - -# # for i in range(x.shape[0]): -# # print( (x_real[i, :] == out[0][i, :]).float().mean() ) - -# # print(out[0]) -# # print(x_real) -# # import pdb; pdb.set_trace() -# # print(out[2]) -# # print(out[2][:10]) -# sums = x.sum(dim=0) -# #print(sums[:10]) -# #print( (sums == out[2]).float().mean() ) - -# import pdb; pdb.set_trace() -# # import pdb; pdb.set_trace() -# # exit() - -# # repeat = 16 - -# # for _ in range(8): -# # out = quantize_rowwise_nogroup(x) - -# # triton_graph = torch.cuda.CUDAGraph() -# # with torch.cuda.graph(triton_graph): -# # out = quantize_rowwise_nogroup(x) - -# # triton_graph.replay() - -# # torch.cuda.synchronize() -# # start = time.time() -# # for _ in range(repeat): -# # triton_graph.replay() -# # torch.cuda.synchronize() -# # end = time.time() - -# # print(out[0]) -# # print(out[1]) -# # print(x / x.abs().max(dim=1, keepdim=True)[0]) -# # max1 = out[1] -# # max2 = x.abs().max(1)[0] -# # print(max1, max2) -# # print(torch.allclose(max1, max2)) - -# #print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py index 2ecfcb8..69d4b0c 100644 --- a/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py +++ b/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py @@ -5,10 +5,14 @@ import triton.language as tl from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +# This is a matmul kernel based on triton.ops.matmul +# It is modified to support rowwise quantized input and global quantized weight +# It's purpose is fused matmul then dequantize +# It does support bias. + def init_to_zero(name): return lambda nargs: nargs[name].zero_() - def get_configs_io_bound(): configs = [] for num_stages in [2, 3, 4, 5, 6]: @@ -60,130 +64,7 @@ def get_configs_io_bound(): 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, }) @triton.jit -def _kernel(A, B, C, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - - w_factor = tl.load(state_w_ptr) - x_factor = tl.load(state_x_ptr + ram)[:, None] - - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = (w_factor * (x_factor * (acc * divfactor))) - acc = acc.to(C.dtype.element_ty) - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - -def int8_matmul_mixed_dequanitze(a, b, state_x, state_w): - device = a.device - divfactor = 1. / (127. * 127.) - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _kernel[grid](a, b, c, state_x, state_w, M, N, K, divfactor, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) - return c - - - -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _kernel_bias(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, +def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, @@ -236,6 +117,7 @@ def _kernel_bias(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) + # conditionally add bias if has_bias: bias = tl.load(bias + rn).to(C.dtype.element_ty) acc = acc + bias[None, :] @@ -249,7 +131,7 @@ def _kernel_bias(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl tl.atomic_add(C, acc, mask=mask) -def int8_matmul_mixed_dequanitze_bias(a, b, state_x, state_w, bias): +def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): device = a.device divfactor = 1. / (127. * 127.) has_bias = 0 if bias is None else 1 @@ -266,9 +148,9 @@ def int8_matmul_mixed_dequanitze_bias(a, b, state_x, state_w, bias): c = torch.empty((M, N), device=device, dtype=torch.float16) # accumulator types ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch kernel + # launch int8_matmul_mixed_dequantize kernel grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _kernel_bias[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py index fa0b516..4af054b 100644 --- a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py @@ -4,6 +4,10 @@ import triton import triton.language as tl from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +# This is a matmul kernel based on triton.ops.matmul +# It is modified to support rowwise quantized input and columnwise quantized weight +# It's purpose is fused matmul then dequantize +# It does support bias. def init_to_zero(name): return lambda nargs: nargs[name].zero_() @@ -60,7 +64,7 @@ def get_configs_io_bound(): 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, }) @triton.jit -def _kernel(A, B, C, state_x_ptr, state_w_ptr, M, N, K, divfactor, +def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, @@ -113,6 +117,10 @@ def _kernel(A, B, C, state_x_ptr, state_w_ptr, M, N, K, divfactor, acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) mask = (rm < M)[:, None] & (rn < N)[None, :] # handles write-back with reduction-splitting @@ -122,9 +130,11 @@ def _kernel(A, B, C, state_x_ptr, state_w_ptr, M, N, K, divfactor, tl.atomic_add(C, acc, mask=mask) -def int8_matmul_rowwise_dequantize(a, b, state_x, state_w): +def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): divfactor = 1. / (127. * 127.) + has_bias = 0 if bias is None else 1 + device = a.device # handle non-contiguous inputs if necessary if a.stride(0) > 1 and a.stride(1) > 1: @@ -139,9 +149,9 @@ def int8_matmul_rowwise_dequantize(a, b, state_x, state_w): c = torch.empty((M, N), device=device, dtype=torch.float16) # accumulator types ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch kernel + # launch int8_matmul_rowwise_dequantize kernel grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _kernel[grid](a, b, c, state_x, state_w, M, N, K, divfactor, + _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py deleted file mode 100644 index 5f524c1..0000000 --- a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize_bias.py +++ /dev/null @@ -1,160 +0,0 @@ -import torch - -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - - -def init_to_zero(name): - return lambda nargs: nargs[name].zero_() - - -def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) - return configs - - -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _kernel(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - - w_factor = tl.load(state_w_ptr + rbn)[None, :] - x_factor = tl.load(state_x_ptr + ram)[:, None] - - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = (w_factor * (x_factor * (acc * divfactor))) - acc = acc.to(C.dtype.element_ty) - - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - -def int8_matmul_rowwise_dequantize_bias(a, b, state_x, state_w, bias): - - #print(bias) - divfactor = 1. / (127. * 127.) - - has_bias = 0 if bias is None else 1 - - if bias is not None: - bias = bias.contiguous() - - device = a.device - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _kernel[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) - return c diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py b/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py similarity index 54% rename from bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py rename to bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py index fa3a9a9..4e53475 100644 --- a/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_nogroup_transpose.py +++ b/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py @@ -5,6 +5,8 @@ import triton import triton.language as tl from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +# This kernel does fused columnwise quantization and transpose. + # TODO: autotune this better. @triton.autotune( configs=[ @@ -26,7 +28,7 @@ from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_tim key=['n_elements'] ) @triton.jit -def _quantize_columnwise_nogroup_transpose( +def _quantize_columnwise_and_transpose( x_ptr, output_ptr, output_maxs, @@ -51,7 +53,7 @@ def _quantize_columnwise_nogroup_transpose( tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) tl.store(output_maxs + pid, max_val) -def quantize_columnwise_nogroup_transpose(x: torch.Tensor): +def quantize_columnwise_and_transpose(x: torch.Tensor): M, N = x.shape output = torch.empty(N, M, device=x.device, dtype=torch.int8) output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) @@ -61,62 +63,6 @@ def quantize_columnwise_nogroup_transpose(x: torch.Tensor): assert x.is_cuda and output.is_cuda n_elements = output.numel() grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _quantize_columnwise_nogroup_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) + _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) return output, output_maxs - - -if __name__ == '__main__': - torch.manual_seed(0) - - x = torch.randn(1280, 768).cuda().to(torch.float16) - out = quantize_columnwise_nogroup_transpose(x) - - - x_real = x.t().float() - x_real_int8 = (127. * x_real / x_real.abs().max(dim=1, keepdim=True)[0]).round().to(torch.int8) - maxs = x_real.abs().max(dim=1, keepdim=True)[0].half() - - #print(out[0][2,:]) - - print((out[0] == x_real_int8).float().mean()) - print((out[1] == maxs[:, 0]).float().mean()) - - # print(out[0]) - # print(out[1]) - - # print(out[0][2,:]) - # print(x_real[2, :]) - - # print((out[0] != x_real).nonzero()) - - #import pdb; pdb.set_trace() - # repeat = 16 - - # for _ in range(8): - # out = quantize_columnwise_nogroup_transpose(x) - - # triton_graph = torch.cuda.CUDAGraph() - # with torch.cuda.graph(triton_graph): - # out = quantize_columnwise_nogroup_transpose(x) - - # triton_graph.replay() - - # torch.cuda.synchronize() - # start = time.time() - # for _ in range(repeat): - # triton_graph.replay() - # torch.cuda.synchronize() - # end = time.time() - - # print(out[0]) - # print(out[1]) - # print(x / x.abs().max(dim=0, keepdim=True)[0]) - # x_real = (127 * (x / x.abs().max(dim=0, keepdim=True)[0])).round().to(torch.int8) - # max1 = out[1] - # max2 = x.abs().max(0)[0] - # print(max1, max2) - # import pdb; pdb.set_trace() - # print(torch.allclose(max1, max2)) - - # print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_global.py b/bitsandbytes/nn/triton_utils/v0/quantize_global.py index 6d23aac..229721c 100644 --- a/bitsandbytes/nn/triton_utils/v0/quantize_global.py +++ b/bitsandbytes/nn/triton_utils/v0/quantize_global.py @@ -5,7 +5,7 @@ import triton import triton.language as tl from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time -# TODO: autotune this better. +# global quantize @triton.autotune( configs=[ triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), @@ -42,6 +42,7 @@ def quantize_global(x: torch.Tensor): return output, absmax +# global quantize and transpose @triton.autotune( configs=[ triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), @@ -97,34 +98,3 @@ def quantize_global_transpose(input): _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) return out, absmax -if __name__ == '__main__': - - - w = torch.randn(768, 1280).cuda().to(torch.float16) - W_int8, state_w = quantize_global(w) - r_state_w = w.abs().max() - r_W_int8 = ((127 * w.float()) / state_w).round().to(torch.int8) - print((r_W_int8 == W_int8).float().mean()) - - # print(r_W_int8) - # print(W_int8) - exit() - repeat = 16 - - for _ in range(8): - out = quantize_global(w) - - triton_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(triton_graph): - out = quantize_global(w) - - triton_graph.replay() - - torch.cuda.synchronize() - start = time.time() - for _ in range(repeat): - triton_graph.replay() - torch.cuda.synchronize() - end = time.time() - - print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py b/bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py new file mode 100644 index 0000000..d956647 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py @@ -0,0 +1,61 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# rowwise quantize + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_rowwise( + x_ptr, + output_ptr, + output_maxs, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + + abs_x = tl.abs(x) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_rowwise(x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output, output_maxs + diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py b/bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py deleted file mode 100644 index 7e63f74..0000000 --- a/bitsandbytes/nn/triton_utils/v0/quantize_rowwise_nogroup.py +++ /dev/null @@ -1,174 +0,0 @@ -import math -import torch -import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_rowwise_nogroup( - x_ptr, - output_ptr, - output_maxs, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - - abs_x = tl.abs(x) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) - -def quantize_rowwise_nogroup(x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _quantize_rowwise_nogroup[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output, output_maxs - - -@triton.autotune( - configs=[ - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _experimental_quantize_rowwise_nogroup( - x_ptr, - output_ptr, - bias_grad_ptr, - output_maxs, - n_elements, - M: tl.constexpr, N: tl.constexpr, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, - P2M: tl.constexpr, -): - pid = tl.program_id(axis=0) - if pid < M: - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - - abs_x = tl.abs(x) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) - else: - real_pid = pid - M - arange_new = tl.arange(0, P2M) - mask_new = arange_new < M - offsets_new = real_pid + arange_new * N - new_x = tl.load(x_ptr + offsets_new, mask=mask_new) - s = tl.sum(tl.where(mask_new, new_x, 0).to(tl.float32), axis=0) - tl.store(bias_grad_ptr + real_pid, s) - -def experimental_quantize_rowwise_nogroup(x: torch.Tensor): - M, N = x.shape - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) - bias_grad = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - P2M = int(2 ** (math.ceil(math.log2(x.shape[0])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0] + x.shape[1],) - _experimental_quantize_rowwise_nogroup[grid](x, output, bias_grad, output_maxs, n_elements, M, N, BLOCK_SIZE=x.shape[1], P2=P2, P2M=P2M) - return output, output_maxs, bias_grad - - -if __name__ == '__main__': - torch.manual_seed(0) - - x = torch.randn(1280, 768).cuda().to(torch.float16) - out = quantize_rowwise_nogroup(x) - - x_real = (127 * x.float() / x.abs().max(dim=1, keepdim=True)[0]).round().to(torch.int8) - max2 = x.abs().max(1)[0] - - print(torch.allclose(out[1], max2)) - print( (x_real == out[0]).float().mean() ) - - # for i in range(x.shape[0]): - # print( (x_real[i, :] == out[0][i, :]).float().mean() ) - - # print(out[0]) - # print(x_real) - # import pdb; pdb.set_trace() - # print(out[2]) - # print(out[2][:10]) - sums = x.sum(dim=0) - #print(sums[:10]) - #print( (sums == out[2]).float().mean() ) - - import pdb; pdb.set_trace() - # import pdb; pdb.set_trace() - # exit() - - # repeat = 16 - - # for _ in range(8): - # out = quantize_rowwise_nogroup(x) - - # triton_graph = torch.cuda.CUDAGraph() - # with torch.cuda.graph(triton_graph): - # out = quantize_rowwise_nogroup(x) - - # triton_graph.replay() - - # torch.cuda.synchronize() - # start = time.time() - # for _ in range(repeat): - # triton_graph.replay() - # torch.cuda.synchronize() - # end = time.time() - - # print(out[0]) - # print(out[1]) - # print(x / x.abs().max(dim=1, keepdim=True)[0]) - # max1 = out[1] - # max2 = x.abs().max(1)[0] - # print(max1, max2) - # print(torch.allclose(max1, max2)) - - #print(f"time: {(end - start) / repeat * 1000:.3f} ms") diff --git a/speed_benchmark/info_a100_py2.jsonl b/speed_benchmark/info_a100_py2.jsonl new file mode 100644 index 0000000..53cda62 --- /dev/null +++ b/speed_benchmark/info_a100_py2.jsonl @@ -0,0 +1,60 @@ +{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.28139352798461914, "standard_gw": 0.2811811864376068, "standard_gx": 0.30258670449256897, "rowwise_fwd": 0.1994594931602478, "rowwise_bwd": 0.16159191727638245, "global_fwd": 0.19502267241477966, "global_bwd": 0.16080215573310852, "x_quantize_rowwise": 0.03306940197944641, "g_quantize_rowwise": 0.08210167288780212, "w_quantize_rowwise": 0.03385916352272034, "w_quantize_colwise_transpose": 0.08635595440864563, "w_quantize_global": 0.09237229824066162, "w_quantize_global_transpose": 0.10007619857788086, "time_standard": 0.8651614189147949, "time_rowwise": 0.8776187896728516, "time_global": 0.944625586271286} +{"repeat": 64, "batch_size": 8192, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.262625515460968, "standard_gw": 0.2806223928928375, "standard_gx": 0.31118839979171753, "rowwise_fwd": 0.1828707754611969, "rowwise_bwd": 0.21236762404441833, "global_fwd": 0.16665831208229065, "global_bwd": 0.19929558038711548, "x_quantize_rowwise": 0.08227676153182983, "g_quantize_rowwise": 0.03310292959213257, "w_quantize_rowwise": 0.032648444175720215, "w_quantize_colwise_transpose": 0.09015202522277832, "w_quantize_global": 0.0988692045211792, "w_quantize_global_transpose": 0.10057538747787476, "time_standard": 0.8544363081455231, "time_rowwise": 0.9140409529209137, "time_global": 0.96140056848526} +{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.5731917917728424, "standard_gw": 0.5709454417228699, "standard_gx": 0.5963630974292755, "rowwise_fwd": 0.37662312388420105, "rowwise_bwd": 0.281747430562973, "global_fwd": 0.36768242716789246, "global_bwd": 0.28043612837791443, "x_quantize_rowwise": 0.046547502279281616, "g_quantize_rowwise": 0.15532970428466797, "w_quantize_rowwise": 0.032436102628707886, "w_quantize_colwise_transpose": 0.08635222911834717, "w_quantize_global": 0.0947415828704834, "w_quantize_global_transpose": 0.10129809379577637, "time_standard": 1.7405003309249878, "time_rowwise": 1.5499815344810486, "time_global": 1.616980880498886} +{"repeat": 64, "batch_size": 16384, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.5341619253158569, "standard_gw": 0.5690865218639374, "standard_gx": 0.599835067987442, "rowwise_fwd": 0.3233291208744049, "rowwise_bwd": 0.41359663009643555, "global_fwd": 0.2831108868122101, "global_bwd": 0.37280842661857605, "x_quantize_rowwise": 0.15563145279884338, "g_quantize_rowwise": 0.046741217374801636, "w_quantize_rowwise": 0.03306940197944641, "w_quantize_colwise_transpose": 0.09020790457725525, "w_quantize_global": 0.0925213098526001, "w_quantize_global_transpose": 0.09945780038833618, "time_standard": 1.7030835151672363, "time_rowwise": 1.6316622495651245, "time_global": 1.6193576157093048} +{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 1.2199915945529938, "standard_gw": 1.1069811880588531, "standard_gx": 1.09761580824852, "rowwise_fwd": 0.738043338060379, "rowwise_bwd": 0.5549229681491852, "global_fwd": 0.7219798862934113, "global_bwd": 0.5512163043022156, "x_quantize_rowwise": 0.08748471736907959, "g_quantize_rowwise": 0.3023110330104828, "w_quantize_rowwise": 0.03182142972946167, "w_quantize_colwise_transpose": 0.08632615208625793, "w_quantize_global": 0.09445473551750183, "w_quantize_global_transpose": 0.10032951831817627, "time_standard": 3.424588590860367, "time_rowwise": 2.9078908264636993, "time_global": 2.9647573828697205} +{"repeat": 64, "batch_size": 32768, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 1.1040829122066498, "standard_gw": 1.1221766471862793, "standard_gx": 1.1548101902008057, "rowwise_fwd": 0.581938773393631, "rowwise_bwd": 0.7480122148990631, "global_fwd": 0.5537159740924835, "global_bwd": 0.7232688367366791, "x_quantize_rowwise": 0.30193477869033813, "g_quantize_rowwise": 0.08745118975639343, "w_quantize_rowwise": 0.03374740481376648, "w_quantize_colwise_transpose": 0.09068101644515991, "w_quantize_global": 0.09645149111747742, "w_quantize_global_transpose": 0.10189786553382874, "time_standard": 3.3810697495937347, "time_rowwise": 2.9659420251846313, "time_global": 2.9868967831134796} +{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 2.4533793330192566, "standard_gw": 2.1938569843769073, "standard_gx": 2.179361879825592, "rowwise_fwd": 1.4615543186664581, "rowwise_bwd": 1.0522231459617615, "global_fwd": 1.4288239181041718, "global_bwd": 1.0450035333633423, "x_quantize_rowwise": 0.1691766083240509, "g_quantize_rowwise": 0.5951300263404846, "w_quantize_rowwise": 0.03337860107421875, "w_quantize_colwise_transpose": 0.08653849363327026, "w_quantize_global": 0.0940859317779541, "w_quantize_global_transpose": 0.09976327419281006, "time_standard": 6.826598197221756, "time_rowwise": 5.5918581783771515, "time_global": 5.625840276479721} +{"repeat": 64, "batch_size": 65536, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 2.1698065102100372, "standard_gw": 2.1875128149986267, "standard_gx": 2.2887587547302246, "rowwise_fwd": 1.0762326419353485, "rowwise_bwd": 1.4638006687164307, "global_fwd": 1.0450668632984161, "global_bwd": 1.4308765530586243, "x_quantize_rowwise": 0.5953535437583923, "g_quantize_rowwise": 0.16899779438972473, "w_quantize_rowwise": 0.03240257501602173, "w_quantize_colwise_transpose": 0.09106099605560303, "w_quantize_global": 0.09546056389808655, "w_quantize_global_transpose": 0.09852275252342224, "time_standard": 6.6460780799388885, "time_rowwise": 5.615361034870148, "time_global": 5.621790885925293} +{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 4.858218133449554, "standard_gw": 4.3631307780742645, "standard_gx": 4.404045641422272, "rowwise_fwd": 2.9063820838928223, "rowwise_bwd": 2.094462513923645, "global_fwd": 2.8426870703697205, "global_bwd": 2.0792782306671143, "x_quantize_rowwise": 0.33241137862205505, "g_quantize_rowwise": 1.1817105114459991, "w_quantize_rowwise": 0.03374367952346802, "w_quantize_colwise_transpose": 0.08633732795715332, "w_quantize_global": 0.09231641888618469, "w_quantize_global_transpose": 0.100012868642807, "time_standard": 13.62539455294609, "time_rowwise": 10.998178273439407, "time_global": 10.991547256708145} +{"repeat": 64, "batch_size": 131072, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 4.246581345796585, "standard_gw": 4.42587211728096, "standard_gx": 4.581417888402939, "rowwise_fwd": 2.1114833652973175, "rowwise_bwd": 2.9050447046756744, "global_fwd": 2.0806826651096344, "global_bwd": 2.85966694355011, "x_quantize_rowwise": 1.1816024780273438, "g_quantize_rowwise": 0.33330172300338745, "w_quantize_rowwise": 0.033445656299591064, "w_quantize_colwise_transpose": 0.09065866470336914, "w_quantize_global": 0.09239837527275085, "w_quantize_global_transpose": 0.09984523057937622, "time_standard": 13.253871351480484, "time_rowwise": 11.081408709287643, "time_global": 11.073369532823563} +{"repeat": 64, "batch_size": 8192, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.4859529435634613, "standard_gw": 0.46338513493537903, "standard_gx": 0.42321905493736267, "rowwise_fwd": 0.2761557698249817, "rowwise_bwd": 0.20775198936462402, "global_fwd": 0.2713911235332489, "global_bwd": 0.20639970898628235, "x_quantize_rowwise": 0.033095479011535645, "g_quantize_rowwise": 0.11894106864929199, "w_quantize_rowwise": 0.03125518560409546, "w_quantize_colwise_transpose": 0.1424551010131836, "w_quantize_global": 0.07288157939910889, "w_quantize_global_transpose": 0.08071959018707275, "time_standard": 1.372557133436203, "time_rowwise": 1.2730397284030914, "time_global": 1.2468136847019196} +{"repeat": 64, "batch_size": 8192, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.3920421004295349, "standard_gw": 0.44424086809158325, "standard_gx": 0.4759356379508972, "rowwise_fwd": 0.23231282830238342, "rowwise_bwd": 0.28430670499801636, "global_fwd": 0.20883232355117798, "global_bwd": 0.2741999924182892, "x_quantize_rowwise": 0.12018159031867981, "g_quantize_rowwise": 0.03195926547050476, "w_quantize_rowwise": 0.026017427444458008, "w_quantize_colwise_transpose": 0.14733895659446716, "w_quantize_global": 0.07734447717666626, "w_quantize_global_transpose": 0.0788569450378418, "time_standard": 1.3122186064720154, "time_rowwise": 1.2863576412200928, "time_global": 1.235615462064743} +{"repeat": 64, "batch_size": 16384, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 1.0111741721630096, "standard_gw": 0.9267590939998627, "standard_gx": 0.8254274725914001, "rowwise_fwd": 0.5434826016426086, "rowwise_bwd": 0.4077926278114319, "global_fwd": 0.5318708717823029, "global_bwd": 0.40537863969802856, "x_quantize_rowwise": 0.059738755226135254, "g_quantize_rowwise": 0.2299174666404724, "w_quantize_rowwise": 0.02545863389968872, "w_quantize_colwise_transpose": 0.14269724488258362, "w_quantize_global": 0.07300823926925659, "w_quantize_global_transpose": 0.07878988981246948, "time_standard": 2.7633607387542725, "time_rowwise": 2.335846424102783, "time_global": 2.305462956428528} +{"repeat": 64, "batch_size": 16384, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.8095316588878632, "standard_gw": 0.8607134222984314, "standard_gx": 0.9204968810081482, "rowwise_fwd": 0.4275888204574585, "rowwise_bwd": 0.5485899746417999, "global_fwd": 0.41000545024871826, "global_bwd": 0.5317628383636475, "x_quantize_rowwise": 0.2301819622516632, "g_quantize_rowwise": 0.059254467487335205, "w_quantize_rowwise": 0.02466142177581787, "w_quantize_colwise_transpose": 0.14865398406982422, "w_quantize_global": 0.07582828402519226, "w_quantize_global_transpose": 0.08231401443481445, "time_standard": 2.5907419621944427, "time_rowwise": 2.2996440529823303, "time_global": 2.2500604391098022} +{"repeat": 64, "batch_size": 32768, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 2.0658522844314575, "standard_gw": 1.718364655971527, "standard_gx": 1.6660578548908234, "rowwise_fwd": 1.066897064447403, "rowwise_bwd": 0.8070804178714752, "global_fwd": 1.0473169386386871, "global_bwd": 0.8021742105484009, "x_quantize_rowwise": 0.11274218559265137, "g_quantize_rowwise": 0.4518181085586548, "w_quantize_rowwise": 0.026501715183258057, "w_quantize_colwise_transpose": 0.14259666204452515, "w_quantize_global": 0.07484853267669678, "w_quantize_global_transpose": 0.07976219058036804, "time_standard": 5.450274795293808, "time_rowwise": 4.326000809669495, "time_global": 4.287026822566986} +{"repeat": 64, "batch_size": 32768, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 2.7549192309379578, "standard_gw": 1.6954988241195679, "standard_gx": 1.8179528415203094, "rowwise_fwd": 0.8649080991744995, "rowwise_bwd": 1.0746456682682037, "global_fwd": 0.8023083209991455, "global_bwd": 1.0471977293491364, "x_quantize_rowwise": 0.45225024223327637, "g_quantize_rowwise": 0.11286512017250061, "w_quantize_rowwise": 0.0252649188041687, "w_quantize_colwise_transpose": 0.14732033014297485, "w_quantize_global": 0.07537379860877991, "w_quantize_global_transpose": 0.0807642936706543, "time_standard": 6.268370896577835, "time_rowwise": 4.372753202915192, "time_global": 4.266258329153061} +{"repeat": 64, "batch_size": 65536, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 4.098430275917053, "standard_gw": 3.3501461148262024, "standard_gx": 5.560480058193207, "rowwise_fwd": 2.112947404384613, "rowwise_bwd": 1.605246216058731, "global_fwd": 2.0697638392448425, "global_bwd": 1.5953518450260162, "x_quantize_rowwise": 0.21921470761299133, "g_quantize_rowwise": 0.8956789970397949, "w_quantize_rowwise": 0.02710893750190735, "w_quantize_colwise_transpose": 0.14268234372138977, "w_quantize_global": 0.07259473204612732, "w_quantize_global_transpose": 0.07899105548858643, "time_standard": 13.009056448936462, "time_rowwise": 8.35302472114563, "time_global": 8.281741291284561} +{"repeat": 64, "batch_size": 65536, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 5.586959421634674, "standard_gw": 3.358360379934311, "standard_gx": 3.6434978246688843, "rowwise_fwd": 1.6269534826278687, "rowwise_bwd": 2.128206193447113, "global_fwd": 1.5950687229633331, "global_bwd": 2.0831897854804993, "x_quantize_rowwise": 0.8954145014286041, "g_quantize_rowwise": 0.21914392709732056, "w_quantize_rowwise": 0.026203691959381104, "w_quantize_colwise_transpose": 0.14658644795417786, "w_quantize_global": 0.07478520274162292, "w_quantize_global_transpose": 0.07964670658111572, "time_standard": 12.58881762623787, "time_rowwise": 8.400868624448776, "time_global": 8.305609226226807} +{"repeat": 64, "batch_size": 131072, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 8.229725062847137, "standard_gw": 6.791356950998306, "standard_gx": 6.806455552577972, "rowwise_fwd": 4.252471029758453, "rowwise_bwd": 3.2062679529190063, "global_fwd": 4.175614565610886, "global_bwd": 3.1837262213230133, "x_quantize_rowwise": 0.4321373999118805, "g_quantize_rowwise": 1.787092536687851, "w_quantize_rowwise": 0.0270158052444458, "w_quantize_colwise_transpose": 0.1424252986907959, "w_quantize_global": 0.07348507642745972, "w_quantize_global_transpose": 0.07829815149307251, "time_standard": 21.827537566423416, "time_rowwise": 16.63876697421074, "time_global": 16.52171090245247} +{"repeat": 64, "batch_size": 131072, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 11.279478669166565, "standard_gw": 6.7345499992370605, "standard_gx": 7.206875830888748, "rowwise_fwd": 3.209315240383148, "rowwise_bwd": 4.256397485733032, "global_fwd": 3.180190920829773, "global_bwd": 4.177983850240707, "x_quantize_rowwise": 1.7836056649684906, "g_quantize_rowwise": 0.4321075975894928, "w_quantize_rowwise": 0.03205239772796631, "w_quantize_colwise_transpose": 0.14675036072731018, "w_quantize_global": 0.09316205978393555, "w_quantize_global_transpose": 0.10086596012115479, "time_standard": 25.220904499292374, "time_rowwise": 16.5947787463665, "time_global": 16.502466052770615} +{"repeat": 64, "batch_size": 8192, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.5776733160018921, "standard_gw": 0.5300231277942657, "standard_gx": 0.6005913019180298, "rowwise_fwd": 0.33330172300338745, "rowwise_bwd": 0.2957060933113098, "global_fwd": 0.32876431941986084, "global_bwd": 0.29108673334121704, "x_quantize_rowwise": 0.03466755151748657, "g_quantize_rowwise": 0.12264400720596313, "w_quantize_rowwise": 0.033874064683914185, "w_quantize_colwise_transpose": 0.1775398850440979, "w_quantize_global": 0.09503215551376343, "w_quantize_global_transpose": 0.10617449879646301, "time_standard": 1.7082877457141876, "time_rowwise": 1.5277564525604248, "time_global": 1.5083923935890198} +{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.5164109170436859, "standard_gw": 0.5367249250411987, "standard_gx": 0.5876161158084869, "rowwise_fwd": 0.3132447600364685, "rowwise_bwd": 0.3396235406398773, "global_fwd": 0.2943649888038635, "global_bwd": 0.33209100365638733, "x_quantize_rowwise": 0.12357160449028015, "g_quantize_rowwise": 0.035997480154037476, "w_quantize_rowwise": 0.03213062882423401, "w_quantize_colwise_transpose": 0.17676874995231628, "w_quantize_global": 0.09861215949058533, "w_quantize_global_transpose": 0.0998862087726593, "time_standard": 1.6407519578933716, "time_rowwise": 1.5580616891384125, "time_global": 1.5212483704090118} +{"repeat": 64, "batch_size": 16384, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 1.2096501886844635, "standard_gw": 1.0663382709026337, "standard_gx": 1.0961703956127167, "rowwise_fwd": 0.6396733224391937, "rowwise_bwd": 0.5173943936824799, "global_fwd": 0.6296299397945404, "global_bwd": 0.5130060017108917, "x_quantize_rowwise": 0.06211921572685242, "g_quantize_rowwise": 0.2361498773097992, "w_quantize_rowwise": 0.03260001540184021, "w_quantize_colwise_transpose": 0.17679482698440552, "w_quantize_global": 0.09361281991004944, "w_quantize_global_transpose": 0.09913742542266846, "time_standard": 3.372158855199814, "time_rowwise": 2.7310699224472046, "time_global": 2.6999935507774353} +{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 1.1065565049648285, "standard_gw": 1.0664314031600952, "standard_gx": 1.1266544461250305, "rowwise_fwd": 0.5352050065994263, "rowwise_bwd": 0.6464086472988129, "global_fwd": 0.513765960931778, "global_bwd": 0.6284862756729126, "x_quantize_rowwise": 0.23620948195457458, "g_quantize_rowwise": 0.062271952629089355, "w_quantize_rowwise": 0.031460076570510864, "w_quantize_colwise_transpose": 0.17675384879112244, "w_quantize_global": 0.09486451745033264, "w_quantize_global_transpose": 0.09898096323013306, "time_standard": 3.2996423542499542, "time_rowwise": 2.7547404170036316, "time_global": 2.7010105550289154} +{"repeat": 64, "batch_size": 32768, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 2.4367496371269226, "standard_gw": 2.0806193351745605, "standard_gx": 2.19624862074852, "rowwise_fwd": 1.2554042041301727, "rowwise_bwd": 1.0227933526039124, "global_fwd": 1.2322552502155304, "global_bwd": 1.0152235627174377, "x_quantize_rowwise": 0.11792033910751343, "g_quantize_rowwise": 0.4639364778995514, "w_quantize_rowwise": 0.03241002559661865, "w_quantize_colwise_transpose": 0.17657503485679626, "w_quantize_global": 0.09655207395553589, "w_quantize_global_transpose": 0.09958073496818542, "time_standard": 6.713617593050003, "time_rowwise": 5.149658769369125, "time_global": 5.106087774038315} +{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 2.1935217082500458, "standard_gw": 2.0055584609508514, "standard_gx": 2.1882541477680206, "rowwise_fwd": 1.0396353900432587, "rowwise_bwd": 1.2542344629764557, "global_fwd": 1.0161921381950378, "global_bwd": 1.233428716659546, "x_quantize_rowwise": 0.4642195999622345, "g_quantize_rowwise": 0.11782720685005188, "w_quantize_rowwise": 0.033117830753326416, "w_quantize_colwise_transpose": 0.17696991562843323, "w_quantize_global": 0.09416043758392334, "w_quantize_global_transpose": 0.10101497173309326, "time_standard": 6.387334316968918, "time_rowwise": 5.091562867164612, "time_global": 5.032401531934738} +{"repeat": 64, "batch_size": 65536, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 4.804681986570358, "standard_gw": 4.763372242450714, "standard_gx": 4.064023494720459, "rowwise_fwd": 2.484843134880066, "rowwise_bwd": 1.9691288471221924, "global_fwd": 2.441786229610443, "global_bwd": 1.9574686884880066, "x_quantize_rowwise": 0.2294592559337616, "g_quantize_rowwise": 0.9196549654006958, "w_quantize_rowwise": 0.0313781201839447, "w_quantize_colwise_transpose": 0.1768544316291809, "w_quantize_global": 0.09644776582717896, "w_quantize_global_transpose": 0.09847059845924377, "time_standard": 13.632077723741531, "time_rowwise": 10.574690997600555, "time_global": 10.506659746170044} +{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 4.0907710790634155, "standard_gw": 3.9793066680431366, "standard_gx": 4.302978515625, "rowwise_fwd": 1.992940902709961, "rowwise_bwd": 2.4996213614940643, "global_fwd": 1.9551962614059448, "global_bwd": 2.457551658153534, "x_quantize_rowwise": 0.9200014173984528, "g_quantize_rowwise": 0.2293996512889862, "w_quantize_rowwise": 0.0313781201839447, "w_quantize_colwise_transpose": 0.17882883548736572, "w_quantize_global": 0.09540095925331116, "w_quantize_global_transpose": 0.09880587458610535, "time_standard": 12.373056262731552, "time_rowwise": 9.831476956605911, "time_global": 9.73566249012947} +{"repeat": 64, "batch_size": 131072, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 9.655728936195374, "standard_gw": 8.261296898126602, "standard_gx": 8.064884692430496, "rowwise_fwd": 5.007706582546234, "rowwise_bwd": 3.8615092635154724, "global_fwd": 4.920527338981628, "global_bwd": 3.8330331444740295, "x_quantize_rowwise": 0.45276060700416565, "g_quantize_rowwise": 1.8306002020835876, "w_quantize_rowwise": 0.031366944313049316, "w_quantize_colwise_transpose": 0.1766495406627655, "w_quantize_global": 0.09412690997123718, "w_quantize_global_transpose": 0.09780004620552063, "time_standard": 25.981910526752472, "time_rowwise": 19.621890038251877, "time_global": 19.49014514684677} +{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 8.033104240894318, "standard_gw": 8.2889124751091, "standard_gx": 8.622754365205765, "rowwise_fwd": 3.8747042417526245, "rowwise_bwd": 5.003921687602997, "global_fwd": 3.8315393030643463, "global_bwd": 4.9162134528160095, "x_quantize_rowwise": 1.8304847180843353, "g_quantize_rowwise": 0.4522763192653656, "w_quantize_rowwise": 0.03413110971450806, "w_quantize_colwise_transpose": 0.1771189272403717, "w_quantize_global": 0.09519979357719421, "w_quantize_global_transpose": 0.09930506348609924, "time_standard": 24.944771081209183, "time_rowwise": 19.661549478769302, "time_global": 19.51393112540245} +{"repeat": 64, "batch_size": 8192, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.7954612374305725, "standard_gw": 0.7456131279468536, "standard_gx": 0.8799619972705841, "rowwise_fwd": 0.43267011642456055, "rowwise_bwd": 0.34622475504875183, "global_fwd": 0.42615458369255066, "global_bwd": 0.344250351190567, "x_quantize_rowwise": 0.03748014569282532, "g_quantize_rowwise": 0.13304129242897034, "w_quantize_rowwise": 0.03294646739959717, "w_quantize_colwise_transpose": 0.2407953143119812, "w_quantize_global": 0.094633549451828, "w_quantize_global_transpose": 0.10305643081665039, "time_standard": 2.4210363626480103, "time_rowwise": 1.96877121925354, "time_global": 1.8842294812202454} +{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.7120333611965179, "standard_gw": 0.7622130215167999, "standard_gx": 0.8262209594249725, "rowwise_fwd": 0.3702230751514435, "rowwise_bwd": 0.4419572651386261, "global_fwd": 0.3479123115539551, "global_bwd": 0.4306286573410034, "x_quantize_rowwise": 0.13308599591255188, "g_quantize_rowwise": 0.037495046854019165, "w_quantize_rowwise": 0.03398209810256958, "w_quantize_colwise_transpose": 0.23782625794410706, "w_quantize_global": 0.09853765368461609, "w_quantize_global_transpose": 0.10247156023979187, "time_standard": 2.3004673421382904, "time_rowwise": 2.016782760620117, "time_global": 1.9123442471027374} +{"repeat": 64, "batch_size": 16384, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 1.6292817890644073, "standard_gw": 1.5109702944755554, "standard_gx": 1.482747495174408, "rowwise_fwd": 0.8386112749576569, "rowwise_bwd": 0.6844550371170044, "global_fwd": 0.8220970630645752, "global_bwd": 0.6802082061767578, "x_quantize_rowwise": 0.06883963942527771, "g_quantize_rowwise": 0.25641173124313354, "w_quantize_rowwise": 0.033054500818252563, "w_quantize_colwise_transpose": 0.24027004837989807, "w_quantize_global": 0.0967271625995636, "w_quantize_global_transpose": 0.102948397397995, "time_standard": 4.622999578714371, "time_rowwise": 3.6326125264167786, "time_global": 3.5382024943828583} +{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 1.4877021312713623, "standard_gw": 1.5015341341495514, "standard_gx": 1.529306173324585, "rowwise_fwd": 0.715944916009903, "rowwise_bwd": 0.8529908955097198, "global_fwd": 0.680088996887207, "global_bwd": 0.8224695920944214, "x_quantize_rowwise": 0.2568177878856659, "g_quantize_rowwise": 0.06864592432975769, "w_quantize_rowwise": 0.03343448042869568, "w_quantize_colwise_transpose": 0.23645907640457153, "w_quantize_global": 0.09399279952049255, "w_quantize_global_transpose": 0.10286271572113037, "time_standard": 4.518542438745499, "time_rowwise": 3.665827214717865, "time_global": 3.5264119505882263} +{"repeat": 64, "batch_size": 32768, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 3.261040896177292, "standard_gw": 2.8816498816013336, "standard_gx": 2.8357282280921936, "rowwise_fwd": 1.6594752669334412, "rowwise_bwd": 1.359265297651291, "global_fwd": 1.6287527978420258, "global_bwd": 1.3503879308700562, "x_quantize_rowwise": 0.13146549463272095, "g_quantize_rowwise": 0.5035959184169769, "w_quantize_rowwise": 0.03438442945480347, "w_quantize_colwise_transpose": 0.24086236953735352, "w_quantize_global": 0.0945068895816803, "w_quantize_global_transpose": 0.10332837700843811, "time_standard": 8.978419005870819, "time_rowwise": 6.8106986582279205, "time_global": 6.693687289953232} +{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 2.848360687494278, "standard_gw": 2.8955675661563873, "standard_gx": 3.0499882996082306, "rowwise_fwd": 1.3900883495807648, "rowwise_bwd": 1.6595833003520966, "global_fwd": 1.3514049351215363, "global_bwd": 1.629263162612915, "x_quantize_rowwise": 0.5036592483520508, "g_quantize_rowwise": 0.13118237257003784, "w_quantize_rowwise": 0.03438442945480347, "w_quantize_colwise_transpose": 0.23709610104560852, "w_quantize_global": 0.0951625406742096, "w_quantize_global_transpose": 0.10216236114501953, "time_standard": 8.793916553258896, "time_rowwise": 6.851561367511749, "time_global": 6.708402186632156} +{"repeat": 64, "batch_size": 65536, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 6.4978525042533875, "standard_gw": 6.462603807449341, "standard_gx": 5.5987648665905, "rowwise_fwd": 3.2996535301208496, "rowwise_bwd": 2.6320070028305054, "global_fwd": 3.2426007091999054, "global_bwd": 2.612769603729248, "x_quantize_rowwise": 0.2561397850513458, "g_quantize_rowwise": 0.9984448552131653, "w_quantize_rowwise": 0.033076852560043335, "w_quantize_colwise_transpose": 0.24232640862464905, "w_quantize_global": 0.09618699550628662, "w_quantize_global_transpose": 0.10257214307785034, "time_standard": 18.559221178293228, "time_rowwise": 13.9242522418499, "time_global": 13.771317899227142} +{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 5.5702440440654755, "standard_gw": 5.717620253562927, "standard_gx": 6.08203187584877, "rowwise_fwd": 2.649586647748947, "rowwise_bwd": 3.315173089504242, "global_fwd": 2.6132799685001373, "global_bwd": 3.257807344198227, "x_quantize_rowwise": 0.9980201721191406, "g_quantize_rowwise": 0.256560742855072, "w_quantize_rowwise": 0.03356859087944031, "w_quantize_colwise_transpose": 0.23729726672172546, "w_quantize_global": 0.09495764970779419, "w_quantize_global_transpose": 0.103779137134552, "time_standard": 17.369896173477173, "time_rowwise": 13.207826763391495, "time_global": 13.04202526807785} +{"repeat": 64, "batch_size": 131072, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 13.058379292488098, "standard_gw": 11.480242013931274, "standard_gx": 11.092845350503922, "rowwise_fwd": 6.637874990701675, "rowwise_bwd": 5.24790957570076, "global_fwd": 6.521012634038925, "global_bwd": 5.214303731918335, "x_quantize_rowwise": 0.5057565867900848, "g_quantize_rowwise": 1.989319920539856, "w_quantize_rowwise": 0.03439188003540039, "w_quantize_colwise_transpose": 0.24280324578285217, "w_quantize_global": 0.09520724415779114, "w_quantize_global_transpose": 0.10240450501441956, "time_standard": 35.631466656923294, "time_rowwise": 26.138298213481903, "time_global": 25.908246636390686} +{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 11.13397628068924, "standard_gw": 11.371888220310211, "standard_gx": 12.12756335735321, "rowwise_fwd": 5.2495077252388, "rowwise_bwd": 6.638709455728531, "global_fwd": 5.215313285589218, "global_bwd": 6.5222084522247314, "x_quantize_rowwise": 1.9870512187480927, "g_quantize_rowwise": 0.5058236420154572, "w_quantize_rowwise": 0.034634023904800415, "w_quantize_colwise_transpose": 0.23674964904785156, "w_quantize_global": 0.09457767009735107, "w_quantize_global_transpose": 0.10183081030845642, "time_standard": 34.63342785835266, "time_rowwise": 26.024363934993744, "time_global": 25.798693299293518} +{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 1.2125298380851746, "standard_gw": 1.1111274361610413, "standard_gx": 1.0840706527233124, "rowwise_fwd": 0.6057210266590118, "rowwise_bwd": 0.51865354180336, "global_fwd": 0.5952082574367523, "global_bwd": 0.5167685449123383, "x_quantize_rowwise": 0.045686960220336914, "g_quantize_rowwise": 0.15827640891075134, "w_quantize_rowwise": 0.04361197352409363, "w_quantize_colwise_transpose": 0.34067779779434204, "w_quantize_global": 0.13644620776176453, "w_quantize_global_transpose": 0.14925003051757812, "time_standard": 3.407727926969528, "time_rowwise": 2.823755145072937, "time_global": 2.7127638459205627} +{"repeat": 64, "batch_size": 8192, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 1.0731369256973267, "standard_gw": 1.1365897953510284, "standard_gx": 1.1498592793941498, "rowwise_fwd": 0.5573518574237823, "rowwise_bwd": 0.615488737821579, "global_fwd": 0.5220361053943634, "global_bwd": 0.5939789116382599, "x_quantize_rowwise": 0.15765801072120667, "g_quantize_rowwise": 0.04369020462036133, "w_quantize_rowwise": 0.047359615564346313, "w_quantize_colwise_transpose": 0.5526281893253326, "w_quantize_global": 0.13606995344161987, "w_quantize_global_transpose": 0.15017390251159668, "time_standard": 3.359586000442505, "time_rowwise": 3.1107664108276367, "time_global": 2.7401968836784363} +{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 2.4274885654449463, "standard_gw": 2.1799951791763306, "standard_gx": 2.1426528692245483, "rowwise_fwd": 1.195710152387619, "rowwise_bwd": 1.027170568704605, "global_fwd": 1.1747106909751892, "global_bwd": 1.0251589119434357, "x_quantize_rowwise": 0.08098781108856201, "g_quantize_rowwise": 0.3052949905395508, "w_quantize_rowwise": 0.043764710426330566, "w_quantize_colwise_transpose": 0.33987686038017273, "w_quantize_global": 0.13646483421325684, "w_quantize_global_transpose": 0.14739856123924255, "time_standard": 6.750136613845825, "time_rowwise": 5.172800272703171, "time_global": 5.050010979175568} +{"repeat": 64, "batch_size": 16384, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 2.1661892533302307, "standard_gw": 2.0948275923728943, "standard_gx": 2.306375652551651, "rowwise_fwd": 1.0587647557258606, "rowwise_bwd": 1.1999905109405518, "global_fwd": 1.0296404361724854, "global_bwd": 1.1749230325222015, "x_quantize_rowwise": 0.3054030239582062, "g_quantize_rowwise": 0.08077546954154968, "w_quantize_rowwise": 0.047225505113601685, "w_quantize_colwise_transpose": 0.600133091211319, "w_quantize_global": 0.13613328337669373, "w_quantize_global_transpose": 0.1484006643295288, "time_standard": 6.567392498254776, "time_rowwise": 5.387119948863983, "time_global": 4.97010350227356} +{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 4.807606339454651, "standard_gw": 4.170913249254227, "standard_gx": 4.117622971534729, "rowwise_fwd": 2.370934933423996, "rowwise_bwd": 1.9481778144836426, "global_fwd": 2.3383721709251404, "global_bwd": 1.9443817436695099, "x_quantize_rowwise": 0.1547597348690033, "g_quantize_rowwise": 0.6000511348247528, "w_quantize_rowwise": 0.04361942410469055, "w_quantize_colwise_transpose": 0.3403201699256897, "w_quantize_global": 0.13600289821624756, "w_quantize_global_transpose": 0.1474134624004364, "time_standard": 13.096142560243607, "time_rowwise": 9.628776460886002, "time_global": 9.491894394159317} +{"repeat": 64, "batch_size": 32768, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 4.1619837284088135, "standard_gw": 4.181284457445145, "standard_gx": 4.635505378246307, "rowwise_fwd": 1.9684135913848877, "rowwise_bwd": 2.3750364780426025, "global_fwd": 1.9445866346359253, "global_bwd": 2.3551955819129944, "x_quantize_rowwise": 0.6004162132740021, "g_quantize_rowwise": 0.15468522906303406, "w_quantize_rowwise": 0.04730746150016785, "w_quantize_colwise_transpose": 0.5999617278575897, "w_quantize_global": 0.1364201307296753, "w_quantize_global_transpose": 0.14847144484519958, "time_standard": 12.978773564100266, "time_rowwise": 9.927105158567429, "time_global": 9.521059691905975} +{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 9.52371209859848, "standard_gw": 8.354485034942627, "standard_gx": 8.69860127568245, "rowwise_fwd": 4.717472940683365, "rowwise_bwd": 3.8843750953674316, "global_fwd": 4.645414650440216, "global_bwd": 3.8761012256145477, "x_quantize_rowwise": 0.3024861216545105, "g_quantize_rowwise": 1.1897757649421692, "w_quantize_rowwise": 0.04366785287857056, "w_quantize_colwise_transpose": 0.33988431096076965, "w_quantize_global": 0.1359507441520691, "w_quantize_global_transpose": 0.14724582433700562, "time_standard": 26.576798409223557, "time_rowwise": 18.832147121429443, "time_global": 18.651459366083145} +{"repeat": 64, "batch_size": 65536, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 8.307881653308868, "standard_gw": 8.214320987462997, "standard_gx": 9.21182706952095, "rowwise_fwd": 3.8919784128665924, "rowwise_bwd": 4.72346693277359, "global_fwd": 3.8761794567108154, "global_bwd": 4.673641175031662, "x_quantize_rowwise": 1.1893920600414276, "g_quantize_rowwise": 0.3024972975254059, "w_quantize_rowwise": 0.04708021879196167, "w_quantize_colwise_transpose": 0.6039328873157501, "w_quantize_global": 0.13624504208564758, "w_quantize_global_transpose": 0.14867261052131653, "time_standard": 25.734029710292816, "time_rowwise": 18.972668796777725, "time_global": 18.540948629379272} +{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 19.30372044444084, "standard_gw": 16.480475664138794, "standard_gx": 17.61433482170105, "rowwise_fwd": 9.49602946639061, "rowwise_bwd": 7.768530398607254, "global_fwd": 9.3533955514431, "global_bwd": 7.749464362859726, "x_quantize_rowwise": 0.5977451801300049, "g_quantize_rowwise": 2.3684948682785034, "w_quantize_rowwise": 0.04375725984573364, "w_quantize_colwise_transpose": 0.34042075276374817, "w_quantize_global": 0.13628974556922913, "w_quantize_global_transpose": 0.14671683311462402, "time_standard": 53.398530930280685, "time_rowwise": 37.09545359015465, "time_global": 36.83258220553398} +{"repeat": 64, "batch_size": 131072, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 18.041003495454788, "standard_gw": 17.770148813724518, "standard_gx": 17.70009845495224, "rowwise_fwd": 7.756810635328293, "rowwise_bwd": 9.502101689577103, "global_fwd": 7.7384114265441895, "global_bwd": 9.36170294880867, "x_quantize_rowwise": 2.3686252534389496, "g_quantize_rowwise": 0.5980581045150757, "w_quantize_rowwise": 0.04723668098449707, "w_quantize_colwise_transpose": 0.6035342812538147, "w_quantize_global": 0.13603642582893372, "w_quantize_global_transpose": 0.1485198736190796, "time_standard": 53.511250764131546, "time_rowwise": 38.64651545882225, "time_global": 38.121502846479416} +{"repeat": 64, "batch_size": 8192, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 4.598241299390793, "standard_gw": 4.294309765100479, "standard_gx": 4.261095076799393, "rowwise_fwd": 2.0976848900318146, "rowwise_bwd": 1.9718967378139496, "global_fwd": 2.0763762295246124, "global_bwd": 1.9703581929206848, "x_quantize_rowwise": 0.08216872811317444, "g_quantize_rowwise": 0.4405900835990906, "w_quantize_rowwise": 0.1553371548652649, "w_quantize_colwise_transpose": 1.6110725700855255, "w_quantize_global": 0.481240451335907, "w_quantize_global_transpose": 0.5061514675617218, "time_standard": 13.153646141290665, "time_rowwise": 10.653059929609299, "time_global": 9.85119491815567} +{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 4.35885414481163, "standard_gw": 4.29583340883255, "standard_gx": 4.5370906591415405, "rowwise_fwd": 2.0015686750411987, "rowwise_bwd": 2.097565680742264, "global_fwd": 1.969795674085617, "global_bwd": 2.075403928756714, "x_quantize_rowwise": 0.43984130024909973, "g_quantize_rowwise": 0.08216127753257751, "w_quantize_rowwise": 0.22544339299201965, "w_quantize_colwise_transpose": 2.4342015385627747, "w_quantize_global": 0.48087164759635925, "w_quantize_global_transpose": 0.5099289119243622, "time_standard": 13.19177821278572, "time_rowwise": 11.576615273952484, "time_global": 9.85383614897728} +{"repeat": 64, "batch_size": 16384, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 9.09888744354248, "standard_gw": 8.230950683355331, "standard_gx": 8.465446531772614, "rowwise_fwd": 4.182614386081696, "rowwise_bwd": 3.747660666704178, "global_fwd": 4.138719290494919, "global_bwd": 3.74777615070343, "x_quantize_rowwise": 0.15515834093093872, "g_quantize_rowwise": 0.8699297904968262, "w_quantize_rowwise": 0.15544891357421875, "w_quantize_colwise_transpose": 1.6132444143295288, "w_quantize_global": 0.48100948333740234, "w_quantize_global_transpose": 0.5051903426647186, "time_standard": 25.795284658670425, "time_rowwise": 18.955007195472717, "time_global": 18.128734081983566} +{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 8.378107100725174, "standard_gw": 8.923027664422989, "standard_gx": 9.049762040376663, "rowwise_fwd": 3.765825182199478, "rowwise_bwd": 4.183519631624222, "global_fwd": 3.744799643754959, "global_bwd": 4.1590481996536255, "x_quantize_rowwise": 0.8693933486938477, "g_quantize_rowwise": 0.1553073525428772, "w_quantize_rowwise": 0.2258792519569397, "w_quantize_colwise_transpose": 2.4386271834373474, "w_quantize_global": 0.4811100661754608, "w_quantize_global_transpose": 0.5102269351482391, "time_standard": 26.350896805524826, "time_rowwise": 20.5615796148777, "time_global": 18.842913210392} +{"repeat": 64, "batch_size": 32768, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 18.266115337610245, "standard_gw": 17.671160399913788, "standard_gx": 17.10302010178566, "rowwise_fwd": 8.347474038600922, "rowwise_bwd": 7.514089345932007, "global_fwd": 8.263226598501205, "global_bwd": 7.487393915653229, "x_quantize_rowwise": 0.3021806478500366, "g_quantize_rowwise": 1.7319358885288239, "w_quantize_rowwise": 0.15519559383392334, "w_quantize_colwise_transpose": 1.6133114695549011, "w_quantize_global": 0.48247724771499634, "w_quantize_global_transpose": 0.506427139043808, "time_standard": 53.04029583930969, "time_rowwise": 37.3353473842144, "time_global": 36.44480183720589} +{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 17.73649826645851, "standard_gw": 16.359902918338776, "standard_gx": 18.0993489921093, "rowwise_fwd": 7.493957877159119, "rowwise_bwd": 8.352488279342651, "global_fwd": 7.486194372177124, "global_bwd": 8.28903540968895, "x_quantize_rowwise": 1.7313472926616669, "g_quantize_rowwise": 0.30205026268959045, "w_quantize_rowwise": 0.2255477011203766, "w_quantize_colwise_transpose": 2.4363920092582703, "w_quantize_global": 0.4815347492694855, "w_quantize_global_transpose": 0.5103759467601776, "time_standard": 52.195750176906586, "time_rowwise": 36.90168634057045, "time_global": 35.16044095158577} +{"repeat": 64, "batch_size": 65536, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 36.309611052274704, "standard_gw": 32.85098075866699, "standard_gx": 34.34552624821663, "rowwise_fwd": 16.74525812268257, "rowwise_bwd": 15.026237815618515, "global_fwd": 16.574162989854813, "global_bwd": 14.977734535932541, "x_quantize_rowwise": 0.5954466760158539, "g_quantize_rowwise": 3.4569576382637024, "w_quantize_rowwise": 0.15521422028541565, "w_quantize_colwise_transpose": 1.6133897006511688, "w_quantize_global": 0.4822872579097748, "w_quantize_global_transpose": 0.5065612494945526, "time_standard": 103.50611805915833, "time_rowwise": 70.44348493218422, "time_global": 69.44413110613823} +{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 35.40017828345299, "standard_gw": 33.037226647138596, "standard_gx": 36.30436211824417, "rowwise_fwd": 15.043705701828003, "rowwise_bwd": 16.756191849708557, "global_fwd": 15.011314302682877, "global_bwd": 16.580048948526382, "x_quantize_rowwise": 3.4548528492450714, "g_quantize_rowwise": 0.5951337516307831, "w_quantize_rowwise": 0.22584572434425354, "w_quantize_colwise_transpose": 2.4329908192157745, "w_quantize_global": 0.4813261330127716, "w_quantize_global_transpose": 0.5101598799228668, "time_standard": 104.74176704883575, "time_rowwise": 71.54594734311104, "time_global": 69.67006251215935} +{"repeat": 64, "batch_size": 131072, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 73.40333238244057, "standard_gw": 73.76311346888542, "standard_gx": 70.41774317622185, "rowwise_fwd": 33.37597846984863, "rowwise_bwd": 30.345775187015533, "global_fwd": 33.00366923213005, "global_bwd": 30.218638479709625, "x_quantize_rowwise": 1.1825822293758392, "g_quantize_rowwise": 6.902601569890976, "w_quantize_rowwise": 0.15529245138168335, "w_quantize_colwise_transpose": 1.6109198331832886, "w_quantize_global": 0.48149004578590393, "w_quantize_global_transpose": 0.5066059529781342, "time_standard": 217.58418902754784, "time_rowwise": 147.33626320958138, "time_global": 146.05870097875595} +{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 71.5160183608532, "standard_gw": 73.76786693930626, "standard_gx": 72.98104092478752, "rowwise_fwd": 30.291248112916946, "rowwise_bwd": 33.36654230952263, "global_fwd": 30.181586742401123, "global_bwd": 33.082425594329834, "x_quantize_rowwise": 6.902430206537247, "g_quantize_rowwise": 1.1815279722213745, "w_quantize_rowwise": 0.2262219786643982, "w_quantize_colwise_transpose": 2.4421699345111847, "w_quantize_global": 0.4816502332687378, "w_quantize_global_transpose": 0.5105249583721161, "time_standard": 218.26492622494698, "time_rowwise": 148.17800745368004, "time_global": 146.1080126464367} diff --git a/tests/triton_tests/make_plot_with_info.py b/speed_benchmark/make_plot_with_jsonl.py similarity index 82% rename from tests/triton_tests/make_plot_with_info.py rename to speed_benchmark/make_plot_with_jsonl.py index 116d1d1..0920851 100644 --- a/tests/triton_tests/make_plot_with_info.py +++ b/speed_benchmark/make_plot_with_jsonl.py @@ -12,12 +12,18 @@ if __name__ == '__main__': fig = plt.figure(tight_layout=True, figsize=(12,3.5)) gs = gridspec.GridSpec(1, 2) + dims_to_consider = [1024, 1280, 1408, 1664, 2048, 4096] + batch_size_for_plot1 = 32768 + batch_sizes_for_plot2 = [2**14, 2**15, 2**16, 2**17] + dims_to_xtick = [1024, 2048, 4096] + logscale_plot1 = True ax = fig.add_subplot(gs[0, 0]) - rdf = pd.read_json('tests/triton_tests/info.jsonl', lines=True) - df = rdf[rdf.batch_size == 32768] + rdf = pd.read_json('speed_benchmark/info_a100_py2.jsonl', lines=True) + df = rdf[rdf.batch_size == batch_size_for_plot1] + # first plot the time occupied by different operations for k, marker, ls, color, name in [ ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (sum of parts)'), ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (sum of parts)'), @@ -29,17 +35,15 @@ if __name__ == '__main__': ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'), ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'), - #### time_global = info['x_quantize_rowwise'] + info['g_quantize_rowwise'] + info['w_quantize_global'] + info['w_quantize_global_transpose'] + info['standard_gw'] + info['global_fwd'] + info['global_bwd'] - ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'), ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'), - #('standard_gw', '.', '--', 'C1', 'standard_gw'), ]: xs = [] ys = [] - for embed_dim in [1024, 1280, 1408, 1664, 2048, 4096]: + for embed_dim in dims_to_consider: + # average over dim -> 4*dim and 4*dim -> dim df_ = df[df.dim_in == embed_dim] df_ = df_[df_.dim_out == embed_dim * 4] xs.append(embed_dim) @@ -56,24 +60,20 @@ if __name__ == '__main__': ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.) - - ax.set_xlabel('dim', fontsize=13) ax.set_ylabel('time (ms)', fontsize=13) - # make a legend which is below the plot - - ax.grid() ax.set_xscale('log') - #ax.set_yscale('log') + if logscale_plot1: + ax.set_yscale('log') ax.tick_params(axis='x', labelsize=11) ax.tick_params(axis='y', labelsize=11) - ax.set_xticks([1024, 2048, 4096]) - ax.set_xticklabels([1024, 2048, 4096]) + ax.set_xticks(dims_to_xtick) + ax.set_xticklabels(dims_to_xtick) ax.set_xticks([], minor=True) leg = ax.legend(loc='upper center', bbox_to_anchor=(-0.64, 1.), ncol=1, fontsize=10) @@ -86,7 +86,7 @@ if __name__ == '__main__': ax = fig.add_subplot(gs[0, 1]) # now plot the % speedup for different batch sizes - for j, batch_size in enumerate([2**14, 2**15, 2**16, 2**17]): + for j, batch_size in enumerate(batch_sizes_for_plot2): all_xs, all_ys = [], [] for k, marker, ls, color, name in [ ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'), @@ -95,7 +95,7 @@ if __name__ == '__main__': xs, ys = [], [] df = rdf[rdf.batch_size == batch_size] - for embed_dim in [1024, 1280, 1408, 1664, 2048, 4096]: + for embed_dim in dims_to_consider: df_ = df[df.dim_in == embed_dim] df_ = df_[df_.dim_out == embed_dim * 4] xs.append(embed_dim) @@ -125,13 +125,13 @@ if __name__ == '__main__': ax.tick_params(axis='x', labelsize=11) ax.tick_params(axis='y', labelsize=11) - ax.set_xticks([1024, 2048, 4096]) - ax.set_xticklabels([1024, 2048, 4096]) + ax.set_xticks(dims_to_xtick) + ax.set_xticklabels(dims_to_xtick) ax.set_xticks([], minor=True) ax.set_title(' Linear layer summary, varying dimensions', fontsize=10, loc='left', y=1.05, pad=-20) - plt.savefig('tests/triton_tests/plot1.pdf', bbox_inches='tight') + plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight') diff --git a/tests/triton_tests/plot1.pdf b/speed_benchmark/plot_with_info.pdf similarity index 76% rename from tests/triton_tests/plot1.pdf rename to speed_benchmark/plot_with_info.pdf index 1fe71682174766b2d551d9aa055a72e6eb837737..d186e91b7d96c6e605fd2802ee37881e6294cdd7 100644 GIT binary patch delta 8091 zcmZuzcOcaN|92U2W}Ow`Y!Yq{q0H<}N%qdn$U4rZY;Q!!EHW~#G!RijL$VWPhsX>e zztgAB_xp|C{c-nvKkoT@j>mJn-oN6GG$bFX20%n}b#I+yCK zNBvCJ^BBWi^)jsZ&_#8L(vs3eE8FZceQ{p6_}#x*tKK;x#{%ywEKlF1^H zM)It|7RSdmCSt=Qa8r$(b2_6O=J-!2@lqIkw1Dr9;@8286G^emOEPULvgTOH@G zIgKqdb2R1devaYma(Cymv>YWoUI_8;k=wh4>5E+H6tNuk5y^8k94f4K?i>Gb_w>T) z`u=B+`th6{^gzpU3`CO4iU9)mvVOHoV+<%X-4!mJbwd@K8NONTapNCOYb*UIOLSt4 zbkr8PR9U&2hkucXjJ?fxWH;Lk;YOXHmKm?x+sm?M z&z4k^EZ|jlSEE3>%xR>2(qKtDotX8kBHVSjv~-vY5@5=2m$+M~O8z^ucl(_9!0sVs3p9&jm4$xFcj1JPO-6b|Pz_)xHQ#d>6KTcsby&IEsDe$Y zhpfgoM@_`Z3QC^TqmjKvS+lQCUA~bVSZ542_jkYcF=;3m@=_iL)nmSG23p6Qf4e(7 zH8QoXvawn;Kb+UJt>`Q=1uLb*XDji)QU>?mi;U0z^)_#qR$MgZBmX0WI91!D?xiX! z(vi_)w%KCKrAXCvA|_d#1=SoBP#V$aa#BFofWk?YM~ME!u+5Pu%MFDBko);}sC*;7 zlw>B*MTIiKI{{};pZco$Ys4f0&N73$tUR-|c)GFe4ZmX=(LfEVd3ExoeA^e_S`%+x zt}l=2I+vX9pvt?rW{edH-x+MKQ_fX_L+-$)WkFXWQYaC%gGBB-bW(ly)6sWDABQEQ zym-%5KWdXH{(9>h7q?ibUGwocj}_(9&H9kX*9tc|nv3E(g~A;m^4~VS?YwF1WS$S_ zGmRf(6ST>7TcTA9YqjH&^*X{HaQnj-I7_Ai^_3+~x{g%uAvaCgIa)@Svzya&$>*=H z{#vJPd$Q(UzDC1I6z#5zb{f(PJj-dwN4Y~lbyTHQ7Q`;89<$QCZqnJW#?YvFdxq}$ z9kj?Pv2bE;{^zf`rW1t<0(ZC6I9Q{_)Ir~=uLmxHJ)3ArxvDp|u}e#HAq_&N)y+@u z>tCnB<7~aFQ%b2`>5U}^s~gF@71O-Go@(K}mD3N_n@gjhB|c8Zg|7%PZw;_*o$c0S zpG`4N;ElvI2qn*^AC+>hMJ&Wj+v*ra80vkU{OJVA>7N?6PEU!-^P#DgdHfo!${vDf zp^FJ^F!l24`l?);(tYnNq^O=+H7tlj(Fd(+eR85!$qjZRc7bKwJVHN?mFK6(XRBUe z&X&l`OUVKap2W-^OYY*L&3-evOT(_AteqFAidnif#`=TX>=@*2z!Fg|u{Li6&NN3x z)tg!uMTyl4c5RhicOjryRMHT~-a&d?JATM1p_F>qdLBDhzC6utc1e_u)^whOLMz8Y zpZ;8=jcS^6u8}b@Qc-mpH>A|Hy!tS#qV&AAAXV&F zljW`Hh@+Q4dC*UlQg)MHOkO*~K-lKiWjRsFuMHQ)|`Kq%_vfXWKV)7zvd%G)^ zyM=DRp{o8R&XrT_If(;hMU?LwI9~DzvF0&#tB5&pg9wu*P#IhqwW8$O({Q~RC09W> z)ejMoVVM?|K$X}lpN}Y#)}k+zk=A!Gii{zi#7>b%27zIEHm&7!`61l-u}ti%#b_xm z!rgYwk#?m8E(ks07ALF7yuIoak>ediCtkMJ(Hw+aQN?JPOr$vyXs^lWu{S^phwn@?qHJoIi>8c-664LhV=}H z9^uoWi_Ymu;Pgz3E8xgIk<2JYqr##|kmOY5&G;Ri29azrRpl~j4Cv8gs*>o$9Mf=! zJg{~=ho7^aM#rOJ_vfJ9bGh2MW00$m?9g1n$#;Nd-0-0C;KP%?8TL2dzo?M&SGt;z zK#dA;m=18N@~nvzQB_lXZP4N|YAMr2G$_7Um1rB?((~NGE$1~C0@I&NHFUn{as~ga zT)3UqJ=54;FzF;L5FzxPIZ?$y?ycj85R`m0XEN`zRKJ{zJHtasQJ*KG_QdVAw#PO( zbp}N!M63ns+SIbGWNtEe(Q1=h74%6FdBe5Z`I4e;li-EfWPJdW>s?;B<7&+I+i-dOxGJ`xj`y&AVTH$J^SQZ+*K ziT)+!=bT6Ih}M_owTw=TX?8y;rdxXQXT#9;uQ+8Noh!BCd?o0Y5QxSGx@be$(=uRen) zvI^8X(cJ3cbegog*(w#O%Ah*Z!gIM8Ob9H8*i?T$`Lg}PyU(Ry^Q;C|{`Ol*3acTx zPRuBm*xC&ibIPpPGbTYD(-Z@y8DnM|W@1J1%1F`&<(mf2k$x@gB+eG61h=jkYm#5P zn5D*dJ|v&?G3ML1bN3qS8=k}wvx}ym)upX|?e18=wMFe(-X{D{*?k!zMzX87cPxF4Y+~E5(;Kwt5nMokY;^EJxF72M@;FK^) z!5jHn8Odo!Nh(64)2aQm)`eO>lG7$mr|P!wwr8ql%bxil^6L9>Ca#V9Ny0?$GFspn z^;mI2L(P-H?3!H#kJ0!@b+>TOoPOB^HYSM$QL%-o?0QS)fW)rYdICY2`8=~I+q*RN znA8;tmrH`&nKfGRj87k3E!66~nKtpjqt(c^>E~Wdu-}wcttFAJDI|w^>p{n+6|+3< zR6IOHCqpcc*?}g4;^#!6K4L z*FssOWqQ#WRI6CY66Ts|CCURXG!YAXlz17X-6uDR+7R@=RB}UvMz&H<=xkvblrIKm zugWe@XVt~q8N;7j*XlcP>=M#CGDCk@!Lh`F=_@C{5qAL7i3A#;2382J}N|wHlz;C{WX1R_f zi9RrhTrrP4ouxPi*I4AGm*WVv4vJ{p$66Iaa#s z{?nuHYL0v`6>VJ7;*7Z=B-^gHJQecf9#7yRVD{eUX&P7>zF{5|-Uh za&J!dh@leFP1~F+LT!qskW_`oqV=C)jAd-|c+$#Zz@@Bf)l_md%(o=J0h?(`Dy*k6 zSrgdSOh(w#biXl9Sh4352^`1HcUD9`NhmCNg1q-WtI~8mYPVDQL*jXcbgQ6`x38x) zhlhFnSSf4W3Ha^-qGgr+?UZ_a-D2wa=LJKCm5=g=xLO9(xh3 zJ*gbF@#Mud?E>v#MB$ZnJ+OyD`e%GR?Ayc1paKu1rPI8fxOKUZ^QHJ?}R z?vv4_#=3K6{>2_tNludD=_|u5Bwok4*}ybO4LK7x-d;gzv1HZA=bt5IZ+TEAZ8>Gc zjjG-|uOMfdWE`iM=i}t|*8lx$~&v>}(mZif^#J(+I(F$J=$aUcwjY&?^>ybg+k2YxDdG~ls zTpA=?vi;>f1g*DXHKcqqH$U?DT^T~KaXps?!G~5Ucdo~7TeviF&5doCxn7{8))}$h zR&*4;vg}j@F@V(*O&S8$Defi`Bq$zm_ID0Eo)ElzY+@krgF|{Ym;cu{6cTW)n6dT# z*GD;*_*Dl*xXLp31ezh|vA7Ms0gG1tFUyqn8BZ!wL@kdWZzk2~>UlmJF#|E*K?l{) z_+OXKJ!4P@vIs$U-_mQMCtU~!`zKJ%-4D%D3F(^gWS z*}k%h@PgM$pNW)1Y!OIRxfU(G;e~IL$UbJq=oeaMtS{yG7s%ATkDy`bb-eWT3GWcN zFeY*|1k_nGM649xuI5&Wgn0HeA{v+H6riJ+Rl;b`$A{yLbID2KMn<&9dxlK|CuS+- zC7L}{Uo_i#C1kVvPL2gU*Lw1-X8kAV`E)XK>9BIPx0^rH%GpL1V&>;b8+$uS8_dL=pn%n)V=8TOzE$j@v_TA2mHU?5lPoR+uGdi(2^Uzw;)zl3{G!@Qz-#_K1}yw_ z>I4ymwdWi|F@;fCSw8aKa*(QX%6SAw%Z!&4*%}!kRBz)%&fARbXuQ7>S_l-z++iB1 zvT@Zpae4GQ@tu~qKP8ubydB85q#?`zuDd)BSEEsJBAc3hvIsQU+Wxxa z%hBt1m$#R^727T>OoibjX7RHh$&A46}bPkT$KT#4@$F$s5Fi`CK zIk~t8(qc0Y1D3rY3S1#Qj7DORBt^Df9N^+50;oH10LVZI;Ryr^W(dQejoYUX zSj-WUSMzB>Z$T)%P@I*59rE!93bs`+0Z7q@06%mf0FTZP5T*wq(dqhwfMR_JDNTl;(Wn$)y8t1n8@~Yo`WGQ2 zu?ylLK*kgT*j?-g3Zx(;ZPQ2)U}**cCNBj6UoS&QUS`)ofTB4ouya`zV6lJz#^z6f z3QIPUphX-ASha$Xo?9Y7z`QjCps}6;h&B-5mvtDxZwmoZZIXaZ@-NEv1(09|0qE_* zL0D3vof#bwP3FS@D(&vVq(TNPf_aU&fr@fc0y*v4NpK-E>t57`hAIS@fNBtvtLy7o50eIVAvv-H$e@6h7 z_ICD%;(x~gmiG3vwzY@i|3pDH_$LY-`D1Rp;S^22ZEP-$PX_{Z-xviRFE zSlW*){_z|@7WX}~lOqcx%YXa^+I#I=!IE~|OF$gbs&GkV`zzKazJ}KB-jZ(4?!Mko z9Exk>o@VVrX*82QeHB_!Nw%#K8~%XNWKni#y~KkKCu~z&SLAOzQy*ha_`#0K@Oo z{}+aY!7&H*kT4|Tw;u6-l10K$`$a!+7=u1+5QjdPESbtf$s*xMB)Q0c+k<2NBAM(T z21CNJn1hTW;dmsu(0?CBz>x>_;AAi4-uO!oj=&=hCIUyG4pkD4g8%O0zYfFDxI>3w z7&zu&#^h~-JLnjO!DA1$3k-|E9mdf6qeT7?|J!RY9PChyVL0+SJCGd=hejQw6^6s? zFM+=k!Ql?;;o*NA|MB~qe?00?jbV5!>QK~p+@Z!Kk1PD39(h=i2OF3?shGohI6Uzn zh4B4?9>@d^$HEV0M;_n(;r}aRID$OYhcLwcBK%7afkh(zV8s89Iu?#Nco=~}A8HzM zH4lv}9ECbWKOFV{1V|pRznLNP4@bie%_kgt?$m@#<+92!|T zhFm7%0czkFEcy_wSaSa#)WedwK7^t7ck^FV!?ENJK8WEE`?tq$J>tKc7LG%q4jIH@ z4>t|&AcbTG@yJ8X3df^y2d4y%#~x&rNIs0i{#}!OgZTY>>#s}@E@!jOj@BXEZ%2SFaN{{%z) z_o720G5^-&-;yAZSnPik|D%V8A8bqn3JE)uF}b%6W0?IL{jWq$Ajk|JQbZ$B2*5K+ zRR2&B(D?oLhQCZ8FmU*x&@l+i!7z~s6b24dMDcLoq0$B(9zIZVzwN)_sJlDtzdYc8 SFHx*u3>*m-7FN+!1^*u)nb;Np delta 7699 zcmZu!cRbbY`|l{l$?Vugha?ov9+hKchV0EL8Cluc9D8ItWQ&~4$lj7jnc+cpvQx+k zp?*hCzvubJ^ZDbP`*Xdn>waJN`&##>KAPf37)3D=Krk?L9IuY?YHaiq(MOgs|7Gqu z%}qaT^1dRktJ%aLKCgX7;t`VTV(gcand;I;Z)Jbp9|yswhDMpmJ9{_>#B~Z*aVXHpl8vgSZB6 z$@}%se2?r5B@cAvg;P?V&kbPa9^MTetCm}Pjd>Rq@%yRAw-&pWZO751^8LzPkF7=J zx}WY9ySTI4u02U4cVcWrM3tmF(CJkCeaU-%=SSIdx>ScOOsC&BcMB|5_;>ejqzZ~s z*7+0KQ6*(inx!XYtC$Jh`1+QjP^m9%i3>`RQj0`bc$|uvo~f7Ao4nEeOz!0hz2$Pioi?&HWo%*h4LM6TP@!b@hVTL{Z<^&B@4-z%c!g!b z_enVc4#_MPXPxuSg9`)~H?Q9PjjqilZW#xz~*pfL6{7wL}H2vaOTlQdH-p=lv{4#0bZj4K?^I zz$tb{Z$4ZIhU+|;ysr|R`sYQ($TkbdL(_U8DEyCfW7 zSj+S}8tQEM~JqcOo1n>GcT-dvIRX3N!Jm@3_K%l! zG&TpJw_?wxNn|U_ZPRjdI`~qWWnSdPgxvYOpeM;wpSZyTDi7Ewl8qr5Co;xT;VDc- z&#aMJwyT&wh>2iX-o==;)@N49g?`l8VY1GMZxs@F4DdI<%aby8bNW#nA-PvtEK0R7 zxwp8FTbR_usu(iZX|5Y8VBQjN`xL=l`5u~)i?#F4%9&?M?lH+9A| z=2E27#BvP9ML*8hhY2iTmn5V3VmketRcTi+dV|~>!e|?v1O|ydfm_c&LEU*5f0nmB z{sL%iwWq4s-SJn8e6pqW(f#Ras#r*%0{=|*b#t`3{wwp3>dCDtb%gj4CXNfG{LJq= zbx5c{caf`Hd(!Ktxa=D1DV%b3UY=prcV;J@q~|r;@T+Mx7cDr}Bt6>$zo$z}i+H_o zzvKC&hvMl%LHVVjDcjb1kt-Bo(61M}>}W&R4)8mUZ`ihRHYK&t<6+!dhK zi1sJS%N1M6AmaK(cR~mw!^UdmX0NUJVj+xBy3Lq@(w(gE${dg*bYyN@r)tJjuAn|( zXT3bZP2Msvfkhd18n1U^Xffxd0!?qCYQ)Yv+jZDgyo|j4)798SKAFzEoVdBFq)^7x z+oI2sq4)S4Alo25j@|oaJl1UY{bOXr_6^$9-v_kWbjbHZd_oC*;Y0>wEyZ1 z3FtKMW@{_=N8l~LA=E3Qt#~XJTa=BJ7<$6H#=gds{2ZCCu4XiI68^;ga)kcPxmn@U zELh|Bo#y=1@VHmj>~K=GnT&!w!;oFyw`|pY{kC{4?xks9BHz$OsNpM2sZ6K&bZp{V z>GmQqc8#bu{v<+#@G}JE<~T%77*X6V8){?cs9F6e$0%|(n7%~Uw0Y>4^^Sd zdW(k}w<(4}mC>l5EDldK8Ou5z#rmVfJlbfXml80>yyzffi&nYBd$e!LOnJX3OGsah zkJ)N({Q_?nzGhZd{y5^DIEU#2$y;Gog{&)far7^v+piHi%G+dDoTM@l4cCX!(6jX(4st~PYuVbkPgU^(g{o{fSjzjqz7w)&v(oWyQ@B&G6q~@f@7|`M7}apC z=2`u1eCc0IQ&Oi!bX?n7T1ZX)R^@}9fk_K7p$2PKU99~lYEh47WY30)WMnrx!C$)8 zS+=F7=&hO);!|zo2qM}N5AgPhTH;$p&)dCRI+g-MW=57yd<8lo={)u0 zHc2~rj+1xrwcu>v{79d1_Ki%&?2?Z=x8#tub`vau(*0RJuN!Y&mpwbazF1k%rDK;) z%jMlTLRsG02mwm|Q13+<Ym?!CilIpE zkLK~=u9Px*+uFw;h%4UT=e|up*jk=n-X9LG8%9yky-m1MOV^~5Cp&&$GD0R^@ndxC zDg5~~YpUEA`I^1!uL7B@yzM!N9k2unWp(c9*<;HD7Qws6z8btpbNmVkb?7y-_ug-5 zczB%ZjS5%FSWN5Z-rjZSoZK9W(jTCk`*9ZQgjkxnz}@L+)wRX)IhsdPd`&sc&VPMw zLFQW}ep>5J;CC4E?o?qmiNh;eH|s$+*HrHvz93FT0aDU&!vc}!ij7J+ndL37PQD)A5v%J+sjD8gTDA(rY;oNzx)jsa zzB%OBvU%ZId;}l3fu3(RQLV3eGx2rHrYbh<4DHfZR5t-vSNFqpV1HZtq_^Dd56*1! zYv8r66zq?!8kVb4+^~|_5FTE2UeZkOUB(zLK0qK{+f}a3o{`$ScKVJYo{OD0 zod7u1((u_hFO{Pypo$^2{0=%ltbu)#zS<2AhMsi#uo6{7o)^m+;F@$YmaU(UrMRdvhEWQ4F``p*D)Rz(O2bv!C+ ztWMQYt+|XR(VXu@R42@o@hN57!*Zh&`9={KRsL|jn2e0cjFjO*kCr@~>`=Kdv*1Zw zp&UtcN{bs8Y1AIh=gHE9$7#{sz}^p52FhA$HpvMIxK36TW>M~Smn6^7U^Q|z+X*Ve zEz`PF7x8M>>t@FngKl5HF-IXdY|j;zET8iXo*%bXKU$(dxF!$> z#jA?Sd-fTgZ21^P0mJkh?J^ogs-1l;dE5Ap3-897HTvQ^zZ2##nW&MQx+2x?! zj?n^_=kJPYWs6EA@i~mT)xg#w9?GabP0yVmZRdEw!c>}&;*#FCMHETj%@UiiyC?gF9IW>lM{; zhSCVZRHk;IN6H(PN6K@w`x#eNio(LONPa30m{J#;c{2u{0llD&)G^`HjD{D7@V4)y z+F$xbI(++{-@NDc;nK%v434Vtbc#01xYY5n7X%lNlHt#-$N6VsSS>T zH*3DHU3vJNg8LmlZHpg+o8aM4*GYua&)cb^lIjZKt^x-O=c5*FarM#m)iPnPzrJudyFB+>| z*H`zCQ=W9(3TIERFAdi1v}~_5>Ho~8VU63QRwr_a$=JJm6X)0Y(VD(~-r{;(6~VpZ zUK{TZpWye*UK6)F*7P>lu~hXzT_!A)LZdd!wD&~fXuYlT#_l75EkW<@ZbII02+_Cc zqSNe(?OOmVs?uV;0`5R7Lf0tg;~>|+^rp10+gI#-QF!w9gCXC%KfdazsI7m(HBHD! zY*;~M=eIW=_$p-g$fR5)fvOv9Qcu~$Tez-JlxKDR?wMFd8iYW(2CxWCa*zZmI#iyr zZsrz_m1kq0JmHfD6&R_vkWyE;I1$Sfu0BB}cpuYaJwIw$`Plij5}~lG9N;p|aD4mN zh`ZZIm~rqatBF0XAJCh?q#*+pjP1<`Gf$=&6e22Tx3;D@uiY(OYLt{h)@0jM2U9fr zBUK%28&8{de9plM-Q*!X#PU^}#pk z)^&f4b15vw`3v=WhJo4)GRv&J9hD1r)H~MBaj#3x68Nt?x=pQXuX3FVj$z(fU>R5NuTjt{o$I0o>qDiNdn#qsA||W z{>dtyE1#7{xU|Y%;m`zm!c#fTH!)}(_90`6Vd=*ZQx&L*msQ>qSyVt}%&M*cB84ibEJKX} zK#WiW5y48l#CdfSVzD}?46nfpAl}qqB0kdWB>HPI6UX^45Rs|Ao13_S zXCmIcT|-p5%~S@t69yo1-T{?$8^Hj?IwKGfWIP*d3?iO3@g;7P{~wz~5yMSDLaRUfY2&nBOZS z2U8nsQ!^(p2Kzq}2a9{=PGBtLKm-)AGj%XFwIRPl|5mXFV-Kh#V>y7a@V{4%U;_5H z1CFN7rZ!;gFAu;Xjux(9>@OdHB93-O#-?EGZ!gFj{`SJj+`*JA|LuX3i!B%m`GpM@ zc|czNhIJ+{f8hc}T*%Acz;5K_FJNY{h&y@x8`j;_;b0e38PGllP}b!@K|@S7qaluZ zJAvUa6a+;EKBg9X>2Wl0E-JMt(t&PXHlu`tTe_AR$CapNj+pjLgg*YA_52aiB(i z{&f!vfgXXt!q8-n59uKgNA*yUqk3rcFFnE^AXxZeML6VFF#Z66L*NHdIV3|KMCXtU zjV33@pE7bl|CeVnJ_G`X!j6y%hr%)B+&E;6#2h9cN+#i`40$AQWQc=Y`(r;c1mOr> z5G3M=F$9T$9zF>K1%n?3hM-We!yph87JfvAh9Um8CHxy>2pSDNOd$k~g&n3Ef`J}X z_#Y>MVBqkhdWiqfBj@@5+Wz7n`>V44+z<;t=m>{oWNMDcFsQ?NG!*$C%pj4l zBMu_5kRvj3TR;;IZ-_!1sck3<4LL$93WGQzBj@ko<^x3|(1(*6ipKs^b%g(Kz)%e2 zU`qXYIt=1y(qIl#NM;a&J(8?YEbK_=$f`hHy0IaExJa%-_T0e-aahAh)i=wg|)#2PMd`&_7FpOe^y5 zK}kmbC(~ib|DYNPM$4ttF+UB~hVGXb)*egOm!DCM6}b(=ytFw)U3R z@3^1m`F%d$&mYgrBfanUeP7pko#$~L$8p|)n(Ak%b}{WDAt9ksRywUsLbA<_goIRg z=XU($KI@@i{2}Rb#=u3#@v_S;GbbyOb7n5rt~k0}u{CGAY31Z>>v&aEKwLnWkIlx# z<(jjUprFJ5{DOd^leJ*iuATn42>CT7LuV3_J-GTFGtvyXbXyWq5)$Rp3c9Wd6N$GB z==8h%r>(|R826?b+N2odiu2`?sisuLw5!%Js~)Rr56!Ji`kNWZrRd8Mv+n{QsmE(( z`bxipr_)reM^-JmAEqhO@{N>bzi*G1bZ!*&O^ol8KEtUTabKQo@6K&oeE$2Rzc@wz z$M*l{2l#pE?w!<%|L3=!(CuXp`9B|2;9)zx@BhEZKmT%9TRuSOf3Ew_kJ9e!HT&PM zB7USodg$Z-{j#VnC+}1LpC26|Td40OA+9Zb`xq0|@>I{Kh6c)LmGeJ_@p}~%{v662 z92}aN+MEjupFLEr_uzT%uJQc7aq6G%pJBQ--7AxjkWkl2iHo@YEcRLz3kwUga!`yv zoXw0o)D+s~(ev*Em2QuVk0+fw^Rc_TRCD}!dV0Rc^3`6~0V+yL$|QfcyKQ@~n~*2% zsFck4H8619$w}K=(Dc^l8>ePc4mI*t%s=7FX8z~)@3xU-@fH^syFI-6{M@;7!s6nL z;xcz>P9^By9Z^tD&}UyzR5TXfCdl-wlty|_-LH~ETTIA^Te)=i-aV~M-Op83F@-$E zd6&O1@$vK5cZNvghgUlC(!N!DwJg`a414*KS2b3E-(K`i?5eR%PswqMst_J~(QUq` zaoHRxs}<(t#j;;jq0AY&M;j9)b-0S?=w{`(|J@_Y!{v_ypPze7Lrv}D>w9L5iBmI9 z*kb#Q-`{u&XMemh{PgI!y^4dhLhWoyfZPKXnG}oaFx?lr2511x(9~e_W2@`FFDwe1wl3OEs?yvYN_Hj)~EGJ|Gvou&|IP_OzYPFz^2T z``fIf4juC0RE>#xdgg8{y^M@Z$&K-*Ox&H%c=NreS zu!AQg+~zpEy}kdet~&o+7!xTl)}#p-SFCWQ&t~==IwQwLR@n682ZN5z*O4n<^0i)m z*L`I`?X}K$Vjv(eko@M&n?!(U+fZ|fibh67(aln`&Qd39Z(3fyEc>8OLE%u%lwS4? zp7qt~^4Yn$6wj48mx=a=0RcM`7xl7?d}Cu7tE#G8-QDMUZuO1vB}-gqpPHJAd-O=V z(C++m^~4v>r(P^>7!=ykpMRlS(cb=6e#zwB&CshoC1#F}8Y^={vx|#Xc&NHL!~Dxb zf2%Uo6Q$G=r3&Sj?0ar+-MZEG`wMz}!K({z1T-GAv$KzW%cSMv;?m7Bik92>^X$W_ z`Ikq>8xkZV^R1gwv@l?GD*G1Vh$ghn)c}E z=-9Poe`$Gk_Qqcc&p$$07v5BT`$osh%PZwFDzB!dmeduozE3u|z1_f%iut*#SAp$U zKRm_J`ogY~o0CSy7cb_%e*HuvSyu9N?#BA6PPVb+fddB&@+Q%Q$SQW5=R&}q zZ;XCLH*9=FKu=GvbaP|%m`yVYBO_yHkwg6sR?iO=_jW{!;Rp?V)sU-fYh%Py4mZT# zd;FOB_WGiJ()!iDvV#;9!JeCa+pHeEy-K%Ua!8p*^m4<8j*k5}2S?Ap_*~6;`xRdk z|L5@V$FsWBVCNuJW}x3NbOU)aTECFMNOR zajVyBxu<)#)^5F51`25^(ako;q~_n@9E2Y_b;p`jKU}7y?9VTXUXN*+hA&_C3!86q z=q&j8?15~n?Q(y`R!Ssc$kMrd2Gb6P12!!b!qaM!oBT?{MG_nosSSZzc`T4oJy1vFNZ`)bG)6&u+wWl3VHiVB4 z`LlX|`0^!m>yF(*HqC5EMk*tXi9}U&e80%R>iy@?Va1UC&w;d_bG6(}(q$byqq&cy z-9#uD1wP|rwUOLk-#D6LS%S*SWUU($o;q|DQSKMHZ&e@jG$%(xQ!|`>%SaLb*6oxZ ze*B0*GNi9*GZvW=S^rlT$5r3?pX;o(@Q#k9sqAMLAk&KXepWd48 zZFnsHK(Ks`=Fatjs_TDd%n+AB@88ROeSU7>Tc%EsW})p@E+U5nP#@kW>-o0vkFN|P zNnNa9^0XM$KA}5!gs1Xdw^8-6E>Y7Ptv1I15RR-KJ~7^wWBn~d>y>GFl6aqIzWl&0 zlLM&U!-j34$YObxwKQ9{Y?)bF;t~#^3fpmoGFKt~83(ag?H1$jAM z%z>%z_U8M%M!C(kd(WO7-m&{Y|IE)%ln14vkYk(ktW-~)Jh_vS(sHooVS!_x3D1`c zQ5L_3hpUKfLE#$v{*sYIOH1oYUs+<$yIYJltL}@xD>pZm|B8@(ioq?>pMAR ze}i($Hy$BHI?oTEdzF<%zGn}S!b4bO9+`deyZ(2Z*39l0Q?L6t|HR}ZIXU?u z0fFofl6jXK;wrF2&kl_DzW1=OwEX;caa>tVE%odJ7SHv?Hnju^>LY({T%8-NC2j&| zQ!$jO5_yQo=p^f_%ju|Oo}25-8Q<*yKO(JLyS_XY7j<8-i4nM{^6utjdyxb4y&Ze1 zT3V>(;gtu`Wbn-a201c!3#@9!@;YW0Yu~yIOXb$Bqsanqz#~SRvWjt*B(^2bSiYj^`{R_g0 ze{5__+04wWcu6aKAS$uYzD*^R>39Uv3Om()k<)hgM8VC;PUEht-EXx-#KmKQLY?_k zm6ac=ojaF?xEpO?m3DpoE#(Xa4?jN;nQesbGexSzAzE(rZ<1|JT~mNiN&*4`buJ1I zzM}#}4KWHBhm(x`stm?zX!n(QrCt8=Xs9I7&7mGxAg#=6b+{yqV{Aj?ptP$d@r~F3 zmrmW6dguO@Z?4K&T>!ehu{3g@oHi8^pp1*m&dw6+vjAYkTMwMUiGpvtFxpZa_A>3r z$X%>?G_U^LdUyG4xxPg-AXv}C@o{lVcLt}Y1v$C7M>=k9tgS>uM@OrdXBibnC_Z4} z_nQAA9^vcjOLySFE53KJ*EC9IX70L(&AnRaVeM14KHQ|j$YK&c#Ywm3t`*l;+eUi5 z$5qAfqBsM;5l8X6cdFR+Vu2m02*L4&{MWC;^>R$IMAd)vl~OT_Dc!;Gl(p&wsNvkR zXOERT%0WbMaBx(7d}hP^%uKjZfz4AjYfDQd)QU*4EA8Rw>tr%qfVyQ!M%;;$*+zvg zQd?|XI;~`K^1fzW5Vx%fKUBA{x!$*_9DVeBYlq*2rCc@^T^p^-jZO}3ZdE`M?Ms)Q zh%{AKpM4~3;n(PH*mPU|k%(20NMc;v5zF5^zxpNj{on40*q<^x@&NDhH)p!Z|W8wH0^&?J4tm zx9v#z_~px&TLJK$dSWIvol%;i{r0eCe|?T4r;R!gJ!U#3YHZ*XEi7}iwapKCAD?$8g=)M%RgoK4j2gNynm&v?ea=Mlg5I4){xK(FG zg-=Gt5fT0=(UiOL1oNDS8}I#1VV=^EOMt*mzOvxsl}C z!!^t^C>?w0=_`>2=>_ixeBRmHJ;!AmuQ4y~`9^iBP1B*KXsYYlk!j5amk$P z>IBAFTVJP0oIgD7n1t#}QfT+RH9mG-{6kk4GmurHnBk|_-|UV9ph}I?FUC=_${sK^ zHJ$2xuZn|$u7wKUb2O^Wd2?f(*t$Ni<%8I2bSz=AUVo2tX@(E%l6*j5OEeh*a_bAU zNb|!uQVDXmWzV{5-OQVscN(fQyLvTy;^phtLPN`{sz3VXyWe6*&`$!YDaY$Kk6SJc zy^2JC&+pj zJ6d){tvu|zUKa`^G^X}AK7J@J_C^P4<<8x^FC(3xuFcN4p?F!hcdJ3u%%^VKsQ zPVSnmOPYR?LAPXlCxZO!3xiyCdS1QHU%%3#D?tvdz}7@RzIEr$9TN0qRmgwpsN+rx zfU_6ToK|Yct)Iq@qtpDHS~a>;$WXan?J(Ez;b>~JlxNRuzhzLf-d+<%v|xXeh1@OR$H#Al=cwAn-b75AiS2PvH7!ehM(!KMknj$Z1D!_i#G0mP>a3UIhbaQXL*QF3aO@&{O zx368lZicL1SNd4oF{b8^{~p%M$Z~aO<5@y7!Za1UL%i+SzWuV15SqD@RpBzupdIV+d?T2xeYT2YZ*Ib!Za`1gAm zsbSF%9_&GnXMw<%jQO6Sb-2xehY}HfSVV-z%WM6BvM+@Sa;&hBP(S)G6@hzi-%5b| zQLB&PkG@%)$a{u!ysRPH`M7oD?fF^IV?s@v5mF|Jfy$(}i|2 zN~-s0tLv9a(1hGAFPFpid;a<94|>LFs#_8m&_63HOa0^kuYUF@hXMb&AP+Sn##_@= zi#?VFkVFuFnVJL4%*^vYE;#01?U5qdAAkQHcu<`f#zAj3L4d|H2!A95PGCpDrRgw$ zzKEBK^*H$R7pv>*gS*~1(o$1D-PzWW6UnKdY&=r%_T9UIsi{U}uCcB+g20lE=kJ3| z`h`A1Y9(SftE?JI&#vQ1Sy}vO)-18#1vV{r$!WPtnb<>~4BBEzR+p#K&%e|wE~Dq= zLD_uTtvS40q#D5?yS|`~Ej%X1@7=o-Zyfu$`1oF(HcqX71XMUv zzPbJYi76r>q2Yti*64;MLFP`1od8zx%oci-K$rxc9+};MxOBW^6Dt_I)NA#csPm9-2&$cblfxT z%s8nVcG-?81i@3v**KLr%eP8VPoxwqp-pe8BxsxgEXx|cBj|loCu4SY_I-bU-a|^L zr3H51gJSqE%q?{}=DW_CqkcA~sm3Djq|tc4cIXnzHZ502bmivdnOBD$v^mBkEiEl1 zDvFEEh8#S>fL0pGHZVAN0MCnyenp`pnis@aoRHZTB!Gdz+DJ+7try;0{el+7;PoX^ z8IL8)w(N@&-Nia1Kg&77zNacjW}JV?!0I_e`b;tOV{>!(lP7Fw;LiB&q6e`02$eLwbA_eE5DM~AT2Ul-7xF8A%g zne9FxA)hSkB_tw}a?*DfS?7*otpxi?bog&?Z_0vQh!QkSLQ5`W_Hi4~-1BZzqO0}U zx+CvOhX~VgyVI7I9BguTF7Nqy?ba>sBS!)d4bO6NngTKp=j~ZBdP?QMJML7TQ+MOj z$L`QzM)99hQxA60lloZQ>M=m=j*pK&X7_C;ushLQf_R}53_&+N^F_R`IaQffL{v0h z;<|ZKtwyqVr3@F^V`6z<85c*GdatA@iGi?5M*A3v8ywurN8v6P&PXgQxTqB4VyDWv zVUm54P8`9Ef|q}N4kUVJQR~JdZKb6qVnz0CyKmjPwSD)28hul*GoGHMVg2W|2XA~4 zXt*IDDH)IGZ_c;Yc%hs55gAPaOp9ar`pxV~ZnZdh0AM2LW61~5`I;lzBwy$cRikUp z0L#?d+lxTW16#pZw<>tTS3qlBA9%gV~jqkwF>L^11j*A-k%%$l(Gyzo>{fT z=ZETQf9#(xx}qbF9stt4-m4B$Hzxv;lUb4Yue84=4S#6^ZnG5x5Sr17^ki=%JF6ak zetv=;0~1(*JO}#42UqByoTU1|XzEGehWHb=1W-i?h8&xAw3s|0F|p(A^}`p6t~^CQ zyEgM_5ANWsHKm}SVA<-_iL&*j3k1QFYfEl`hMX9pe zue-16OUs#`{;a?H`=5Bul?cwv z%=9PU-T;Y0#TQrdq(E%Q_t3v80Ref0I_#AGeto z#^UZv#v!@6^l}^RQB=%pwxbtrwo$37s-i1VhCnhiJ4=WeYm05BK{G`*E$mQSKq~vP zn@64h8}N&T7SNK|7SLT9L_Zm0t139%S1#8l+gV_Hudwhq^7g`bn})Ta!t{>06vw`@ z(eZH*c70o7Pc#LCwkEoW*C5b5Jw5-Rrqe5M=8Omc&x<+ssyR7{V0kUuUz@2EmPR~( z+a_6P^yS&v2d>lamyt%&!OgiqB+}ob6JGyN@Iq0DF9qYT$;mH7ubYeXcT(mtGZT{t z`WIZ?rX!CN6&rwYOO9!0MyfR#=~f;d9wLt~cAGMwum3VPR8JZuH6ebrn+@^~2#||` zJ>R&5JbHVN6l+B!33Wc6?4Glv`$l_tX`(~La|x+)m%}G$RVd!9*IZn5X()$dc22$N z^L}Pp?hR?i+`{4>Y8T)&p4u6Gh-Wnx^e5ol+H5te+tC+-%zyv>#V^i%+9N0AG*}~2 z$jQ%71qgK}&=kkt=M#0omF~9>0Wv|Y=%i`~t<2_autXmRt6^YZ0A@*l^tBICG}|sA5-6WL*P(3?8f0&C)ky}5A?G~C~HKHjhu%SNXznkCQ zStXj2J9g&lO6i%+58_)h&E~JXS^Bl69zzI|-S0gZ?;y|5VWW-|^A)6hfB*h{U})$N z0GKtlTxz+j_~D~RFKAg9SXm!LM$+zKmE~)_VF7v`OvXmqyQ{B^GP#bbcxyN~@C*0% zP&%mwawyXf3_iF>G68zNn&jOFr8Eg*$XI~QbdUS;)R&d{5y|VnPRA$cV>3Q{{Thi~ zXON)wFpI;{#jzL!*yZef!*)Ap-h-Z%RReWymsih`!fUDML(R3FOE`@78_K+~v_)@! zXCf);{2d9W!GH*dzVF}92kL|DDWB`#Pe*qe)p8VA2fYs`L<7(P$OHuV(Rr?>^^vdO zWZk>zGF7l92D#>B2o^vU( z_h;7Pii3kK06xmQjyXwTN>Fex1x}PXplSEp>-Ru6{QULnBht|d;vQtiAf&nPnXkhc zJ10p30s=Ibd^wcsydOcTLoOhyX+wkB-{tA12u@YH?>E6O5VR^mvVFVJR5wulzzy9a zaTzE`L-m5?y1hbN92|r812_~Cx26st&~R9kXj3@ZsxW|tP%tuhy64%(U z*qYxiWKoDp%WMFITUUkbuRACArci|~(M=E?2@dJ(?ak%e zD$iE|YJ}{6@IjIZ2o0@lZ;u0CF^{auyUIT+WB|6tCtv;}ZcFj}Pnv;#+&+gQ# zkXI@#DXnGzbJE^xaRlbUlcR6$$hY2OS19Gbtck)$$Q&s6_4@ZwY_4WIzCfccjW0;{3N5%?bAb~La0fjk$QD?)X-z3J_~G!buc zJKHed54Ut3*!GD_(@R7v4Gj%}jv(bi&mJB8$#e9?%?XGpR1mWSOiF9eenT$^mWZtj zPf5@}^QA)fD7)I#w5^5GO|3qi66HPbZVil&2j`$cSXw$}|3l4I2kkS#kpVIai;C*7 zFnsy)Mfu#hp(eRaLZ}54Vv+T%K`*-UD_BqotkTPzvex~AY-}XpR*&jt$OGpAEq|)4 zJae5MJGKuZz?Q*Xf*m~>g@k;7qw#BS5Ix&=6o$3AdO_n-S1lga=R@H2Asy|6?pYbS z^4PIVtIEa2We}-!03AzHw<+>PKRP2K8=X4!$@qLP_DZ|V(+!FxgoF_!qiV!VWiaDb zx|(Qrl&sTd&u+c8x#0!k4K!Ce$}68qsf208D2U{sEJMega2!7EH!D>UAQB|5-^DQ)Usy(gF~59S5v11)5;-R)r~f%4 z($^@b5cQZKR+l0tBbpI;rSpv^f_7Ti*#*6MBaW27x?>Lu9eYT>Z&hPs*e?2`ugxoW zy+Whzf4t&lJt%`l39cZnRsa^2)zusGx5O1)dCQQ+-M|bU ztJC#kf<*S&Q8Fx^l7{}%^lY_}M#*UhDk>^MT*2|*Jc|vh|In&Wf&E9{7AI;Qrf%rS zk2T$1H(c7f>lC`UZQHhuMlA{K-dvmC1E|g=CB=jVBaS`9L5hps#+sf%L9g4wULw^ca;;yUJRCA+ zTE4fD(Ic0ice5R!7lxD6wzO!)x4k&eOCAmZHuc1&@zPVAJm~9VQlXo^i;I4zr|4^W2kVA71$1^WM-Ze3H%M*NAR?uJ z;!BFZrmLax!1VSy;eNRX!r`}NViUiq3X)-z`)`7&mv)Q5$F@4{3uSO%p+zJDp-SBN z9i=V@37a6q?)&-<&0H$JHJzm1sz*yps~8ggatqv3NcnG_M|QJ$m;q9vgVsRI?WCZn zzvV&k5ot5i^~J*2SAW1}5G+;LDlm%tnygsxx@Dw5VXHd2-7L~d=pu*(hJZV`%VTJv3|7u#Ur1eOdTd9|Lp+z41l%k9mLMh>2br1QsNfQC zXOOCu)@$^1r*?E)ECQr7Ff^nFKq__~A(71Xz|$O%bh;0%cFgj2(`5I-<91OIrkrmQ zP8!ZGFI%H+`LnX31(gOzN&)RF__hG`6O$K2CxvMV>)jKMaE`$Ev(`?+f7waGh7KAw zJCWMS9DmesSO0|D`JE7nWn?ZEi`RxJKKlu&v%0J6gofZgQVvd^BJ=C(+pkR>;NXxS z^m5fEHz7QJ_ss)^_C&JMMAMzWa@bruNUtoL4T9i3u%%C0YzUhco?r+Vove;V&aqdT z^V{djO5Y%QUZ;`9gU}Nm+bie43fx+lZP}VX&oX{(wD}O|fdCHW_V#uH#nRBy7Q1Sd zwbYo5eT1V+=6&qTnrOX@Ep$H>nX9%qWk-}Y%eHfKas6&!g(n8fd<8HAI@$HPL1j+z z9zfq4b?>cu{8z}gb>`-kklBq`eTkiC2m8nPpUVMN;3JA15L#qROpJYD)JNSx!udfI zq>73Pm@)Fue1aeOjEk*+n2Nq>v|P$4Q~Z=LIhqJ~ldt&K zYSC^6WMaz`g{=rh$Y?(y9(gW)*Z=e9PcyWdtG%VQV5kUijOvfu_X7}40%i9aU2C4$ zrtk2aRXE4%H#_X@uU&Ix|9JWzlV%iOrMr91C2i6ZI`6E#Uxmy+$bBPHbW~IB=JT7n zLOiao=8+a8kd8{9y1S#3|1~wWyT+=yz{WY)a#?BxyGdGXJOncYg#2q^#!O_K_Ua+X znOm>H&IH&jmv|HAgSva`2sMeD6Z_B@nk21X&f#Us>|Jcm2rwGtFG ze)|DfAnJ4j+w`q)=dbWXGPF;9jQB2UgIYjB(DR_?wjj{&u`eH_rG3Wnpc80NSU4}- z@TDFau~C&PUrfvbIP zZr%$32s$WE#zPD>^fEv$h{u~#-O8{})^)BiK2n`qRhO0-DyQeIyM36wzKX?1e&)9} zpGMqWKQdq0T|Oi|S_o2EOUrj&4EbLg=Fy+PBSAf!oMHfcu|Qmxu%A(y9>6p7TK4cGLa;TqmV*mn z8#qCNfEaE{CYn!zY=UNK4QPUOKH^140DTSX$WtVi*8!W(kUupn^OQteC?O?f?%v6*n_-H06OGrqMJgf5bza}d17m1(?@i*`00Fy zb@D85<5yfBo#7`moG&@3zjBdo?{OA~Z!JDY=JG$bWjg;Z@;C9RhLU}-9FGwPg5(w8l;GGp7POcD?20$Q0HN84zF|=vv=Rk(VLq7$0W|x%Quc4s< z477SObe^F3(JfN~x&Hd~%N2eK=O|vjqc~%iKinbx{Q2{x%a^OrU&o6(Cgn*>Nj(A> z9veIe3BDfyLuy4QhR^UL>}nbC=MlE8-rl4KvVQ}RxOs-Mor%K>H1{wmsWJQFyQr0m zB`#0lbc5Bw8Tty`qeR)kArFeJ23Jj|&=29+bGhf1grsD2jdyJ@)TTLfo;qnN^x4M6 zci}6;5)I&Xg8t~!9o2iKBrku5#^~GI>lQdtfxz!aql1v6exa2z$|4N95Q6AS%&QO_ znV{ui_c$QuZ9L1ygoYKgue9f%aQKICp&Inve}*4BM%_R-DFY_S%F3pQL=agz=$y7Y z&9i%ow6RD1soywVsXTi_?x@AfZ8^KzvQH{*H!!OEsYJ3{L)t7Qg-!Hu(|OpnpUlv4 zcsMHS9xndZl5#Sv>HU3=GgCU6?v^?1x-oH6)a=hxU6QmK^R5&?qaB$3RR_dZSkPh!586IgKAWGX~5yu>}r8kpYjol8pcEFlm zU!kghFjGh(>eh#+rKfDExLtksLuEr=umChMes}X+x>3}ViDLJ~qljp3=q-{*xyYTY z(ceh9y~yLr*}Ze;r9U%2)sm!Pee2x2Z=Vt}n}qAkrQI4*gsq-_Wp?esL|HNO9jEO> z20t$EzwCFp$v~Cu$iDdfPx#q>M^`EvrbWzu?0$R5pF=XB8p8sbC!-$iJiIrG4u;Pa zJx!V@ek?{);g(1?$KP!^$<}Y%fvgbj4`@z~jf_--H?Ib_s>1iP2>Vr%x$N`gNcUUP%uLPf0}_{mwQpSuk3D<(WKW)uIzACmYFrpfW(A}y z4txU%I;xN-=QupQ@(~<|PQK?dF83zv7hm7qa!g!-t;P&?C|{KVK2!uj1l!e_4xgs* z@Ry3-1|z48P814Uc(yI)0Ha{-Z5e5K!LG=MN{zgGcP8!lcK|@73O0#CqjS(cK}cl( zQg#`O@U)JT3CxEz903X%G`1v*u>3P|Jm(aVvN=XRMov=Te6{*o@=vQP(f#L7waT~C zQY&7d=i%)fEM1C7x5*CFi12Y%eLMD zaBp zuHODTAyO9rVi7&Y4y(Tq+&_H&d>?)@A-B0f^F^6kQ^uDY6XGp4z~c)aKhCe6?6EY# z6LW5h!qtFzmGDHb+ST^t5>^t6XyJ>;FLsn$-+ub{NS#}BRFv-MjctPBu>y6sdwQ&L zIZr^ft^NJ()*<$j81Sfgsc6oAr8jkW_>21I>ddiOhA6?!wQBl*=2J@L7p8g$Sm6Xi zP~`xpfIt{vspG|#gc@`f2WD6zvp4dOm6bq~~YYzVnojkE|t}rff0j;vf zY6CQj5QZV_zL)aehq!HKX&Gfv=HsLAkkvIfKU+SW5q+3VF@r~F=GpY4UoC!zK8;p= z$-a0#YF;J5KZkKo%HZC^g-qQqMtT|*I7aAqny9pX{aJA@{cim_cXERIgQFxI%9TCu z(kveTGmO(u#ttqCfr^Zq3C{FxY-w@IyNVnzo56QVmDuLN0FB_0f!nfN zJg3S|ObvDhD)io~`*Ea#H?-&|1{*HL%_sWa6_a$dOr4N33l6c0hsX9Z$Ld@O z42#*c&VSZ==RHsReuY-5*JXTWanIQsPqG(UjIG_Hpne{Zk+imj=-lBKr+=A)14;xq z{1+w}t)Bq1feA?Wvan$GW!(AVwAU1W|{O*eBy&K;wOkFm)_XYy3z0fISHh z-wPd9Gv+x3o+R*LX6@kDt;tTjKN(1pek3(q5z{Gdqrs8y^b~3;8G4-`=^LBMp%k#g zIe5hJu{>K%VwB}u)->Lt(`RaH^9wUXKS+?h(yrj6eYN<+m}?*By`wB5=04ot8wWpl zuNYYz+{zHEVtvA7jOIHM1c0kQ4!35uu_R%WN81l3p&va$Q8u~|@%4%O-OuW@=Hqo9-2vg_;CMG6SYDySmLHOrlB+R;qS<2-IAuEvu6TQ1j z#|;Zcxui{Ml0ua)AN43Qjmw;f<@?^kHF)EupVLrMPj7{SH(N*m$ph;EF?^z5DdK8M z_v)=f!`nH|+);>_XfaRN@;tLYev4w}lUUUNt(yy9D%x2>CO*9V6-nZ7Rto7l0TE&K+}7T2+hl z^H6$QN^aO6yzofOK?RKoYDpSY-QpMxxK3XFD*g)VJvPCqrwDDUGmLS7Rb=u|Hi18N zVs0;WSYsC~Y;6O5d`JP4&Ecv6m4BK3#Pwg>2w+}SwH==N7kXKBaD>ctrxVQyjuW~b zG$W7gANTiPvf8xyWU-x!DRm+e<^u+pH9%Gpm^v^d%9OX@YL5|5rUyLtiC&&?&V2mz zi8M~qnG42n!jf=@uNke_35~~`%0%WR0ST0770keo_JkH+f+jNK;yW=~E;Y@VcO@m5 z-Et`v&AS@Rkd=j>UBcZGjUQqwAB=;*#gfy2a6J0i2VmqKg3#~Q`NE2D2IQEsq89=e zdKLh+k;&I4O+@9_(^_&H1YdSmyC$vZ~r6{3{JjwdTuFVRzQ@n~$@ zCnJ=%gM#(7{f@@19~=&Rd$fn@hy47(<}1A4Ov$HAKMdUZnXD+(M)OzZ?sf4?U(QJ! zq_|M>3FJ%rwrqW!+{_ETOR*ej!{pAhH8OX9m_(e)=8z{#zt#77`l4XzBPr=v;p0q_ z?e=J<3AzLw2fVNt71HUnVKt_up&^t$!ej`$2MI>!cHSX)2~FrEOvh}(Ee;?>6Qn(q zF_M|NIgFJiAg7be-P?c}j#N@N9X-48d-rIFTFqr%9%;$2nLv*HNBn@+gR(|U)*wZW zlz5>NByf6swqDkwIWJ^yqPGPhFCroWK5~F?IU)dp44emRsMprk=mfWe+h;#~SQ4oQ zL984$OR%N*6`YtM_+wHKGH#Q9r6rvPccSw~9e^9V2L8X5!AK2*H&?+L{yz%PDbCU= zkZUlE5{4_pOgsir(A5LS61F0b>Gx+5;ByD&amJ-xXGlmOS6FSJ>_!i0gQLUXu70*L zhD*XQm-3>yYx@ig4mwXyl3|AG?afIh!0n2UA5Uh!y|yr_g?WdR1dC!}$b8S$qhFqq zIB^2JTNQkfqrLA>pfj~X-9iBjKvM^LHUpXl5{XXZo3+uNcj6%OgS@6o=XVoC3FJGX z`vWKQYjiXK&>OPg5JVWl2?i66?Apw!#IhJc(=el~=SabEadCrov^D45|I@0TxC&QR zRpC4CVla^KY5j!><`9y+pd8en&YBpVU7$wM4x7YOp;>hdADX*@Z4VN^{3j( zaV(?xREp0cZ&aK(vaf=@U~NZWq2RaVH7X+yYX9F12A%OSf8G*iHQ1~Mp_H7LW`ik? zsAfc-DJA`nGl2+pGE0w^nVFd9<<>~z0DVJkf&A-N=G}Pl2l!B^Ui6M}Fl?(+--?Kc z_-7aiV0cWjhZRV~nuz-%OXwOfK94zP2OE~`GD78jFxIYRXTeH}c^ zVIuD#Z{w0JUn3I}k)|u5HWRaRxY{$&2}DCvP;iV89LdQmdU_6G^c2*N)0F{iAi=UC zkgb4ERetq+B(Mi`%S@zBVu}qsyPmWZVI@MfB$Ds$-Mb}EKEh}f5kYvgLEHi|@wb=y zdkT#bE%^8mourne4Rv)#tJhFYcubTtGc%zY+Cn=wLc*t5snK z>p^BhN&F1w6h>abs%gl}lc3nOD5t(^v4QwV_!J2WbFUfdB;izt`yC2Tir&ll0_Ar( zICXU}ERqI96}$*m{TYq9SPh3fw0b!xVyU-59<+;M+%G7sfT3y?jTHg*4r|) z?h#-EH*KAr-eR-wxE&p0%n~Ap=gLqlY@?BYbh9mIS+y`PPy&f*yabj961;0*tYI3( z6lO9>DAkonu;>9newe=KTX1q#+HgWN434*{|2z{1_=wB3{|s8;)@gtG)|2yk&| zm+i^&9CTVh$=xp&yr8ghz&)SJ-| zAB8bXd!Cq`2%V~d_HX9Q50f4#yW3sPU9BaUwB~1L_n@*6YYmqH=4Zn&<9Ect=*`uv z03o|VCNQRG2W#(=QMBh-1xLiD9bs29d)>fZZN2#Rp_A5Yc-}%Vpaa>Ou!4{O;3wv% zF^2vD@j%#fA~?(}EKb3Mt)!-=>-5mZxsT2d^$mHB@bV)Ucx9U$AUl-f;K=}VWr6=l zR2rjU9y7msRo&B58hiRh=YGq7>QjzkJ{OD!m(chSlfh7joiN-=T*WPPNov$oJ5 z{7}WTFn9ax1vV8U7dNdVG4Mc`ytX@n1HKZ(w_Wk~Nw<%5P@A3;eqEwZN=jmZZyBa{ zf)pmavp0>d4*_pn=_(3Ey(Qj^0O8>?Kl8)g>b$g%DH-+zAm zMA0B2awrAEF2>jA*q|OZE30hir{8p{#J{Dh^`p@^DedOxk)J$Obm1$ za)FDY8()rdD5u@+jr2-ZRgZ(Ob`~0`Hb5Rset|(jztCKeEQ~K7Te~G*Y zbW$D0@frhA$e?d8x+(o|ziUaaiB#$PkHDoR`Ipe-K!ZTzfF(H;c^v*D!g^J$vy*^_ z#9K^oW0*P;BXFng2OeeV5*{}kKxL#*QerX*sJ$Ap6rp3o6$Rx6?-)^$Fu*?(sGxj(i5We`06Mv#H&ydb z;2|M&9=zit8n|2wN=R#KYt)(ovV?%%CZL4HKvgJgTySXkgIv)jYk*msaI4`3HNbMH z{(2D?80nC-A(7PLIA!SPuwW{a@PNUU04gs4_5rY-Dvhx?VHQy0Ugex)>Oqrh?vVbE zOwGr_C1>fK3ftX(7_Y^xin_! z`JeaWXPP^M=U5^+(o`b)-kCI=xkI|*a|6plPy?Vr0fs0EbPAmd(5c1t;R#rU1zj2b%{{}5PwlKGuL31eJ-u!4f~BQ20QVr&Px-95bjmb2UF;N;Swx} zjzgD$joa?Ex-J{GJVEFLOA@N?%`EwYfZN$1qS!OFME9>G#EC_&k z8~~@8eg#6+!Yfv?j0(-`qIjf~1M@KiLp&iYE64G|ky!SRgw-DC>m6Pe7!7M4}t} zk>z^h>&BmfJ;SgmL-K%7dK3gp8juJHjCpY|K;>ib8NCfL_JYq!S*0ddqAis~M@w`# zpr`btu^(d}ffsgCQ4tP}y4=Sy9!2bfXU?3t4r&F~t1-ARbBq3w%>{Rl#gBQ?tdTEA z4j#C63n{sni-R5g9L8-3T$ePd@_Tvo#$gms=&+DWcO^P=Lo-3fZN{)r zT6c-OrtxRD#lJ$UeI48U$hKUvHb491HXFU#QR&#+AZHW$ky~@;2hYE|YFtb?-gYm3 zf5<_K7wm(5#szz6Xw7OgU9}&}YX+4DjC{`zKjw5qQ$<%`OTCwyw<2l6#QDD^R*$G0 z4)y3R*#!QO*WJ~9dmFhMN7MJV*i$okLHtWMHM2cyuR7k`b+qJLk>}A8iP*eRZJ4OE zXYaR|+ghtcvV2!@R`MYU=M{brXCnGlA3_~{xh3MsAXqj__)~!zc{qDO=*!Q{<60WD>% z5Goc`Kd!BJypz^PfBtwzwC~y7WmL+_PV|h7Z7Ivoz)#Ufn2@JZDup(hTGpz0-;{G# zEDqo`Jk4cMGYH(ep^%qJ&(6MFfFhtmWvPyOl?|r@YmS>mB{$D z)5f)`mOQ6FUdXBr(z{f9G;ef7=lJNX#ap-WzvM4?!$+dmxk&`=q?=94R-oMcl?M-HeF!&*QE;&aeM=UAr}PBtl6)K!EeX|Goo`^W-H1w#PYFw|_8j zXb>s4GGM+Cn|nTIH$2ss@D2#rs%S83fWe6|H0|(6RD&V1U4|@9xatU^{pQWfz<4fermTe4ZGSHwG_p##uv?zlz}pbW*s%2khCoQcDiIN z?D(*eVtNc08>@%tF7o9E3J9EZ;kkYvhmi33$+I0{pbn(=%-f_D;gD>p zV4GHO$?QxE$#P%ZCv@pfBSgR~VmKWyaqE6}>jAh*3>cU;!{Ouo=jRR*w3Y;Q2Tt2V z=FC2aBif`d*)KlYAzV#LZmwSQRhA-2fU(w1HP++8&)-f-uJ5I_Ocpr!>@sHzcXCpX zWz^mu9ZfJ-y>ex0a&f^)Im&*=WvlQb%Z58!hAL10&#MFPvPVWd{5blg<#c*Yq0wVK zySg}ci@F~fZ96fIO9G6fbnYA{#v3tQIa)w?V~D8*pyjtEB{4qHR;LyYcSI_rMMTaq z-2XE#P37ED!y;_h!Zi{avQ2sG3r4oo!Skmhxq=0$y9R9^KM#M%PEGsB@5Z^UrLy|; z%6Hrn;&iUK&9F_M{{I+z?{KdF_WvK*6j`N=LK&4+DI&^Vk(9lPjL57I*-7?Z**ir@ z5g947j5L&;9hFr?^u0gd@9T5@zTe~X&+j^pYaG{gT*d44d_ErMdEW2$+gW}2vgE4- zT2{FY|C5SzJrffn6f)lbe1`ZaMbJ}-Elq0(D|Lu*|JN{iR$|fvb179%PbpkK2sngd z=G9q+cRCF#pj<3nJL(^D1HMMX!qMR3;G3{@Ydr2JiM#y(SC6=UM$@#px; zFG@<6?12Y(5@eFt=4%BH1Wqry@F*nhNCHqq^z!SS7nc@;4)79bIg~}ym^T=t#%uu; z3=$vSwRP8T>!`IcVhKQyeRO(S|H~z!n~<9qpI4t2v9A%Xzc0~q%)%hmN`I$;^rb_m zP|Q1zqVpsnB7ShUZ}}ew!kL?J9LaSG#_qs4E%&nO_3hjui3!LD;6i<_{mZt0$Mxqi zlNSqjbvF&l#4ayef8;qseJ*JGY&Us4x3evAHtamPbOnn&eEJEhMgrVr=E4$y&@qT}=`9aJL zKqM5@_CdHO6YmVs>r26DT-z8l4*3=;0+HsIf?D~BeW-NpAu-7$7oPKiV=D>CWv^@wfyYeu3fXjZ~&TXzDoaUHqmJj@HcqN zJ0a-k|HH?GRpcwj^BZxm_;(!WHIw+ubfcK8*^Bz_E0oRde^$kafds;J7}lcGNWb_K z)!+Z+(>RoULeAMiXxq$;(!lfn?Y{jte!0jBO(f4IU3Iv#cU$iH`ruPlGAX)88-2L; z_ntgcU}BiUpdG}kT;e*`{A$W#^Ge95q_xIoQMg(TPw)Qp)smLuYa?QFQIS(`Qh%y= z4kU+WwIwColryyRh>3mEW8)I3u2bAADi+ez$H&DS>mC+9CMI!4?qMhSp1o%`jcZI( zj0}#8pLI5Kc*VIvJ8-t&NozQTKK?szZODGsmayRCmruSF8{7UNg|W*me0~woSj4E=r1fCv0j&Bm~i(7oQE|hFJ(j zVYca8V86z3QPOa}n>*-HNz@P&!>K5Os<0d_FvwLHQH`0@qNi8e<7&74_%)qKx~VLkeOm`>o_btfZy9yE z9zt5kx9isBj&GGhT;Hfx-m&S8q zlnJC1gmMYRbDHZ(DAkWhlW!~5<C)^b+_*W)eYWPWGLy> zi)2-~@g!e0dgi=5)T6@=tB5C5t1^UHt%lHfp&ZCZLoelUU9zueA!$B*k<&*3QTg^VQs|i*3DjwpPnc;i0)9|$mksE1cLq0JBh~rvHd3|g6@g1%xr%GVlr@e zN5C?d#t11hWfw$}$HXQ7<{bscbe+O|jvn{%KSCXy=Y;|ANrjYh_`tULM>Co+!;ovU* zV)wsWSRGg1p9lLS7sKTRHS@m6KT1gFv;KE;44c{zuMQ}D@94ED=wOu zSa-7Ho!yz4nXI!#g=F;Pc4T&{+XQvhnd*ejk49Q-7dWrAQ#D5ax*hw+v670pcjCKF z9J#>tI)k6!RbdQ4@GE^YZmoqtL7JszB#Gpfy^Itkoc@Gx4uyjtaUzETVoxL#jUF_i zFdKwEefl(_<>miBz+jz%>m9)MhzkiLCFS~C8>^VBNxDwb+x_efbSHd3unV`lX=^iL zSWI|kF%*S|fxMOHHMo6?9JBE40p2ClHYm5v+pye{1_TzuufopWhDw#xVD3Jo861Cv z3$$Gd;U0JlGVpw1)ZK|8t_SSA1iTYL7~Rhs@EnEhJ{AG-Bm(|-j4W(EJYWY_OF5^9 z2t5$U8+e;hG*sV~hOSKz42aP$)IGvd56B1cXse|{2`YCUQ-a|XiBe&wVG{i5S}29bO@7}l@k5}R9P6ru)@zVQRfIXMlH3W zL9P~YLLVl) ze`Xi8dQ|LmdU!nZOunVMo~6Ye{_gktd#|yExGgD=m5@^TZP7Yet(f?p#PHuzO=NmQx~?oof_tkbKrBqJ_I*2WnI9P$X#jhYFjBFzvlCWm z62f2$s> zul+L!UciKK31ZjG9Z?v!flK2g!O=?~UGW4eLw@;Gb3+9AtS^|F=j-IDkq{<2oOLsO z(&!i`YM}KXw+&L0oN^#kYQi&dHsS~BR=Rh`-Nxe1ss4noBLjsr;nI3z269<{UnP_^H}oOE(^WD8;Az{`eF z2fwK3Bl~a>MPW12a7ufJ#{3r*{lkU~HVx0jlB(O5mg%!jAGiaq@6Bt9JCWyBdmBK- z$i7!0DVwpHSy+TpkW<5?;ig2Al#EGw485P9(-`IS%&GNXP7`;57O}0W+S&bZy(3GF zm@f2?WJ$9cZzI|M>GL${wdhv~339G`L?{}$Ru3GG<*d2hgZUM`n&5;I9F?mtmoQc&>r(&oZ%K5M?&*BQND+cw|-`Muse{F){?r($m7 z^N*!dX$-UuIT*0R=x>42MkPi8pL(J8-YJnqd|gQ>l;9|V@q#g2X_J9SsG~%BeS^>| zIGtbMTes#!6lL6A&=%P(8hZMd@7~2hg852KPlDn1E8ukCvH}1^6SgfJpYk|0FkFX` zie}rkX1Iz`Hcv^OadB~p&?e@YWv~{)u%!lv7mAa^pgT>^PI=o|UA`=C-N`}Zj|eKA zq2Vs{Ad)e4wqt3Pa8C=Y|3IOJ0z#07rq#LU6Mha$JoNC`go8l{SB=Zdqcv4k0%RH% zYn1Sl3c8%$kCqvEuKn8vBpVjbb&zUjT{I2#(5);q#Diga1Z+s0FSz)Fm2w*2zmG=` zb@!-zs|iKTX;!NIoYh* z%Zfy*>U!oA2Zyv<4w|$-C~~ZTy7XF(<|zFG_VwiF=HX#(glP$s$-jJR!a?@<@nh>> z88TcNKl#!Ru1i0Bl*4MzH)Cs1lbukNg3|#Qb0V!sC|sZ@%%g)5>;P5N7Dk&>r%oYN z-dpS83eX@-rYQ250Fl6JMby*B1Rda7jeB|3kf_}0=q{m4LW32|aB}q@`YrmTDR$nK z_j+dVNs1HTS-omyHQIhBYda@OzzM+;%RA%Tx z>e*1a;h&8JBj?AJ2DY9&G0)Dl=-m-~?c{6czx?i-Ua%Ar|Ln5=NMwK8lS`RrW}JTB z|1;Qo;O`TX8)S0rB%xB~(cjXF^B8tsh-$LRjyx4Ksg-!%&G+wStc(RY1@9rs)7EnL z?p2c{HjXR`TZUh*_2K@oAr$NpFT${~-m$)>sEE;-I<3>a0#jjS-*D5 zeYA>Acys~xpoZ&>z_Jh*08Zbrk@@8v9bmtM`-MPxT;K~5Xz_e7#o}u?jB?bgihlwr zyGzVsgLcxyL0pR=vm8IEw|PS2k*(tZyzy}3{A%4xIxbbi+f29^BVPcAoEcd#O-Z-(VygPM&J_h3rL{ z6QD<-vm}54Alg^Kq8V{)D>OW`qmSS^hD!^wQ1goy*($$aZQ+FXkC-^Ez=FLxX1 z(A8D<+uJzXfeV@c`IEw&FbxmYCU56en#a^s(1Ad^10w+p$Gf(cxA4={t$jTJ1!(aN z)s}K%+-szoX&T`AYDEGcgH05Li->jNOZ5k&Vb7F>q|*oq;n!j zdHLe7&-@gdWMX6}U2X5wtG)uvL=Ct|gWXe9R#>72pEsr#_^Ydr(<+sIW-ZWF*ePGR zm(-DxF)c>MU;UT6#}HXq!d$gnd3Hg8GOSue<3l=oj1K4};0V(}@^y1_0s=N#~Hy_+rnW1s|q4m3F$+wmwb@6+-Q5!bW8=S@_m?Ls z*+1Zj(N)YYTFTOLyGiHE z$wQFAq9_uj5=L~++!6M0W*BT&oN~1tYwzvOPVn7r5E@;dQkeTdRdsO>d=$`&fl7X# zNUG=eI)Bg}yLCIYILr8aVZGWh5!Hw%S4JjxJ+h<5yu9=8A?;tawtp#)I2h^}E!ah| zt@HiuVE6cJmHDUbS;o;_ebrme+>0tmJrZ5KXX4ty^cIhO3W+iDWG_5~syN6?Cf{~4 z54!Yp=2@9@#l5g~g&Td-a5`qS0IXT?X_ z-B!sQP)im6@GM#5;V3@pn>;VvWpaexK7Sasone23dH>oc?^G$jWGcSE#sKre0h9Dn z%|8cXmQGG=a#j6_zjvkO<@Q=@c}cxNiAe6I+drLFgbUx#f9Th~^DZg~QqMkRN+&iG ztwiNdpLdefNpC%5clVyYUS{Zjf>MYdZ->7EwIl)cwC&FHE1$#XY6+z>^9kn5!P z*4kL3X#Gu2K<)Xwt{d2saOT#k_OCfnfwUuuB78cO#%2p{!-mIh#r%Dx@zliav9Uj@ zl5M+Zdw1MZ7v&;K17#tt4n=PAZM=Gi+wanJ2Nt9YhexTa&@C$ey9CL99?A_AgXl}Q zaAu(mB%tCro^GRH1@o|xu(lfSs;hxzsB(0+*UWYGy9Ehy%lcoRvM_J#$W#cCo2M6+ zDY~os+;daag!g)TUQzg5L!Ecy`qQ5QSB6jh@lXjNxkM#;IKXAXvMXZ!?8vl!t%JLP zYGhEi0pm@z^y^$CB>}Fq#y5U6to@9N-TX=Z?SytpuTIDwLkG%mIiHn$YN_Akq4R(|_ag&i$kN^2kE)=*>WrHLnFilTg zAMAS-Ipsqx_fV(e(fiXviKWKyv5-@Yy^Yo8>+(7seT3cVX274pB8F*J9;$+~6ptPp zpxb)(Z)>$B@r--2^6;2yZQs8jWSm4M7@cW-aLT95A-Tdtf9Ict6PA{(>krIi1soG9 zOiUFD8V9tGPQ8&@Sn;v@YS1`AempUFI{VNLlNKi9py`+cVyzR|mDwGF<54*e`m&6w z3)!^gro>MDM~^lA*WLtB7Yr|MlVI5W!~aGv{ciyEMIa4M?~9g}x+(dzm}Toe9O7R#Pu&`pQf?`_Da>TK-CLJE-D3#%8T`BQOW+dWGJ$;2f)XD7 zk*85r$Ml`N zwfDl-#w$WQEtl-0`Bta6kGpB7ml*AMRx8Lqc1WNo`Oze$@txoYF1~fNL#6u_5^JM; zQ>A~pNRk-0<)u+Qo5(gB{`@rW-&`$Yn1SCySgrMbX>Z4k>t_i$M2FIt*j{I zfW|7>z^a^>l(i!<$6yTc1p^TFJvytLC*BqxnW-1C+Dn9zJ%#=x&1PUth>OKp$19He zk0!-kxphC$@5?>CFUDRTJ(B@#Khj`9Qb!=dRm66G7z0r=Pm?e6x4Zm2*hH%~nY=r{1H~c*Qymg63nC4 zXsP#o{sB`@vZU+fh~F>+H@|yq<1xE3_vxH*1-U=Rou&KspQ396#)?`<1xkgqrouYs z)^D1z8=d#M>tYJFgAA2Ac~|P{@oI4~axVNQ)t>r#S6ib@18hjriy0v}!t(5308L zoRlnKrMGZD-^VQId{QbrJ%Fqm4eF%9=67Xn3D}nc=>-i5nriVezh41+r6Sw;OS7JR1 zHbuNdSjE7*CtE{aHiz}Nm3 zHt*$S)6!7C%vI!#kOfjo6qc;mvIa;9%m^P!6SwqlkoE9 z7&HIcwR0!SZ@;ITOh#88>+cZpCP|+$VL{ z?C_RV8VtTRYq>Yy%cGhsMn2C!oH=9LuxZfX%EXLX^Lz-^y9%M?Z|+#; z{as>u4-FMz+i-UFIC}1wYK-oP{wk{A=%}a$d<_{D=l%7fp;$Aru;eZoKnWm`{WHIX zcoHBYy&4=$BS5XzgQy$;whzt9sv5xE^kPBu_g*3Zm9M%bv*4zd=gnnd&@=pB#K!T3 za+8>P1oFH}_YVxsw5E=Du6EhD=E>qPRin$NS-qo@j-F)qQkQjoo#(bxPEH6n3{VGW z+=`({qGdIo?)Ik>UC%M^7BlcYODe-@5&vU$U&BkXe!q~`p@Ru>C(PLx0}nRnk_?e0 z@Fd1YP+lj=Al2MX_PRr%=&ooCUm3)hk`@!|PeoQLI=`~@_h~)%{E2`rk;H7T#jtU7 zZcby+zA*&dYBr{HSR)4BTN7H~Hq1BTCSWThC0xQVRw7lfwKbg!2uCP2l70RC2$8no z*g=%S92YeXh_UJER&;Rjb&!FvuTd&XiO_WNbYSERCv)hB?y~dnWqJa(gY_GjWl2<3 zRgD16k0BJ`tATGnbaiPd+hZ2M19^qmStqrGO(SsNWCM8Y5wV+*(YL<7zO$BKoT0HJ zVD{nRTM67Xkv>Wi3qXz}nP|)6)q>RWbedED*Ve&U7Dsrd5rGNDZwD*N-Jl=};?>+c zYODDSj?g7c-_9buNG#NQY;$(oO_=S~!DI(NJqP4bfQ2KGKu&YGP^3$wTF_7u%L0&= zpndlg2|D;oiKJ3>J-L4^F3(&vsZ!hz6HSnKD1se%&tDM&s|xc1GY7|O$Ww#}gp=fu z{450`bSj>gr83BUUTpCFzUIo=FXc5fza;+T>-&9zf+0xy0i)K60^?p0P5;jj#PJMClJOll~y)tYUwfvi6ZAM#FqK@o4tYomXDs^ucjaZ9Z zmrBVkT?dvzmyn#}H@4l{INupMHb*ai+t-SeJWjdDWoCvwi!pYaaQ?ie$?7Aiq}neu z5kI#0kZ^CgMsj6D)10)NN`rEQdYNjIftn)QH@v|nQ+WRFk8Hh^i^04yDxM7i<8!vY zZq5QzzVT(OHI8$WFLLNejQ>Zai4FcS!aQ(rP*i)j(}_`uX1h!FnhtahW6}wy`@hdpS;lP=KmHExJY9aMCXIWWUxP;Xe z6v&a2!;#*Di7Rl@O$66j@Efs^>}@Vhv&XxY(1?irKqBIp7FZ&-P9n+=)?X$JZrHHq z1fw%1j{w1gF&d3glw#o9$wvfwAGgQ1<{q9|&S|fJf!Ie90-*27a-Hb?15lqe)qEUR^ zor#YJd*nn)oA?-6QX!(p#KpY?z$giS3;uzzn50>mE?P_%Lh878SBkaSrdl6;JHY?jOvry_hxX7R>l3Q}yvpW3Vx{|z+Q~(7 zW&RDzU{Uq2=N;|85|u=1$GYr#n^f3*^{&#&u)?+jZztVoI`YDIYsrTi@+5}ddfv-W z|BS^b@b|{z4=t6k@ZNvkf4h&sfXnWNzEfdeR6xqcu$t#(0)IzIEM)#mzG$7O3 zn4NLI`<>Y>u??&7aE>@Hh17q6G6CLu>)scZS|w|E4lJ%*QHLvym}7moz~+kyk{gqy zKgQu^bsS)W;}a*=Ai&q|wORe=I`~_v#Ew3M=4QsN1h)}DsDT0FmIT<$3ACrx8cs9< z2#gsR2D&Q1abVO3TSOAVPG{?Hc>`&y@~>yx0$=n$YOa{$;4|=-b?+~~s~6?r@rMzE zm{ROhfT(QP?t(D8&bf20a}|(K1cpsaOxS@f!x&MDCrl}akf4aYz4NeNmBZ6cwBZnz z2tL+t#EfnGFZ`n0ARePT#t`Js*5Ue#$E*${UiFI?c_u0>ov;*$4e~UV05oPm0f;Fx zm<-W7l(g+}?57ncjzAq>+8~@c{?=u%Op`T{k&6Cfn${6Y(iON=Y^q5B%`rbx0Sf{z z>sD;Kv2u8OK-qAY4d;jFi|z%*ZzwJelZL4uw*Yy}Akf2|aiF9oRBzDA!u7y24P{xc+D-CiqAHxLGy8GitV&4)u6&0^46Rd664=d8O@VNR~xo11ES{8InVyH;KVJA5ciDYR)^U8bpRC3I0DuYt;wlDLk#mWyWf?(2mbf4> zbH#&^<7$lMds&Y=z7C%2t~9K_dKJ^X4D%AdUhe*figlw9oEBmG)OADsE>U zCno7QN6JwRluJy+PkcS!+DdZZz@rXDbJbe1p1)+CBkxMbTaK+HJXg@+nfR&>)E|$A z0^hrN;ol4C$Jl9{7CQLX>7C}bo>v~{x)qncEmh*=PX*7MEz%v5dc2JpHzwZp3hw$e zW)*c{`8xSe_04UEnU4owt)8CK^0G@$2)*iePkUwG`>*8tywPklG)`a4PK6)6E${pyl^d3t)%B^+^J6m`W*>7!CdDW4N6~~0?;JtA# z7==nC%0EQT0M;mvam+?xh-)LO$42z%(f8ZA%5zvtp%<`p?ZN&4G)9D6g#L08GQwrd zov_eZz{9QF6;XeJua2IZAWE>Yu@S{XYfa4X$%E9e$mHXEAs!)A5jdYZ@W2r$>4-7I z662n`Rf@{WL@IDE5(^kw1@Q?9-Ysjq)YjcMWL zVjhpvS0lGSO`kS<^04u-r8wzyrjDP3QPuM^B{8~XHPW);+2bvqL(+AmJM!fgIVJtd z*2yt)zS*DK^2lP#Ks9f$`A>Jc+KjzYgKGYz!i~TAkH&i+G_R#Po}4g!Ic}G2Ylt&u zJ$v^E8|}U{&eUV5mDt>HV0?~OV1HiriTo>XL%cYOcpFGmR1{7|@R^)DzK|vGA0ZB- z1bhVY5fgx_@fZKcQCAzE{TEfUQb| zQJ%u?2l!|Z-t`exLsUumVM4EDrTwQER1~PR9t0kmPz2LR^w=@}BS&)?OGoBJAj zInl{ovaxAJV?gRrZ#8VK9Kq2D_A0UC62XSIe# zOB(Jpl*|w~=&%}vZaiE!s;=@M%2JH=zakF#Tk+nVyN@J*>`OZTaJs{Nc)`m4eX$lQ z$|`7$2B{0}G0CK#jmihq&hZb%Ev1kgXAm$R?y(hJke$A+vbj60t44pF|9nige8cyU zvFk6@_rH*qrY;yxq}a)Ovv5gsh)hG{$e-1mgAF@(gXB0N(qL(4zM;AEx-px%#yio^ zTNB=SRFfz1jngU~_IuJAADXg9PyBPl!Gk7AVa9`noTq)aPwUtN#%W$C z!8h`TB>P;p>%j`BrP>>R=VIIA{|XdMTLXl73z-sUVy&0;evvJ zB?uWi=-F5m!P1tF`e6)EF$koqh?yco>QeIl#$prVi(n&)VHYa)qOl*bVMLFyS4t{{ zh=R2|f!B!zex+CE6xAWd5UEK-1}t&m{*EYu;Dk2ixZ5-%nh+`g%mZ<)E;R3UG`F@^ zMk$T#cf+izZwYKJ7>XVP@EvNRIW~|0UV+n<2e=tfIgkJ^xg$Z5hxZvbZyc~L=GdB% zmc|JT^#>X{*X!4tVZb0($3pybMw|gPS18krQ3qzk9%JFhXsv4g8O?X?R*gy8_+h_! zbCr8faIlDdCF2&E=sPPOhr#d#!gvOj(lb>RnU+<~@@M>wK;)bFGU0MXW{;odVF9yB zC-1HrfvE%0zRW$Mf07@qb5=c5B5&FJko!J?O4mHZpHU(`nYmp)wY%*uYHCyygsBpi1p*HZ zCP>Mc+I^r+aMchiy3sJ^;CqvdS!ZJbIfNjOp@RXZkpZFR*Xc5;Fru*D&-lCX!w36g z%CMOV*Z@_8!12Hk+hzf;{H?yvR_0wjcN(?Rf@5z83ar-_|B0~tf zRd;W16HqZ2yb#nZL`UJ2v%q@>gE);ZE?~UW8hD40O6NoL0jUCUtgAV>HqDC~iMYMd zQ(2+U3;MiCYypOw1z!h|nU2*uxmWmmk$fhsJGkLTG48R3)&~kkBp`Wj(Gy-=*-Bj3 zXdbffnEtDy3n1cF{`>wP|EB**>4Y96&uc`k)yd5*7rzYg+rbks=}uFqQ?!brtE2Tr zVrr<5}24vh86ivM5gZ;>_EmG{4~zbOun6mtJ;p3dSaVm$@@ zu*^Mr)y^%4OJ7Mp)BUE?d_l(NU(HYa1VvH8YIb7bEo@DVp=1-e{Ze6inESr3a+YxN6^wrh0lj)e7;As`I7IrfT#V9r;LU_~hW9zP!eq9H&j! z)|7pxhdELl-aOxEV}9gaV|?M6;nC^$yTig<9B=MD^Xpc1JfGIn@}S7UhG&8PTxwry zpM_tY$X2g@Toh3<{pGH}j*a`s2W}lbx=-K;s7)AXI>A$2E|K@TtfGRDj;$PS|MNoA z{wE8h=*vDXGhNAOZpue{PC-ul`1gT$oVUEKrhoVT+^g_Cu6~Ea4*G^B-87UcLt&z{J4@lChbL38Cl4w7+QO; zeQikHbP*q#AuVG9U$y#zi3Rg zFuoRG!W1_Pqbe&0rQ{{>_TwsP{&`3RaxHVAD?E+~F15|e4=XyBZyzqS@8S=B=h1FX zuC!y5`bx)#-f)&Z_9?6I!5n`PA}BUy8-K{JJp$>Wy4p{wKlXLnu$)?=;`Y$bK|gNx z%Sh7aB~W;cZf!c#@SJ6Jl2)Ttv9SB%$1`je%krcjcZf(jz80iTWhh!9Wj($szbK

E>=p${Lt@f3{Wt13LCnf6df z$MfL8O(vbOKUYVWB6F{dl=6R3U1>MBpw772$Zu_-%(rSNf2C&NW2M7Zy+rxXTSd!a z4~*sV1(d0d+6&l<$BwSiR;}dz3Ub~zze;~_N#zKAR0>)B2Ia@nj`ev>pTb1-;}Jrzn=hWs z_LGSIPq>Kn;oY$gg}DtrkhxCBjC<|PJpERhra=E@bYEd^xkNMo8w=RtBN)#&jNl6l zg2Wq*WAmVsECG}zcy2vC(HKjgEjYt@3vSo)e0R{_kD}CVnR=TyAZ}J`te|*GBaDM- z?1IUSPif(qs(y>#%?BUNdvkjUB<4I33uI#*_qg^jGo#{9nHp&rSuL|4k=Kv+&5X5+%t_x#3&YbfUprFEYopD3ZA1NBM`R z4m1o<{dsRk>GoCOaAit}+|6;9o^U@czxpVOq{W8a4fdAK``P*!ycR4uQQNlf8m0M9 zSZ_YY_5Y=k4|H^Aza9G$?R8OB*J;6tT+>tcfJ{KT6V_&8al;D~b&@dE;FZUJT7nC% z2Wbb880ZG4;Cm!&9=K7!E!dR3ImOBP>V5O5_002}Jv@qywEoYx{NlP95|Z6)^RDk^ zY1s;2N|!z}b9hLKehyD&U(MME0@-9d{wyp?!|u?8mh|-&UF!&j~2Q>{_tr`Q~s-_3Yo7-33Wdd}`H!e; zHQ7oj^o>TNZNhT2pA$x;XTnwDuw(Plm>$xgiJP0X+;KPg#njwrAJj0^iHvtVc_vyW zUX{es=vOPRJQAZoW@u=vU@9TSe0XTw)t0hq?^PFyeY?eEiD)!NLuz7LU(!Vv#uaj1 zjxpY@qR)yqIlv0_OPV2KZ;N4u_??@{`u0zNZ1h7yo)|p9NgVv-i3E&qmKgaWy29?d z+y=+?^o4h{XwF)ms~y9@g{8FcANC+WyXD!=qa^)`dU*SgU$tT#;+<24ZegD(EB!)4ORPjP0#1$39vEJhtx ze~P`6pwMHUa5%EuP`kZ$KB&Aa^caU~3gzDV)2$+V)TuPjIdL(tX$}pqOltN$-SM`! z@dMl1GR5AX)l{YQ3*Z!quZiX`3gHMk0h$B z=G$n31Wv#A1(7VciD)Ut6+18;MX9fd=_peOIgmmyGx`DA4)eQ>-TguqU%5T2e9>)| zucc*T*Lzk-9~~>VKHIBUI58#sI4(MU|HOF7P4aZwH{SB>x(>$DAs*Ojqk*})Ek=;L zcD=e0!il+lczAdW$rv1wfQiEBMPw_GASMDt7b}VcT9^IHDz-d->h|!_mc`s!sSn!u z9z!kR>Oo`d6RyuDb#m`p8C310P|oOhm6S1S`H)06KhHQ&keQT$7s6}0QJ#Ru2KD!Y zF?v#DkLF3H$}(+ZZptRpccuqjQl98IUwr!GRmZvb_7SqZ ztDCz;cFSSk*03kr8g$L;G@6s+Sa#Xkl^>NAvnD%5bLyY#Dr!qthO9Mm+QecF3=1s9 z*>~;Qg^AqW*fzqwfyFD<@CBkP62){7hB9^RB{=W!3?CeMpPMVQGIz(1%;lp$4SKz{ z-Ztbe!rYFA>CtRs^+>%*ZA0(j?%O?%y!&QiW5v1e-A;d>xm3D z2ST~nf*vS1-E{O5@(qNFg^emf*>7OVLk0m|m;$Do;JFf8(hzmTDu*VH7VK!>FDYqI z#<6=hT`RW&oEDR9C9l6NeHJ-(mFoT$yW^4tu_;0f&tvl&V$*i{2D&_ds~W31G2yb< z;Ad^er~cmgUE)~-k?;WxuB!~kE*+uPjCr&i5tkg_8^74K^I5>NEXvQ$Okp{DmkoAJ z*t^_~Kd`ghje0{pJnErctIj@~o$XG4Bsg8Q*i~A89=~|WVqIeE#Y+>-HPv~&PtyL} z%STQ?L4QLoZMMv%GA|`7&N=>VQ6~Seqn&3o2h%Mknz8rqe~XOeUWjhf2W@UP%)Lpb zsE9asF)__5DoXwsX}G5~!QablV&%jcp4f?$0~&-EK#!n{zsvK> zV17p1cvu-U8f$#9Brqt1qZdYssp(({3pZp8RQF2Jz0i7b8Kwemj2jmSO*U|AIn_}dAUt$+pf3L;Rm)n z+4Y!D$s~h~=A)CU_n)DYUGFvf@`hQ86b|*DeAZ%@?cJNMeMQB_V=GZ!#Rqlx+E-m-IShu!tll6S8EtN?n4v9A7-N`&9@&Kd8{P|NUCbJ$JQr;7PGc~E+jT#FXs8S2mTfUi z{5YNcBZEhh%=7Qv^QZs)rj7dP_KZLhX19I*V@6w4)uV@PX4$Xw+)6rN#W}vCZok!X z;m1#YD&hU_uKdW?8wKLGZqbuc9-T3&kYXKr zrg5h%+UW|Jew{Bl^N!DzB1wI}B*X;+f9Q=Z{IFArpSDqWY z&a4(LjcnGqDxNkpWWVqE*m`2iweQUxA8U52v!7GFU3rEzQ!NWCc3zT(zJI5mldq(v zRFYVlr*m1AFG;V6?0npcQQwl<9$)`A#Tt=~wK{V=uc0DJdaa5=Tt)?5*1BSi2`q7} zVlmCLXNxcGp#}W`2=TYjR{gjCJTKViOsIE+jt-@mpl!|0cEFT}xtBNzzEvWfK^T{F zlqX6$kd`8pU`(7QJ!3J*H^f3G{p{K^fPos+Tz?3|rJ|CywsyLV1NdryvmYV&);z_y zlnc&ysY&N}D>EnVqLS;NJ;eS+O5xQ7)0g_I|5(RvhL_J$Qeg zFEWbR`T5OY>>)s#@b($lZ$&+gWQfO@I{6Btt-!o@=d?Fwwy=gFSUhS4`WXK8K z^n<5bgjV_`i&%usZZn@;{Hjc%q{O&`zl)AH+V{1_i#xL^xovW0oOMOy-yXh;VmW(D zQ1H>#BudLkk*gFBVpIxCO16Ei&Zu~iqDR))KW=S0af6F()}&Hne-j8{xQzk2TlG~b zBs&Ks1bB>G#~}715pz&f zZ6ON&85tSbKk!s9EH0i#%oh0u7jVMz%CN5VCCDssC%Iuy6jUjUrS6Byr|74<-^#8* zm#bNtJTpdhAKMXw`Lq0AG&Rpm4Tm3#C*L`>W;G*KcOit;oCL2$3n-Zt)S`RpB{;<{ zDkNM%WM*(mz~NIN9>B`^MT)s%;#Ep~PdzOm&YW4Oe@C^!dXayp}rD zpZ5AV_ADJgi^pv7em}wU4GPR$b+J0eLQuT&$w5HrRW!UgX zP@FxG`?h+(I9XVVgzheik^^#hlgy+69DpJWP+g+PZYogH z`FsiK4AzaypT6U=AcDbIeKd*`L`n+rm?#Rx*E}~R!1oAe0{NQHH%IK@gyn8w7s}E#`cDh^!m$pSfF6{ahcAXH}$qgRZ5RUK(P|1FZ8cI@TNSGKOD1nF&70BjbX7CBMt#r5(Vgdm zKc_b28-35Kie;L3(b4?3Eoi5MgU!b?!gX;X410C@%(h2J_%6LarN?mjLcs5%4cT`d zzhTT2=_aqD-$~lxn<{5LelbaIzDV1A`y__!Fe=KI{XykPENhYD;{92VomJTJXepk| zLr*zQMkI{pS5~OTRUHT@r| zZvPRJ4{TFbLMRcECgGJMSc9ku$;@p9|4t%9+|u4oixqB)X84y6@I2vDCZri?YDDD$ zjd~PH4t!HRZ{I>j*<;oc#fkh(6OPvz9Bk-@S~`i6No`$3ujF3#XJ0w(cN!^2 zRA&UUHTLJA=UM*xJ`^r3uB;z;-F7pDy%tpmDh>OsJX&>`2=X;tMp_ZGe}7ML!@ZL@<$So5*rTe2+dbYC zrvof-&|3+kfDH@~h#$TD{5b_@_F0fRytpHUs(F3tl;efivkr?>|2{u-xtH|_{o={X zI#L{+?$_T=Y;-C}zHQ)mX1#3YLS=fHB~knVy`WO2a8Naq@h^B)o3We?bcB6cY{?o* z$UR|Kv#)gG3mi1zAZd``aM4rS(~nO-FLk-r+}2hVkgvXH*A9p$hheu21h;6b8uYw} zv0ahk~%E54NTUvo-IB7myizJ2>|soVG5S5D{RaV7%D zK=F*u%oMdYXSCsnA{G>bJJMQyMeviM zFoL+cfEaH%q$|*J3r`<_DX&GPyLeB2T1vh?*SRb7;SRfsyR)+=J``Es`?JBNQzUIa z*r${DtI0&HLRiLB7Nu1#3|++57dVXsOU$sy`3>%OLdJvvp<}6vo2mR3w$MvQ8tJI2 zd&e^5t2KX9TBf*DPh}^iU8lU+{L^FGg01!$+Dc*_n zjQjxmNHGJq$WhAgrMGv~En02iFBE589*$L8dPP^gqCUwYb}U%w;T{$4{euDh)#NPF zcm6u9pJZX5-z`ef8n(rT@&V<5>ohN;mk^77AT}FeccXUMy_rK~nORwWz`lql)0>$M z+pnA9KEh&;ci{RQ>)+EgzWHlwEmlMM&2eC_0}sn2DC4cyQ*p{$SX&dp2Vw_@PEj1b zVt|7dhZcr4+j*l^${jjbu8UKoK3)1UUfEfNeTk5pU|~K`Yk_--;J)nIWgz=GE{+98 z2rWQ6kd!b7gOacX;NqLAVgP{>GVzk7p2hsH>JF9^@h+47fCQ^8Y3>nFOPBWWkByDJ z=m8fM677%h&B=X#zVbvb6XQ!3$}+*SFe$OSfk%GJ2Vej7ZNB9DL&KcAcb44*3vS4c z$krdr>VI-x({|(hR<~t!J*G}^tM)z7Dnw7Ovno|v{p-NF{4kvw1)j6C&)FK9l%rzE zZr?VKjea`ee0xXOHU91+ZR<(m>~r4xef}nE-Zs}EdHilXOIlu}X~Jf8jAprrmeTe&=@&)!hziONOEZN8LQJ=T$q+^%H zO%Hsw>;5hi;vma%!h7`z)4_TwS=pFSPKgqy9>%aQxEUGWbv-8ttkpO_)2pK9JYPnDPh$G zq>TsIt-u_E%y$fCH`! zO{R4CSwxmEJkW1M?~XI(1y=6Cmyw;Fy$ai3e8f-a90v``1A&$hvfsqQ27&>M@0PHr ze)`^ojS_gf5GB)uoi_f%x1r~$aJ_efrG0&_uMvVci-#8+&MnBx5YrH(lSk1+dV!om zQ2p;}^WD68lNfa1;ljcUkMzz7%(=QheCT9i+6Nx~lP6E^$=pnU2;u~?JS>l`u%EKb z^78PAinc17WBSQB-teq=_V~u!5t97R`44^>hLE_laj~n`X(Jyma;E6IF zVRbPwvi4wSXD$D0B)pTIeWFVC=+UDDf*J|WAg}&J|V*4WNU$O zCc)9@$kC$&tRA1+Hbg>*?+UWPxW_%LRZ2i_xcKE29bQX6oST>lPQd^TQUsCtN2N=A zB~bbZkNMp?JU9{RVk7_TZ z-lIq=R9OAJ=fS3_aD&Re`6@EwLaKyq^O}l%pGfLTDP-EMMH*Q4i;rk!3W;)7lk_qk zXRw#279~4%M#a*Re_xwlZZ02K)PVs3@)4?r6x<>>>xf|}?stsY3+!A$5Xgn9N-QnJ z3wJ)1SKzYlPw%*U_g({1|LLhCH=Yd8(^AkH6DY_4T&%={F#00s)#Y=hcf!M8VVySq zAsssWe<*wNa4g%lUHmpnq(l=kgbuzkq%yr&U&9%Y zs>D&lSxj|QEFzk2d!CK<3s~}r4QW-xRc?-vr}j~gZrE(^Tza!z1wyfH+d18dEYw%pqw990M*{+8k^krN9 z4+kALs^?Zb%d%Hgoh6z&pKgq%c(bE-glEkiGvA&5$=3Y;n2}%6;HDS`qg3g&7!Y3jO8KHLr?iv5P#e7Kc^(kM<;+yfBe^ z^TzUu;lkbxdb{}yX-f5vF-p=O1y-mraegngm&L2VSnC|Gs#9X!9+ckA4r&!H9Vux` z&KI`P%+U<{Gi-J&^WUvBJS3~eyCD28x9dg01mqO>^N{Q-5xNi+Oib>|KJR%-lPsux zsgw?CU&WVr&?%AryF$>f&-U5$pYLlrcP-9mMFy?`x?2BXtSe7+9F8Znq5l9hdoSvi z$qPp9yFB8XXAl@MKqYeb4a32#rJM7-TUSD<+*Vx_6XHwDUlW$c#L#~AiGjySug8@E zopI{wSI(aF*nQrcY1RBlb8l})s$h2^i-Tx(x6k@btGU{Z%T>&&>?!reA60o57?Bi! zi_?n&KzI}~RdUE+^H@1JQxIce>z>!(YozN_{hnxrw= zvgxV+g_}z^H^wqGBsC9gv{C6#u+v-BWB@c7M z#Kd$|UtL-VjW@8|FUfGD&F(9c7z0=60GI!#G}V8z0;n*`>-4E(kK>lm&Ax~dJi^G# zEU;$}F&mO;04Rr|*_I~$q3$T|LmxE2LsRs-dl|(xuqeZI>DF)RF^Oa^<|q z6@?u|^cgv;wr=f6rpnIywB|dbi*syP*R357woHrq=z}%d`Y*N5(2=FXgDa;2Y!{Pb z3p{NYy3~&H5E^M>lY~0sI?^y*pirk!Q&WHLE!#mt-hg+GVYUKjj7=CJg|gw8T{(*O zA=}1!L9^h;)JmH_O6Sj<2+X`P!TQ?6YgSOrw^~Cpsxmz^++XN>P)~9~SJv=iq^XrT z@{~4+8P~#=5Za!sr((vX;4SLwyooVb#L}2U%VFGEA%}UTjZyWB15K`Qt&NTN-@JI* zFZR6p*S}mq(D2&$NcTsY9Cm#h!n8vMs-M2PJ-M~TxSXE2y_$7;1| z)7{AilUMxiXQo^TLuSd9Uk--xDc7LMMbjp@e}7d+LyWq>x4uV3>J6Br24Z3w=nM7t z?XqxaMyd+{Y7y`I1+yjoV^{Fp{3U~@59#aQ#y^5I_HPG9i6WMgltD2@%IH-XMw0Xq zNe40%5Qn~fJc`5f0Qm94R#zTC((;m7=;N0Gv#Z&gWjI@2)OV@|Zs15L?Y`($-apoo zC`%od_EhkrP|xoX+D(*w`<{vw0W4;u8ryFj#4oR^e1-b+z5tKsOSc}$J_nt|@j17n zAWiEq$E80?6_jm}N?m6pz1)ASmZW^A@boiE#=P3WSD-|ywhPn8Ldl$a+`oc4-Y!pq z`-@Mb?CCkAqr(a&i%==hKDcsipFM00;v3+SNC7$@EMc19Ls41r*-KnH$H@F6eD5rQ z8xTV1J-y_J)`vj}Dq~E!yQ^1&>>^+TK)Leto`Nz4PSxs`1lGt3I^*|Oc-ij&sRby= zP-*c>xE1ZL6xjAb`ul1^+V@@n`a}}cICyg#jJd))B-04&)zjT{xRX_pKS2j6db&kON{k>q*=rxL!u|@FHksgZS`jPlmk05A9l;H z<1VN?cXoAa{MF3kKYG`(GE)^+ckQGT-B=_-UAJAk_^-5L1$TYU$$7ipzRi8PXX>l| zcwPAY)1FvW&^vtY&RGxk>@-jsGMf{5$|JpR5i^Vih0x%tAk+IWgo#f;AU#`KLeFRE zhZ>ofiB^}T=!Z{OIhfWSE}0?4D> zXnX7+BPDgLSFt4Zuz>-)d}Km`77RO9@_V*TE8tmocXunikFD@F|Fep=;)h6E-kqQ9 zhm$sGTU!ZG+?u*imG*ts^IpRrFf8{(D&m^4hud}*@!bkF=U=}57*+anmxp(sRXXo% zE^pF7U?sC3)c7M|XTCk3(c!j~T5Z!&YmtBN0hFZpltR;(^R{vga{wyWJ@FNaDulY1+2$t5dx=^abkg z@->5p1SW?tjs>{+0L5G3C4`LF&uuRI^yE_Xfyjp?C1eaUp>ds^$#G6OfBrl?mC5lut+xnpdnh(6MfiU!0`b*eZt-o)*&e)~~O7e9(fL6Sj>?vs9GRX<)PeX^V_ zc07)VVMLlj)j(#WBJMI3P(3({t9S2iA_=6>R3C&#;C;3CKJ<+tu7hJ^ne0y8Pys;X z_s~6iwpRfZ4DPVdzYe(MT(( zj$xeS2==3LIhKXr1-L3LE9?4ybUmu;jw2mI5hRJUtmiEIe9^dQ1{YfK3f{entcySkg zpf#+-pvR0}TZl$KgbRkSH$R{YZHf_ZM)AmD^{x-eS~8d|40z``n*1Ey6l#dHNgx-v zEYa43%5AV0cYwj+-~0#vMWxU3J!n&kG0qTx%?q&jOuNP~$m;IC%T6(Kcyp;)u{C+G z19}D4S~p}#7eSBaxqjY7W>kpslI`>f!Lp%uk)57PMM?H+z4aN=XTGwzaf_TOvvA%1 zjsNm9gR`bSV}{S3UT*UE1d`39(4Nqm*6C*Fo7uZ$o=1v{i5;@*-1nh^#$iHtF2(Ep zoNL5L*AICS?lm@YXL3@-emw>MrK~(7s?N6R9ebMgM6K0`O;(rR%&WYu%VB1Bb5@@d zS9GPNrV8YJRvE4##3!Vhqiw1~vu*XsPQ9++HJ-CqH>^ko-DtQ#IiwK2ZTh0Hjp!zk zU%Pj`RaUdO{o*nEP}=S2o{7z8Q=Icp@_IgU?0X5OT+Vkz0VjKXUtc_s&Mt)efy7jg zleWR=!__zv46jv^fKpIdWcD_WYBdNNeODH@uB# z$j2QgpA0<_fH}4}hfo*-1>0P`^7||XqfNZY+X%1rC)82MS;ry2fr52|Nl|oUq5!GP zalJ`mN`m6wC0Aw^7OZI&&=`b^BFo!n`8W0bns4Dn2nZn=oZa09^vs)*FPd*bV0vbz z5V>VRs)AU9Wpgtk@`+jms{=qcq$O~ouVLXF<67{9H=O)}jB@oK0$n7{4`wq!3#bo5`<%171iXxhhL8i}3w}rK$dO5j zCqBzp@F*K_blblz2L$;@i8x`zw^PXLT z^76!O$_AhSf6dO%_0Oln1>?k1^hq4UrEB4b46#0k^H6E!B)-H+xb@zlmjeJz!z(0o z*#3_eYuuyp?x$p=9K_cmF^GjPE-T9bU`BwtcM|*H79ykQkcz6JU-Hw)Y26)W#WIgy z4OIsoyJ(oN(oj2j{Z~PcetmwE=ap!!biK_R=NV>yzSLKZk4{@xP@Nl6cuK0lsPdrl zm|ZMq)YL*K%OoJ)R#Rp`_^mkI-%djMdA3rp(5QNV{`+OSi^n zxrVxu6V`O^pbDQ)a`Ug5uPd>2p&BdMSzj{HP#dc>_QqVolE#vW{ia0)^rCB-#GZZF zyBHSAa5cI&;cIlpo8&Jq)V9wyT;5pneS4%*EzdmkU!rA?xQuT${VH%jFCTDw-;_u} zUSD0}mHS+)t&5;K!TmsA+t?Tkl65OmoDrpw@mP5`E=neCd-$<1ekd7ZqVdimku-ys zh?sw#{-+0kZ_=}9J9(a!l-b=zcAQd1{)>vm4_V_-i zRSWU~a1j(qJQw8dg`z-(n2p{NM=j5{!6>8SMdlCyvp_kGxpsQ?_aOpC?D2EBY_SA} z#>dt0`v5GHF+&TBi$uMdl*EII?2zd`+sN;@9`5YO;_BvPV%Zr=RxBZ8P}(an+!5JZT8 zWyBdV<7H;k+;n!j0@2oGRa0`a&DaJIdGSe01G||NuG}sC% ze4P+c`M9n7;M#*=d$H;gxfh@qaUFWG>Y`i8@yw1lZ^R?MB0dRWs*<~QrQ;ri^A!p7 z5rWDWPraB=I^dg#>xGM(`*lyx%jV`#h*I@67W&H0ZbYMKcuf}pMo~wMy*3RmH{2Gw z{ddySXg>4mX$B6{=mX~abv||c<~(%0*>|6S1KYhhm(-x^+QmV!vGf>#<-%e*kEaFB zsva>?V{MMYf4EB5D|fNyGfhjl$iT*U6$;?xi=E3NH>iTPZ9GNULR~{m=NTrxmXYZ; zb*SX;m%Z{qjP`c9r%meC#%To&gzs~fR z?Og9JAt3=Y;T4Wg93lGd9`5e$(+Cd%r}+YRbANny8wRwk3yN&SB<%%^fRKmg`#LKR zNr@((fK0y{8VZPu+l+*E+`J{{2UW-nO%SddCCmlTkKogKPZyurZf&jRGxn|zIbdOE zV;`WcM`{7WB5WZb1aCUh7>=4_7D^3B_}-sucQi1#qvsyQds%vTd51wZMOQeu>?!0v zsIt$yxv3f%#iAfCfz})62}g9tnYM&rtZQfiv!MNkOlk(5X)&Ns>M0A*=HeRm8U&Uz zh*Ai?bnt6XSmEy9`>QmCbg0u7=Evg&w!f+uaP($D!>PO|D#+WDMvEeYoz>`JTJ|&3 zBilMJKEAxsOxovo^E~x0YHOZ%v4eD755+cle>i%{L@w|{->B5gwWl)v6WPgZp-Q~3 zqOE>#MrStO2-VcvU`erF%X2?$*Y}Scxd$Ob%Ndb=wZ06Rlce01?VqNYHp@$)oWio( zw98Jn{3{A4ZD2a2UBLlQ5_X_k$#6vA`oniI7GmP!9IG6ZC%rGAIa9dF7p?hYZY~pT z04cUm<3PwAo1x{5AJEa+IfNingOPny9-G=jH%k07G5nl}?1zsZk&55aIf=k}NO|S2 z{1isTqdx-Su>eUMLrV~kLi{x-Q#@LYqesUOUz7?_1Tv6qpBA-nNjfJv17<*XzgjTp z4+^J{`PZ*s19H|mc4^_~MAtvr=@4xZ)3utK+T=&iVb!sbk!yv8lb+uf#^RkxQ-mG> z!SV;1e*=*dWn*34uY*7t^hH;uc1VzW24?tK^nN84a_F#ySo*JWM1sagEJw+DnSbNch95&LMaSsmT3yVIqp31n{aI21&_9vV2^R-Rh zF@K9;w(>nZnzB=L=aG{M>C`e59jG+7xhS96gRB8YP5$~B-<-mn!|ge48X@gp>{HHW z2R}K}7|hpJ+318C1UDOUw>xw6w__+VnM1`fs15~f3hrAImpdab5E+F&(l$n~B#w+g z0<}q)XvB6cVB+j!q=~0$>SQ{SF|Md&>OXu?I1+xVlE|J(L5uqoM>Rj#JOk@*e|5dWrzHmCwF9ZMUUa2abapm5TS65L80w7!K+0v)MXSZaBYhmo3C2yK-dT#Cv2x>A>g-qx}yb75Qg@bUmZWwA&lJ^jFcy6OO7?KQZ z&YzPXgz>j>a2$b2N96{f#Lg8^vMW94n@)n zwTp8)aDdJlUtcH`Nuvm4LSOUzH~iZxSDtkGAXKWPv???xh#u$eC^c3}y$LBOB;X1>z=@}zqzT|+3CT?Jtj9w za!Q!QoA#|L7WroCW~kwp^RXv#c3@FpT#g_6r56{gSK%hhg_ssH8OjcM2f z*2h7GK#PbQc7`T6lb6!S$V4hsVyC#`Bt1D=10b>E+c=VxIqFvQMNLoHOiqU&q#v@=Nx{t(XH^$KD#GdHfZXdn>+Za&)zH5!1~6MJYY5 z8cv<0XK$Qqh1ZBxR&kz!o=Mlw~>pTQVcGO3xcoco84)ZSD#Rp5p z2NMz!f<<2ZNUC++^|XwPdL*_eK5pvFYC^>#2SGKwyJ2V%fJ|FDIpKD5BYCn|bg6I% zLBv452!vzgM1$aK3_?fGziS8=8K@<6dTb)@!>yRr<#UJU#2lV5?$(%CsEX3CN%*tx zM%sY^esRN<-J~$q=em{W@=>BNmMx1k*tiBwpCnQ{Ahw4RV{90T=;8IBzd(n z@*L1m8})VLyn38!M&ZDX>zq=_JN?6X>{a>&EV(+E80kc5#{GhCaP9>q%@=S5>zkJW zh)N6Y>LVDOLSmS}I)V0d0dpb1whCY+u5vOs9)kGrs;a77?G22yqE=n4+FagZHEG>% zyKTuK=nx&=!Yra;&o#jID(!1zNVzCW0fb4K!J>(}_uPMeUdOU!7qJ(TY(T?w)F2}e zelDR3#9(uyz~i|w4OEuX)ah*5zYXhNHvJ5+cc9upC+vJuuY%5N&4*XUEA+-|*m7T)b?C`&#PP>;Q9)IC`qJXSq3x2QO6<;N zq6-SKHGki+%{2MBDLGZ%eZLh-yVHFdA}r+I(;wyMytI!ox(T{g>Re}_&0UgMfr25kGiTdN-oBl5DAMNIukJK0)iqtUKznA%PeR{rowfQD;{$M`P@NaaR za6Zf^OhPVE{NRDE7@m3p?u3W9ElCBA_kczMZ$z;r=>QJ}y#?x?I(!r48|n75W->DN zivHz1M&B#j_*YKveY|dZ;T{*A!dw2!wY&X?sYW)oHXa&$i?Y}Nn5NR_!3XxJ!*BFGmWz{qG#?t_G_(nteJEGS>`h3|W-HsD2@OhB?%n2_q=#vUGY5L= zg&DP}z1{4Nm7o8nFSzQ}Y}LiXJ8K_P$NEJw{&Um_zc*s~H&uXEFoNZ+?5Ft`tdxV| z2YcKLlr8ivGWJy60fFebW2 zu@lUP6mO1q+>%AFLalW2AWM;isD6cV8=p+Tk_;xNt>KbWf``@~cq_EGs03vkdv^gn z@96auZc zPsa7t^ht0%+CHTCersDDI(u+K5Z_;V| zraduH`bxdrOp`TF{gu5rV*gWkXGLpVkKK?A;3B!_=pk{yAAXZ5`~x)+=7u#vnZZ=W ze-k|+c0O?HC+ZCmpp5}NQs{U#&H292iAQf3fmwpp<xH=@g7S7`)#|ZJG{&PQm zPUz#ZzX`@2O_}I}Rx#K`%K97^}S=_Sm zQ~FUi>)z-GI^Jz=1FmN6^Xj?7f#GY;I=mUm%n+oos2tqv^L1PFbEC>AK;e>a9@bvtV6{F zp6UT2GXg`!YSG|j!+1>w6++$`GKZR78ixRSoxs9Er}95Y*x*a@o->KOL)%|s%L+U_ zWHd_j5C=UOWeO^R%op0k8M^uc4lQ?{|Dk$UHQM**lDpCT@q%L&ytkVkUzEGM`p>|% z#lU{uZNtH@mQHM=-5-T$tg+ z1oMN^L!G(Nx!OC5t!0ZK57&$Eh@OzTYZP+wnup825dEi>z6qyo z32;opi{W?$cfX0uAjM0lm@6wTK7|>H-~$X{Wb505a|4-vA`V@lcOZblNMRMcGVs$` zC~MG{xkB1Qcv8|gLnedaTUHp2Ow0>A8NIx{ad_4vui9P<5E1Bzh&0M=REC(!sq((q>fnbpJ+}y4B#+bLE+#W1OCd96KuLvV=*neeYRc*OJ%0!(3umgG z-3KN)A8xeHS$kT{*RZHCJoC)T*wAbB8#>or_I5l7qyHi}J0NLl6&ZWKKc?7>b-|Fo zPGdODXODR{yY*2fTKUK%kLUpp-w&4ex*@8KCZPV^SfJm=*tYbkgKsomvZ z*9be-9fQY*38)v&pH~8|h2|Gcz?S;o_?ZvzN>^6?IHRV6;}vRxe>E-yjVA>jGIPK5 z6f}da1w6n{y1&|5KfGxKbHuNff^|ZypBoO}y1u?Y%iE=|f+vK-4w1?m$v|_6vk}Go z8UP4UP@|7rqeLhPV5)3p1dfujqJ=31Ij%d(JUu){AGf3VOerYPhJ1wG5r_{+GFBv~ z5YMiQ3+5jHypaS)Twjj9Oc{3JN*6snJpg>;3qOWU+6d}QM&yEB4{=RwgpChKG7+-? z5(AQx=H5ih8?EV@U>gS{{55Kzf4#sXKz9sIV33$KdQOghRWy1nY?mLyhlejt_(Ih2 zc;>qsT<4+g!GMndr1zy3z@UcGNv1(DKo%_Xr@=vg=v#;9IuQ8A^WDEi5_PlqR zD2@#BpFRoiYClS2ndT*zMfi33MSYF zlr$th`{>c5pocd2flUV_pEbg_l#-s2@jA~)f|#U$h&*Kciv(}EefvsKjb_Y2)IpLp zQAp4DD$()&;<&c5&G+wP>bv)p=q{|{qKayMXu!?8ey#8K6PD8kswjo_mNp@M;C2!^Tj&j&o`&XG}e z*rpvF9iK=nA;F(9(gQE&2lNQ&0}VgT&X-r=B7&o{IvEY)d#BqXC-Aum_lfg}<0}>- zrmfKd8$Vupz-s|kw+J&;64gMc*Nk%LZP}0<42H4-o;f0sxO4vJ^t3LUFQ^Jqp_ZOR zY&n*6hHG3SP!-b80XfxAaP>pWPu>OUs+TxcU;!ddbX<2kevj>G?=OD}KMI*`3j+Ez zbcJ}I1;);ZRYh${hKk~1IrrglB;a_F9v%hv2y~Ssi@ngiDg;Czh`#2zN=#BEH{y9n z3c-z}*cBlzAJBsEI>8-B_7dj_nkiC{0J;(le)Z+c7lf=7yizDdci|w<=5`rB#VsEq>EaP06E{=PoOw z?;8J4U1_yG&X0~N4iD9RGJEvncPf)1r>W~DorqLzet*9Mlec5|Tg#^WRBuX3z8U=1 z&v4h|tA3Gwo6{Yhfd4&-d!B=kxFKnd|IZ}uhPQ%*#I?5jKaseapYNufJ7y-nP6Ne z2QKb)Xd2L3%l2J!MgBr|ULMQ?*%#YTvTEYxl1L5?M>tJ4u5EElwCVgW+c9OVJ*gdf z8AzD!TYZ3jNVvQVB{4DrN%si6oC^71aKj(FCkx`G#f6J#E-+xS4)}ZMeV~rW(E_?iRDem@$zhD)+#yj)xbvM9GDHUQTxLqs=|uSazh) z`bu4?FwNnNLv?m*6kfSYP~}pz4Q#^!GDy*FIF&0~QkQaO@ReQK{NO zS4-JNT7S!(9=+?Tr+e~XMW6Na!sw0;QTLyB^78UJAHvK6$8NXvIGNp*~h zph@$}lRI5cd8U&F^i@q4MKen`>(Ov8>$$}Kj=X?y0~8uZfwMj2m;?kf`|JF-?`tPN z0L~|za@*{Js`BT7h?On7^7&e5LBo43F5Ubky?IF{_1lv>TDGNP0@U;UCt|3K46Q-5 z32$aMe$ye_^YUT9sX_^EZ*5{D+PPePlXfzJ<@vK7-iGQgTU^%P6nA?qyVG6h@8$iv zsHF7lC;R2lhowS6qm>}H4ssj9C4rVEq1haKq7{SxE8$xH9!)NpVgY>z7u3%LF-Gti zUNy+~xXG7OTQ9#vz7x1$uG_Y9~na zC}3d(LDJ(2Ad(PVO|Bn{z=-Hga*#Ss#N`nld?&6cA+5W~a+6xl@mh8Dm_m)N8Z=5H zu0Qs2FJXB>7N7W~mXb&Es5;TDrvU*YQ$rwamqrhtu{h0liBNk8^-wkLdkTjfRbag2 z^l5NN30T6Tk5>p{5;TS~qf_lahMvDQMW@WKny}bLKaNODUoe;}swxzyeY{6PC(HRP z6V>5BOnK7F(aYd^o0pRVA;O*E-?svP3b@}xK3*G3gwkjeZ{s-kD}PD#SY?)N=cT>= z&I$U9!_}hI?f*pdp(=F1%prnOMZ2^Qn;AsS$gcHch6pY)lfyZn{{tyQo4!-INcgvO z`o?AFHLvgUwsRSTC<20)hGgZd4^R36Ji``v0e9Mk%a^k-8Xj@=B8~uiqwC9;tnBS` z1b8#DvM>p(m36WxGxIPK3y6nw6$^{^)Erna1oByEFjFlqI>A~%l4`IqE5@!;LD!7& z?vj1O#EytIaR;8`@sN_Xmes>J*-^*hHSapx69&6K#JBm3@0-HEwN>Qhovk*zF)&J$ zWHB(LGS%YkJU|*R?pw5z44CYT{E7tIeGp_f;C^1j8DfmFJTbojqLCo}5U#xe3FwyS zAZKmtIg8U#ji@)#)#cX0j-|_Pv~#B3mXrc!NayFihp7A3jnLn^T!VG8L0HOZGf zb*c_;lVo=T$Yti-qcu3#g{E8~cr(yDNN4wwzfa?E_=6v!Nz4Am)0Vn#}Ow`|~LiCOe$rB)47py24 z=&LwhLCdtwUs!`_K~yyK9Uvt!&<5ei zN;AGgn|&iRGz~X^=6D^nY0$N)ihTWW{=$V6NH#SGcjK4_sKSA}0jW|!I4$(v+rXrZ zm5i(Xd2KBvB2*Hgo>Q$1G6>B=TTX;h{MH!e76**hW%;)o7ML0?^F^rVT$j$d64Vdt zfQa3Z*58H|e-!f@C?19dsNIOT8aggHRDv4WMm@ddL zCel)2DN!&y&T63Ad#ZKEwg~+kJwYUbq0J(T8@3$;-M7Kx1$R9dLIym1!*~&BlV1JL zKDx(wZONA-8m-L~*;>F9vQviz7Epn-f;)AU6;>$R-JhVn#rlIZgk*d{xY!Be9{@${ zXi8&4P>-Xj9>1WSNo`43<%_nq^QXxk*$i6K#DfR0|~C87E*1n~>1Lc_PKYJK~7mpt)Lb@Xu!)1RtusIf36H5<}&I zc;kPl&@jD#aaKO)i_pQo#s`~c%Al^huE#cR2&04dnxBIn&s-^2X!X&8JU%bc2)VylyVe0h0!Ku~yV^%L4JZ17@>X&$$> zEMKDfNv{>@`TR~4`yRo2Q43G`gqO32E~gnki^TlAbbM$K;7K^PR<&QHPKIB4I%M6* zO;zr z#Wk-}w$W5KyB}a8lr^;t31g4;`gx(dzShXXJHuhUCYdxwdVszcACgp3RsBDmw*LRS z;(nS1$2{rryu!f>?W0`j-~qPNqZ~8;al~c_NWI)19vS)C)3x7QOF&Jf!=b`9CWz|8 zADw=4q?e5zR0>Ait&3x6x4_7LS~CV_>?!am(|c5|!Alvy$_p2V;E@=3;|{>Zk@)oz zEV&ndysfHv=XQA`<(|Vm60eY9-ndqdlzW#OxK#x;wyoyw^&JkWxal7vzTS%w{kA1S z3JF?Gff$JlKSF_Ic+TYY1y@n^+avuZ8mHF&-JvV^*D&(GDKx>qyG24+Hb0OYUY4zG zn3_keI}<{zwEmxHHpx11p-WH0Awle7$L=4&%W^7IH9K9mNEb31$8u)KY9du7bVR)? zBjG*gi(kV!?2dcdgi05m&_AIUe11?ZLI%eMX|PCUdFWfhs7?&j9Dw>o=F1lzi=9a? zY@Nl`1?^M=4pB$npH+-8p!c=ClQC(vjVCL8m?43#G;;e02OiJm|F+E%Z8KO5GFvTo z56UWbwsou4h5h0@a&H%lUwfX{QKPIv(Q>9J^DLKhUE(U#Y3z=>70%gpHm`DXi;Sq= z^#D3U0Fo*w`S0$sk4pVb=0F)k4Ug=qhW`G<_a#S;4vjy1mQ8=iBFf)@+Ch3KOyJ#n z|3shuh9~TF?QF^x+$Unhy-Y&`5LQvA)F^sR0zunML$PMyN)`Q3%8^k~4l*z!^F>A_ zJ~JLDJ|uLQTK8@4&h=l8>J}TWg^RQ%#vJhp4i0PzibY{+15GM^BC4eR!ILhwZQlww z+7F;UkUQddz|GJJ;S&h7d(N{;^2TaDBFxBwH9ZfnA{^j&PZ9xgPuocDjHmNy)iJI z`T3(YcBx`Sx+rlwB{{7~V_eFq+wQpL!$>A}2zWbyL36#mJkc3=|0Txib#%kSJ4;_3g>?jZO(_{oUqgCA*HD{H+zI^|EYONABr3RoukE7RR6Ve{b$M zyy1yx$&m;m$Z^#k+PYywP>HKzRQer$tJC?ERijP*7ZlmPtgIK+aWQRYZu-hm?iSe2 zVtIgz3A(To1+5f=vE!K`veTFgjTwEBg%w_|oaZkeKktT007eJpmxWX~y`#9g?w zFnkN@Q*+fG4%Q;8eE%hnI)RMmDUyqSc4oX-ckbBBfkY)6!!7U{Ybp%At)l&eS{tkx zfM&G7*}Vx?r?Brp@&%!m&FQ@(2DOPUh8W>4J&Kpf8c! zJ3LdBG>_b^U=Wt5N8S zAaPfm!n&f%Zq9q6`DCIv>}pQSa7N-n>aU9QsFA1_lNc zCkOG9bD~lX;o^#i(CHY)agnGZ0>OHt`vih*5ZxH4=_382!GyBd%rLy3>cW4eV|7-r zE=B(E$rWY6V$BG!+IVUxHA+21N3}s|{Xv=g`~w515$~c1M34@&-E1&|68APhRd}2w zW|nJ?T#_xsErM}V%)?6y4l#N}XoO-e^xP7fn-ZWnB+?hY9ggn+l=y{&=z;llcR$n# zE}90VW@b!T zUf;1~$Ur_435$0H;i42qpCkJ9#z?{51awvZ5KPyuk;;weo4`S?uBZ3^q|YZJ9!xUm zHKVr2+1Y)JcwKfluO)H3u9(Z*_v%|I5sIB%&TA{B-J)-Hrb)-s1~x!|%svGfa|>2h zbdLpmrVu*{mw9XPw_IglvzB&iX|XXrF{Z71r_qvxEVPX(Rk44L45bq3pgON#AcCFF zuA9%VWoE#(@67#e&<{Kb+Oj;hTUd{83_lXb%351nTf&9lIXrgzx&eZ%5j2+x`!27M zk$Kbn6(ymu35WB#r#R4d?KX_{(={^Mc=BUvsB01iP{IIYjrd^fqID5QqSL>AjY6R( z&m@TeIn;XQ6O9I^W<*vwg0Dc9UJ$4UB3?F(*SCc0z8%FHlQy0_Ivub^P=Zi{Spn(w z62dK14swcNyB>Ku@uo-PA4pakhZURE?h5`SdJ}RaJjO9%rZX_l<)`Bw}OqtI&?? zBD@K>Qv>`f9&_I|cV*Qq=HeVNdA1LgGjSSQd0}*_ z#DvF2CNM$i17%VB4+S!Y5Ef@3UKkLH+^Gjh&%{;-CP3USXy}H=#$KX{Vqb>yzYdA7 z9L@jISLesuxRth+r(swWzKi_%PZZ!epa*EqgKfe`(a)^Z(4NBt4k(>1C^P5N9~~?z zw5SQkkrfma1Z@BzC?GXRH|D{(XP8J|zj*^ka17eJA6OV%w@ge;<3X7egW)4%T}hfQ z(sV$b8sr;GHTXbPBLskM4R`)R`$EG&z8@aT1R;=L>ZIgZw4cjWB zjw_OBXIrvJPj&7*yLi*u%1krnf<=RX?}L+|NQpm+nMY9}v>_0)P|#MjH5QS1W<}5595Z6V4{HYOgq0-3XC|gfy2jUzk?Hw(h@&$_-17lc>Of zF#u+gmv#D(44V<=m&8H;K|?4}lAes&)BlVyDaS{Mx_3_#SE(HW^?+}fVPW+-%Jji< zR#H{~88Few;$OL7RA8o7TAiY!563c?{msJJ;3DxEzlj>)Nk1*pY8#fHkwyd zJw!Z_xR$+UTV*TeNRlmbbv6040H2OxwGI+55N;5#!Zf(D>*bjk8Fg>yOl_S9roza? z)HL_l^&Ts}ZyF>a*sa7Jw9}%x1puJ>kt6y*e|`bRW+W5v;Ftx98BKf854aq;biu{t zE_}NMb&WC5zn3_bDsZS1J`9jPNm4Cd!JiQrhRg$il81zXbm!b2G9Dm$&ot+OYJ9A# z03bwqzXG$v;ma|kXMY9)Z6bj*xt1G1VnH)nF+CmnCd@!-h5e)g=1_P}^?Hd44&_rs z!qVVCFvLYo?>kC>;OP}tK4?@)&&a-3g9HaLv=ao1&^J`vc#>*;n0)=@ZgxR?Z(Rell9?Kqf0(ZC;n~ z!9yqK?8h~8-Cuv6su09obPMUS4sX7d_#0xPF3h!rS&WnB+2Fe$Tuwb+k|(feav?=K znZZSj#>4Q#3txoi;mbaN`v-mlxcCvrE%A0b?a{$>)bI6(YbRD_peQgQ6J!i?rr~|0 zAKAJN#mO)@{t*~|+Ae&MI{mT({8#-9zJ8p72hi)ERF+Vv05N|E$B@s$AOqGT5sXWi zS1KJmcoj}!B0Pk35>N}_-w14%wY`Y*^VnDr{*-7U@TUY~xjboxGYoSn27x-@s8-~e4CWL?V0Dj?%7hTra$4i~22QWJbgB7v5h_oNR zL=u2ORoN7`!o2`^23Z<+5)G=v72g0}M4=`PKVP6xG(H7RRrKT{&DAdVVBedjL6RDv zAbq|=Ri-aHp_@X1u+`1YE!QAN1-e*F2S0^lAjA<8-~8!AE|)HiL8q91qAUc@l1fG5c)ON@ z{JI>iNjK>iHjD+|PH;TsF5+jC`#H>#`g}blB3r|*vuo8xo%s2covSpr1A1*h;sKHK z_YxyF&`df%T=)1@Aq?ku)`O$=khrbkKKJeUpRL3@VIWJLrT2-eoJpPf3r(xxIYRF2 zN1AI+e+&Kfdoh}hy;o*Vv1pY|>@Fze|5LN;q|p4e$j1Grtfh7JV>_JguKxYfMY*oS z%;(@qB&g+cMV}1(6*jw){@FWm=ZOot%fLx=oK^@JxR|^}58zVF`FqBnTErsP1s_<*qo11;GB_mtz$Dvwtuz)Al=M9z^TtHDxM!{LvyT_i{!UOjmx9r+6=We_EP?`#{H zYr10<1rLIRXlwSN7?y0F>09MTY`nOwA46}2_6Xt@5Iu)fRH(09xk56HA*mR1_={YZaU@0Bg3Tn2RUO_`co!vmf{9-1pT{t~BDjzvwi)gn%Og25B# zHH7_`#tC>MXXww$=~NXuMy4pf+Zqw-*6-eV-R-z56u6u+Gk@}k|3mh@r62{IQV2SzCa+J<9Zc1bW+bw21@X z1K5JwLxtC^?5pr7OTL=tqSX@b{Abt?fIduuXu4E59t}&R6!IX%zGL!_p?lG2^rPqR z0lcOPk43}4Vf1Y0)^tTmtx7BPt+!jb$r>lhHt(`3%BDd#?}r=^@h^p3U9Br zx$K`G7Yd+`O7bwg%gPL#Kl;TV6mO>-9a|AF6Y{Ue8c6+k8v7~5uhgFNuiTg9==SZ? zR{T11YUYrVK&2LyzP8eZ*n?vYzWQL)R50inXSji5;8#%KVqv*S9eA0vC4hGjk6R!RPL!p%FhpYZD=3iK z6!t{q@UBJFD~eST{h@yJ=<%8KM&?!wh>5n+!ZO+*VXk%Y;!n?dMs&)+oHp;fB!-89 zgG(GDyl7OSnB#ueQC-K0B3doTqDBgW)8=_)%l%v$3O5`~2~# zEMa-@mP#=>2$EI2dyJ?k5;;iupX$yT7vDaySMJ`=16r$y)v@&j1 zUfDGo2tbJl4naOHq6%S8(#UK$qNNq4_v%FEbuFhiA4wn%!dTdwaRQO_elw$*ED z8^{N=Y_E~Z5`Eloy_XsLXl1Y0SsB@)0a_)CgGoufOV_RzA559tCfli>LHo_8wxq9} z!;!<`?(0flo6?Jh`jHd*M91EB`hg@6sj|+paTH2+?;mCi^`_!k0xhfVm@|r!l=M^d z?!pi)M#Q?P>CZmb)17Q(H2q?*M)5&t(+!3m>&m3VQpfj_{6V_<#EGYmw?dLl5MqN)NuTKt4lZld+Qi}sA&hDt6ot^`7e-ws+5FI{ z;_nV#km(2agaxY#`aN8>7=;}9ai~#{2~dmTyaEzf@E;5#hB-h*YSGIte#t9>02O3@ z1nI3SCfcoWL=b8X^+{R^U(y&BQW`khB>-3<#RK(75r86wbj<+KC>|Vq0NB7=9@E!9 z4fPGyvxveiY>vYi#|kB#uE43epFh*Fc1d7zr&>_3DTNMs}cHUT&2y@EzFi`}q*_C1mu2*yUrT=b5bHZ3xEnX+5(_g{O* z9=>$(_Jib2s-^|ym%69E-MDv;(O7ur%s(+3z3<)$E1bmPYaR!K#SYLvI2OIB^oM(7Tr;s0(a`5_HM7AMhdIZuwywTSHPZhRb=G^VTjxeAw zo&$+DZZHy2CELL=QkIgGq>NA_QT}Kbw_&OTy2l^r3b^QK zQNo*7dujY#Std?lqD=nUqdn+9_~lCo+H3;eL4l1gDSA9#9SgC9TLRJ_2yvw#l?{xD z@(@GQ+qPS*5x@-U1v2PGb+ZGk_C&#pQW^z zAG$PK2f79wjU+k`(BZ%*8!*}L$V%e8l+Thow&e>f1)xDzQ2hagr1!&OCf=bqlnt2cb$wE!>5y4zWME@EH>owfhU}Cq~Fkl4<|DO)wBA z6V1#>FOaC>CB>Pj^+Nw+R1)@A;LV#ZYb#OzW7U%F*RJ?S>vc5%1Ckk%m2e#GvC?@h zN&TZom1B<4Szha7RB+rM>iAbeWk{}ch5pCu8859b_Z)UjS5VN(JO)M6Er`O zn9l>f>MdklawDuZzs6(sOpcDR#a-)|3@0n94T5Izn$*%)Uu9BE^nb1fw=$`2&7+$p z|DTzO5oZPOb=G355S0lwP8uXexqH-fsfnxvIyHV_VRb%*RWohagv5dj97WfC8wWis zyU0`w!u>BLwGP#Fy^E7Y99!^6)nJhGERGW*Gr-NG3_#^hPsP)xX-I@7oPUka@?CjX zp}#x*VEugdU)O|NA zswkM~bN%-ZVuusoQEDAH&eZ+?QT86-T=)IoFiD9zLqn#0#-|_yu->>!N ztlP#j``LcI6xoy)K0e+l+L5*@dN3@%<-=M`!UZOa^UNcz^TtrXOmrk~Lp*~sF$=GV4K%cy$ST;|&a0Zv8J;$2-wnyTN_y>hv5`~sZ>MeUbI!>5gc zX#Cgd{JG+0f4d1yYu%?$=NUc)PrPBLw-}zBYzB@^2!60hu!VHZ|N3zm#G+=#I+g1o zQNE(@KBVw45}yDQ`v*70sPh`=9k{TH2XA5_T3jseZUy)^e>S{-uPgUHTp-Z9g?hWZ`?aGUYfDc#ne{{FJUED}bNx#4jZbXMk01ql z+Q0tPrt$Co8}5+Eo2@!A>g=DkaPj-sRsdw)TQq>Pj9jTigo-Td_g-_S@=*Ijai~k! z;$hFu#iI+l42PssI8(2?DYFOV$n*6-J(5?GY+`xjh}xe(pcd}y>N)aOec-SJ^ zrdga@+BDsSZEpvm%+$jlLmN@D^4hEc{=xm*Ab)hw5Z71I zn)PQ|`;Q(ju7BHm;HqNl!t25pSe$pmFXL?w#ia?Gx26`V9}RWY%f{={8(-B{8Vh|rAM_q2nW18cI0*>OB z-wF=(kM8+=@WMK`WJ4{y^t|Jto8`$Wj+!w{K^%&)C)FR+rqv2h*77iRGE!W!8vJKw z@)Iss%wILJ;wdT3%s*qFQl9NWCibhYr(a*kuQw>08{V+`&yUw9-%+%1Dz|F3QXEn~ z{N;V&x!6sTS^_mj*E`^nkSmq|3yC@#_WdU40b|FODrXyVu`-mU~M>yGNCjFh0k9L*4;fHj+tDDYbAIrbl*dr$u6nr&5 zMF3dmQEIGKMerw4ra<#pcj?K=*@^$YVZ?_*z3v&U!>)EgO`$Nd6z1l@I6y)F658%J zmn2J1^t;G-2_7^HG4FNt6bdrid-!>Wk&S-W1287KkzbG&RF|BI(ZiL%LCZWDyX`;Unhgxo8MGpoKVvAbG zPkl!%$WgE7`ma7~3Y8*NeIybaKTK+Q`)GT7O>q1bMwhklq&L|*)H`oxqZcV<;Q43z z%mDV`Ry)iNo$9olDB^z?*yESm(bL_q^KwyAG1K`&6gC;A5oYzFzT7EFS{<5eMU%Z4 zLP74RA+;8dgw@|aE108Ix-ar7DBVA83IO2K+?^(-3uJK#2#@BgC6r~W~FFEt=jq*cN;MfOm& zY3?*cZt{f-C(-Q5m1*2g*na$f^m^F(+j<78eG$XacU@@o)kd40j1^TGGT|GBb)RKG z--FmFM$K)@^RtoIH78u_Nb|krxmC;7bES43|86_^m?h>kGsq>pPq<~fpw_vhOkI14 zaE|@^VTj@BbO)+Qok?Y}j{J^@Z3f8iuRO=*}j`gD^} zy9}&;UXG*r*>F(0QJ)itd}cMqlB7de!Nmz zYgdpeQO`cWbU%;Kd0tr2K%zlof!p=qxU9xa6h@p=^iCe#O=hGJ> zs%Ot${IfiV6_`HQk%2DzD-aoI1?peEOnCcK6u5EfD0+3xN*70BwLlAL|Lr+kNdoSa zfyBfccm9~YkxiGqJPfK}( zCOzrt7Djqj)^wX~>-X*xd&8fpt%Rkem|Fp|@n$P4C^Ys|dDCdEBguZ4OMt|KLDjZ9 zo1f=6r3-C?R?@Kh;3%P2(10T9;`{2Z@!K`Wm$vSBjkr+EwD}+~+-AnziSCimO#rlQ zd-jUidlOF*8bHmNV<*sf>ck+1qf?e!PNd)Ih@bWRxr8=m@dfAb_5ceL+(l(9V9i@o z9lI}l^Ip-eG5_0{H-o=vQS_J?jpL15xH?;XN$JSMh5p;z?Q!beiG1PNe4&dHexS&4 zA>SVAxzImht&%Oxti`6KBB9dDS}N4?*g;}EL)(k;F_eE0GMM6nVXU_QQojZ9>0a7= zmY7N9syzRn%Xx2N68ayO^B#p1$hqGNrIU1RFk%^mydU%rpB*_r8m{`MW10Ee8PV(nYUq+WkvO91M3A8FME`A2p~Yxx9MJJ|49H`N70?o*AA`+}*sy`oMuA(viCeZ*>9KkO1b`VR zCZ8)kFB+r}ql7`3)?J$GHVCey%*-Y{h}w4kK*wP|efRjQbgTn&MvfDLo#qL zR#fMa>a9AFf8KR>r^1*dcgzDZ@aXu5kx@uqA>8G8`~VV{iP~uZ zM`F%uPxd)^u#o|^s-+o0l_^7T*n>mVvAaiZ$p56zr!%4V3s5p^pKn=@eP%Acz)hgV%NDhK|KkS|`G%y!`?<{g;o!XmfQV=bB>C z$AOT{^fa9F4cZN1(kE6=IWA1%EZgMC7P#3mU+&(6*`A>krGWA1bU-ASX%7T^g?X$I zLuYtDQ2KjS9&=$wJ)wL0uU)lu1=SyYmF|TKjm*4YGUiK{*++RReD2fJ$4e6sUdeLs z9#fkG&;#{*fThWu8WKT_z9yr8GX;ryCW0y$;&T>$q9cZnVlSM?$b1O@=D9M@g|06Z zUQVKx$AW3#Fon0G36qLp6aJz5uv+0PZ0;yCLyn08X9%N1+^4i&*~kA;mvBg`{w;zy zM+s&}IlkHl+Yhh9bVcec6#e8;g;R+D%s@dP5xG%U0w`U%_nx2MCc;DtAZcd@ut{u~ zh8Elc8I?+A^|70S?qTmHIfp?`!g(CTQPqmIVxFF!c*T=5N)X5!J4)B{?wwZt-|u=C zGe4qw&OzE?YHf{g$$*HC@n36w;xl=Cfw`2`k)N`7611t;iyzBN)VRCzp`Z=&G#=DP zn9cZ8AN7+gacdH~@J%E6@DgL*g{v6~8td~DGT18zSN@ioeqpB1OT0KZUbYp=h}I}< zFm-B+W!!1_%^B;HEs|OGXTP! zOCYHOWD?mqIM%5fL~q}@lh_M@+4)69MTMPPg|~u-i56TfJKGv0Dzi=8@f3Vc(cH$l zGN(^lcRytY2}$U{Y7cklFZjPp!$yE_DsN)47Zt4vWD%0p0@xP7{t$-&F;A0i5@?p7 z2_+G`gxCdk<1cwkS~?zkqTiwcE}lm+0J4+r#KrBw=N`sqU3V)ne?%RPbMlV!f3G~T` zVf;!R%Kcic@8GzF!f>adfK4&Y<=2tYT;;B-hhxU$hK90_P!#<*vU1^)orF7^Md8oL z#yal+w_S^Z-~V1}NNZFr617p&eKu+Qw(b)(LT*5$+=~72I9v32=Sq8Jo z^$WZHF3)Z~a_qel+i#`8s%)b9IRhu|?bu>&{KC<8vefS(n`l)@t3vha(k?;_ zlUW8Rm*xr~Uf@1LLdFi%p)n|+N08=df!KS?>HYin-^KOLLwo>CV2Vmg{za?sQ{x?J z2KB3O@+1XVPO>@;>Uo?xlFW|qD++oPw`Zf9IPLcIt0{$vY-_YaMvUZ?S%+bBunh@ zp~c*m58-1`5oy6KS62~aH>P>_&tOfyiWeczEHkdA_aSy%&uCI!S{r@(r;E`1vU zf(HJKkN}K=|4Jc$V2wWgqH87sYLrH8br?$k670@gJQHj}(nPlffW=v>2i%-l5Z--T$WcWj= zZ3bKv0udF}Q;SF7{z*X51{4!WbN2| zCn@ipZt>aU)cLG@hD0_lBH4s`bb9UgG${$Kr0{$&j<> zMN53Z!ADU~4fPM_caA4t-*WBQs!d_WtrY4o}6wYbVN>EP3n|8%H7 z-?w#GJBA>&98XTp4Q1t*S0aZu#d|_t*PTaR$Ne`nwFjm`>b& z;UivRSWe#TB=RNa)(?j3+?k zs#bTh-2f00d2$eKia`31lGimgWQzzl_kFmsho+`lAf3ejF$AFch|S{U9sS6A5?p_N z1c~125&cMefR5)@)xwa)$gsWx!k$!8#ZoyhZt%|h7XH~Bm=*;nRWu*$&H~HD!?HeW6)%Gt$TbP zaJYXuc3r#zjMl~SGN6EhPrpk*-+M!Mk}X6&Sj#J5ukpU4BN(wJv#=_l_k93s-P%vj zwe({VAuO!QJCzl2XTJ0Xjf0iH~<#{jm|r^OqdHJFv#AWAS?(q3atjoFX{f( z(%znq%7Yhz%F1e7Pvk4BT-cq~R+a0hKEY8@2A2ji**eVnBJanoh?P~2oTzix+@|9` zy)qGUym){4kzv|>-p8HiQz&ZD+xvg42J=nn3vnJ@V!1fW)61A_@ND^iCwd-w9J`Sv z5Lg@p<{*oTuYC;cF3Cf+{?O1MPtHJxIrzY3DW81d<6#2khi|xS#4b9Bt>b{3P{4k#9ectpaFN}DC@C5t9&5kYg z`!S(&p|9lfrYFX*B;1A0d^`ceG*^F z%k4|6u9r1L^3pPP=xZ&rd5Zmf#NE{1!g^oy%+pFITaz2-Ez+8LezmZVU4QV{$tG^r zaGY!GNi?@NJ%^Z}0U>A6uISS2llAbZLU*ChnZix8&%2kt>eP+i{!@QkR`lk03cp&l z7v2i~-5v0eJAMqGyJld(xU{s?w;G(e&jj7WUn{pj zdJb)x>t|@L55<4)#CD(kVB%|WjA4x}M=j&^+Cy)4q%=DxHxF$fwK(IW)R3_NSy`<6Fvn6WIz3~$yjh^Jww zlgJPJSmCTiv#%ECkT-e&v>Z(6@XSEB)pT@dxXj>nWJO(M!lzxD787DgZ>zs(I0Iluk0IE{B?4e zHsw%{W!>B=e1n2m`}+FY-38GepN-zn`~!IrJWIXYyjRYWjVYAAIOugqG$;7z07&i> z7ak^|kOx9g$bc6lycyuZ0bp`TU*bX22+f&t9 zUw@|aT4Xr3ZXH4#%imw%HH?iZ!#|ugG~5Y{(Z&&8M&%nnZ*>-sq!QSyJCT)Jd;)r@ zaHAObi~M|isBy2)$jjeHjq1Qu0RQVhOeNDid9g$)ib7)vsmLG59MCa6KuNQ;vxCB! zSF?nu@i83b!Cru`e;NWNNR*x)8N+qRq;_JKNzmrQllE$0U^VK?!3!`m!vRnTX#*7d znzohDs>nDGLqia7_bwHvO)|CMvUe0;*5kGO67Zms2mH1IDp1NyB@YUBio<$2lgrdn zq10$Pp+T(2lZs_lpYh58z{Ee||Q;X}7#FduiJ09peMXqL(#)H!8JT4z}x+vd!hL zxPRBxx%5lp=#e9X)3#SeRb|cA2ebO*Xg)L6xi?GzAlz(k}AtfDfRe4Mum-wUJ~pZc>da{uwlN0<;`0r@#UL0 z4^gBGFLiJ<84n{ODsu7!va)c2vC=>Rw;xSl=WktH_teW=klVlvng`bmO2WpI*olKt zFv@TcevP5KTcs$pbG{z6>H3U+iM7U+k7AKYUJsk9S0@g-#u&r1R*O{(JnL|wz8r3oWF zXuP`2dHue9vjsr$8b=U9{3H6wlr6owxD61nrZKPyYga`0xXr%WXklrI^Ra>ClVGGG zV<82ae{++iwa1tq;aE@ESIGl!5+5YRM52Q762 zWi+>I{Kp)oVruB5hc>G^<`;;1EH$)wSBX?!5_x&|&SPttlQEQg+=N?pH5fAcOX_MV zb0uYld*u5^CuQhn@1*s)192C}&-?f9n_F6P4SZ&zmD}g@CUovRIa{cGKOkd;ICjAv z*~}eHn3a2~&f|Y$fX5Ff$2`)Hh}Hg+=bC$_grMLStS-gtm9gaa-~ka2wc8dxqr?xS zMnz*keq3L(l%zc18s`|5)xg5jedPdVs}Z81y>}1$nN@K#SVL=d<4>q#d_6Sc2riRa z{!B&GYS2yq-HbUh1}TcBT?MdNqV`bTG{*^M+o$yxQWo)b`bC&&DbSL$@85s=`-`KH z`k+;??aiaaC&SE5R#kKjFH6ky1`J!XmM8H{8uBA4`Uo-JA<~f0{dnUQ#~xFg3q9Pe ztE)>CT9_@EV$uYmDd5jy3@avB#oFvZ)XeetYkR!>e=!^M+7Br|oC>}r`sB?*n-cBz zEvnSnQrTx0&gT@Y8xvD(J9|J_kw0i@?Zj3=F4mg^&xOexv9``OvONYVqw zUmTyx7zq814i)C*B>-H8;;9oT6DIp4mb*yQ34`v(485E?t&9)=3z)CMh z$dU{H8d>vN_&I1855WS})1zk?CO9bpSPWLG)CKjC!WL~>-C0XQ4X!(88AxcuNFQwQGKFQ^?P+X}N9?-dnv#spa*-fw2UioWSuA=^z_ zh7SZ6^BfPK)>6LvM}jNbaA$A%V^iBiz4=$|w~>2+LEjPu5&9=hJ7H(QJ9t+cpt?;J z#+IYvUCrnwx;$)=G~Mv()hWTpSFdN%#QWgAzp?*dAWlhg{SXS?23WRx$=Sul9zOw0 zO^Qy+_$w4MOao|#x|r4@wZul+9x$KB-xVvcA($QIf%u@L5j*~)Cr`33Dr(X1vc%Wb zjKPa#fvpahJ+LM0t5x#|6iDO9Fwd&T{U{hDnyd=2N1u#=pL^h zC>voBk<_U*gr7s&YHDTG08b+ZpApa#5qESE!}9FWZn8}ihj^V-9_&IS#R)>8mpwg^ zP*i0Z+_ki{L~m<6HgoFCnX&4o_Ziu?{@SYA#Yc+_KH41C-5{jqx65^6-*3+?`$d%< z2fIbzr>#a7=Ve}8>io4V#1OT;>7MVFZ{_)P^4sK(d6w?bbI{u%2KCOTb zS|7R5=E>VZae>dZB4)Clg%y7Xq$@3QGa3C6`rURxwc8wxuV0_$1LsoL*H`vvU1xv) zPKYegEMpM}HSlLE;xF#)Jp|Sj)&#yCdr-x9p;;%kL8v^+0JWf!*fi0k^(5ynlzIIj-NO|ytZcM=H0FT#HWCy-oAZ1 z{OJuq(^Bm5+M##6i<*b1)ggER8=%2QmrDLLc5FT!v)5)M@>4v}#8*RrLwfp`&^&d2 z^uUs>CcKFlV}e`vklZIwhoGqvqa7at#lfq55j+4^CT;e@i${`_>>V5eiZ?X1wvr>y zZLvch-|9MS1?WylKpbT_Zz68v{xc6gBT@4u{D(L>?O`;`j8ESoX3vR9B8l*IXm0=x zn?r97B?(YX4Z3T;kKhObvT6UV_psuoLICoxpqzpn5Y}?CClb&MsL1n|z9p0&Ff%jb zWKFpf9!x$JSPWw9hR_^A2qzQO32bck!6h2!4fLChc+Wv@kZ(`89L(sLsq?W(VD0bB z8bQMj9y}|Uye%Wl0Ew81=(}5=h_jNT9U0@$rS4N~66P2KO{IE~R zA={-9E%5Ww$Jc`U(l081(5g(+YyKE1%WtvAuf)4MuhIM3P>=?a%$=K4LuL z@yoq_FI#)r@9~TblVA^>U%`$C10W;?do_V(h*pw%*ZyM#Vm2ZgiW=I-p0Js1R0$KKlZ5A!aK+em{yMWh{v`y&y5aH4Uyyz?RkYooH%0Ahw2hgazqRwM9 zWweGe+-k^ydgLfYXI%1N&db4*QVE+XHcCvrk!16gaddrq`lzDt`q<{TM;QeZu?e9vW{?U zn`46yW{g6(*D!3&E-nK5E$FS&g!~@wwj74ZBRdePn_~E6-Q@`k@3<$Fv}K{V{fsz6 z(Fa4bW25xAQ}aJZhY8d=sOnMMiR0^TNV{4@|NdMr8$XE3&{#$bff+SG(ckM@Svb#WaI_u<6;(X-9d15i zoUOsIzkTOUe~f>uw0wV#7#J9s;S?piwq)RFg?`Z=r(yu$4i?uOS{w&Hz%WBfFr@N< zB*g0BGW7kdC=x6%V!I6Gvo`kOT{OLT@c_j{wCm#6;O&Plc;yupF+sDO1O$)pt#T#% zy!jGlL!~F~oYY1?5}NY?#7ri&qtvy}UI@Npg-4UDK*4T~QuRlZXj3}K&jcJnfJXoY zwV>9#>3wonpzt7KePTVvO3ed627&4kyRX~W%0arT3#%Ct!Bh*)71k2&#sg~ALWWmt zc&UeTgK#NmAOFMa;liC_89v-B#f}zLKKaGn>j*%FJ3z4x4>X zEau>rmPfM~0CYg%=VK&Jc3iVNtzqTJHuobjx31%Zyj)em<4ezkg6B84o&!|_*N>6) zZ)9A9WjJjFmzVkgx)WUd595>Lj&lWZyJEg*Q#x;qWf}I1-6%IN96$HdoBq2ueZ<Oee>fn8{m=~Rvz)A~w|=!pC3*5NyfMnq1V4Jj4!E-xr-|H+ z8%4{QU40NF+AsbKN@yRMM0!AI9Q*d2g5VD*FEl&^6f?t{`7fRYJK8W%WCj_g)wuYu zFir}r+gsedUt=8DycwpxVMJzgU{?@mE;T&cuy6wi^u{LGTqG?pA^~q0ZUTaenPJj2 z!?Ywr(`|&D)zdK1=o(=M_iizJSrTvf-Ccy6TOKjreo)=|!d`=L=d)ys^|3X;rOW`? zN%As19*k{hs?0Fcn<3UNYl*CmlSRcP2`{LN-pDXd z$Ux|tf{F@&?tGlK{~~Bahl4!-f4}JT^g+@@6f@A%v!lta#f6l`-NK~(_|n^i+n2z8 z&hFDOc%ws-q0{$d5ct==rjE4|uCvR@(v6$qy*t;x+ZY zwKQyIT|a+BO>I70DHXZE`AFT{1_H09xmgKQpg#^-x_wIN-V8F1Y}y&>r^tN+*=t1F zS{u4rCSlgrh~pry6;2O2^d#hfK(qDT%+ggW{1ELvrNEJ>@NhX5mCXo(PhI-% z8j2Wkb}V_ux&Nz7rO)E;F657dMn^v#)8o-P2APF2WRrlJU99b@F8@sKk%JlxI5%hn z*5*lFMpSg-j=-^Ak*E0dfeIycj}HF2DEEuUKICNWrncQ|*V}u2KZ{HIKS}3zP7f$t zY=7Efk&-bi2nBU#N%h9IA0sD+Qcv-QZSEEKTG%w6X{PP7r%kf7`ll2|r)21LM{p*G z@Aj#RkZQZw_N^*EMl#=3|7#(=-JAI`y^6-wNxA7z3Nm#d(ZK?v0Fo5R>WTUJd91K@ zP205?)7*0FiuU4PFGaoLLtWl)J|&pQuzSzns!yk)ww8FhZY(!dZ-@ZiN(tHny~Ux$?=_qhdKZ#jE38m$_mVmyjIgw(A^m362ar3)+`e z>myv;4m~>-5&rOe>363n*NLCYU;6iM&Jd$;7j%#QDw^nbbK8cSJA5P3SR#)Inc7*s zA@g2fze;cNA(!$ci>E)ePaJnjxa)9#eveF9KE_j24HC)&e2lbHx8{bQYf>&0EQ_2! znrMAht6*K4gnE+FsVx_7Kl2K97C|$50S_@AP$JdC1c4lJb_lL`u69_QXT19PyE*_( z{2DXnloE^!{2(3@x+Z(z(JaI@?-$N}6dW+g*qW+&Z8&JkNaB_jyjYasKFMkds%6UFI#MnQ*6aYiKNivzTo`3Ph4+?1jn3u z*Q{#LZ^Qd0>xj_Iz6u6(8U=1E_JGFEe@d!g;N;(Y{7Fi4YiHR5>r4|?Cymsk_Zz+` zXYlV|C`Tp0b4c`2pbhcjnPawuT_~+WF4S4EQ3>xF`4!b8qj@u=eBBuDq>f9z_EO)D zvTti9+pnAYX&Ss?3AcXVLaq8{En>pO;4!1H4_l9udiOKFZ9C6?9va-LSQ<;wO?+D3 z4D~NhQo_GM2;WI6LxXMe>}7G4&&jvsV*@yo7WkC$cCpyk?0yJe~N+bjKs$900N z?`m?W`*+&g@>@ zV;@-0t82YHvivf5$>jYnHU10pVb0%z2HL$UslHYvi;pL@1v8}i2b*8M(&8#Oc>3|C zD(MBuy?=gCh6bsfId?i{ubao`&+;30=y-e48144^atPbyf*N_vot4PDyr z8>Gp~DnJF6sArmeaN2A?af1iA5f%-b@XbI_56lAOZc*4HAH)$|jM59zt7 zYNC->+$QIE!yoc(i)T^OyvgY7?CiapF44L$4gO%$oXdd^gH`?9xjw3x{%meZk#5^? zQu*B7_p6c_y4Az{J3f3kd5U5nD=0^JXl7Ga@w1k#QqmXVE!%@X@3**kF$s>3Qq$uL zcgM}sE}!YKF`k(2WRurwm}fB=V@rnjE_2#DFJ zAYf<}v5=Vf7Q8zXCCR&9Z{4D&^-}+fcJDLWGHg?ng3Zh=?%q$`IL$KM zYtuJ;Whn4|s(dEXA&vBzxrRRrA?xZJjUum@Ppw%oABx;-bg{ET5lsSxmm#xFhfwm~ zz5ZLR$9n@ObGn;s9@qU9|4n)6e5%;s(X~J*s1BFEx`bDLL20`Qo5Y*-F3q<1?=M-A z&t)gCDCl}M@qbN*^P5=ya}!3bv_hy0TeDID*TejhT2V(aa(T#Z_=#=}GP3Zxur19D zLS45NW^#h(08z9@)Y<{HIDu}#<}I}_)FRWz8WPbt%o}DGE_eZr08hdtD$1%})=kh! zV3MGD0A-%gO;n%7#7WR)jHl?^OmR&$U#Pu!3{Q)u&6g%M$WAC=EEywcSaUNi9F;)* zBrzMtJ0A?(D}5I5TF1M0#Pn%9SXEiO7qQdlGO^$aV=z!ZY+%15;|Lm_Ktu3ZYv&k6 zK_r>=-2_}3057Df(E~Ujn~n_)`$a6>Lm-M}*s-Ij(seC5cCVD%V*mshycnO&N}U%D zKwQpOk`0Y#CzxDFlv4`uRd5LUK*fYU9RpiD9Kamj_@w?AS{3m6Wd}TbQyuD!kt;DP zOZm*b=IN!T%Z*J_D-Daik8Py(>`ff1wl;ny)}Rlcz;u6OH}iZhMrXSzypCQ3vu;-gO(O7p9md(C*nHvH%#f3pIyi9 zrIGn#vcd1_J$XdJJqJXkSpgjGv64O@CccBhO3Mme+DZ9^E&d^V3vH%}R1P*HBAkvw_(wIvDd($H zVihY_6mrF7Pubc&i`FS?{qQ;_<l#Fup}FNO(%4W z#Rd2p|8K?XI!t>wY2gsx$LoUCm%!H`u+%Eg+>O@?Pc2G=dPXk%4OAo+9rE23kSG+S zS;ymV1uTkW<6+n&A+g2g$`08Nurzv6DDJ86ZpI1CVp{|x8b_b>Yj!6 z*7WZYB?2d&`+B#ow>B}(Di^p?{8;+MAH}Ubd%oIK?QVG)+bXg2tfF?sT{KnrXWFqF ze@pgkIB@mlx9Ow7(V?^%MnMs5Cig7waYVU!OaC$}7Aq5+bzaM~dKBMM)<0HjFfLuf5B-gsUeD2Ez~8sGZS zB4mj+eCAbclO@&G8QQo zto@No$Fu%7I?2vjrvERUB#>CGj6LiGUA9Y4P2F@7a%xn^@}>vZ#*Fh0TEl!yVjYR6H{vfR zh&EYE0!f|QuZi>|IN9258-T8otz0&i(6dAOme=?T)Ce>5Es$Fhu_2Luk`hMt)Z>SM zFl9jV7BVUAyak$+j3nAq9j6Mm)|R8!rWfIw~0y3-oAiB*52Me zDhSzk4A4HaiiwHko)nIHe!H<&;qb&)P7NnVW1@as%w*PAc60xF`TEdbAK~W)(NlaA z9_9L?tKRxcmaksC9^_b@Ii9G^XL8}l)vZdYnz4b>f@d~7WZL3!nSDAv^OHfz-!dIM z$wn6AmI15zzv~j*@5;V;8C~Q0al3?m?}+F3 zreT(eWsv#k#w*Bzej^BpRi)%VhoG&1s>}?|UYnm;*mxp9i>1e-5-vfZu(e6wh8h<4G0~GX~_>KlXW%R2D3kZvcj`L z54-^=X3pC`U(wqPqe!vIEPKy4G>PZK;hJRCr%Fe5Tmm~fCpjzys;3*E?u99gobc$NVftqOBg+3_`c1)~d^08T(*%%G4d3m{@6D(Szs}8!i~} zY2>Co-@8vZ&x8u?4(0pS$kz7FUx(kpPTQ{}zK30qH)e1=mUsSmqPMH4ChhmDA}^_) z*S|NCj(i#XP{Zc;PLIzD$!FUvL)5}9Oe7_1{n9DIOPCLV8DyD?%F6qNgtl%A?Q{-* zpR;=PiWDdVXhjqh6mHcsWvV2d;u)l&AAf z3J;-xQ7kCKqb}aQOd4aTL^p3g{wDXzr`z_7^DUP^6(+ zQ}O*wBw~C})`;?8!|UAG))vv`1Tza{rd-V(Nb5x;Wev`CWiB*M&HHkf{es zCo$)FPNW1U5*}~blKKy{VP9aK+>g*^Na7$=gu>2TOhI75Z$s><}s*+J0tQz5QMn*=i zVWkys#3;)0%D2LHvVsBVI5%cQ&?8ibV0R2t;zA3Tj7}-`grh(H#JK+x?AcX?Y4|bX zQb9T)D-y1fGl#%5!ab1U0(~Y~3IqJ#4oeLDFcs# zJv0Rz?RI$h@g&IY=G9_B93e?o$3S}~Q=4VQ80+nVF-afB<4>k+@GG(q^ob$86cIQ&m@6wiI@Bkx{yuw8KU*8E{~ z?D?^T*LnSqbp!?+zSxcGHVZ!6lHKvXYs{ic(TLG_`|rQ{6}A<}-9`>M-{bQMyM3F| zv-Z!c-~Y@k1W(DA8kB|?jsS=Z6yUmX>(tD6Bj@rr?Y_Xif!&KOUtfSzZh$ioP&ZLC zAk!LB7EV=0%w7(eIe;9{Zl#*OzK1XdVir+V!hU<6%AWMJv;vDJMj|XikP`*TvqYDq zHRTFif=G3}nu{x;{UD2DC%ft7&qyKg8h14nh8YsM1j7x4uCK5*npRLhEhXg<;x>R- zr1~g9m|?y4e=J0<<8${nhol|1`aY0K6r?*?Z-I}Dbx-#(=quSf;hy2+Q$D>FhfxwC zRy3%|WV;gjPi&cba3d#vGo5;;z7mJHcnl5!GDBIv>caLad|cv$$>cRf2~YhF00ZA^ z@1yW=P-A;YxFXJZifVh3j!4Ss+4{y zL?+>@+{Y$4Xf9K2+M#gA4+wekM9^|Z%pvU(UFtd{3Ub-mQbJ(~gzIUllhnUL92_4? zqGLFS(F_~pU7$oX$?WOswZE>J79JtE3<=4}ufboZ+~THyNO%O-;14beSe=MV16pt= z!JKHA@e!>e7L4KBD1uoA8<_y45>qr$&ykc~1XAF6#riTY2%|lIO>o03?Uk5l8tl&e z@kU(59{35NSsX!0YTwLmv=gL0Z=Dq!#vFWnN+)LUiAjDu!9vujhaUR-gBzuyz*aQC zBE<2D21PF4eKXsMFf?`ZL-+R*30(2UjW2OCAx!#2_we)vTr5t5CqIB93zq6r;s72; zM|u$_l$X+B=ZQ`(G=WXHfF@ZL!ZQbVlrVOK zUdqz)9vSBqE7%v3wQo%-h|R`e{_ptFw;i6M+nV`&LLQ0l1N-rE?p#Au->n0CUECV2 z0^_gG{yH`JxLZSzFUqHItaay|*t>_`iF{EHI(^ByB4%9n%$OZQAe99_WJ1nI4zXvA z7?JNI!W%FVnSbeoAP?+x?SB{c{?3Y`^5h*^MzmORXDf1zUz~yS8k#M%@6a2(#JwUd zUWzwTcmA7p8L%I*nE+%-l>!EdKdGTS`v?<;LDhXo;;pQ>c`wkmDI1X3Ou#!f+Z^_- zS4zbW*zFUN%S{Uj>O458BLtceey#v4$iMg{kF@}oAUkOLCl4NM?k};+(v1rZ)%yCR ztaPRYS%8>t6@#_OwxeYnLGgW_igv8&-kC^e$n1A#fut@KIwYV*BCzQ2O7snduSCa4-1dQ+&qHf}?R6 zqZovrqomOMXMYL{tpwd$tY0Jz4ZQ17NRPzLZH<@So6uX-3uGz{6_jiO25){HiwE%G z>v31RCVjlUXIFm?zJu)%y4jaF;JLCd0uF9`^JWw{TngZ7_^2iHZrFgLg#L~M`dY)D zp0d6ZV->%UqVg2XzqqWa-{BU;;>i}c;k?lRAz%>~SHy<*8PS5m&u5Lzq?q3*$d&@2 z!H3c7nEigLpU;7EOVqo3I!}?0d>__epmiiA0iTcTAqQlQhieGQ;(YJKqL4R;iB}s7 zLa+o#3maM{&Ws`k8{Y@cwd0ruk-_x{##803P8cLHl(hrKd)v{Ghzd`J3*dWDa%Q}5 z$67c!+!ml}7I=njvJL_&2oGB)+^%-}PQh`a7loV@OFEwzuHZVfqW|v>%MuoDfBm}k z{c4++KxjJ{79>VAFw;ifS~SFSU4riD*1)vKXk z9ca)yarc_}QUyM)XE}fSE2nx=v#e(O?u*?5=cP1O~ zxmeTH(?$4tZJ|K3@`*^H6A|l_!^w%KX)Cdsqx)Yf>Byl#3*;8!HbU2kdEWHOl{O-v z7hPN4sJWF>bQN(< zlQ@j&C^&*hoLP67lCJ7LZDU#zLvsg!9X{e7Iod(?-psFM5 zb8&=@7PdFXr6vMOBSJXAZjlV55Ma?tmIj>|yiiEw(bHiFzGP#Al-T`5$coJuYDi`P zMzTxfVypUPB&wTa58@S>M}NT*GO9?FAQ;GyqhNtbb>!-&ShVFZo5}0yvY^ExUR8AC zw}Yg9Wjss23uosX(#*-Spvm{yEdUX?1~LFf)}UF29=#4HFm5QRAdrAO=>^cvjF=np zoe^P-n@Scy$SwTf1Hvvl-3LEJn$<#r{z7XRyTc%|<=}MQO>1n8s{()w;F2}YJ z`kBWg+jZ|}k0IcsQRAr^MYQgf=JBIa>4}c7&BA{Og>HPV*;hHtXsy^samnWL?5FhZ z>vr0`t-tKXT{<0a9CgeW$=@%=yQBR@oXYnj;q^-o#r9s0`{LJN{k)j%r^qx3M|hIo zoL&`T%Eh;XWi>W3*f~;zr+JKJak~PQ%Zt}-G^KItx>YZ@{U~_B8Xm5d-sK*LKH`5C zG}krX>i<>DqdeJZcu0CHx+NdscoumaYed5g4PGrGhtLjUsjm;*;ecCw3fx)9LQ-Ip zV5uXY3CESB2wl4h%p_8HfT$j`1MwRH#US*>+{{dKm_t{Cc)%cM@~N$@<;XTwLc{># z?-D?zVX)mtH(&4>>!0vebd9N~xgM?%*t+?a*tCKghxbVuJn$vjQ2f<{td&Qz-zD%c zZVWyf^!Al+YYt@$hoUe)4VS~b=`e<23d(#s<6Suag;Zbt=;Tpd{N%*Ds|k-ak%k;c zAg^f?w4reGhI5|@qOMo2#dM#MQHk$~FTAa3Wxc^a7NS7q~O%6%P%GQSFc+nY$ zUv1Whhd$hWPDxVf_QBn)oa=B z-NkaRi?`HuC2Q5yxw5txPd}eq;P{z^LA|*zIPSAuYhW0^cPWo{-=iy%dUWl$UqJ*f zgtC(fAfg#?R`96m*l(f)d|pQ84o)5hOd-r@_=lnCV?qG9P2UlCF9n%seTeHDM;iy0 zhCvbwbv9T@R+t=Eu;C^+Q>%F_8W0|V_t8`F zL+!x>$1lfZPi}4m8}I*GQHy_p74HB?K{YSSkn#yA!VgGC0Fr~13r>I~z?72NWP*xC zlBWVOCtx**Krlc1EdQ3ftrP&$VFc9UM_w-}TxFmB-y@-Fvhef=e4E0sDFX@=xtReN zD*VunYyyx?;?Xh#aZ6!|xQC5Rj0i}khfV~+PHN@n1BSC(0~bq(rJX2cfEW_I^q}D6 zhsLYbtM@QxV>CDbrydNNWR13F%<(FILPG&Yf)Hv=WN@|R0P<3B$}s!uaV}gW^6sNq zIqM)IFqur~RdWE;$;zyp7OPIn#Zbnu`;Am{_MdY$zi)J;CSEyR;aGWCRd}?2?y}LB ze*RFd)e;uP$R8&uX#|Id*S0*)F^>+)wBoRO9Vj!wc(`KjV`Js3_lwz_0T$nizfHvU z@5o*+c++wQP<5WMlennRK<$}#_X1tmGPAXIRm!{6O8Ij|^v<)aWsf>f>)F>=KaW4o z&ia-%uaYFof7BZ7Xhg{o1R)NJXCC(E{ROi28JN~#QCo+vTABU_KY#2 z&wvM@)MfC(D*-l_ir8+zMx>gv?cU89d0gF6r60jm$`Jy*1-alA3h=e@4I0rtU&r>Q zx#2o&NY{HxVz}{tD}a_$R8gsY^@`HdbFF8|l~)k*-NvEK@K%FJxr>=8)BU%_CxI#q z+=%Ud1+f$^H}NCkd6)iap>igz;#TC0JoRnocohx&+bhH?3<1G4hu@-9&B(mDg&hm< z$q03_@xu7=X7dp)F026&-y`Mb>*uE!HPVdX0QPMt^h;U~WD1We-}E>pZho`in46%e zR`=bea}(z}8nU+zJav4n-%$FP<@~{>O;NVd?kxV!3(J?1xLHc%HcEKL(6}1B(Sp%3@-y9!h8Sw8H6;~{ zFC^$<)J*6M2QM!#(p#l^Rxq_#0=@vsm#8xMIzZyfw%fbb{>=PSvFa4!wCg`oURr7s zcNR>dVPRq8H|1SotUr_p0!u`H22rT)S^rQ4nyBSI-_9^-_AMM~={=aB@4`{Cc z|8H1BX_8POS~63FvP&U*RU#o<*)uB?GK%a?c2=^tG?103L`2ET3>7809iZ4v&*$?R&*x)3osLZ&><*8Kd{|ZxrzQCN{kY1;tnFsk>weE_ za8NEOFH zJFW|9b#5MZ4tDVHG}eWQ%SDHud!^8Ly3bok*;@HQy>q}AxWjFWh5}35|`rJb!wvIx9Wv?r%z?${+Uro|#gxGvVLH zbwb~$G_+sLKK%KK?F**;$?L_bo^ka0J301UePCH{J?+VD9cnzF{$C;6Jr>xf_qH+$ z_@gmgAbv}VE*NZP(qvLxaN@;Dfw7(A2#i2co`K&4`9m)u7yH)eZ~P%+_Q;1sEcvbO z`SSOI&SXaVYc=XFVXQ_u5xmi?cI*>;Eq`ch6W^(M6^``n$Wo+w`pah_AyBg6an~lW*QN^z=3%sGrtRJ~)L^BPH0+R9QJ1?PfZ0>DluC||wb^IV) z_Jyg#`k{f&NaV{0ZJKtoGW>lZYGAr0-~;M|V-=Y)S?Ac-~A;9iBeXAjmdN)PF(oVt9mX}?3VA`$p(nePAlrTM3& z8D?L+O}v{puk>!8<+nZ{MTN-llkUsMo|fOLT>5lCUae=_XSoe;nZI?>AJh6;`QnPl z*{KZK7rx>LL=P6cbZumCJ=y)p%kYST#g+EgUEL8sOMkyjd?5JwfoemLni`nmFI)YKIhv)eX%LwD!B)xU)Hv{v{5|KTappCPIdu=dc>&T-P2r z(E4X14iX9uy2l?eb`b(xVBdjTNB;^P@Va!~>f8n|$4YskIo5Ff*T#b&oavm^OsGL1SE+U9<1?M!t%Js4VduT8+S}UR z;$moD0~3?`Z<>JAq6u%VH)>W^oI@hwr%$hp2`f{nS=7X{%bn*qcacMt%KN!w(4Gzn zsZq-;{mz_EjQHU8L^xdGjFk7Gfwu#;gY2?`yv2&O|Lf&HOKX$G>bv{AoZ9}+jBS_w zo1nZgv!QcJt#MO#dw+EqzwFSkTl`>%q{Tx&=MSp0(Cy{Xl-1j^%*b%EJIBjo+m`S5 zc19#CtKX@e>vS$ZXVwgwU9YBm-jg!s+eJa z+q5KG5rfbsG(AF>V>DehQ`$B*K2X(9ij=l;66!cXALha&r=)bm zdWX@#>A>noxY|&14a2ZEJd%Bx!qkXvI`9f@5$)(U1cJnX3=2iNrP;`kWmJlQC8ixvoc7F-_vnbY2iz{-&VSfqt9(RBq4jM)DZv@Wr~;v$ zLdCBy+h4IC6ueVhZSb;HE#F{tghTkr;^A!~CnJj;3ZmBrMZ>t9m%r-xn)C5}67K%9 zhv?Qoo0yCKtNI{eDNr=D9Fn!N;sxzOVj_*zi%g7-Z82dNHxr^}xyuF|eb;74d3)2G zoS1kfN;f8xdBcYW&P4;LXD{z?+_9I05UNTms{kfJgbjU#zHPylRX0G}bwB>W_tTQ}QU;jFlcfOW&6c zOvZkV>#VV}cb{@MGw54yxYw|BMMy?yYN{waeJWk5DQ+^((D1zb{mc8;hs>n!L>faO z?gU98-vbVkFnBz+ZRyf&IVrz9q_DyiY z4fnNfJKanY6Qmt33OIdm+FEcT;nf9E!|c`HgL{qMZW4966Cce=l2uBYdWXjtrlV}E zW|hSHg#TpaUEMrX3(q0Yg2R!rXU~F*xm|2O$Eyv+j`W7vPZFu z6at{r#;D8xS55(fG^vp?;yxN!KS}&{mZeg_9Arj64xMgX7u+xL3{{@+9?!70n$#=q zIk6X>X9ZFpnbua%mCC2RqGNH_A}QO#C9EdF-AeYb?Ltw@5Ze*X>X)vrejiT0EqRd= z#rBhhVVto>_>t7np!B>$Wx9PWvzRNL%%XDV#rA&rTFO_S)Ia)ihHAX5N|5GWzIfMK zAPJ#RY?HfU-=8tLwtbT==OK0@+eK~K@`GVZW>V|gxEO6rOt*dwv48!xyZ)u(E_XYm7s`-wR7Bei%;E>tC|}I($jyA7+>8B zcOKwyDDED>u3k`Tz2`#C5(HL=aQ!*YAYI2WaA4r;i9j^gX<)@L#S}yx8GQsFgFT2$ zFr&|bJ7!dE_Dy{q()7QRlwa7=_yuDSfdpf?@HPNYsE0|)kiuVMWP^+W>p`IFdS5Ag zR{M}!3%C#X!*CeU*_f|}CxgDHV3EAp1_!=3xIzdBYrtM0Tu5R;ha^;kqetAeEYuxv zkBB1r2xu&WTibsZn6E8eA)+4e=1>8HKUGSW@Bj_N@m3krX#oJqsHjjQV;Ct$^NL!@ zqtYFsWN-lq2F6lAQU)C(L6h2A&vHiih;KH8(EIy`-k?RHCwB%nN#3%e%;wZD5iBvk z))x_Q?{HE1vTm8vFcuahZk#HCrmrM+qi1&H^=B?O!O9hnb;o8 zEGA``+h~Dc;|)pMDl*-+QHW}Y%D99KoeEjUV^K+Cec4)T&BEs94o$no--gOF{wA}l zR}Pz5u7?t|wZJf>aG^=T$D+0|P4ugo*|qzjc~w@`!9R^&r;cv@RmhgyHlMk2U%Nk; z!(zt>P7Z|rYrD&d^NC+>N~qq#=s_u{My=%SX(TXB{ue?w7>EK0O+ zn@FzI@rA*~y|S`RQucj7c;%K67a5eWnk43;oPe=_@-6A7| z0q!XMAD)-{Q9(pBgg)vm-l(CWK}cY9BA6(1J~T=kO(6-ifCH5bK1RB9GtyI++X_jJ z6H?^K)KDM;gf+rJ1^`!#;7VLXH?IoEWSv{ya?U-3}!q;j0w0}oXmbq`{jZ3cI z9!?zOe`I_=G~V00Y>l{%TSR%!EVC4q`p)8)${h_Yn1t6#-MjkavDDlhp4{DPDMdRN zg#6BDTPeT$O~0jqg}+pNwSCx_vzfb*jq|9coNg5D-rLS+LZj~AKNZ3<@!s^Q6SCj{ zW{^lEc(aZ{UXFI*4)hRc6)!Jx1;5e zcSPjDA@ptyxM-j!hx(mLL*tD>A5UOW(jxTLM9mALX;I2fY~{+!nqM{d7Sm36qoP{l z4~B)GH9kE2V43wGFRDsiZzF4JXN}p-9fC=zCi?AC$=cqfXKUa62<+E{#(BIj?atEN zpO7K-XKbMbZJwO-hdwIoYQ4iu+giu-aqNAeY(vW6qrmYkn+}K`U(9&D}4pI7taO<^{_uqK@q-5RJ2-ZAq^~P0FKd1@WO8a;;r`N zriU$%dG(MHbMj(EoM(6}$B>r_Xq^f#m1G%{S4QICoOsvRq97q(LS8V6Jc7wVQvULyJye8&OFC*!8Ci$__Ar{46))y*`i{^xG1s4@)p{CC-NfojPOJgi`T z(!G01p}Zm)SHb9$IWFL3w?INf0euQOuYU=ikz_Z1pbtTd5&hnq5$@1=e0e)HwdMlb z2D1+(oTzIKOG>h0Vi^?V{}-*yXdUDfK#b2diC4~JO0+kQI#4PZC)aW5ey_m~$A@+X z_|ul={iI`Ow41lDm>-u9jrFHke@kbsAU^$3CaadkTrGoA@7;1>W|>m!<>B(A7fTKXQmi&WV*=*B!c%%W&4{&+#-z6E zua8Oj{p3*B8~rim3MT3@q1S~6cQZ!T=~-HGTyGnMALckTo*2R0j{SgPg2!Nl?3!3D z&U-mMHAQAE;O=AxJ%*#WInEQBOA?<{OAJQ5(hkpXL@BXvbl-Rg% z8$nCl4k^qUG9wjmFN$w6{s4hQ(l`YJ7=B43EARi5*tm;1Grcgv5AzL*z#*YK_eJIv z1O%VMRV1Fms|j^-_5jLwB8J0WA?g;?3ke{tP8nwp&(3z>1|!RlBvoL-{7K>Lq09IF z9~8%9c&QD+%}<|tBf9IrA|}+^Ak^*m&uIl>dkzlHgpeIz{~IKhlvd&VFa*5a)7J-4 z0xR@%TIVt$kG-Fl$Ac&nj4xouXostzE1UNdVHI}9FyOUy=hrwnPk(5DdIlW-2ttU^ z%ke=vbfU#^%fMAAV6H;m1D{MQo9%Z31s1v9g z3}6yj*_Sl@bz_}W$_-w<;r~qxDj1b5dly9BT+aH>X@Ku^vUMZgdkh#G84Vms8j<+! zaH_AZM$sy3;4rUX@MX?&e}#MDe?wF-{=%_DJKlAa;1DYGtc<+^QhJjOI_~p!3obXf z?=ACj_J>^XAPCXxa+=LHc^AB?{odW&zh{7fe}TW(!|+7>Goh65yyw;{H=mc@;)DW9i3!7K3q>1?|%j~9A$Qu1q@ zN51vmZFk!ovxiKA&j5`3z-VBR&7Kt&>9T*w{%UK&uIp%*FIAOTG$|&mH@;_R@t>5u zoL43Ovy#`Bo@54n*G-O#BhyMRm09dTqpV&y_q<}6}2>LFW5x4!~Uv3f|xp`zkq{CzuN9T-xOflB=25Q~S$(Kg~f`;Y`} z8&U5dVhpaHIY=(~aR@>2Ls1WfLJvSU&~X&F@^DM3K!zXMe5J+i2*n>*X0Od%P8!wP zJdJHek#p%h>U1)Mn3&UuDhaMhjQd$z|6b4Y9_~HpB}nUw&WVkB$^3+zss8ZQufqXE(r58nCqtr1L!V8$3!Iel;s$DFWc0U11sEoBH9E8^eo zfuAg0&>N^JmfsBy8l-$HcK>cDZnbq{AdM-+$Zb-pMB)YRU|qG2dyNei8aiif^Bm%& zFP}a=U}#&!`DPrYAS-(ZOoVB-%hI>~h~iDfSO1fxnvBN>4Mrxqp}|xC@fFGt6xz+O z#Hdpz;cky@A|LOEp0&v4#1MLE4UKSoo+vV3paFxmf88gvRMKUR^TTrB-q*msL{ziW zqboeRjV=XHio?(f*GA5LX6vpHHNZF{vLrJaB$D45 ztl_>~kS^oP$fn7&Hib@1`K2o7Wdrqd{z{)ePYXo+VvDJwyDhZ_El7vsRNq;U1Kejc z=VjlrrjKUM*Zo}HJ;*<5+hWM0c%071 z(ZhR`lqjjKvuvlQJR)3juB*|fU3!|iP>gb2xCpx8TTZLK2sH zJzYX{5iKKQ3uF)^knQ|opHtcAaaSH}KawOyjf)z|3(X>bu|vr;oSvx)wtEfrpL+TF zVOmm_k?~8X6KVzp`ubMeKo# z3To4stBm(1y=u00kp(yhnN0_A3S^oEnLq4pZIPfEjDDvJ^E+TeZ9s$8!V`|(2KnbC zLr&?-${~~#P4)E@(6mAQ${u+c_V-LBCRC$jJHp;2j#8P}#2`#0;q?%Ot3IrSrGyxn zh>{V<8JYY8cN0a!BjtPWq=RuMT|eH(25fe6ilR%UsK;b5Xa;WnnUnR~y5||*!x+Kq z7eg)E6NGnS+d{4S0vCF=YKm00(lhLe6BtE~&J#x~1Nq+<^IqbA2eqv`O6=4$;j|ig z%c^jUcyX_ho^fLYFwA^+@3Bt7&r2V_G<0*fUh@RhzGxVR_13S*#(sP@wh59pePdJn zWha+#>;voc_F%SMK~8eCp9V@CvU3N8BBbPQ7@QR3x&9zGyEXB?sKc?zQ%|f`UbeQU zjg=+cU~~C=eWJBybU{ARc;d-9U1bSvk+}ZqxtJ%OLBwp>KjpbAu%txr@ngYsDf)wX ziHdV(CPC)X9wjBK;#sd8^IFb0#I91T^{1r!@U8RELz&)Nb5P`CtB<>XeShXBV5P+@Piw=k>m$L&4`C47+FtjL*#cZM%CG z%jhuj-hnC_oH;9C)gJ?|$HPc-GqZEu3TVsNV28rm??U?qC4pB|)Wy&u2$%9MX|>!2 zjQxL9VOpTbPl`GCQqeM;My4eMj=mF+bFNpt@=XOhp2YSbW!D6q7=bfz^XUUTg%UG; zN=^mxzh~H~4KOlfUoPi`HroKK1QbkJJxpYT6MPk-utEP*;vsAp)Gz1pFo3u)&=D{S zrwf_m2}fMe_dXEWEF`@fR;Dy^^TDf~5!|p)^lBLe;VB%DxJ8~}n}ajQ_*|V#A%Hsr zIzBWn(qGto{!7Vw=%Ivdc7}?qS&FH5ZK1eRqf<|8&sA%Sg5NPchB?Z0TX&GpHOenvOvw64m3>Bw> zdalYxzTBlhTG0b%yX?A_j{jCE)9JnS^NMBN^025Kop8(rp@WCH{a~8_anG}=FZ_cm z=OVK)<=xFGbp5q|W5yS#Rn1NfeRLQqkZgRd7_DM@;&e*N>6Yh;TNNE1KgtQrA5wiz z%e~Q1U|{Z~6u&j^rc;a$vc0Z}Jh$Lm)!e%e9Nt~i@NPP0de~GK@D2>rZmOL4KfYlMC zLv<9^2&^4%Ojk!Adeq1f@(rT(Bm%4SqO988@?Wl~Y3sh&+YG%S!dWqu=3Pr#YHHF~ zJ10!u5D*YRr8snal1}G%13Wn>1U8%>ONS*ux@G=*O;wFcR^&0&rBQxp`G^7;Ylg(L zwm^&3TW+2Fp%9Lti+z|TF#8L?-?b!d$Sb5i0^PJYOSs^@Ypo$6yRJ zL}+G97yrp}A#i38M4G#SnSfYSA?^!A6buvPs8k$K4)qA(4@&q0zNf{PW+73)tVT@0 zucPTi8|L_5Rl|)1MnSEWO%iqi)1TZpOh=I*q=1k+xzBcdpAA9_^^zLOlLot9lx@Ot z-2F9zIloiqJu?{0?{5@s70mA1mQiL`1)RMoCY`Cqk0~J>k+;&9960@2S7PXBXEP5* zbkr3v#f+b0;HBTI^l)7N)}x!vA!dt*g?i2(XA?V=^j0=@_vMK%18+wYHR{_Wk01in0zlOie+0qQ&f4yh#$?ZJLMXcE^MK0 zC&J_cRb(|3&a^*kc&!sHzy0vZw>hVLTezZ2<%17z)#}a>la3BmJIGYA(y{6kab?Xq zeWUdrk}fX%k$#9lhYzoTjOs$h2|-FTPWojO9&om?K}7Xqq@ap8jC)~Kf6#Cj0=m4pLcXnBT^6O)aC|H-1%ZCqexRh415+A zZ2$&@Z9Y-pg6DpBiMWRWXA*ZX%7li-Mn7br>O(kOUhW)V$VtXjp+k8NrWPGf|(YG z|5j74(s8-rJN0MEVDkEPGmUp8Ts!1;G<46E{b>AH^_0Xp=ONp;jh{Fiqv;W*iWSMA@u7(H6V#rWxC>4g2><;<{xwlNUa z0%hiRQ_A*Ney%LJ`8rOlKy3R*k+W5nBDzt!Ja$)nLK3MzFs|0}oD`0A33F5kx4O`I zqWg2XT$7x_??va7jI`5ivO9B56h3_`&${LE8IYwfz3q$#%+_;J-lb6V+C0Lo9Qz+- zqwoBZmCPj_o^e;1*WX0-Oy9dqd{MNhsji zC@`RskVZHG=RZ0Rru=IH3R~`8BrBn2B-1+0Q3HbpC0GzDNjVQG>l+~Khex?LaqWKWXaqm%vsondp_4hldcd<@ zH8yU5rx_!jHy{=UCpf=(SGsyRiB^Sw=qd8=VcSZ@2TW)rU@r=eNI^J@35r-w`!GA| z!Gj0tO$G!SveIKMLf z3lS?%-@c6iuOP)apm5;CiD;DMZ?N;T%Hn#IsZhsJq&L`KdAEqYk{GEI$ zx(%*xb1zYM$R~lGqFFf^rz;xdDctwgb|*`e^hPOn?WDH1I##}tL;9y~#5Sg?dimym zmfXBcgG(e%|Ei&v->9eInonAEj#tDH2HAc6axZNiW$xvR4i+CmlXS(kJPZVL&uE5Q zN2zE=KXP!M4o)n*A{_GR$Nl8!j@y^bjWR~&$CVj1w-4?=A(}jO*VB-ybftgYdztQp zLFe$i?>bXrJQ#x!7}f3Bv3F8>M)Rm$vimO15Tp`cWY}eB@Y>E!N<$@tv7Y!i4I+?xf@&Fb-y{QAWQ$CvZ`Egw9IdJ?1X`Sj$QY+ z?;geO0OML(9QW;ce0Id|PzSBi-yoEYkCC_le;NTq&_x0sfg2h&+HpU)w3xC~ zR`L7NxAB^=akuw>)k@OCe{9ENbsPlzghdE>_WtfHCeS+_a zq`K+tnynuAzEX6Qx0{zo;F5pRqiYS#jW0Jar8%?rX;@ukvHct{Y%~&jgFSJ4Vbgg0 zc$cBnI>e>gl_5{t1Rz@^}?%%F<_T zwexP{22`^U);;Z-;HP{-6;I|B|B4mZ;2TRgfGTe?Qeyi%+p zQeI70Ezq-QPxHfrKKu3`T!X%MYst1H?%zk9d}crC{t;(15~8R-*&1Wp*Jyiqd8?Ig&X(-GlBvG-Cb=BcqYB{ULH2=#0{bzkPFxqs!|}zs2&wRXgTV4xMuM z1*JVD>69W>MU;_ko6?(hWT5CN1@9Da_OPlo4q>U3ME{PWhhx)d-W`u>O>x~coc74o@AoeF`TuL`pU~iw_k1Zg<9n-}kHY-IE+N5!9iH9a*th{hXi3~r z5ri{1IY~4!oLsy`wLnj8R{we+E8|Nv?<8)3rHR~v^yx6VJ=VuzS{DXcHU#V*Hkq25 z2JHHsl-a3A|9Pk7<>{}Wh{RTBTRd>W{zQpDpr;sTbz3CDyhep&GIH`-zc|^!f zNUmmyzvAU%>U?81dCkDe(n~ADQ*yL>L(D~iQOtqjIM5h*;d%!Q)5n^e9kgYnLAo*`WPLVsR_ARc6jRzOuJzf$_q@rQ8op zA4-@0rsq4RR{rhX>({4b7oX5io{|}*{d7nEmqGB9-y!T=tA&3;4L+(@Q`Jg7K78k* z9@~DQ$iU!6oZ(YXK5gPM%shPDyP?9k}wrjIq% zKJ0#J85zEnk1B^5)$emv)punGWT)oFT_i%H8yD3%b#r!KP_r+jkw2~7^+Y}@EAohW zKrct@LnF@3=pf@9a{XM`)@@aEly_J-mTPnhy({z^kN7=yNR^)Fe()}9W>!ageCTCc z>+q-c+b<3^*&O_}p`7L8fz{IzOcH}$b&n**LXu=-bJDM+<-j@oy#30Ffa;waf7HKg zj$FCAJa_KziG4+DDH((rDmXt5Y^J=+uP1lSKnU2To$$C2W=t5F4u{Sk+d{ETX^4gpCrZ<{6*8#)AV;|!JHYP zV}P849<45EPYBX>o*$abP5_63Y>FtcAOtxBG?kfI5w9V`?)gW^tN00&%hS>*i# zb(?m?uMz|loVua8c{6P5WUQwa;6F46WGDp%VpJ0mnNx&ZI?fSY3?@NSOlAgOdxSx@ zH}eouA(Hh#own4XHw-)g3DE;1A`5dd-X#)P2#u41UA|VaA4)pdP0pfO`GpKpJ_zuB zj*X0v2`XqMZkLq_fg3S~Ggw`Z49FtlCnPhOnZ1NvvkCBT_NBSO-SlYWNOuRTA>7V5 z%3+q1&;F1!46zJJwSh0~7?Rp3NP-8d6O2`gc;nw1u>}j?p?@GW;g6@`&HSF~wG>CT z9dWJrzHh?ze73su`#e>R42SG-IfOgLOvHtShMk#6<553YbaHh(Q@(fp8*7Gw0Ygj8 zm`2e!*&zWLEZK5ehx&OMJz3-m$_HFk*HRn_8eadF)C7+1k#CI9vnwRpe z`-k3&4;8S7-#BrvB;8`(E~4wEuix5(_Iq=>bNz7@#c4C1?7k>nWcEYu$2{$2nbCZM z?KV3epEBMSelh7Et4Ga-0j-4)#6;52(@SINkeC4q5Vc`J?ZAs;Bt0bnbwe{@52Z0_ zw&9!sMox$$A~A#V0W3*|(=MJ%^)O*zE+;L~@xg?IkSXHI#k%S(G~7s#Ixyr!X$&1k zcC(=4#CZ@oO@zCHYLnIT4<9yNBg_E*_V)k$Ng@>BZ~unDuK&bfty|aJ+M0$Fj~HDb zB)K0R{v1dk+erPtgSHNB3B)`Txzhru0SCS&;IfJzXP-Kj_?TkI3ig_u-R4{zo;BD(>XZukU^2I;+tvmLYb0F+8W zXX@hOYpDMRT@2yckVvWf`w<0LfKwQk4jO?NTpE&`_rz*SBad77mvD@UHhl|EMBb{` zwos-(u~%oCd|WU7cxX5j5PwlAZ=NeWHbVBA5d&w_@!l1VTM@Ea!(&`QVs(ccz6oG{#jV|NzWdnf;DASytXlpy=GFGHDJxeMuT!*C zDRMjSAEsKnX8j1)GkSWp(+2}7)gIrbx^(Ki#Zd+t%EA=Qr)#M=4(#hX>$++ExwV_V z_F9y+FE@qxvTgDS^O<*-WSwn)r7T?R)^mKJO}L}8*MY7ta8B@|r`6v0cc*nt_f#T5Xz2T>Y8?K|I(yk1 zqB`hP@#jj}=@=w7Zr!lXq_~%RP%ayy<4}-guDW-8d|qP&Q^zD|n1B2X_z)LfpP_b% zTAc2SM)SM&7|vK?cY=--7RW;gD)aGK57E-yRF0$pPWI{_-Iww}6n$Fs^R*BC;1++GpfjV5MteC?b%iiY)ea+(W(}t+Ie883zMd zz0+)znx6h*vN{pIRwPg5T`muXR)j3$Po?>-BjgfFN;>L1;KB}xQ?4cr1lxW%2GTe- zV`7a(Z;>zl;&$s8`^Gec3A;?@mz%-li11tF`OLi%ouxrQGYEms$&VV@LKc^V7CRx0 zZ3V&b3-QR%@jQO-W>OrB4+)Vls-I1H&t6avbKQ1j@$W8YP1?nZ_$c%D3zz?lO;P{a zwHiia5V@Y#LCN2t;gcTo-Gh2(p2pEyuql_RQca2F8cw*E%up!w%u%$;J!M!`Jt#BS zqN3u?wvE+w?7eeOxfAQ$`xgtJeo(vCnigI@dEc{suUWIgRhuFGXYHTwu9}O9{+!Y` zeXYEeaoxx5&R?Rv^kQo5h4%FB5pWh5OKMy^R`bS&VfIp6bYI0F_kiI=yV>w@YbB}?to3NF8?OAgYq`iXY44}c|UE6-n9S4$V^~uDn@(W zdq4T?nG}hbLHHU*UN(KUQPt5=yq4oEg%nH`MadkF&6q?*;+1Wvy(fB$+412Ja~+7F zPKA$mk}g4+{bDYCP{XZWou7UL$=$tm160>!)*%NkZzhPYnJw?y6cJlUMsOYu z4?u;Yx#YqCPGe7w0~W@7Ba<~r%qTR|W+}>k)ftl8T2LMr=*YJXDOX+r$HdgMYevW*`!B_J>h1sv69p+LrYu;9{;Z zb{Q#O5CRFQ+?TKB7$qbR)d^_jh`) zlPEQxecPY)ra8&cjjCv4;c~%8p1>tT=BOK#-pU044LzcI$dF`c`;``Or zgGyfFUQ5qzI8n|#8!al7G>}a3PC4Xvs4GpEdcrANVZp{Z{(Y|6QCC5c&ZN>{3lC*u zL9=b{t_xX4my2yrGUxp^q)!xamdbjw!(i*v2emd|Pj0c#-@USR+=20CQl;Tux04X2k9-p0%4zM*;l=X*Fc)&5uQXZ5$t3>=+3P`rQB z{zuH37Br*7r#4pHsep&W2el*#u2MX_6Nk+UlVX>A=ZSlzIRE0m;T8{vO;ts5g;cUo zP}5-1d}ExkufxI90NB3`r8Dt(JT2~!Xhqe5glBP_3PdW2xqp>yPV^&oeDUpM_B%!c zgeN8{L15i^t<5KLZ75Xg+Of`=@Z4{_rDDBEDr|qeACTP+VwTUCew~c7n*=;q#1%b_Ox8 z=LWdhT}xw*{nG2%r(moz@%)ZmekC)P*x5|BaGvW7S`Aun7Ybwtm+I1oGrvt`b#y&` z@v)D$hoNo$SF6^Co17biw$r+DMC3-~?!LKTGeP7npcoie?G#RPOVO@XWB{Kfm2|+xx22ozK-%Pm)Hi2<%8Q4ytl~)1RY#n;A}aP z^4{O%kj`H3+7|&&A0ABAr3jt#erw6Bcz&fmh->vOjb@l*eQVR|O={D3_iYoOc6$)M%*0D&CPQyl6H2d(gn1vr132= zv}Xfiv)6A-KIdT0A4@7U93^8&TO`Z}xsk9q-$0MS;_+Rr?)L~O0~ru&-hQ6v^%PI+RM??-%gJ5VF+Dvn=h)c zFN#^Wl_M@_)@5{}wEm(%pAxIu0E3gNN$Y}jj2h4_}1o4{Tuq#iyL6wM_-K+ zBNP&3RaMnUR%^7p7qS2yGXpHmswhq#WrUWU{Ua1$SWkeOwKi zOJbxcsF~jkb-~*DYo%cc32kVsdtvs%{^MTk$DQ&NxK%GSy?FD*mZyr7m@_}I{7}^u zxr4%N9z_M_Xt!@UT2qgP6-HK~uZXv*3P4**LqlV;Uoz@VRo?Amh=W3IP!!e%>}t** zm=;jKL3wowPpn4gJC@`&ljnMlq6SbT0$$!=-i!d_Ha(9Y^teZLBflX7E)~yA0{sY` z0g8@LA8TyDpDaAva@6Mi6=#xGj+E&=P$!Y{2*^6d1`tt%Rn43XD7*&r3>Yn1&i-%# zTI4qcdXgm53e{MrnE(O}VxgOgj@EU}t~s!7W?vgRoMy~HgPkO41cIWG()YS)4<39f z`K4dK*Jl$}95&DQ$@kPYV`jD@i8}V}f-&tTB!Wxva5%fBGFCH*uttZ2R5}`)jie;K zoyw7wxLp(&0LgMn6m^CKOkQ7qU-?HTCL)CflBF2S?~5i*$g2MwB#|v!*NG;=D-P_TQ|_=B;79#ge6l){ot1>p{riH~ zyD`jIv_M}QL@0<^;soe->a#>d;NbC~cYr$rLPCx@3$g9bIinhLbPrwRa#CPWh0|3B zrneRSi${gnf@w4Vibs05EZ?wy9$yk@=BwbC@_wR|v#NL|HZ6zic+{&0Pg6Fuy-Ys6 zEc)@xZcF8OQGfNRnS8svrQM;U=c*bVMpU2fPg?n<6)X}GW~I_(A-Gd-cZVBw&7I`V zkP(IGFI-NA`LBdjQ(BXFYr+?{_pOxeytXaFH=i+aEvna1>#@C)E_s z^w?8@A!y)NQ*kgJ^~CEtCg@TazNnn?5B0K%RZ{j3`naXuX&-Hj+H>Pd^>)~}Q7oms zySB!;`sYmbEWgo9nu0soQs?~|3{v2GLM7LCocmigIT8^vtAufg@PXm8;3IL|2;bdl zL!B=at0?y8+jDn7Tq83F5tzgbD62C+8-c4iUHc+rP>BLo;ND z(roShs3^?hVT9R?#HBHxwDH1MA`bBvaEGJAO?KMRj*|V@U=QA11MwJO+H+1?*K@^@ zZ0DZBgVV9)!wn7TZxv9Y?$v_)AbE_Br4R4?vn7$-534IHHaSm6EL|w*c@BJ{vuu@Ab?L+e|NdeCW$zFt- znUqOh2$TeiZxF{#A> z4t67}J1KxMDL@z)#|RvU`vl&Kdt{|@=(tUnas*1D9$Dk8e;5}XYR`y0{7yv(2)`c_ za}MYngq5b)kJu0lM|Bmk%t3yP~)plS|koo&Zd&xtGh|T+(j;12i z+oU6@oy0++9K8X7WGsZO?|(uI!E?E+?fB(47(~#H2UE2wh(SvSu%QvZE6_%YOELn} zuVmlxm_{(g^8I=N`CqF}0)bIw@7d1V2XI}~=hip^eYuUtM| z=j@ltv7|E>RQ||#VypWEATAj#q(Au5%V1-MZm8~BV# z+b}w!;GLha{PtvZB?MKBp#UR}N>hwB)-ApTy+S7 zMDvra^pCakabqJSSM#dX4qKDJRH8Av7~}s#JsRu}ZapLtJ>RoVGSvV#7@1-~5^(G5 z>*IJ|Nz2|0c}22zA$=Sk8M#nZJ@QlXO-KjN`WyvE>f<63#j0O|$AoGZdhN1#i}HKZ z=`%wwM#wzmizrjG>`g9D?tF6ZaN4okYu(mR$mEAUv0B|Ctrqjxb*)oYqS>HX-Qco# zFwe###g~hOjgH^!ym{A5XziVe`q84R_8r=~|K+fd)pJ4A>E2%1&~-oj zTi&co&h(+xUHNmN=y*{;iHvK@4UTU#$5~^WHLuqhd?~ue6Qu1YZJMI2w%0vDf_b}y zqL1*c-GY9~KSXU!TSYlb=!=9rj##FCpc%KfT76}Iq_T#8n*Q#iqSftrjNFE-tP1yq z#}j^f@J?OaD3lvMB*nC6dA`$({`-a_!h58d2ND%Jt}1=o!Z*JX&m>&Ky({|7^8E8A zP1#faJF+y`Qk2%0(%#w>AY}Nx+NRmHVWh*Vd;!fC!bJy*UkN(!J_(O5Uic-}vdqGB zid%n|4g0aIgz1*+UoOz=n7&%p^*gfBN$K{pE1%xeFiLFqD;^(C;j3cr&dr;AR2_`S zOM582=j+!|f5SGjN1W~)@P->Kc^;_vQQ?Kh#JmJ>O%ygTg#1CE&&fpPzbIR5TaT0z zWgGD1W3h^$I{1P7k(e@|f6BN7;WHYZ81fE5hehGB8<67~*>N(EM8jFH?Z=R(Q{T!YZUMxlMM z!!z>%h0(q7nG!pX80Y!>rWqm6Y=6qwxa2 zIEZK=aPZSiwg7zd10IW4KoGq^3l}S7$p|ee$*#f=ovB^aonKfGA7i1VmBRc#;dj1d z3G?OoGF_sq^+~u__PLRWgBV;+s2NP^T-%L{6(qeKn7G%bK_|t zVbRf=CXzz&_s|;hA?!8@*8I(_$5DpkPv*fa;@Vb^D&Bl_OPHJ;HWp`=J70jCRpR&f zK~y(vRQ4=gZ{9%L6}(>m>^<&8Ewjm%GgE&~_`Wc)6{szHMVYAea;7BwYV*D{j^{hg zX+1VtgX1*yi{;);$9>fv zmw4T(qD;S&mBylx{p5+n70GG&&`17Nl#b>m*3dID3Oszcq=ohSn$^L4z?VmZK_w=6 zUl+3+wlc>c$J(uHjQzxZ?fFwtH}GTZM=xdU?TW8I&HvCdGC@gk1JiP5u@Pn_VrM(r0BA@Ka_~h`gB7s+V z`Q+GA_L#92HhAj0>n-c`OI&@c^`;%V#drDALFW&%_thNxjZKLo-SM0N@Hzu^6-3@Po&wg z<8x)e5e|+|U~GqRE{{H{o>|+!Q!L`=fun*5fVHZ4Dt>$bH6<}KVwP+QcBXMlB|7<} z8$+9bA^8Ilx_4rL+`wWh-`|W37>$mMbfOFEIQFaYp2}8P;_kwAvk-AaLL!Oq00`}z zM{oBSbB+l5h`xTPc@<+hhBx1qpeH;vw1S_#H?r(QP59!)*#XzdBP8_=)ae+fGr)O_ zGB`v*fOT9-gp|dk@lYNfK5!k@gENlBDxg5CYlR_VtLmV~-j zSJ&LCTVHwCtV^fQ2-S^I)>dPTjZ!@9n1m{%f9}m*3(>On;+y__K8^eD1yBcO8J|^P zss7EU;?MeUp@W>|uxl>%+z65dkhshqk& zar(&Z-IlwR9_+tQebL?PS5Lsch+9(k#r+qfYMc5~?DzEA4O1??|Ijevi#k6;hikoBO2MCBWezpnnFEX-pXyw{W|3T<>Fdv=GEnOpX}O2 zNGg57U^ObVW00%@g4fQE!c})1&;zs(iU<~FUquFb4!ii%Y$&Xhv9thH{I2%^9w6>1 zD_iZB3#(m8KhrFflgD8_D$zayDX=Ln2SQj~;)&uv9Rt1xoxaS!NYZo&Re@@~w7-i+ zjSR6b_^hU!wKMl!V1vO`Ky-+NzOnXLMsjQKhZ>jTtA7@#@uL=aG%gUX1Hx_gV-E9U zs7P_4B<xLflvj}la3y;ssenD{#x3l!XcTp0q(SQ-sN=Nq_I8 z@`3iyn)v~4W;x-teD{mkeZ_+ozp330dFeRXHDSixw*B}M>D3MON*k>Vt`A-OAYgXp z5nJ^4w6DyZC#BBo(=keJ`*{GNS&5m#N_-66jph5Iq*m!Bb{A>UKJiie(I0S~b$XRU zyFI|(^+%Lu>k3bv6St)CAD_U$?8n>dC=2O+&1Ut?((Ewb5ph!f*hGh=zm9Fnz5cX@ zOIs{1ZJEAw#EDM&DYJgPw$a$9pB8Q25=*-shmGmO9UpsLPPPi_%rpJjbK#`L%odkB z?F}2LS9NFj=qRzgWh57O9$`(}&KkabWo~tmZ<6o#w!3x@#w!OFD-VGG<%9c)nR(qxfew*|XwTb8bLz zv~GrAK2-&is<)e0ooQulRZUD;m`2D=k$c~7l7o&P|qVc~p)TmP2qhY$Uc&hsyH z{t>&LF_xrIeEzd(WsfSo&~P{Ih`+j;0?jY?I`^;n)Lfp*u^HX9zVC7phOxpoa12Pu zaWd|$Lt>c+4H7x_=olFIv1F{!4Dk0|aO%17b@p%bqI^cBlsr8H11Y&|0WAO_9D8^1 zZT{u=3yaycLUht|?%Pd&^K>lg$h+krMc|>L{+GAzqNWQTiTTAJ7kwFn9S%bzW_=gy>5Fe*Dvw%Um2B4*=*ug;0zk z$#4EL;tDq`JLvfnzbpPM4(-G?#px>@EA%d; z=<%4UD4E&fTk*kms_NRVGedqf`kPN_ran&>tmLJe8t) z`7uvH3y_OSlW*RSDK-iHd*|rVnXeujidi$5gtsWbw7=^hv zkivR(QFa|@1d>xy>@jW(jnh7~J`+*dg8&eUPF@pISkO*J#50D^LL4{*ZwNs8UY=H< z>{Qvtp$}9dzoXdM6#{Pwk7wkkN$ds<01n{8EHA*=l!i|Ne|vlT;QV!Ik}@DI`2OPu zzqojd?YD+@Rdw~VP&vhKaVV7^tADqCZ8?ov$>~$&_cRPRS%h})PMej0il7(PTC*mv z&%V3w=r08}C~TL@4(=H{i|LS<$@8_pzZ`{9XoJmzt+$vIgI|)A2aK1+nR*2bR3 zCjWypbEI0Oy_4nM7Z&tl=Az%ETyyV^&nnJ~d@S3Sxu3K18ruewpN`RY=b!oaG`T)r zXt*C{ru2Ki@QU|q9xA8!r7(aPB9S4ZI@+fa8iaCVg>t^XP1X6tWOuPPp~2gMO>o?J zYH~Wmrt_$~7$=SHRce3jpa#uQjZFUhBTO>>*WWj<8r_kLB~9YwmU1Pv_A|p$Z|m356Q+#p zzI#N%U9w3=bnvfp$|D{#$IW+V%P<-*>r6FCh%H4q zF!Pxi%n-Y-{BRt?Z))pzqxMA2J{G0G8+g^p(A6uks%{)SayXS35F=4KU;m zF|pY~N0s;~hbJ&8hMv9+&XQE%+x>X2!3uH>)I7#890RZ-0Hah?@Y9`6^~AI&ld2Zl zpcn0SP;sFE@1?5<|41K>JC1bx1AJQE$uD%4zqG0&B6yJD?iqTZ-Y0x*)Hp(6x1E7R_!+T?A&_sflzacZW;u6Qp zJX_}S1F3cE-G;eObcEe})^@FFl2KJ^T;z)Xl~1E$EN5B69$nliSH%~pJBXucRryJj z`VvU`e=be%nrgTmE+5p>)(@}AJ#4BGe#_fKkTbCN#6iZGo3q||uSU4knm*OzeHg>aTtCSW2l`Ux{{K-}6Y>KK8Q%{ri0%EMI!L4WE$C z@VMWqF<+q+Ug62!l+jgs;j={^He&RZfv$);-{>l7TJS$h zUim{;=6n){M+}SikRfq>eU{jGK?doA_m1;^#gOIIokj8lR{%2ffz;3_Hi0V$Np`y@ zSuf*9z3!Kz8Pi2x3z&8Wd-QSBh&$jHVi7DO6AgLQ1?gSc?PQ|qA|81HeZ%oiZn4+D z6BrQ{4vQ>=1~ZJQGxHc^o%NK*G>YljFMu(LoU9W;fzUY6jUp8>G?)|8gpGl7WEjPA z0vsP9Sc(ABc>`D1l1-pW!tKzMpJM^gsjRP$mTf(bNMUag#3mHCGic5=LD|O5NDCe2 z*EqNCwB|g$DugOHW>{ka^?1jdP*|J5fYJL)yN8Agt|I$V&mjVNwp_3RbqVe-w?J2` z%l?R`oY0X78ujAET~JM!xw!rsIb(VPbj?G|0_xujC!rSYoxFVjoj3IC8QAvp2C|za zAQJ^HN(7}Ku#AbTr(=+cneL;=p@!W>^PzBcXh>8h(WMYwH`v?+5tk#Ru9?$B1_%f> zQ>iUk!mh1%B+K_0vxJY1-NC~+ch2!JIidD`NDC*>Q$AU{Eq;*LwaPs*xWrrvGKw7JjH+lawe7UCm^y{1F zC(^yQi1tMvjIT~Z3Kw_L_erF2_Y)_vu1Ooyk8JwY=PrLQ{Wd*QOm)k>s7|>(`-yW~ z4Gt==2=NJ>n2LGr@VDBoQl?TFjNGn~1(A1G?PQaUxFRc{z>{#-_kGB*)pOB-UtdVK z6^@H)6fmb8by>)F$X}P)PaC56xpb9XC1cqgT5-mS_X2ON)_llsKDY39QpU=81+?91 zB}_$}Wj2&H_@G;LKxcY?w9CWQ8y*;O5AfzVl-CTQytXK9o;y2UN2{+)qje-*SBr(l zTKe5xR+?jPhZzTomP=kX)nh<#$vS4Y!nSL_{(5ON`%%GcS&4c@i`nFvqP7_w=dt z>fzXwy?|RfO`r5v=i!J zKK3+q-u11n-hu)wSNi&bX?|1x6wc}zdFsa+$*!y|r>kceQ8x&e!Ijvc?WbzV>_Uv5 zdxS>mN%p~)PES4c&*TF3Y};>=r0H?I(o#yb&Nh0JzU;%RqRfm1VSQJ) z_;arO*guvPDb+Bl@oLl26lW!CtxrNGu_6m$9?vxV6A!Co$*$`^(^yd4eF9vaHpPF20{U)}!meb*jX)d>rOi_WE6pzsHjRWCaSE;26-v%p` zyE30r;7~K83zXhz>XfuRQeBmthWdDy{Z;*Sw?xnEei4BmZJR$Xb9ryM)b~pgsoXsV zG_2eDS80jSS2MErIZ^^zeyr}4=jr?F7*ng%l>s44oNoS}TXEu(pL_eD1mpVC8% zLD7P{UxvBl-mRFoN=cS$f7CO1B)!#P+=2gb&M(o2&I3Gqt1O%v7pNCRGpC+jDzSIr z-N=;d#T8h&h&1+|TLsxt+H?Mg^`$)%uGZvI<#$Kad_5(-Cnme-Rcb`8@qU&hC;TFDcR+L#Eyi4#)Bu(b?eY&3;Ew&?ZVHxcBzJKh~NKRV_&jS|OT;D1e zZ~+gwkGt7bhEH9z5FO+A;mK}iz;64t!>ej;!M;{0N&kg$?6+ByXqU8;t|t8L=K~XR zS98+Sn-r~$3uRMIDjjs_bI$i?^z>(oGaFNRFy5b1&7FE&`$2!{HrDLojn}>k*cs5` z*bHc%F5g#E9rrNro09kK48Gu^wh_9-t6r}U@MYie;}(vP<8q0)o`B4D{KHPE+AjYp zg^Cj=_MXn{-jm9GddTNl%lUO7V*Sp!HL@n^BRk`ozLuHw%{J3$WwFT!ioED|%onWP zTYCS}_FYBm(kqXY?iZFwXHGJ%f2}{l!kryIKhv^gqfHgm^=qkn;i;E<&&Gqlr6*id zLMVJCdo^EKEaY#`_&q<{GZrqxa-Oi6M=okg0OB!&=Y^2`{HETuXsKPEZ2$OFCy(kD zQ`ra0EP^${6gdEAEKV_NG-{D}K6!dmK;*8f!ZzVJ|3WdQ?Ey^U{DOitGhWWgnAmtc zN+cY(5`d;`L~|kObC<6x4rIAnMF`u7c5G|UIS|$rtqz8fNb8SXe^+19(T`kU{w1yX z%fq00e&d|QwVfq3CpA$-0A_x0hvKM&{Pk3mv-r zsb6^Xw#<5gIO}_fxlWwQm9&ZZdW;h_Rr_jad(7RY*upY-fBl$xWtOewcpyZJ#Z6>~ zaypYq2kCi7{M`JK5&8b$&F<7>!ORL~Q5u`e-nW+BO)8rhcIEpptxdYqnZ%|VGQK7t z1_30ZaI;^MH}wfSK4tR&Ivlgd(!bBFT>P=7glfIO1nZWo((NF}j)w~;$~;XhT5aT8 zsIi+;8x&HuSKmjEUxW`hG*D3p{C8v+2!qE>7(FPSlK$C$v=#tRkhKABKcIV^j$g^; z)7YG$-o}SIgcOOrJAzh8OoUjUx?$lU$}Pam&;tgL(~1Hd!4L->%daTeNQJtmBc0l)V&i?1$Xk z!hYKq9TObhF4pVIVkxf*IdA*Ttd|z3WoEiOe`@$7wdc62?9f%m&3l9=x7;e>Sw|&T zbd73Nx#pt>#%@1{+D07waPLRCHQ7u(pNAM+3%fMSiXi!IHoCt^Al4>l(oD@HZf!9W zAAZem{AG+k~vBp#$(R#4Osdr?btl#zz1AN(i?Bo8s zOAg;q*3{OZqx4nSzhO3LedKq>0P>WETrc-U8m_E?$h%<{{CleSib5m&&Qwmu8@tMe zXgs!W>h14%@S7I6^rvN;cN+W35TZ5LGDwfGUD&zC@=eE$OFSWo+FjJEo~^x|m>_w9 zE^};n@~d+ylfO#PyVId{X*xOsTV+c53c55xy|}bZ&5wJ@oj@0YSF_k>ch^xl3~UxjO0&G#`(-=o3&54n#^aMtcz#ud2;%wU0{bR zxBGnmH?{h?XzIZ!_N`AemYUi@iCBMbgG)?YO|o-S|K25^zs~`N_e{F;JM^_wy6xw> zAAiq-j3TgSPlico9=+T(d3|jow=X%B;iIbWe;67ADgtf#v-5r>R*Iem2L|bhAJ1$( zh!Klumk;L@0#7A;4ixqg?dRYqY^Ktv3J73hcKem8Gi=ruq~_mjsr|5$xcN(!-|nB> z`8V~l?{ja6vi*<{@5~#bx+uOTA`xjBRT*AtnI(F*=5F1#UFstaPhbh} z#8?(<``o{~WV3!{vY?HAczj5iW_YK8V_9%h-hHRCV2V@2(4YRP6R2p=s|jbz?b=l> ztoi<3yyxg{@E9jN;vPNHM0Xa*GR1j&)?WTNp7aAXs(WYmFr<$i@7HJ$n{*#pPuJ11 z)t~jz#C+k~$@%MIg+rZn#pw-!T|G++jzSUU+4^0pGVB!5z=75x>x!sy-Q6 zSSSbF8O9;nc^we$xu0A%RLyKP{JZLja%;X)#Tw6@Iup}96>1{YicKH)2QkE$CEl!((fxIE-?W=VvCFPYPI+)6K^h>eqfSvmn&rHG9UTk(k*uWt z)p;XXL-8?#YG2nD%QgJC#{O!Y-P3n?%u-m>N~wyrub}+Mz)k~uei0e~p{OlA%oavl z7{cCFfWzkj9z*b*LoP$IFmP7QD>q`_XJ(pakufOU(Qc~hq;<~Px_@e)lhBQ6Ru-06 zkDdGrf=saC`(a^e zkhFyJowkn%Wa?{_gz)dFMqDkrv9%k@OoJ^ll0RA-wSj949b>hPn59HaS5pDQwSp8y zZS759Na^gWpwZPX($R++M)AlIY81=xCVct+^{~t>eLqpfb(ilkFftASh_k{9!)A>N z95VQ9ete=5b|K$J5tqiyZpkqg*!CLCFAx(kCFbME!@lWcicn=`B|6M>Wx)(X>T^P9 zE2XBUI`+u6_8g-tTgP9qGXa)M2Tnak!`4XxQ`Z3Jq5Vu5NsKiZx%&iXLY?Cmhs%?Q zpoFXeWldg&qtuOPmweCU*0J;*QF&`DVCs$$pt-kj>l-kQ_<8=Q5EYr2r`0<-CoRXIDSjEKDwg{EEw4i0=)b_v|J(l$8xS5+J18 zy?+B*Yx+&f%!@+cAemDEm27rU z31I5$GtkmHY;JCTl0B(Pr2uah?(}-%dZKDBJoiPsL?Um#dL{Pr7Z~7!8LvN}vj#OA z?Ef}1F=;99WHIDSGfWhU)x^?vVc%*tmLV4Y@{Da*ZwR_9q_VjsN|uiJX6~_Vng+wzIOSAuei(>GFt>hxeDw4k_*Z#ZD|MOMf zfj8Fmyx8dHF`r2$?hZsb;RW5$ohzXiyr1(nk+NbKfoD-M9AGE&hIt|PyV`Ciz|Vgz zCEhcHm~Dl6fBVD_KOpe-Fn^Q5(=2i5%3OXim_wMgcv!G9L{1DMj^~kjHB&iy1_oj~gd_(+$*agP$}X8XEFO6^8t+^h~1u7vrKrs|o$kv3Yrj*)eHPM?Zao?Fy{&#yT{ zKwU~8no{UfaV5h}oPFDLy4|F)Na9e)Q0AKbad{c zn`oG6l|1(J>C^b(_TZZz0CwqPE6hPHU`XvkFsTJ0-AqjEPK;I;5Z}JNJR7iqi%^1y z(TkF7I{<4GjTBv95MG4jV$?!kZNW}2;HNYuf3`oNNe;;O=fzT1(t6%p;sXv~iB6tb z-O6&$$~1^`5EiaF?C$O!b@4Cgir^p_-RfA-_3`LN`3kUr)@|4z|ETZgO)VoMqbMuH zL^L@bVIQlurs-gIZ{V2NAt2znH`A`@3t)tQ zhV1O(a#;RshbkW=+LJV*L!2+%AS^({} zj_Bubf}F(p0`{pDhTjq?F%$p>T`D&&EWlqY+p3N&-uL&?fhqmhW>_##kZbLx0PA5< z73VpgRZXaf=Rt@HT;_vCkg~EeTvC6VZ1DWjY$eZozatO^#YU{XsC= z4)ru`+I3IjXsr5|4VNFAQYNm?lI%*@bfoDj5=3lXxjMB+ya53L?$0tQLJOTv8-0Jx zvBNujhm6dZnQr8qM8IlX;JO!y*wy9TG`R=sV?LaFf6hri{!Z#Kd=uL>2=)qu39Tt| zqxN$e_VPpg;mB?uPutf!$rc+ELuls|3Xx%v1Qv&Z+!$u3H|055V+xTX<`z-@)>CGc zg~=iq(fvTMcEX*`K~vJ??-d9-!=D?H5_PuWxZHHj_AB7bQ+1+X#S~$JSk=2CqQjWF zvCzY*{v~)u#PkKxI{mxdzKf0Lf@k=#2em{7U5|Pqt*Bsq?i_13lH0BWbIuZTQHX`7 za;{+A$n0hUHQ?AP8!H0dhRTdc1@X(*(9+KI`fpsN1=tPO9xo)vtQ`mG1!lqXVe;}% z`mLa$pjyT_{|o(t9YdpK~0j+4S13y$t`mI23cKt-vj_^?M*500Sf5^cGKk z&8e5$E?$>1#4*J%qGK=lWET*2)2|T76E{GTFH3eghgmNXeJmd2&{Fg7d%TDo1oIjaErMUF`EdTKpyyF2qGNiO`HCEAytl}rK zihq*dsBEJ>gkOENVC)^dJdv!aC|=v51z5e104$OE^8`m!n`9OfVt0 z&daojw1g@#V@4#oa7h;us!cE0RRZ+|HPC3?JYBsIwR+fi8s|FjV^?lhQ0Pe)A?X4i z-y_wQG!Vj{cmBd?`<}5Pu-rl+&kKT=Iw5P~Ddd2OC_YQL+~&v|@I|B_D~IgnBv3PI zA^psdkP!0k$ec@U#9l(ginFW?&~(w$eLSM(RJci2u-=B-F&!$F2fKB_%CNxbOnuL{ z><9lnp-pWfoeLh*Bcf&`KY5aRIB#X|>9?KP-FWU&C2T$&7Ux($ipZS&8+1*xOHL~x`WisP_KWmjcU1|zw2c56NwAW zC&Z)FFQ}{E!g;SXhqQtH@8$~SvrE&1LJ80Rdh^kCDQYyl(o!2r?Pf%Qmg?%`7Xe!Z z7Heb6kAg(?7_)&W0%0l>E$NY1kIxjDWHrD44(0OXMtebu(8y!t8vmu5(L07%|N9~! zbuCH1{>yIvUw&0V!w%_+3z!2{R(QiT$Qhn<|L-@?j~2N~uJieN0QXAT528h`qefB- zc*WJ{xP#Q*Op1C8*d5U#e&qYXf4*OSk7Ur7KQi*l)xh|5-TL)AK<&jY!)VTK$5#0+ z_Z4(Nnv#xhL{5m}su!m=tQX94Ou(R6k12yyHe#T5u@i5*4b}nyOn=gu=gxl)pa7=f z9l-4&*>S)uBa(|lwp09To(l<45|Y`*WoaBi#FhsBGeq}BkWW1MNlhbUo(&2?VoHVH z0;x?g;l2$DSU()m7Rc zJ7-yVU@DDbqF;%rvBM;_iYUbRyF-aGs-cVO9rE(sv&|Jd0Aq)#uuf((j+L z=<*%={1JgW+|p%8)C(hur)!19^|oO@;CYn{xZD2E7Pu3Sfl$OCiQvVElZX#i@T7q8 zRx~_@qDA8Ib~kU4yH3(n3{6YIfJB4mrth#Y4>3UmJ^xC}tQQE^AZggNPkXuL{ xBD5uhY_flAwODe*|NnSo{&~;;TMy;R#+%a%lO=SXK2-QmMOj1XxuUt>{{aU*`a%Ez diff --git a/tests/triton_tests/plot2.pdf b/tests/triton_tests/plot2.pdf deleted file mode 100644 index 56b835edb943c428073df1f2a1ea9cc52a593485..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16044 zcmb_@2|QI@)OV(EO__=c*OZKRzFbr0S(%ctOxKW^ONJCeWR^JG!?g_1pG7>+E6ewbt3|zxP>3*g#D~5+#L&370*Gm)62ia3tK* z?kG%N9*!^x^diC$su8NBqAKUPB5?~k%&GXa2#|ALl`^TI=FZ^!SO%t`gqzK6G?DO zFszyeV1*b!f+Muu0ShWWa@8NXHr)C*c2Iv#0OA|s-iHXdTW23(MD+Fa^RWl>f%pgG z>k}PZY*jo1z=%ljheczhWZ-Bl^xqoD089+D_JtGH*;DrL@B~*VsOs;+fNuYko*vP| ziR27N{TNZh#SP3Cj?i!eLQo^xdpZyywtY!HL|b=QKu(L9uIFxP#)j_eJ%vh^jcDyV zEDFu~u5cw5ZN?>or6f0e0@ZK~pG(j8&nL1kV=wRoQ}y4f_#~B~n#{NFOe*t5&o$X$ zwQ+i9jo6F-Ep$}G;LJk)jQ%iY5VYAiZw!DSMvL3 zA5FhqlX~_N)1ez1L{ZI3ne%xl=~GOu4!=dSvF2=E6Q0^wmOxLdncd%|>xUZP-Y9MxadqR|!ya-y<0Kz-df|j_xmeGos=ld~l>H89ok)J;T|vkE->2ptE4uUq zAMK0wK%AWj3q^1VxQq1)2o?q>+!Bi6jlF(o_KAH{tMelDr}#0B{5w4*`hE*ftNdR! zvH8VKxGla4Wa3}m2j|EmsJ2(?`dTXYMiP1v_wb$n6_KK zg*8v*<_?uxCCqt4eNqwB?6ic8h#a-bsHDqRH4ozV+=`!nIycy2 z8ZzY8a9nY?!Ps-n^w5z}tI-SJs9sK;Ytx}{?N_0ueOfv`xYv}KUx{vF;r2r2dH;cH zE%)UqWMFuzwyn7BnTAJZ)y}Co_!|0XA=&Np_V&%fo5b7O(jVR~EfPxqu!8%T=}QG#n_dNUH$mClBAp~^*NOzwz_ zMm^GQMQ3H(N#z~%Ni{HXtNy~-()ZNg*p1F8sh&B&+@5Qv5Z#u(6(NQh-za`$jA#Xq zwxDZpCTrrh=IojYoi1eVndfre)7cC|$GEnJQo{WO?@hd?i~72UbaBjD%5PWs^~t~~ z#c=N3aTJ%cmKArqQa>B-%CVvLOo^%3W2QG0m`Hz9$V^E>QH{@unyTG#x4Z0#xli9e z-<;~pe&FH3^gd5_Nb#Fvyw$?vF_&2DJ(9)U?UAGVTLtwUnk9<6J6`vehn=UdpU~02 zR$O8IMGM7vexxiVw&K8)LnbmcGHc*qlH<$JvSFmN@$(@p%8ZBD8w%^*h`pwa&j6Iiw&Q|VR#*CWtn^Ub0E-8kdQBf-n zbeEIjM|bqKZJW2N?5IAj*)`)Pe>YBGI5@&MOz?$(#Boi!Y6sp_mTOJ!7h}9t&Klfk%{FKvXzR-BzPYqNsO3JzuJ_ZAwqPIIXdG+JNqu~J z{J@ET8A~=Bp_LkOieN?)drGg-9(nr@d&L_O4vDf9wg)d6AE}`Ka&IhF)hRc#`igSU zVO`0|l&ZMhev@ebbV=1bSg4cqxTh4w#R{U9m7$jlVH zBk@z_xjnosIm4?BW?H&ZX^%C}9XoSSHFzK=)4{u-)$3sUhZaS3&q|->Nt+J;>Tiz- zF6T6#X6jywy}cT$#dT8Dj;{H58^L(6S?Rj`hde0)#|_bDedXrp80y9;=0gJ!*yf4w zQu$~bCazSGj3TccujEG!jED|N3CZz$6L09pGTA>rbxWsztc-i4v#hsxAc-OGlQK!s z>Be=1^vD4RRvPn0*Bj;na)Aa@wxnvkArAAb7j;$xdbe&_9J-fy;J9uqR!{lLm)glA z%XFV_LrCF=oBM$o$UYJBlPGB%{wI(iJ5z{g3Ag@9L;vp0e>hYOQW|*C|L~>?DY_n1 zC`Q#~#Ym}bjh_cEd??skn>l)NSVZ{Encaii)sLpv>X{q!CSlJ;oP4w3RdIe%&aKOCEOC<_Nqym`x_gt|2#JaD( zCztmrPKsWdbm$UM6l;Iw%gM*VU*phYNA;}Tj7o#~k(9@d)BL>ci^U7CFAcb=bgz_) z3omI6L}7eH-Z{5UK5c)M^nEm*S~u~UGyjLN>mGelHuc7`>UIwMeac*75AI(!;COPo z=?>$zk8bL!qQ1$`W{xyHVsK4&+a93vnTGe$$R1s0)4}25{qtAl2YZ(JEc!`&e<}$L z0hf(n1}XF3ve!>gW^07gQO!399N5CqyGD{#7+5D* z%(Etp94in%&$4DODzypI5c=G}G(d6x&9r*5z7GvQUBvQ*e8j?)i9-tA1xDt2@9r{n z)eDJRL{3sYp=8%wU{jrqNn__jA4g21k5{AR7GwvrybjcOSTzxwFt(Osa$*nV4yzuk zGTL`O@5=m6!l;xHjzo007?hXI(3uo{jn$kap5dqTB=w!q%bjkP;T#`z%0$lRVy9Uj zEF4R9J?_VT;CjV`Nf*CG*H!kNXUg3Il%IA8vEE{e6-oEyH(b54t5jCX=B)6mrO@ZO zX^zK(jI>#ir;e9(ICoOwd(PH5?P$%`y2L0c{!x?IOG$R@7zHokR+|u{1UZf@W^ry9cqX!&x zO2Sn&!glt8lnB%cA^Q1B3hvUL6HY#dzDgX3)VzOnOq(MtGWI%+ z#oW8Mw}*N5_MFqJgZ$3cBbVLni#2rGp4?^9*H58aRik&R!oJ)7{bcu@hW;-s`}s{+ zHxU@LdHF+NsJ{@{0YVBjKjXu|NJZp9u3@FSmoHc02Ubtkrey6;THx<^!p@TwkR>Qs z?$|bL;Q)6WGuCUl`{^bN`<3M}UV+qD2HU7)ufDQzj|&Mq*8J1D&1mQGD`(6PZyUI; z*LP^UlhiPcy-Q9rp~EB2n!O_4@9Je^GfiY3V+5(HDHZN(Qn1vKfS#O|Nfx;fJO61| z!k=)0`7HxJo}Y7SMJy;Be=2`AaGr9fkMf%J$O^A|-Lx=#7yb4=L%Vl5uOp7!@h9=# z+RNsvlC3Po&foP$tY}-dL0RRcTq8rT62;5YWfxDrK4K-giM*iQ-9~v~3I8oG1p~PR zHGamogh<4K(4wFhuSTh)*K;#bFYQnja7N?@#^iV z`|wg#Blx2rmT1y!5k8q+mL~_Z&sN$#p-~;RHRlbfNP%__;mdtkgzqlmtf`hr%oTxz z8BBuc1+n0Z0ox_!+VPemDGC$tmqtc09fcQDAMA5~kP4G^?4_1qyFKQ>t}pYjPTMGs zI~6^{TjVbgQYPPiNp{*n|52G>*5H!^!&xi4y{eB4IqXksBk^{w`g`-Q{WD(&~v76O|{5ZX>}kRXPD`wIyg z>3Xy%(G5>=Smi!mIM*F$ba15pi>IrqQHfZ645}3w@*wHqdlO9ixYxzMDjN~?z#))1ScW(yPP9VcDrq+dT&r%<2F%C51ws{E_qZOSFhHrctHR2-gG@l z(^d~-?jv7Qn^X-ZDj|aY!hAq1M>V4?)m0s&BFVrz#)2L`} zMk}6=7Y)jnDBqb~uYtK|yN$JPxv8CM=Qp4FZP?4M0yoVB&fT(8i$CM$t8j<9Pj9}3 z(AUI%-);VUU)~K2MvOmvs5a{8TP6R>h7Vp<7x!VN^W8Vm>F-A+}k(dXp?gc^xr(E`i9zPsDLAw;{2?{fB0dOgq^1=$xa+2b=f}3YN!anbN zXteOHV|j(=I!o7AtDHY2h=ERqHaw?IM{~{>A374C}YwTb6FLkh*sG;bF?;3lBX}!!y`f&3Kgsv^m={@Tj zqqlJjcxYVY@Syy*u%J+X>VXge)v}_mSRWyQq7`j=8vnG!Hdo!U_}S!Qj*CO@$19dJ ziUv6?KECTM?8-OndeBa2-tQ|7^iASyEyb7FZ_>1L552>SI&V2x z8>@Z)(qfQ@f4MEqNMxN4KPhSWm0RpZO-PWN5DRwf&j@y-`$$ zme=;ltqFI$l?u~r-Cnt3ujku?=GP3+B7LN(xuO8>}T8vvsSG;l;thjyX#XG*2IS8vy zZ+GafGXgHIdmnbIg%6oL^P=qzB z&ak4?W%T}$J5&}g@1&I~49)N3^_N}TWU`w8cj>=?o)~mBSOm+l6l_u3_rN9{x57lD ztdtr`Vs5KQ?a*Ruen3&l7m$r4wZ6_v?O@`eJQR4n$)1{uQ>q8Bv zQ(L|eE*ScCLjJ4{5p{bF&-BOv@1IfpEkZ}Ih)rUOBM=$VvDoEM_2wACRBjgyhx*>k zj)T(`l%;aU%~`H7N8j2LCu*L^ZEe~0$eK{%;C(61&%{s6ORqcaqv6P^zj zQ0BYaZR~-26PJj_xvTG(mt)`uM2t0C=HP=1v?B2#X{F*`=iNT_>LQt`zp1ciPQ)s` zWejWWGG3Zsn;jqbp9-__3}n?B+}57V&&3n&f8J{CRp08JHSZ|7?pxf~cF(^$mf7du zcEm1))0ZOlxoGi}tCU4Qx9sRyOxr}!m#9a#Z8GXk>CLb1>+Amfor+h~@$)94+yrc* z@RmIgN0xN|DWe1IKw*k79`^=j%EiEenPlw9IlmIee#^#P>I%) zdcK2)(S=r0ejQ@wwu^T&RZXI*qTQO| z+om5l8oDK=ee8Y1{NQ;myB)8mL!RZ0sxQ(kl_6^sN4SF>Hc`bUXbz3~izisY;C4JU zit(X!$Cqh5PB;DyE1 z5ogPVmwRCELt-X5mPekYSE^z&oCzlwFk)AB@jTJRXM2gU*XBNN!hB|OCsCQ6=B==V z_Y~OlSHXSSShyJPWwFN!FzV=UUAyL!c=2%TF2$zzsvOBF%#*k^vu}&OOxKMQMUp){ z8(tmGDPg8!-JOzMFKBI6A;re(f>YM-ZnTgLt*bu#eb-IFT1KT$G0ooDMhSU$UA+ps z5wAsz2R0GWCh$_`ZywCJI;KV>omy~_4fTmVpA-#K?~mSP+9vE#BQm03#Y3MZ>Vaue z-FkshBFfr3Y$+YdV5Mo0aDMiZ?X!j%>iCaEMjz%`QZLMwy^4BEL8-FS@lV#Uo8UGy z<}Y@r`UV~DKzm>6U^rys^zJ%d+F1D;-%hGNpijwg)_s+?m};5+fVeD3@a(94t$KQY zn#-a`PuKN|@0S=U{nrdkwp|sj(CM^#>`{T$MhrH|#AmqPN|-7MjxfIzurNOGL^SM; z0{Wg~oZT}(uAmDYLh;w6vE!}J{8VS6d%qsI=^4?HCM*P>EPoD zTNGMGgfh!oS5)y@vEr8kk=CP)&C#Q((tWTbmT z5Fp8A(V9k+tVERJfuh5A)F;0WCzl5&=TZw%J9hJ{PNwdgF$#VFJA3!DgypNE#rcjq zyI$+OuMqZ)ewNyOmenpM?lRX5_pNYl0lEqmP27|aoKmlAuPu8-cco3}-F`cN_kGVI z{R=X#rN0e8D>;gmmFreKGHAM&C**$j)bR^1FeZd3i_)EGb~QQ9ZT!+{J2;jHBkw7W z&uKff=i2e;#C@WKn@lhdo=UE~aJ5Kok?=LO`*lIL`PIIPk1-NS$`m|F-tvZFQYBjb z^Sg68a+7h$v-kOu!!O3@>$_ph3d1zRbt4~@$)g4;d$-VXdK0{54CsAwcb*#llyI@F z=)y@;QskUBLGi{xt4jyv{4Z*>P-V|GA3fsfKVp>j?nR)CrQ?#qt2+x_-OH$AuTrH; zQbBJUj`5vVJ*jrB<|xDa#yQUJuE0q4tax^w-uIuW&!}wWe0h4jjsQ!IDtW&(K=_mG zMA6OU&(pBi6J6$))IZn6zr<459CC08f8Dm7#yK?{`#s_Q!LMIwS>If5-9){cKv6XI zFCJNa!@8^B9R96Bh{Y)!3u)*WIPGHE@>M|W?R$VxE9gI(5#H<8^Y$QFb73m7kH zr0dfHXB_s0S?6L_XeZ>~kZJ?&tW~9)Ro7s)Swg*ziqSc44zIq{*IXUnsTBG0dTD25 zx7KjGZ1lsV*qRi{$Y3>z2~@LubNGDedtTnvvd)f5+E&EiF&6PpPa}x@<1O4nww!IY zoD*i{e*9O01B?rrN`wZ+W7ia0d?l$Rk8AXV9+|j!Jz;D$WxLCAru&m?LFIaKR|?8^ zyB+X7yyu98(`{MaZrYAd*9KUP>L@uPZ(|<$zTEr7RO9A$f4tUFD>u)CZ)Q3@ksekW zX9SZjmI%n_7Ibv(fxSx1;q?)tvYpAkM)KKp(EIX*PTaZJYK*qy+Xk(Qtowo1GL_#M zjU8_Vigf$rKP{n~uW-`s=^>%#%Ul!lJmpk#DLc;gB?b&31v@yf6Q=>|XSZmLQsKgZM<1?SiERMqdhB;N^^i8>q6qWHx(!c-hYOn~Ecup& zD^x}k=dwGET9&t0a&${x*X(AkmV{9ovMZQnAvgt|b}voK7JKPj3_C63q8d}=6*W;Ep|kiH_k5Z?Q@n-3oy{DwNxixa`Yk6*e>s7InC2aJ7V~|*WaWp_-Q~*qB^o8lxL89F}Q0{DGao+!$?tK3js(-g(dEpoEyl zo(q+QwykZv79T&bn7lcDdJ{!#wrltc^oGITG(d(naj0IyZsD1vGTNl)O)xqN^A~HV zdZI2fH9y$e<>V(cw((7Qqu)gOuFZv>_h7eIh%^bi9iFEkV0)ddV;#S zqDhdEDND!L>9W>`{LgbG7Q(f~qWBGBGj}FS5Bh-> zB;QiNFJ<4UW--L`@mikt!oIrE{Md$*1r#XyO=hzR%*6oo^xr!UBZ3D2N^ifxLg1ku z!}Eh+$MN+pQ^_C)l{d*KRnEf1|KaIKlUuW-)thNUY|<0Qh>^|o9i?n@Mp0VD^``vK z=Cjb(t8q@$?M8+7CkiIKJHq!h6<%g&Tdk_N{<@tfk=Kkn(-l@xJ=C8y>q*@ zz2agFl0*H&hO`(7!4m$4d`plKxN9!8RRaAc$ZU!9p|9-}19m?D>(nzwHY_ndrIq&T5Fsi*%g{8STJ1@*H;)h1q#h+g~s<<()jjdnqmlKT0{7zt5eMmibj>)8X4;oH{#PQZgpvtd+ui_|=l?!viFd#|lfLt8 zUpb>=8Ozr=(dW}GFHzi1)&0b$+h(n=D`Y8mlvy39fmGti%MFJ-#l18`rbJEB0zWQ(e@|f!Cf7T_f2eC&h>jJ z;h(*FQ6D}2CGPsv)aqBNBQ}=Ne;PP$lbrzSZ;s{lleC#t!STUz&ix(55tzm)`MXzY zo#*dfRi*U{`#J&ll9%uC7Lp=4OvfdNpW8j&SP8d()hn~sQKQ(5QXa{dG|TXB8|-BB zy^lswR%vt$%I7z-v{})yIb`rPG{Oa}6IV??n4ewhu9h1~ti9=YL?OR1W|pp{oZ?XE zx7xNa-TSMDpG;(k3EQ)uUJRQNFv82_ZM$VM>vQEy<;Xd4FV3Z}y{hl#Xl@D0zu08j zn*e##-y8`mB;I3gLeiw>y<3_+CkagjYeyRKw6McJGi zZ+R?i;=01oq9iQco@K*aSn!xfGdajkhRcfg=B|WS71N(t>rxy;dRkR#g`Go#%Zy!~ z7kRTc^WbP^L}zj?)o;6fvBYTrn=*o}O!2K@x*ObW+`o3YwEp2L&7EUX-#3}nCiqqQ zFLoyiadOO|NU|<9olgkSszis4o-7o-8T)NXXt<@<#v+-ry0K$IvE+$Qw~*ZNZRhwW z4(+nIb?w39Y{uB^xVua*M)dm`iez>)Zo<3 zzqjlA-LAM^Q{H=4Q8!chvlflc7W2P0Rt(CL%Bg+c-+2@}m3~vwH?*dz9&t6LF~%|a z)IFEX8j5Ur3f2DJVnfv(ygg3~jrW#6r&c^u?ZGl)e=97bihn8wZnu|PLb){KzA0JjXXv;9#<<8Uwp)JBcy zYwzRYMe_84gEJW@co$*pXGbCjIzwXscf8-$9fS+3Z0vfyy~=usup|lzLX#x1;5ZQ_ z4c_aZ(7=&{IALfAeOr=`3ptKh3W7_GmsC61b2obT)+hw$PJEghao^0U=S}j!W)i)EEwnk zbms@e0y+x>?0|?a5bKOM68b%CnyiYSZ_9qE`Zr~fCHp^U28i9w*2x#h z=C5?5>`Rsz0S!)dpb;PeN}@4fHBhzn(pnb<5{5AQSp;F`AOPOQUfILRjR+V=7?X(Z zrcgoOHh?@MI0^$a@GB&qJU?hUmT){0`Trxq{$KmS0MTNB6=Ly7xC}-Hj>q6(1mN1F zWsq>RG!Bj-Ajt^__=N65^)bK%$T=`EQ2+myU_b9;KwT(7ef&%~3Gyw+#3696%;W!-V1Uy#Kz>KA300Y3}$T@T$0}=@Emj>;j zJ_#}ekgwCAoIDwfGz`iiCZI_{36H{%+v32)*ZTl=*SqM-LjZan@BtXCe5GV{969?Lm zh4Pa*C@0$wG!hoNKtdvd*oLMCiU@jm4Wlh-+I7GgGwMg6Jj5-gMYCK5y{2`dV*U0(+k8D5(y*U`_TabPk{eS zkRhzw0AvUMPGngFyMS!sw?wuF$R>U!vaS5E3CJFPPY_?^YtRQ|8$S~?b376UbO?Bb zY+#c_kO4Y@>;kM?zo(xz@iT{n0`v|e=TP~#MAr5CeW=bq39<`Ft5E(cr1f`PX$uD$ zBYUBLmTz*{C2&i2aG*o-Eqm}C%6iEG4)jT;5#eZPqY9kI`htaCUs8Xp+~^;!4O;1e z1lOC87k_eG0_fcdn8Z5S8S*LQo?L+OuG8G$U>4+Y+`)sgUh)9;@S_B|3Gl3wD|o_z z@YhQal{^*-)mN0pfaz?1ey-=K2+IOjz=uz)6u^9rnk- zz|s8}b={u=$wC!~R;Yq3fUCM+4aW zdRM>h|9(=WjQZV6|MO83LRA@bg0Qy*$2^dyQ%3)f)qqRkD!WBuBiB% zclqt4=ogESgnmrn7ZDQH&w1%Xs|>gXD$xg1{2^<7G#uFdIu$^fbvgg)fZXC|59@yU z=Vx8NV3UTYk2}CiKY)?66b>#4{ri~!D*73?4PmhVh}*^(w7)_dEKu4BDi}vvNm*~B z+2d1_l*w&nwcl#X&3W3TO>pAx>iOR@N%(6fUp>Yd^xz(24z|ChOorz)6~@{)dV!;T ziY+(mHf8iT&Hebc?tdXoY1H2{3OMblL~f;^v0fLe$q8mQzDEyUQflXYNUrhPgGWVI z)XJ=C1(A9?pQ%?L+EMxzT~m`&vt@}{Y?JqQWIOr)W){Ht{r61VlN6Xk;f#?Wyn#X9 zNcn^n&z-1P)cfzHa0_)UEf?Jvn#0QAg-M=+%}rz4ich2HHa3)%KsbX^=Ev=00a^lXtoAoK$;le{+k_yz*^ zyVb?@Top$2__xmQgVjYMOR7&IP<2m2Ul3sDH1|6nV? zhv*0cfB**r$l}Kj05cqL1CH>2XpjX!Ke)#a8VZYtmeqgLFgOswvyp~Jf{AaaCk?Ie z8)iy9l3M2F9+yK_u*cKd< z0ATX({!m!VA3TFy(Vy}|5s<*`K)+uyg*x0w!=M1L`gc7f8V%mGZlvMM4m#<1N!~LGZKphZ}K%q4=YP|1C}`)jfd+xc zjWlWeM!6ty1OUJ`)B}5mKiUG!`e!{H0H7P&68@a8H2P0jNn;_*1O0x<0ASTW#+3n2 z+{St`xIgS3DMR?f){ww7{*W^P`-feDT3{2iffs`GA3URwD6qBvcRdsm+T3iU{V8V@ z5(3N{>Y-47%nb!L62I92$;Z~kjp#%Es@&KmhzQmfIKs%&6Iv3<=tJAX(GyNy63OQR Yz9d^85_u^?VP){(<8)#50~)aZ1EOsslmGw# diff --git a/tests/triton_tests/plot2.png b/tests/triton_tests/plot2.png deleted file mode 100644 index 94659c0a41e63112a0a69da5cfc3aa655e9219e9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 51996 zcmc$`2T;{Z_bo_JR6r3$FrX+%5|F5Xl2i~0f`ST2l#HTC&OtFCAX$)Xbno7K?X}kKduP-XH`4B;rJ$hL zsB}tRgMxxeje>&m+bzzBEjijlxAt52qrsQ&?mv zr?=6uZ;^lD$3w+lKZf@@fOOuS^)huP6Q zQHvU3Kfe&grKKg8<>{s!(r#iD_*w5)@vh#VAAQfI>BTS$Tr{$FK43D#3SodBQG#jr6VF+xF0yRYj&K z2JcyW;MspY@}=pO zPwnC1tH$La`{E>QOW2E)9~|(QX|c$@*0FHowkDPfZ^WbAd*%5xe0H`?*Pf$CkG^*v zQ~UAaJf(!&6i)z)AQvz1y|-7IzP&jAl8WWsBcT{oz4s0qSXfv(WK{kwPH7PX<31Ug zJvKHrfnVH@vX)16GqJK(mjf^_cb8~ZVt~90PA6_nhho8vE z$lMmV^Pg|RUNS#9z^j`xEIov$8U6d~Q~qqHyG{g`Po`Pbw<2%vr)u%BiBH*f96Wf? z$0<}#U!N{}TH4C8sLh0sd|3HFYTo;?2HPo8k3rKSC;(tgZS@~rm7@o|^G&tzWa z33pUP4g6&>^a#Z_36G47OyNIt=mGwFvO7SqKXtGyNI*BoNND@iW-*2Pu9N-STsbeC z%^shf9cWOO6^JO@NaH(tO8me9THlYwAAQ%+DMYp>$=K7DhMz8EN~NTx35kq6nPvV} zJI8v5_v*4Z`6IjZm-=h$Ar1s0UT}T1T{2+mon1PM;**p2{eptNeEm97Pn&W^ihX9Z zqiQI<$e`KyXHU-w2M4hqFSP`%&yH582o`*<2uZ`!o!-+sBA>b>&CsybRYaEIuf43n}ys}|K&2fguwwF2&;dnYwI z@?9tYPEGX|-=O~f>@2l~g+*(wjk<$FcB!j|$j6m{lvI@?0b3x|7pW zujK{n<~PR#6NN4v&r!A+X?;8Y)0e@Hv&dD+Z17u(m`x`uK8X3CUUbT7#!Kz*Z08mh zj4dqg=H`l@xVz~`mFU5SXxR?ALxy&Wx*Vhw+OoZ!S^qKH{)14bb~WVM7$RzpW4m;+%V{;``&$v5NPRWt=QytD10RrLwv*UbrBOUL}irc1`y*!bDG z=UWg3eQ%o#8aC!!%(nc9T#^;%zBFfx51BaATggYh+|p!i^+Rz6U0q!R*S>9c7}(PI z80c6Inj+dgR^|uRtX*sSvzT&Wdgw07h_w66nrwYUH+2Pz}_9bu!KHcnPvw&e?oVv{XEl!olGg#GwhK1FS zPu!jGC@2fsU6q>X^`fflwXV3Hp59#7>dIXlGyV4tm&o%^|Ik{HM14rIZppN6%efza z)cISgI@`)z4@2ysD_;_%-L>C4@GdXTre1B!S!n#e)V_66G~cdIxnp$f&ySa{3<_c$ zhZ_4Xdk7ZJv7fj@e+ggwy-ADr%KTlN+)VSYJAbas{-~ZBY?x+~pm3iVzV!F!M|?0f zstg4)zqVYAh}r1i-K$G;;~rD>imDF}-;WWs_*UTVT$3OjaQE)I(9qC)1a1EKmt9k% z9S@_Uw{O|9CHl}6W*mqfM6gclR>q_!Pj(<0M@L7aRk^UQgfCcm0t>>`Wj>mU*p9RwD*5zjx^JqvS^I@nx*AGi zMotcwj7-9Shsm+=_K^ufh8J@9eMbMOpWphxlN_r{ zlMH|bQl5)5rInTb+1X;MT;ZK}kB=vFu)Dar+O90kdn_98BU>357%&Wf7{75MjcGrd z_mc3+$_iOib#?W|bVKzbuVo4EmHZV2hPrpQCQsE9ui2&*6|Hha9=1Sk-yvi~fh{Y= zW1=RBUMg8r;4$2sf#3HXNc7(0W7k({JKQW>xUz6Xf81qtd3JgMZ%@xA6Nli^&QI3V z(1@1x_DYDeevy7&%LeOnZ~8G7t-`9aJo^#`5!-ieCzh@L4Qpg<*yILQvpj5 zvqsHtF1f`IOE{0R;u~aJHPGzZwadfP)2i;#v9vN{^SQsjZr!;<_4mszo4U7Gv<_cu zKZPx({N0cmETdxf<wkjE0G0vhF7EF3%L|U4G`ynY6X^tqN22 zefw_A&AF&@Z3TR4$+;3)T3gG2RWUYZ@9gY+_u+%Z4i!~Z`U@8>kgDkezCsUEZF5TQ~9VHD7;YhHQu{xi|d&+oqGLuJ>o4$PE(q$2%EiIy)wD{4|GGDmq zfG7w1el9M%=|ATwl%ozV%VqZJ+KR-eddzk>ztYYqVZYEZSu3j^yhnOMO3261d-(wt z^>-PN7F-}vdPqYmmE~Wh5SM_! z{kB{i-R)zuLruZRm6PLTY$jqmb+RoP6FnxB5_;D?(@O7@Hgg>>Wtm6r23TaDd#=H(5`lGO5;SP<&R^;b zM=n1LWN^?R{}d9JT}rg+Q}@}?DSR@()fVJ5AdT`W!J=2e`^0CrWp-`__%QICKHpyx zuYzyXT;wgQ7{pqUITaZb*wkK8u|Z2qYx&~syLUD5uK3~(Yfs3(^IASKGc(iD*@Yxz zef6q9TI$=khb(FnN*f!Q@qWv0EiEmVY;4j6!{6uS4bIpH1qE%te=@)c*+ofBthF{dtxnyCX2t<_UFu29m))vv#Jvy3JyQihOnIKq1mZ@?i zzm@RNpZ@+Yz>)$Js22P|uV1~Ia>B-+#y$W;pf5T5N?Ta}xhBubMkHGe4UIRsxwTPV zmvWBg17;vaR8Jo`?bNGc`yPo72htS%5mK$byKa{G#+O>@=E!15!dy*G;^ra;PMhDn zd2_D+q0KwTp$loOn%fRjCATzR|M=s<>}js!$HbG@jjUZqI<2}lmrexOe-*bw+IJZ1 zj7Q9F-MQ2M;lo`jk^GlX2brhiNu@#V@y#c;FClhdIw zfhPz0qptMlSI-=a+4$pY(Qna#m|FJ`_bp6J@(6ZhL>XYup5OIPP!f-n4TbHFKjL^? zUK<_Y^iY$<;>^f<5xTq&>5>&qfH0Sv(!SoNV_8o}_vHM`3v=`H9hwR5`h2415AkSi z{D%$JDw(ep`(QF@ZyE`ptK8h&{QD9!HY8K3Xja1-MQHr*-@S(qMc5wtpXkeq zzfn0RAwts2{hf#W?%(G;crfVi-|N`a=SgflOs>WC^^A3$9Km|Fv0_$RkOIt*Zjm{a zS2N$aPI76cs+qNBTd5{U#UP#c>k5e;bsB!88Y9B2+yGQ$YiGxfp5)HPT_U#K@_+_2 z#h%;NC{ko?Xq9;qT77{(MfpJq01uK)9l)%0+gwku?BB^rld0eJZU$cr%Na@le+PfO zWT};&WqrhZ{OsAY%#N>KzaDJK;=Xk0QgdtT1p3~gR<~=twRC7Ra5nF$#UBy2tlLZ4 zgf8#Z(^yfo6xl*AWF7x7A@?)1TX?zDxqrig$}CT5l0yk3GqF^Xm76;-KagngtK@c^ zjAyKm4+ZM|lc<6I!hU%y4)VdCE|1BLC6y-*BI|mZWZ}Xc~z_Xp-thvtSnIkEr+CJ z3;^ENty@36x%7!gD~%O(!sO%4wclTA1$TKZ-rvS|K|4D%AtA`GUCU`w(AvI$cH4nB z9`kl+>4HBR8KJ8}Dgahd&;|Pap1z295vUyHF3KBSP8$KHRZ>?^Mdp+^a%2Li_1Djz zK4`!2RN0pGH-Yvv(ha0h%*G04_h7Y>Q&V-boNINygL4QD~F-;J*yD(uQDNbesv?FX#yxi8*X zTv2fk5EzHj2*pC6>`1>^=jvk7YERxk0tXk@N%8gKh6B~0dcY$oo;$|`x?&$f9XoI_ z&LrVbf;9|^NGNhlbE0XOfHS4vM=)sOaQd;Q<9j#UYh@>?Ht5w%4zh&s;pf zq|V;rd2rHyeln&bP}86=d@@Bs|K0T)zdqd^(C$P_&EHl&*-(ZSU%o0*Kp^K?Fq>>= zQYYAk)9A>eC2W&Q)7z_6bX2&ip92zXU5{*C2?A}0NC}wLyMW z^eX3G{xJHvMrbAIB~_LOAQ}C3As=jri1Fw3I09%6pCBzDThs9I@g;aISRpl@x_tQn z2!$v112z0QnesM0-=AxKE-me_uco-WX*WCC_VlI<6GfTTC41zVn=fDT5lCBDSO@@a zZpDXI66HJ1u;|)ZC)xRQy*w&Q|1lf^yND5lHR)~0LWWn|N=#%MD_lH?j)C$KP*$9j zYZy4bo?pL?CCYd8i_x{csz%_l?%plwID)iv2Mp&0g};r+>17QGges9<|)5So*ZSE|>k` zWL#X_DHW9-vhsFg55d|7~i|1_8tU-&uPwP{8>qvUL6*K9`jZEVd&L zUh69Snsc=+Q(mT`>hhgqBD+F+sRRWD(H%eFKiXEEzdX~%ulKIbc=4J6DJ@vTQX`=^ z*u_P+zCB`=n`Kw#o~*Ec4H`C&^YHKh-f_F}`Nh^pp%1pLqo+}AQC-(49y51PI^Lh5 zz2V6*`DYj39zcS!K zyaEa7^bMD6HJXDIEvNVH^eH&WF@68!%DSn-aIJHBP9r-3t$u;6m>OxDH5?fK@CY^c z67r69$NLBAg$oC&ruA~K?Ivj(dCsFZGv(=1yPhu_!C9zZzka>2{g`PP*gd7?h3Smk zTyCHhe2p`irWGC|cU+^RQi%io__FB%qlNa9mWDkFHaFaIEoa}cX! zQd3q=&Qqi*tn&^WL_)nhI_B$#?@*tlP^|y3nh+*{7`Efx3N-ZFqX>FIL`3;y@-Z<8}NF`4AIP>b6d z>dBtT&8c8+-O2B+p<$ZnH@qdZpS>jhyN-O?HtqDa40}11BSHWcnR9s5*LZn(b^rNu z0JH?3_1Vlfu9N1J$!OT^P!XxFIOBk5ppVe`;PjgHh{GuWA|9@A-ma0?ETP3mz3IY` zGO&fA_?L7#=xPyQs~uvNA?O8^$Gc1Wz^~Bxe?nB99XWz18 z2d`y#o!g(5tklk~)#a{LrQ5XIJ~uXAz!^6EmaN1)02WFQEZRQ6H;pXw)AN9Ub`jn{ z6W_o??2xoehUjn+Z7es)>@b2rbqP;K9@*Bxfk|9k-0J(Y&{F4VR1E%O$BrRPx$)WI zAQqLdqy}CSy-HTC+0UC>T9k~89=+1dRin_!FuvU>vlooYf+PsC&2sD<9Mk#e?^MFM zPJiy~G^mUe-~%-iimFr4UR$fqrJlf_UN93Yar9_FaBwi`-kD7#L6_u1cR@>Z5Zx5{ z{tzyl)5^kPz<^bv+keTHnwnY_!K?YXtIH6zF~$6AY^HUKNWhg1X;@yu2x>7M+9#DER|JIy|Q`KLoS49(@m(Edu#Q z>g{aN>asc>`6ZkB!^5X5qXccv4&c+nK}u-%rlh1OVdYE#HvffQbHutKQsmYLvOP`CnqNpt*6xVv=9@wC|)=L#H^yM?4JhP4hEtHfBhZTNoMlLB$CR3E2#i#@El%aHc9skVOu39OscE7m;;& z+1WWx25tt(iV?R-0Iog8`}sID3G7JfVQ6_sU-!ATb}T_~3B}5)@l}kr zj!p%77Q0tbv9SlPUArdYF?U_kb0M4csFNyCMXI-#Dj54L)Hd+!jx z7U%UEMBqWNG?BjR7%BnoIC*$@YW*7q8&XRvDwJgAd$$5UBoDXeX`<>RYo!}3NRnfU z!*A#@emm}nqcVi611F!H)l%;6=}7?<_AT2=v!&4U1NpnigZeM{m}hsXIGJ9E<2oI? z|NM({p`cEc<>haILc13q9}jA_)c6&;(+J2-<@HY#QvmR93j5|WaA(JwB%j_t|YQK4kpnq?78 zA>ur$_gw!&4roRt^s0ShMXNO#uK|ITaX?c*akzCN#85#_UPw0(;0jj)!_1FeBZP=f z+)?xD?St)kS$^f=k0oLtY9J+T194SJy`02UFT)@6*2dmHFfamOQpwWt89wCR{rmR; zl2w6jqAgOCdh7Yr%x3~-s#>HL#kA$O(CJa#PWoH1#Q9*abK;_6{Ai7tc35~f)AsF_ zxsWyA3u}%Tx z^S9Bq3lN7gJ%!P+oS;}PO^m^Il0(fuQMkI|&a3}E+s|)lWpT_HtP6{v!S^a}P=Zo? zYUBt*jp;~AL}m%7xBT8WGdUTqW#FnHC?2D#k!h+lk6J4)-wQ>K8M-LJ5`|yFcJJ7h z3JikeRxR=gj0D?Y>)V&5+TzyDkD|aPUZn73CA)gemqsVM9w}1awz54vW=V>Uot@>N zUI><}FWT?F5~>O-B7fd*R5MenT*%#Bvi0@ z%^kyGD{XDPm>xbJ55C!nSMzD0-iN2JYneDp2XC;bZ=|6{#ZCqeVHoBBYC2FhCWB}v zlr^$5A=NlJZ-Z&aUh5k zVn)!AUV(6$-Q7fm=;5%fZZtvB_#M_D45NL!!QsXO4e@{1*1=kNe-8(nl0~`b){i?10 zL?yJ2XygP1S@6y}#$#sro1nA+^gNWkH#}Ss6`lwQxsm#>lcz+nS&~(y^;c1G1>}QE%P| z(bG@^24y{4tl&<3_g{W(J1V6Ni*5_$FlBq3K{AnROysuH|`Y68%cyf=B8DsFJ(Mc-Xpoor>3QMl|H+| zq-$*{GSK>s8#%?qA^-+pHmE?R#t*)BKbPh(=gyqvII2D;IsY0MT#LfNA{C4U;;OIN z2@E`z&ha@L1lS%fG`$w@7<=D|^ON~V^@=UzmET=OMYkXB z5Zyto(N%SHc{#1mH>i|#-zD^YI^ERfou&~{ix0FWTAk2?H5Yoi2lXN0vBI3uH4*(F z7cPN6t{~bX1Px`Xf@!b~`T6y*J4lZ>(>I%!_GZBv#+biMs_3)Xk8ZKRmI zv2IcS(HSObdO3reYc>e&5Hilw{o%sBx?@l0=(#LGVbKrHV^0BOPy%dGnXk3K`v{fy znfvGpP7(i~hXsibEuP8IO%f^|tnwhZ1U9Z& zw+>Oh8@cwmwy=9D?~NNb;$*!Oe*d;yUi;pC_FBM^eu?|__4NyBg{T%6(Hp03A`)Ci zV*42&qW<~iG8(aZxEYu$R-*JW*H-un zS=6wi;fgnz_ZhbdID#}NZOk}cCXg**W9N`HMe0s^&P4KW_gPT;lOZSVq9BW*=6@BfmGT7vgk2>C(EE8XJOm(U zb{ZNQ8o2yg3ndJ3)s1&jjTO5G8JpCP6NF_qd_J$BoHqUPdhaxX=0Njzd!8uvF9iZLtHgB**j0`n zKR$Xia*+tMV2)kEDxrLdS$(?&X4Pr5{T}!Wa9Y(5;(#F^12hp%3(X z7j;u~(Sr|iAiM`q-V6SA#P+sT{t-@n6ZmPY)QWLKwB204baJ##NGb{yv##QL%$gsgsro46qK~u5^WDedH2|uAfgPJfE_IxliVg&iAPN_ zLYHom%Kg#k&2>d0fPx;{3jJ<;diuxx=&v*%ceJ6ms?U8U%L7d`5^S#u(N@e8OlQHi_Fh%(UyMVhAj(63f)=MfKN^^t$p)W zEv=@K)oqK>#zNa+a#{;D|Dg5RJ#?)(R|BG=SU{qCUJ97dKD#;bhr1_tp8~hb&y@F2 ze!#~Cp}#3iOD~sgEZ)AC$z`p?#_W@#I}^)4>Bj!^)O`$#jxGlkh>}CZX_#8FE`6eb z+-%sK#Jk7GWA$?@#4YqV-@ea1}EIKZ);w z?-PS;^~6W3j`xRPQn0wO{+W+OeLJ&*939uvm-tn|ZBPGWom---i0aM;cp|=ztStMS z&Js^g&pZdEf=0S6f<}k_t~FwKMSb~Eo(NeUd>c!c-XbC*9MlcBg5H~FJ{2lTW3=}T z+cWAV>ZNtRH-2VG()(x1ThhfV&;5sEiZYo;v&FBM$wwP0XT%blYD*nES>0O9cAF>s zpDS*aW+Fuba{YZE^3IaqU-M1H;Uj2F)317&UU*Sx%Yn>N=4e`LN{<_&a++dVG|wdy zpZzP=kaZcKq)1}tBv$K+`w!cQ=xfG856Ubl(gQfFIB9lb*E5A~Yw+wbA$Az(0ef7Fz&UNFzr&x=!4!PQpulb04(^0MIy|k=_%h$VH3RXY znkcgy2o!Nnh*aVl=Z$0r;{TDP&WvEk&b zr>E~F_)uXN#Q72$Y*8#il3@1v(yD{$Ul;e<@0@cn>XV4 zV;4(sn;{Uam$l{RK)504@yb7E3g*Kuaq{yA0#i*cjummRXJ2Uw0@Xr+a8(3FsPo2% z67gpSK@6%}CA-Ar>dGvEv(pYMORfN@Q+takz+<3pZbw$~S{xBa%LQdq^t!Wi(@_3& zqwXvH50U8juEU>n2kbXC5q__}Cm;4oh9=6n;_Hp*FI%04A4}p^$JCB_t=qVBzc01r1vV_gha9@n7y-7ILvh}Dw4E0PEs`)H=&n=rKRAh8{?3yJf-h$pfxr(`$^xOp z*4{oGiC^j5xd?+5!Zn;Z6G8|dd{I-hZDFyoFQO8qTs2}3Up+u-)b{N>c)P6i6A#;p za{=C>@C>9SMz4I~4*d>xP;YK|iHV6|0d)v{3to#EF*Lt0lrHea4lD0E-xpQd(V@?} zdp9BVG6BK#f;mNFe3zd7zrpzIUwrf)E7jkps*>W6I2 zpO8gX2Bg(!i!-zwQ2x0}kP0jgyGc9oP_VSZNXYN)+qcBT+Y%|L z{YEbEq={Fom#I{Re|U^9yTF3dlq#!iy{{B2hDmRdNpN{Pja5|tj z(3W|ybE_vrp#CW%e&Ek44=Qm8V@N**5hXjKeIK#}+%e~0$O zv9#OEq#~%=A3!@i84W7wQ}4lyj{XnoOgJ`AJMitLPj}G5G6S(aL!CMYbtO4B!{{Rw z2=6Fl8D$rjcY$oO24I#POL=8v4B#kY2Za!Yh5>DMWN&Y;hb$d_z{1U)%BqPRW-lc! zijWntZWiLzN{b_`jdnUhaw2hciMwn9F!>^i`IS%>#Qmo`8^c))T#iXeOQ$%Ev@$0O zGKPYWQ9?@W1(6Qv#}qFiIOd_aP0jRn5zXKJ;ov->6TvR`rZg(ZPb1FXzk5G>7!SKm zxqct7ocziG)f-H!J4L*eVpQ4hS>>tcYA7s>Y_rO1(XhIlr&Cq*Aa*fFM3q@5)3cRBIQGIce_EY)DO|*N-gZT0uEciJWgirr?0gk zJ!Fy1a^B+>PqC%oI++6UQSS`kI}u=Khs5^*X}SuKV|oI5v_P%DCN#j_=_bS6-^5t~ zI^HpFd1HShGuY9XuObg?YsHDoUg46hz3Equrn?s*Bl$EiOGXk_B11n6 z4Nz>^y!m=<%TvUeKaM2n0$L&s^IgtDnKK3YWu8ZhA~ZH3ywOlqq09)tH6UWsDOI4R zp%I2+aDoC3LuNQq2!)jSdzb6uM@*?6yEtHXctE%hqzeT9+^@TA@H^LlNV|KB`Y&%C zlApH9&Q7pV3FZy1a^$Kz;G=6l68i z=kBme!PrdvlgPwCzvG~u01HXFRnY%$nD=S-v1}tDGObZY484|b7yX%h=xY;4eg2X2bpWSYN1n9DQ1ut~0_xzuk9FS`H zse%6;o6`%bZ8H;-X=qf#^2GEsC(e4+rb#0Gf-QBQ@h-~Aa%ZLh_z8dMG^PM1+zJ2) ztvbLbA@tG38 zuMztC;m(4GuswGV43vRXbRCFy=7B5~v@CsK;fSz0rp+UD-5mu%%uoBIjW)JklUgm=s`A)n7>+1rVxzosCll=f&bTBRbNnDVIl0#* z!PoahT5E1bM*XTdpN)AbzbFwHAC0^aZDJF%JQH;F?(N&cu$B^_h%I!5jgz>dFn|Ha zh>+KkO9H?z|A3j7RsStl@wcXahOx1i2Xzj@SxCfjc-{F278e#^>(AfRpAnP(=^qEg zxo;n(4+Kt&vnPp`9yDoJG@)$JHVRW^TkgQp5J>l6obt{rb!a*5?d`;0jgN#QF$?xk zoV6ECP~^9R$?C`99f00co3oqo(r9~L{=%P^umD|*cyLhvNEJh1aj_g)flb9ojjj+F zNiPHOjERDL_*iEFy2+mL^KsQsCRspS8hDH=K#VVk2Tx3>cl-+&Gxs zhdVw*!4&k{71iPGGm|SG!Itl(#84>pz(p_nw zfqBlgj!MWE_(wf56QtMHJ$tT&IzwGUr%rLYMeX$IuicI1mSW7&8Vh@w{JMNV!u%~` z^Zxr?QyugGo&Je)AlxifoWA;K)t6*XQCy%jq-;_5z#<2I49V>i5xpO9!iVlw*%j|y zEKRL$(Q!wegK|6F0uWnS;7c*kQGskkarFY^!$@K{0U?5}x#aV`tpYW(m`Q+L;PQ_H z#QOjkkKv0za660~CylOLQ9=19+B6YXXZkeiz;lUUm?FWoe+LFCHZU*@AWJW(cp^lI z1`9%Z0tP@Mv7H3h!+UuN3?rI!V#8_3vQS6&by81{6;Ye>`w7lDNM#|UMyQ0zD%6Gx$Jhk^jwMF^f2;AB&a>tn}$qr^-Fc z>||m4*)HQDVN*EI#T9-J&=iD26{rzgN5>yg;iF|C`%e6BN@pX68W=PPLHp@0gJz+J zYe5iP3#fcY!IHtSvy@Nr8UW?*?|zDva{3cV1(@@_yep`{0?}y%iM_9x#`X?ZyghgOo>3H-BI$i@?SG zpwURFgoK)j`2tEZI^-6=VJnwkR8$mNS32i4K*w9~Oc@y&jNl48{93Fx$7S4yP~jP{ z;A=NOVMatKY{Es-jsWhVc`^~6Az~Wp@8^#nj{%3JfE8!{5Dx}7MJb%i_{tSNWbH`2 z=Hm9d0e(J4>g*IbUGO2VRl}FT$!4Ewe149IU@O8ZhXZH+Oe^UU=9opWP}u9 zOh*K#cUj`V0p*p+#8p4QR@mK?0mV#_UGny*n2YSPdZ@}p9LSt|_sUsYTYqk7&^&(P zgrnBcJjVhdNF2QI-$0gDL4q^;UH5nkE9`+x@V_b(!$<^j!Bnb*J>V9&zZ^=eyb?vt>naa&wD9$4E0MP}bJgmN}QFcmjt7 z0xFb12#psXT}^%Zlni+g9dF=yeaTPt?3DMK&Ou!?T+~jI36IaTCz9DB4i11>y|k;Q zdk*0kzA#~hJ|2rwisP!HwRksByO%#rKTo{{s=F zhy|1KBTy6lC#Fa+LP>?@obFRjoX~c9X5k2?;C1nJ5l>}`FaP{{)O&!EVRWB7-q5P| zQ)`bQagOdfY|UdW%(PZDjfs156Jr+J281r%z;%D_0FB~w%PJ=u%3XDl!{*e2O#j6X zF&)m+h=VD~uGXY9{4V8oitIJ-sTX}%ZX^_2$bF1$;QL26xr7MrN(E-=XsBpJl>|7@ zE4;12w418VL;C(nckM#z%a0{%&k=5_yXz$84I#&3RL!5Y3Ohl%=?Q^|B+E;5s#G~i zw=M2b#Zr$^KbK5MA>>hMxFUXGaR+s*Dwi4c^qOyLB>eiV+6g_xc=4yvZNf~gZ0v2@ z{jq@Q3HT08c&e8Uzvd}<_gJ7wU~T)2BA>nBqAuIqC$CI!up6P6{4iG7gFg+y+^1td zi+6d1r*uLI`9$y*M#E@N9#;rWrfap<6g1-aYnbig+?6y^smZfP3pCU*ATe@o*sKT@ zg_(vRB-t}C3n;ecQb8~*Ma<8mSkmb-P4hF^+DZ2~O!l8f&|&!Ivyl+--7YuB!I}f} zE}0<$OqjS-`I?6+LSa&9ML2f%_se3rYm1n{{3$RRLD@qzz;7^rY9YYiKM8_LuKV2}>!P)SdqbT`*?=gvji`F<=O?f5W#M0R;< zE6MIaFlbEA;-Rfi)RAcpiv(91pd=IsOSFBV%`TBcJ5te+`{EBZ#gqDSw0g>z)7TSh z8TpesR|2z>Bh+oP65$dFMELya4qooA0n zs*S(PfDsNB-CD?3(USIDjg5^UAb3=)-yx~iK}ub2E-mF$(M5zlLfKe=52|!}Ra#1F zJCr=Q5I&+d!y-4{dL{iV6mFy{N$E3a`Lww&t*v#x%jJtjT5eJ5cIM>Kr2=hZ@BD^C@-3CBw1IQDag@7Jrif1rHU?vhC{Mpp^z3Nus39HjKs5o3>xUV}Z65J(9PkT+ z3}j9T{dxi%WCcsW5n@*w=`nEuovspD1a$Sn#04kOtLqr{Ag*8rOBonx&hQ0>0N3RM zH3$7#AJD&B9*Q@{gdRJOb;6nXb7-gv9|XGTSIJtSV1JCYsi~=vW&+(quN(g?rgDTK zhXRw3AzRQt-9TH}?;;`M;wgzg8Z9f#QH;NSMYfp%M#zIu1k3!`rmHZ2arDE&L>p$& z=0-n^&7mcmiirvI+A=u-ryKzu5Aq~*2)=N>_1L6<%8OW&uR zeN_W)wzVM`QlPRq%E_tV1zsH%J>I`LZ0ipDo!Y^8O#T%?2Fc7X((i#hVt^LXQ$CR9Q1)4M|>~s`L3Gs@+Skg@56W?>AQ*V$GWRfzi_@E zv&2XdOmhEs?~uBNrv1vAl;q@7h@;N{QK$~#CIy%_C;TLO)hdAJKIn`8-qC&Lm7gEn zwf_$sM5q2a*YI1N`8pIZ<<&1rU;pVtK}>e+?C5HlkzeKkE_$JDX4(&^ZJ?zM1NS~S zfMJn)ckkZy00m)(emv%g;|uUPlD6Hq{QO|hRe>T&>_M4OF5t6>0Hi#JVa{N^`~8%{|{$*$Vt_yLd-Z`nD-GHY#tU7tV2 zM2khD(y3E^gq%T#!v(G__u0U}05>K_wtze!_M?Au5F#;Vv*Y88uxz4H&LFoTNX}gO zFD~o%Z^HI&WgOA52Z4z^b_yAl04ZpRd%Mnv9C5*dG=bm+LV7qP);E_M55N;*8b2 zt5un4rNNgtrc^Yg!e$ayXb8YMNB3xhL*Vnlng^Vkp_Tvd?MI-y`hUBkL)j*WnbCw1 zN%WSH$P$&%v2I%tu>=)_+}exoX4}%f{2!H`Jwg`G9Kt#LtnX_ab8hfo=@eSM3dDB_x_l`jk(d zS`UDWYd>~l!U2h?3~5vXjUsdy&`@##FU}xhgdt=yXZk^&J5`fum-suMQRycr=md6oWk5uka%`&PI%*IAdV* zTh3M80AhcHCy{@`?!6%p$fJ7GNGQ0{uEWyy|t;VwRr*oY?A_;eLyE-m5G5 zGp$!Bz@}%w!iK5EDpWzrf5s+zv_1FW0U^E+^t#Yw&L+uk2si@WW&j-`ncN1ZIa%v$ zYx@gT9z4?X@~1&`I!|(IEFIF`nP< zrDdMIR+!JGpGC=>$bb|bUgbtYLY8%pSlc$y(V3vBA`%)Z-~^c3EX*CCe??9*f^Sj^ z)7LsRRj-ewwfJd^H2bDJz4(O%lU$FmCiJ_Hh*iyF1s7I#r2D#B_YYk!x-$RB1iwmQ z1azkp5n4SX5%a<~RFb*T9qk zP$w ze-Yo3%eSo*Gymi;qW{a$O8;Hz(N#5W-UJD2>+G1s_AR^U3~O&rIS6f+>m4=Zk1rHJu(guJ5H$c`UW7sg% zOmfwE;)VzdIArr)?84Y1kO10h_Wk>B;b9qJCN4UAc3dqN~yj;YuXULkL1C-O)?9AM5Ol#`vkq^A&5Vh}DMJA-Q%p#^17^uR@RW7~dj9hAQmbQhM<^iq)vaZLR z6v7>}J|UR`w+oZGRAPw(8$oW~L7H}T@y$D7YPug|CjBuM3H62{scP}x4F8yX3H*gS zB)&8?ML_|gM7sc+iiBP7dQc=u$de>w;9Msu5`Iho`fLR@0-npt?!rwk4ATrQH+gO- zF3O=8v>#(mxCww zZWtiVAAqd{5KiI4>*&4!rSGCS!1pRa5p#iMi+CO#6z0)h;);|DhToR3G|{lWfDEl) zj6LG`Z(!~CZCYp>fO9)Z%R%I!0(?sTCo)WjpOQYJXnAHAENfqV2(y9stCSXIUolDn5(dP%I$?xh z@D(>cylahxH5p8#Xu-rEb?`JE;ai@)1+gcCBlm}x4f6Dh@h~2hz_WXx8Hl+b@;$%P z);oZ`2m&&f0l9%6uX!u9iG{Hx!{jb5;yFRQNCX~Ybb+qtI%6n$a7_q`eBf9~876u? zk!pyca{@A|cDdop3j$C>UU{)%_GNXx)|)uDh*plGMkWLPY328`p}Ki245mU4!&vfR z*i(5y!em0+XRL_dm+v>jifncq{8M&9x;3(Ll09w8wj6 zto5_8RtIM8k|D5Sq?rqR98vnwQ-y+0DyYH2>=zO`gV#D)iO$n`& z|6VhH@kQ0|R@|+o?)&jf@-}XTUk!{r>sa&cttn*WbW1cpj56)8<(s)4)z9RA_ZU<( zge|+67!#liv!FpWDWKbx`Su7Rp07c7e6vd1*)#&OEnuqn!m)rr*)uXHJrUkj$2e={@J^b;Og zV(8MPSD<{tJiYDqaoX!auavhIR~+s8XKi9SG;yfMv_HLQ#h7R+3HD!sRfxp9Zj0J@ zukqa+erJIboyUKgWXC+!ezg9U-9LV+fT-t# zyGdQ5(ekl2eX)m_P0xRxk`pVmjHW0b{vFk5p&J0e)UjpXU;+eRD5tEfoYt{wVcL>u zCbc-6g_tDG7#Z0ICs~FH zmJXr@F4P>!>QS-3t^$FEWK%fUCfY}VY{_L;z@9%r)209~lRG5P&-`{IV}C_n%PxZ{ zQFAaB!g%}(gUVG~M3u(pMAm*S99$z*5Mp+g5PR$#*fB6j7#(d`72kOux3Vb-I@{a# zAY-^*o5eeufz!#K`ldq0;^0LlSNq7U%$an|kt<)w#FJ{AL@1J}$dyL^KTYX2kM(%> z*!WW>w$FRBUw`^n{_GN+RejQ0vGDedvcxpGD|qOIn#`e#`jm~+7rS4{+WKiAGvt&DATqf_mk(rWo?o& zMYU3s^Z;dK1pmE*6}e_3;9fXRBbgzTTtpQS{oPz?gUBq-fL^a}K23rHs8WSpCpXT1=H1yZRr2^WRFvO5I1*!0;&Khv< z0Y7~vhk?I!7CtScCVJdUNBChM{EhZ^Kp?A&%AK}y~1ImaqGEmnnn zVJ?(4)pty`W*6OKq18=F`UIv5-jcdtgCGToRl!S<*3^Rg2O)R6 ziA2RL!iKU1gR{mkX2Z*U3$uV3Fpy9+NRKLcvC~7DufiP=`!UO1`%u{A4!VJVo)S1G zv;Fob7?QCoj57s?6~O62x&O$K)}&74(6u3u!TOsoz}5hH;13CYs&e34PEO7gxdw>7 zy;$c+xYV|}w5{e$d0mX%%ihTqt|v$}@hoDIiG_t59rFYHy^@=oTj96T^MjcwDZDE0 zvyXU>n5h9YgWc!AzuXKkUSdXSE}9w0NDsP81FDb%1pW}C(4WTFq?l>oQU>DeBDBP} zh6Z50<8@q0)8FAD#cc#pAadsaequ;L+!6^0_vnyoEdnNS6u59%&}P8i=eQIGhXmF& zq^5)Di6bFEo+8#YG%piCB+780!@72@od;j?zq77YfR&t@Nc37zLhl8-QVp~oH~6v# zFxnO@LE;xe}5wQX=U!H6zTDmDli5eSg? z0sLu_oSW{EeY@)Y< z*r-ra$&jYbdxz(xZ`TlW)xWXN>(_B_#b(I#(Fz~7a;a_x4MRpjvA_zKdmg$c^4VJlIiw0jA>CKkwf ztAn}+_1+c#*A)mmjyU|L&J_nefvMv}PH%h*5C<~X7h4`vNyu`T+ns9SjzFBvKqdlU6VS( ztv7z{5%$20+4O3|w{PT{7h<#kWsdlx&Y$mph$i$KbmW2+a68Z8cK|>ocb*VK1HWz# zJt+SE+C*|c61o2q83NgAD+@~*s6%o$AOOuiz;7}q05T6NfR2;G2)|Ft5^$O@m^xB5 zAe?99)GRpSb|}XSue6V0WCR1A3?OfxA{udL03!>F#aVIOg@hXcLEX9iMRy$@7FL2^ zM3=f3cfr7h>5G2Jdm z2Z5PbUWn#s?t1S?CpX3|A`*stCjv^Cc-LFTrq6*8)~M^(Fx=wA$Z0xMX!cJMef#{!@@R(>}ss7A5<^>A&tpa5y?rZBnz5 zp;vP0k}Z@d#Hbve7+E{ds|S~mQDX2Ow{EJW+ayHJ{QX;=@fGi@!{Y=(A1!zt$U!Je zbn>GwD>grT;dyMNrQq(Jj~fxSPop=wv~D}avZ@@nk>&{1mCC3AYD2@Dw7Pp0(oQKW zcO&CMKQ^(kQD3+^*OPbCZ^D$~hUO=`)x4|Ot+yVWX!Q7ZODhHx>7B*+07#I*$WlSr zAd_vN(#frFaLrN`Bf14ziBbaq4t^OA1QL#(0=ILfLdx=)*|Gp<-8{_MzuR&JDL|63 zi=Csh8W|p@FD7#|V5cdgam^4M*gt`;bX1urW`)O9uAeMDA~4mGOwiH^)_0 zHfLvX!Nopo7fM6|gbGS>|7O7w2FUw~bO8>A;yW04+mFIuJyO0g|GK)~(bl%UNDDVYO^xThe?I|Wi2@`}x7Hj^Em9nzi_ieSVblm$RDCbX z!rVwHMiIyv1wU(s+ysZ4FUCLqbSITmG4m^bP({EQ1;PlBZS1rk!;S!< zi7K=YxuuJfCsS`*;XzF98)NlAMFmkXf#bsF$lp4@;+Fzc-(b=IE;56Cwlf7K&`&;) z$H4(?11A~azW&%Oh5v#}i$>54fto;15Q59!Kgz(j=ILoza|OELkh$(~+u3SA;QvYf zPY%|PAXOq7hLLVbXA8GWKZ?Et9c5vsd^~&h5F76HHhT2Bry!Z3X(zg!!9M21_32-_@J-_VNp1UeOG=#EKo~p=ehVj)$4G}5^P}KvJ_wa^;E!# z|N0M{$`JYC2-)0PeD@iOw5zW0S0E0koYqWu|IzwWw}ptj2#LfLi5(oZt}uA__9(id@`-2M%ry=h+jV0c^;FpnTcG@+~s? z+ZCSjNF5_1dMHoNgANpFkp*gg17&nN#4oEc0=);uhisqpfuiSM#rZUdRp(zl}MI|f$J%!i6_Temag;4m9aQ3p_P0=7f$t?jT!$p zX=8&-{p+=*^>G1JC@HaZTv+c z)-W1|8xSeuj;C0F91Jv#&tHQe^Gtkd(@1*x;Wt5P=s17hx!!$>Ey;JlRsOSP9hL=x ztGR{*TTqEx#_rLBS~mjM0UiasU!;x(8ot%4rw8p2Nq|Z2;ZM_gcU-6!a38=ySA{T% zXX`&C!-0iBvY@#QK0S`OmvJYM7CJ!uYPbWmKWL}VK?(#cgxg(z`u%RhjRNsi2UK?u zpX0+~5<Kg!;7+i?5(4vEY?H0mOg(ZJPr-HKNm)&&d->I-^#z<3{ootqO z3vBAYW<`pngxNRyRYy{qme{({B#|=rLw&Z`ub=}#A16LO_=a5z-q63&g%U5655NV` z3ak1MLgGF1!@#q>B*lQ;*t)+G60w*_ofy%vh&~=pS|KD147@N=cDh7J$il%90jKw6 zP!_}TzeGw3lml^SG3Y>=<=jW~Ulrgg-Q;zL|5E@~;}SSwDf*vXv869N{~xZ{G^yX= zKyL%qwbSjF6<(WWuf+e$6`Oj(J3=jFtH{f@5d=1chrk0?oPp@2O0NlaJ%?fC4TQHs zBas7oBj^!8#BBv{uU>zPF#+lgcckt?N|FZ^EM%ua)fV!r9?6loeof%B2Fhx;a&ldfMeLhLGzu`tTXJN1GtsV71 zi|Z35L&oSVzfpb34Y9M}*Y{ayLX%3 zAdIyp^7yZkaVtRi;E;rOliM;3Hn?_xg++rCm9J?FODX#`R36-r$}~s{Ak8f>sTTlA zf;F6n!1THeG2Sa_=uAL}Mh3D=E65xm%hzssYzrR8`*0IMj+qA%1W>jh$XGA~_96Fo z@kcNUy@hz8AF7tHDpcSgy5l;BF*7p*_vMrq=p8yb!k~$cIIqB^U=JYyTwQBZ_26GL z2#80YA=x-cEVH4;h};7R9}JFBqACgy+C0BF^MV{0aVdrQsF;0~lt?r{S|SjdKxM{aHS$7g8BJYh8+MY#HmNjJy1@+q@sAAb=hg12aZ(pB zHzP=AsKp0E##7}m#D=8Yz(H_fjs@^4*t_5ZZEr^)^d-n?pwU(dIWWZS&m?X_rt~Uz zAB+7%I@ZQlkV@QuZxl$Yc7+yxxB401cUShO$Q?c&J*KZP$k|J5?BWr9)%4N)XVNPc zQ60kS)j-@6g?~6$BEB@pCn~xb^p7|Fz!g1q3HMQ))_TU{82yH-lmji3x zH}Jn}DnZN|84&@UMq7vfe-&tJIQ@Km!1jWquCOk1L3Y+45e${tWe?A_AIuOCBII`j z;{)l@0}Rk#46mNTsf{!&k$e_PZAj*g*tPgT2|P62{|?a^L@`L{%fv)RzCkby$gc{` zPDB?2_pQc%!}R=*8cQ1p6Ioa)5nK|yG9?)Vb8grv{=YSs8!;CDliV_uR>ISn=085) zml0oJz2-t{f^c2H)PZEs;GK>G4*0|BMc~r9NYFtMvTC@@)EN1-K}h7wNOuGhcigzG ztu5{kdb@jlyf#heboFCi+GfwNLhiBM&BdAnxDS$QL8*xP*$^%wMgEVvlIBO5IL2k# zioSoo;;_C=Q~Orrbc2_=)+{92VBQrkKTjXeelD|DUYM7ZRM9=wYG9FMPB4RIOEiOR zE0PTgyYLFN!K$Y$5|(ZN;weZiphYnAbh3uU4p=}N6mo8X|5sBm$CiNyK;)phoCPcf z-08nos8)>6XAk_JiJ@km$IuchO*zje3m7$|;Jhz${G*Gt){uGAk5z~>ljrxy0mjum zhICd+lnvTp?yiMNrb;$Q3diIBC-?BL#m(F7P+$XTdS&kg939Z)!GSG_xO>TN{8)UD z_DDW9w0HQ{=xtTQ`+nZz2C8;X9I70-O>cHL#YFA&SGo!h$lMPfX@L5N$%?6fjEhr7d_MJpnU>4}uMV zb|#?$nw?ID-GzO|>74v}`7N#Y)oa%?8shVe&{E88}bjxsb93w+|@#ZJyzg^Sc?iQEV^zZgF%F@%WTd~Sd(S#dsK(> z!HNFYGCXdnz25N=(|UCMDlz)EtcL|_1+B}{eb7uof&Zm|GVh~@bQv*?i8K7IA1=^& zTw7fvVu4qe6dejHtMU|TxQ{kTerscTU#R<1_O{pgWvT#~1yq3yxnk6o&e8ojp@|ej z7Qa1Q?mXF)aTSnLtzya0zwJU(RL@=~?Vn$$7&(u|0{kQWVWn$_mUW0xH&m4sQNhQ& z-AN19F$Zt1fJ7tF{-4OHMSBvL&(+Taf0g-#v{=p0sqqAOuE5VqH_`Lv`hKNRTH_SB2dGQMO6=RrtKf@UZdS!w_H1zN7`D`i5L|aBW=z;A)7#bqUWX5RPaA zcNruFyRyNz-1FY?NE$1t8f?^w{_w_SdF-I-8#v|mr2jC+FT_`)&M$;Umdrw$AEURb zE=7gM9|*GG1o{b-6_B4%-Iuu!1qe_*K*S!I;}6yTMLFqW7J2BEfQ%EF`$4v*CuvT~ zkK_kPhZ9l>A84#{cO#lLq=F2^6|J04Ceqya?=o;}On_Mo38McEO#EIbViYTwcF*|@ z@4;8m&&|l2jQ=w;$Ek@Mfhmpog1ShE5aDCN#-n-IKNPH_PzL~H$aHFA5=Fk&Sc!s^=+HF>W|$-Du&&K3a9OU@EZSH3)L$YKsX|BM@Z|ayY2KcHGjt7 zpOsAf_%Zb~?@y*5sf_>OmQ=_!_}yNE4g6QWgCeE{zidWv;<>R_Rp5j-ZmnsE)7Of> zO^M$&NQ{y~D>Z`rLZoGBD~Mb=|8&W}RkG{^Gst)x7mz0rJ|(cH0b-kVNhd+N_{eA& zWc*0m=1yX{Y2!YMMUsnWZ*sdW_4BuRq(^fXIZYeAwV9fRq)HFaq+O3_M~;{slK32xHVi6mMALNDWapAM|v{ zB1$i(c(>I#Zt1Yn>M2f!*(vOroZH|$8^?XZ5IT~?B)#{XhepaV>*yI4E%6PSEN-Op zWez<)$_sO7 zSns}FpXFBHknhmei_#Kt<7NfjM<>9OLB6;K1)L@zphH|raM3IR^l%x*I|_nH?pUhn}NSsNtjxapYs zSQogpS-}{HM0}xD>p2$n9qsP~#|hyxzvd1Z4)ES@)`>p!xvy$_fLE$2-@&ppGbd<) zUBPfjx~x|)_^(#{zU(da$dgC>vU&zQBqbU3qCZ@zYwg^h%&ivO(u9AM%rZBFCZolV zl6~bcM!;eA4hv~_Equ(sWL(4%$)c}Ru!_lAY%j6P2D{e6D(LA;avI$7Q-X_We3|s{ zt6^(17POiSb(m4yCR*?h<+pJ%Y*VR@I!6gpd>#Mj)34Q)@%&zYH6WSvp>A;@!}E6U zfNd4U+?&=X<0CiflB0XDy{>L?GX=gBv2RXG!I;hB856$sE{_`dC_V-F?pUo~>E%CH zQ^mJ_ZXJl)AW@gUOQ!a+XtnIlbktWRx19=92Xo47VQ|Ln{wwc7XmAcOQsnk;a=Dok<-ViJda=&HkkMT?5 zI-K_DzO1e_IF*Z^Dm`?i2`JVk;Bo}>kWoK(TTJY=uv+pfs< zj~l<@vg^dask0{)VZ@UPT0*;6Trzws0w1Btv4P0;!Q}xE`g@?x0a{NEtUbi|^Q8Ic z1;l~DiCWvCkTL3sdS?qfD0(hx-fu!D$!)HQH^(J(lBb8}Bg?;ve3WGCf9MX#xGYm_V>vuDd2b_7wZ*M4``r5s7i$yxhnF&j{gIFQ z8HPUtt)JgrP*i|u&l>a`NCgghP6IGz3z1fX(!Izwlj;sEx)%4jMGTuYW)TvlGZF#^ zvhuzG**kMsDWCsC!>G?iU$a1rpy1g-6s`)PK%#~s5>gCC%!YwLN0al=`}s0D>>mZu zM^;3}Z@NMnuJgAr`%~cDbe?x+iIO4wlNUbpfpzOB?Lr2ZR4VoHe^%j+Y~&E)BLJd9 z0i>r;7O{c=0dZdf<^-7p1eX@P6l{n)iRpz3GOT$A|CtFjG&g)I(1csnT*fLO(ZMcPhAj6Gp7FU&F+Vk03e+Lj6DOCKB3%-+)s$S z9guUtWV;E?T9xw2B0Vmo)(ORYBMbQ}dXJv8jX#Y`FEbR)&%Y|>N17}73)Rib-%w=x zDy9g2XZ)geR!(%@vF$a7S=!{bN zFd47F=l=e1*NEP}+`PADG4ccPL*w?j);awW{+s^(_S#r@xW8xHS1BlaRVeLM zhNfE_94g!>Q)S3@;?_LVh0M5#G~SNt_P^|!d*IEw;KP?OE<^3^{0Lb&R9aA< z0&h0pi(`Wme|_v6xLmUNZI+IB3HL)IkxU@ti~89%_7CWk5pp_$yuZqPzZZjBG1@uh z(jKKGZ6Cy-_pnB7M==9!x>XgQ>S)sO)S8N9{{8oL^DUtOz6SX`I9GyUpI~T0WQcS? zpfjq?)F!1~M?I9d_&q&@T4p-UZRKxN9#fhW5xa)GzvS9<@l9ojUxLuG90uoQ3%UIj zB5j9`{L2`rs-Xjc?fG(Z79JN*H^VU)GtS(E+}zrEyLZWfZ`H0S8%BdGg)55oHMqLBRb>ERRA;Z zdbryN`Y>pwL#IC>`x{q2mN&+e?2cLq?7s>+8J(DX+CkEi|2~-?ugksAP(>K8b7KYO z)jTw7kPI351M-5G%y(ZP)LBs5nD$pn)6`LiU|y(YyQ69FPDOj$n5HI-bYjv2XkL(o zUzZXJ>mH|j6F|r|#TQWT*-eOK+@xbz?DJ z&7KAO_lf~lU;s)+*LsY>atR`gm|ujd#7bM+VV;@%e@VPQ%Lus;-c$+tij%L=Uf$9c z^=E5Z^6)9y=~ccbliL058YdU=@f9*l1AkWuqcF4j+iJwHPf+4>t7)J%L2965zWeMR4YcVs#Bq!Q=O;U`!I30YgyUMoGPjNyPL^}2kkvNoWL}#h>JH(MBWj&>W@8EWl&m+- zUm#^Rf6r@B2aO|S%<=CPcjsrf*p&UZldAw!iAy(oCM8J&1TOrW+ccpcLKnM)yzuGx z8sERKO*IfE?>@HgT+$%a`ed>7yzXt;xJ-1BcAd(j8xzh${?&7>7=4(Fm^H4ZLRy{k zFnUlF*Kek?3 zIgp*!m}Ew(WW8?JBh=uOP$S(PZDK){Qdxbe*Q2!$t!n=~1B>wfzqdRUaks%9<%=|_ z;eyQEVciI^fUbF4v8e{!lk>qhpU+Ioe^GM=?O7dLH8$fnX%R@@!JSodl>T6p$Ze9$ ziAfZ6(4UZNgQl3npVO;m@cl)kD)#TwAXjno4F;Zo?Yklzd0%=3B?x`lH@tadp50Te zt<@}ilc%6plH+qbFx5McIJJdD#;jlRRZ?Y0(c>^4YF$O0MAED<7efMnd~O^w>=}Ij z|NXf?i9l+lT2VPFU$0woNkfVR9E%=L$*rGsrml3yDNwD*QZ^OBL5LN zb;F*K$y%@8xdd;4&@R#o&`(fwQ)3eL~ z4wD9)t{A5(t=qRJD$TqPr+ra+nJO4!jZ~F$2k35$P5ie05La!Fd{H`9=+AkX2)Fz;Ws2JHc-6$tOs;B$L-takWJQs!iCK~E zyu=i-D{25`bT9D(A1anr(p+H^g=tR~iq(=eF3TYQ)NYH8{~B#14WkBf+lR$!rxM2X zw8$L_VX%rU^mgdr@t$K~ZUz~IwD23)+$}>tSWB@%A07ZL2L!0a@B| zlEBP_wuNQ?Fy-;v@Nv7jS3{Z;nDUWVVz}#DF(k1iNlruaWEqr+I8O<-c(=^wE{FYs z!=F%*^rwoO0JN=k#G+wV5vbjIk~CvkUtZbZP-DY@Dx7N@9Zr&lUC`N#Vd z^IOWwF1E!P{InJiZ6efzu|?-Uw+M^kb}(X>wi7PLEV-TfRo8iGfw!13bG_S-#QVWn zA`F`Cuu_G?76L%SYvtxG>7cE64NZ5D1^IMltwHf^34zl>$NhU1Kf)=42pBR4^7yX_ zRs~5Q(gWAfvsaSP{*gQ|h`V9h>~*xhOMcWlZ8b?Tr+pN|qDN=Ki7K_5pTw8CRO_i( znvyuEUBlqh6+7!u=VQ_GpYX)xPG1#{|Cw-3E~=)+HqI=rJ3jhyr$LC^i7D2 zG33mO;&CUvn%oL$Xl6%SBgS84!5fU&O~FP_p^Or#tXLL&YRmSY%89j>**9d^1auDK z;ko}VG=(DXmVi^f<1QU2kM+NxtKM$d3?2 zKQ@8+`df-HSKm54W6k9Um-d9tW^j*6+l%8k+=22KQF-pyOz0^boE+~W8?2?Q;*4mjoqwjn6oiP4}}PZn{Zha&Pq>8BY^O}Sn~WCR+h|I56jT@VZ) zQ0mylg#`?kV?JTw8d+ zC}pfypL`hjVyDE4PXdnbmG=*#1e5Jq+t@--qNOE=H>A==eazDWAKebzCt8t^g*STb z`o5ho-}+Kd=PelL1Qn=c^XEsnA*~sFAHkTdfXunr233fKUDCxT4RsOqWB~8cfbl#V zB+MYJ*&VG#d?C+I54n#Z{A(N7zP%H}Y7ynuxsq~K-(aeu!HBrH_%Us~OO?LxCAI_a zLDkMTW-DwCH?8g;MBz4O&9kxG>_CNvE~HAlcf-%az%!Z70^L>kz`OLJKSxSIdZc?1 z6+H1S_1U*x&TOxK{61PEe^g#Lsh z8699kr?`RVUNuij4gUq`|MB2bw2=qY2$PIFlb=KQr)Tp0aUbcDaSMQ1C_G;#KT3`59AJDu?zLg&t ztoAXS*R~cywUYAcWOgpDOqa%IqvcgBQqwTI>gAQjoW-BITW4&!a!H>@ZzOYgcy+fd1~_686QiBvee{iDrV_glvRKk{#|mM0QeO-g3^m*;rs@V4~U)vbmJoIir;) zu(sS7aQ#iW!=KLYjUFzMhMd~p-?lbCYvs~;+`&!K8W51__2gk4`S}rka~k$^gIqG! ze$!0R4Q?@rq3EnXqTFK#y4TOdf^!G0*=P>u)v*d*ft(71K4Xd=2?x|+1SM$1sKFPG z2YEoy5>*wrsSqLOYG>bDR9SNh%U&Ie{q@f1hYeQ* zu`tnrL9;;tbb|_L?7!m$n&{numn9HSDW-!O0aWUU%0C;*b${2!k%=dNY!F00=#yc} z)(sdp0f7dhs^i)D)&&mRJWvD!2n^9#!rvtU-_pX#Hva5ixsx=Y2(+iRSQ5{BDe;)70D?n ztunFSWeh79Pe=y!mj8U?B#s z##X%4Zo}{DciwijNwxi&{r0?k!~OZoC9w^ zli)2IklLodV5ZKDkyty&UrI#@ezJH>RZ@3p`Ya<}R8l=p!xL>bXCz{Cv;Hc_529rs z_FPQA{)7TywA@^ugoC6eRxeu-_Jp(Ub~xK-xJIee;2K?rk`4GaelGuMgm+@n?EhiT zC9;b&dyj!{iF#lrH7Li4*?K_(zH9EyspMR7f+U0YS22{$)m~Tbh3>xkBJ%eE61uvL zITXC5217$#ed{Psg_XoVU2^&AiLJrAs-|S`k}TTRuG2K5?PKE7e165OZJUb)*p z#5nO5tGG)6wNoGO{z|z2P(6j%1Zy;l=T9ki88*?z4b~K?gmIA~Qb!SZ_>xCIS8J>U zk~jWj|A9mB$As-3SC01%|3g&vheVZ{fgOeFT+wOlJ>q_OQopQV7 zE({){p!Zy-$6VLMakjVnbBTUP;jn*D!mj4U?Ul0ZwK?Up&?V6#`V`D#bs_A(;<9wrhQ*8n zCd^1ax~Kw0eLw}bAU+<%B?P)@u;C!1h(RiVs8tXlBfw^^RF_1uUn-9|;Gl}QgK>Z0 zj=}c+3+aGNZ?}k7ArDYvHP&7^AN#Qk!SUY!SI)KO}3y)x{h{xu7{R zXi~O)WvY{l>)OcmAv)MWV|Nt!rGjStUue3U!;T7!C z@xmc?eZ=|Cv{rDWXmEFXt4MeSS7Bkr0=w9Ac7K&gPjZSK`}LUb({82}d*5dQxv(Wo zjf_XR(k~v4Y20{i-}$Qr^C~JzJWsekv{7QOXxr5#dl` zcs;#XTG&s0t85?YC;gfdW-X+nWw<)FeX(Sh z&Dm)UNNh*Kd^h5Fw|Z_aH{;tX%lfZm$LAL4kkVc{ja+ix^_nBeph#gXXGHM+l9it1^I$Ld_4R7!dN@ANUMVr9i3q6fveFaYrVJuw>mSAFf>s#}$O zr}^As1+uQ<`}So-6PdU-rF*Bx3wqH$7W-+|Gb460g$B4aI#3&s^mVV%wF5~nEiufb zx(eFovqQ6ddSW~<;1KEl2v*=3k&=F=J%2ITxO-U8U3IbeLC|>L`b`|}8w>)8FT9od zG;VUOB(I#NwQ@LXB1ZQ=Dh6`$T6{42#4D@%Hc~OSlqEQVFhY`$<0OTg0u+C5zBczB zow@Z{C#Mp`-q zm7rI=!4;a`n^IrC1l@c!zb~PC`{*dgfj^HjD(VHFdfTjt=@)Ye;!V3E-k(Wbg}xkG z=H$Yy%ffTyj%<#?Thy*p-aW$3XdFk|Y+vRoE%}H<=8k!R#o;pDx3ViAn4I1oOL7n# zOXJR;t+t39+-N!}Gn(YRp&;D7Hu$S4{V@Zz`ojRM^Vg*LJtMV`TV2fsq8c$8;zaXV zhD#;A;>1@z%7mq0?GgliB2cc^53^^S7>tUa;wE2@ymQI-3=6Lrt5#j}4W`8P7u79A z#=bQj|Fi`h$eCQDqHdi{Y|Fe`>OO(-P|ej8t4WaJ`aKjsQO=e2CkN=Y^@?@p)ih&2 zN)Y4B$WFIS5^-a(WjkhSqsCB9ngQ8)7=a2Snj^JlL*CvC3Ztry)bfX4Hs^GWjJgp+ zAT%FfGb7#&sJZL{{zXUI9VPnlG7qmKj!6^MBX(?cv3=4t?@N)AgqCe>^XYy}Hx+Am z4u8A5^_jsMKN4br`RPV~p8Sd6$EdK%i5BAxedV6g^7XU$IIH)MWFNASV zO|4_z73qQ^X3+-^9#nmBefrwigF$&C$#qMw;90>w?$-mk1&xt-pT~M{gTG@5ar#(A-LZjPI*6Eg}8@^*5$eFGv%piRAm)SdS)}mji*Z3$0Su))o9EX;h z3X=mHPe!0A84Kot9WZ&h1ELemci%y!kAU_eQF{ahDNr%=mB(Mn`uK#rPwl-U1M}9v^V0ODp1u%D!xby1h|1RN&vT4P1s$Pk=sX?Q zS+5EUM(ZA&vRNTr%3hy>n!oDV;t8|i4Y>xBA2dH*9A^>^HD)EgPJU^0eaRf{X)KH^CyOYqd|x>8 zc}jzz>SS}chO}1s<^**NqXB03BUCu!i%WjRpK&o_XjPt3DqX_bqM`Rv__{&dQG zq2DD*zsqZ0X!9Z6fkWl4`5{VIE0(DgoXj^}XL;Bf*|z(RqDi8eSOb=G39E~jW8G+} zOVsN-^+OiJGtyU{;ulV2b3H%rr;5n9Evz}`0-}BstQv8+s64~{mvC@$IX#})e&RhI=PILaIi114iqmJp9Cu-Wv zoSLwGjqbQcHDnTu{ic(}C10htB5s~R`8t++zH`60gFV7&ndIbMnL+PG->aLO@%S((lJXvJKa$fW*~)4URYU}JL=$)FH^C;%Uxj8al(-4>#}(W|L3yKasV zhHts)zei=(cSqBt<-5}Nzj*Ri1X4fB@KOo z6||`-sQi|i+IgZW+0Cl!(MO|~zor!D=h5V2Te>e+ZrKRwyj_|Ie0S_}d8)xeQfGux ztnbV?jNI%{ZxIBeFe_OAwwJJfQ5jSqL8&=?F)Q&9)Hmk#vakv{Zk>0pLjiNO}nP_Dbv(618+R< z&uCBSay@p}Ym3Q6tf`I5k`(a6by>TdTOXExOaw&hK-7?uHvn>P3~2mg2Tecjdi+U8 zOH0nkCiwX0K67CxFko7DZp3vbZ zu}bnQl_Fab)#Sfp?~waSWBsyYJ|N=b8)VuLlRV~dp2d7GQk0UfArq$d-eT<`t;n^B zHq#$Wf>SdjW$RgOL*dC4W0)ICS-VL?wO{T8SA%pu?(Sks$MvYYo-JiMD zPD-7!vpZiUT;W3*MMrdn48M3?vV{O!3zI)z^xHjc+@GuND=kXOBl7ndbi491(}!2F zPXDpEFpRP*ipd}xXF!vVZCT$P(WC3t^-xYMxy+Dc-dYH74*Tme9$>(@ud2$*#q|or z1$|J9!W4RHjU%%TY`W$f!UbT+m zVd?&6I`ojKyGgLL79<}#N)#xC<+p43Oz1Bmc6?|7k@gMOk51n1vkQVLg(`RV+9x*Y z(}i>|H+>o}S_+6UjM(idpZg0!AK&SNzQFT%m|L&|Lpe|U)a9$|<`k9%$w-^zKPTsZ z*I~ZE=a9xzz93BAygS$%=ZgIy{wb~-}j}Q@>0pZ#6K*y z_HKKdSF+I*7E`zwS6xIPre8#h06GbzIq`U-ooK*$Az$_%S+S2sBSN^H#W#(EDdft7 zs!>j?z0K?k!W| z;S&yb{pJha{EimQpFgQ`&AX;=S8O=~p+R<*B7=jI&1oV@{^Y_!lXFI1eNaL*^>+_RV1EuPtv zCDnxNPfx}T%E5EuWG<#Z`pfPN??TYd`T^;9Brnn=EEmUI_qqMg@A&VML|rFgw|WaR@MS#VmTZjks>81Gb0pqXTXg<0A!zx; zjP~Mux@$=to5K2cgU^|Rk;C*p^E9+{rNP4y_YCbH9}B!URpWsQ;c`tiq0MSWq;IMZ9Z4QE#PYo;xV=uwx=Ce!7O4 zi$Mb|np#3l*3l302+fbEQ^6ylxmhTVaZu?6>g1)^z*DQC?++qG?o|~`6#VF6w&S^G zoVSu4(}YQP5t=iZv0Dx2t~Wn%(y293Oa*DfBw-IN z{G;gA8TPLXe^X-M4V5AlK2O1O!xaAX_r*l3?c?>TD_0sOR7XwzeC)eFebY&@eZ+9s z1?_AR-};di?yR>#+=fGDM{k`GbROs$y8*04au5r4BA~BS+7D#0rKOUuM}N&ruYC0( z>-cUx_HgEG`$sP)Tn;XKHd@!qe>che4hmUz;@L+YBuL0cU!Q)CMO8rh^Q?WOPLxhP zaWu4G-6u{5@8Y{QYkv;?YrAj11#hOA5l*{7RZs|hDY-nYFrd3|}Je_nvM3_jp`!jLW z)F9>h19E<}uw;Ew$?cRvabl|Mm9jw;A91rZ9uf?e1?)&D_rNwAcNPIlJdbb&Acuo zi{d2Tq|58@i&>bt$ec0>_1a0VTJ)&EI7q2Vk8aqoyy-d@X9=AY<~!o*U$gfo&U8HB;^3D# z`=e36l;ZRd#+ZfRk;o4TJqNMv^Rrd+D)40jad%_vE0G&BKfe4R@!k!y(H6af?Lls_ zt*nNyv32#DZ?9~%svu32-;{P;X7WGECDb%-Ms2o)yyAmsfw?*aha=m@qS>uR{6`vD zp`zokF&-PC{4Y7PfmX%2fUhaqj)^7PsSv*K7(VS6mUKW)>(56s@{Pa8Vv~Wzy9=W! zkikn}OnjN=V4w10=#RpfN4o#c{l@_MU6M9n7QgmXv)9+F%Rp|)&++#B8DBwO$_Qq6 z36}1rbbqnvHSrwBjLOPk0p8hqtl=LY$tO^p%1(UPPn}p3QFN3fFV`#HnMLGX!Z~hu z!47+DW~GQQ=)u@w_uOjsZRBQKvGHIY(GxDjTtYBZ5u^0~tqF3?z3W1x68sp1L<-ayQ_mPk86m(sbgSuogqm-+Tob!gOr5c_ zn^E^rD*1|~jK-6^un{vn+a3pDK|N#bQX30AUvBKDmAb+!0JgQ*&FhU~5T$w5p^`)W zXkzD#kl83GrYLPk-{9@jm~yhb`I}BDRxEWq(?#*$=P6PZuId^Rcgci)qt-OcsKxct z5!Ad*sfStrz6_Y`oU%5yMJ~2a)q)1owvJad4GV)u+R409w{$1BAN_DwnjSp4O~&bT zW?lT5_#3~3WCh80{b3oU1nToUM$^1h@*H8qA9CHpi>(TjGG`tZG+L{zCl^uaF7GHd zlBoj$idj%F9vW428?*IK+5h8K6WQ;K3ko*-5x(#pNJw5l6%>u;YYH}fcJ#N$+)NO(m>){ohs|- z-w*USf16e233htuZQOL48r9VosH%I{f`k85kEldtaG~`NU_g-xQqHrFhsmg4kTD`G z6K~*qUG-buRSJ%`*We^#9-_`DiX5(uv!>1vH#L%x&CP8)-YdfEzNeh-v-1;qA-6nK zH}AbZ7&Buif4q4UM=ptJ#82hYJmY_s$KTEy(_aGSrsB`mz1|^bb`Xt;V_9bgkBE%9YG7{)CqGm@^T6$6Z{a;FDd7g@%!)@K z$kGjGo{vCzu9pIDdh)1B=6b@8T?kt8@(ur4Le#XBrJi z)I$DIxOZO*#lXQVIKy+%XxdLHS$cGf4bPL#FGHKJM4y|ebMJTUcM zv0H)4rC|PKdzyvRER*(Sx5`J3$9htP)+^!-d6t77>7znr8GGqJce%vELPO)y(>FW< zL7;+(S@|fRoTq~x@9Tm5g4)Xd^fp<>7XO+Ly}X~?r$%Pqs0N0lW2rSGJXEeG{iUPFKW}km-=b5>y7g}R=en6FV2N(NFrV)p=?4IJ{uuyY|ewxQl zap#{Dr!q@$*CR8bDkl0Qv2k&6iLZ&pfof$7DA>Z2FGWNP-6$CDD@t28kr^_|>%E+y znYhY=SCqf|O!FeUaCY!m5_4N;cy(t-H%LWIFFPk^<70cfUeL;GtgpA${R*#W7(eCm z@v*zDK}=|C9ktn_Ort*<%F(Ck(aN4I7van$j>j_igU(Xr2^G<>8V;C=&l3LCIG3&)tJ9BXy} zEc5A4{|yCYo0_}bA#vV$WFVY5Hvi~vYOJ!$htNdTPUqns66-c ze$AY#r-*-*;fSWkx)Ej0!orI5ycaWJrEILGu;g34!kU99c|G-83F1zP3ES2qj`C3H zh$O<;#XX-jeI|*C_A0=Q=HlY~QSBqR*csFDsD?syH!QDn=cGOkpV2=KqBjH?Joe|> z^C30S*Ku)kW4Fl|rN4oH@Cb6{@y~vuz>BbfcfM!#3FeNN`Tn^H%5x=~mkeyn zmM4l4q+Pb*=!y7GGdwM9R8JsQo6^G|@eudOCA&jM}M7TCB{Tm7Hv%9~KSj`uS;v z9+H@bPfevv`YOlDYh7NODaDh`R~N3*nr;Y@6kJFf!MWe+=w0M4ub%8y(9Nlp$vLzq zl)+9({`;n`2LX;T{_|k%h{XP!n+DHJ?ZE4v(=b5+L)`cQ6DA~6d^Js-oh|-&WEtQ$ zOE6HiTy$|5@P)`B4FsCvPtkOOc#YHFfi4tqZl=9>L%d<(!QqB_)Y#qrVS4YIU5+Bj zA(=xOO#AK;*NnVOCCzp9I9n zv%xj6B_)~p7JlD1eq0|0GD|IeeKr^JP*(!=Q0d({G}rKZpA|t3DLsP(ygMJ-zxHKv zA1VA z#YINFVedY*9376*Hen*(efUddY((ez%mgQuL1FM-yiEmb`T56em(EO1Ir^YTqIc$j z$aK|jHDhL~06T|WW(ll-poWqCPc3KBPUW8yLFUH~v|?MSae!c8_XF%seDY*(RoTbq z4#4*I_6gjWAyqM=NPS3>JHE1S`n%XEJ&)-Ai-hDje?~FSV)C~qHxoW+^$N$9>k8_8 zjMs}GbW}Kcq)GmFvm6gSF6eqg54W&dh=V8W>|_L^V$#W1Irqlhxbosa!g3)sspzp= z7tJO((8J@)ln$$DW?32zp0gIM~v_z7U+ zyMQL;1GD z$$!~g8O!Tqxk6>Q-Qu2UOuHxr3WuD?xyvkz5bZRNeK~9UXZ(qfih$5ylKipurS;!8 zY~QH=^=7YCDw}7Z#|yzPo_{94PY_)#BrG5(yt5-z`vkuRCr|zNck!=Z#i!#vC@4LQ zEKK`|&W%V$GBhga)2~sxX6fpbQmY78stkXqQW4(a6x!j`-4bxbptl|PLs2kvlYH-X z!BU^60|w`5Ka$(VK%YMge%T#xRgDp;VSqCNL054bRe$+#_tQI|O>KP>>>Hj$beq8; z#!Cj1z?`c2;ua>x_X6vT=OLL_XRXX-RVCuElLu5AA;Y4=oQ=PG27LNvTF<64lC?+6+TK7hP)9iM}+} zMgF;ZByoUz{jREMaYqMUY_adWfh#ANpvP`E)j~(DyQgnJdW>u%IrsDw!R!@(YP5q% zJy}i9*L5_cI`{&QdRF=vNbw;^G;!;o=Fz^^45C$DnBe1S?8+IMza#iI^>|QW!#6rB zgsnt*vEq7kJ5>RtuoAoNoA`)%@jTmibetRobM`4pMkz0=g`%@qh;VV;;FrcEr^vF+ zeiNT|gMBsB_rh(Msb2rR`F*`G$Ktu$s&Dm*UK`{6N_lHim1a`)d1egnoupmi@NWV+ z@igZ_jKA`ugJVS}ya{}#$CtrOE(1a=spl~H%WC_bOgRky1V#*KHTpi>75!q`QVlcj z0JW*}IRFMG{BRb5He%Cr7g^c=)!CVcL*0e{pJXjdwj@g`DN9<0WH;d{q6vc`TZ?_k zmhAhIl9b54WEnGKDf=?kN@R`6W6hGKu|^2tcYmJm@2}r~zwdKhuIIWmW7}y466Df zLTcEryq}%j&nJ6_GMU;@P!OwiD>D0J%xJ9)M4s5=KYZwS^0h3LStYlX4>^(ynb^a| z7*cbyUL5E;>N$XQfj&a|M6*Z7Dn9L%aijl; z$}!h`f6L2t{@;9bf17mwOKBXjpJ_o)-#}=DvcH^b$8ogy)0>MJ9P_bO`;h)8T!ah* z=kt!chUH4vwFu05X|;GK&DFK6D!GuqeiV1DGRc% zjs)dE>TT9)Vj|jIuM7R(eY3LC?4V8zZz79Ym!4KYaKAkVzz2weLh-1HY0r(fJ4X=7JbZGJR`8!I2@;PV~o{AwhF_ z{~ZE*1Vun^6U*`{F^9G;Ne7}p{nt9anZ z#+g0FBLSt!CoUSZ-oolqE;U^?*czrO+rF1cojI?54+U362oG&-Htg$w?pCICSP^-{ z#Aj1BIY+4yfP{b|ajbFyMMKbnq{V0#A!@}G_#ddg#+bWl@LRIkcTJZJXt zcFlGNb}%xBC_nW|_&O9(;YBS;mu1Lrzkbd9)>1~Ju1U~LZE7mwfH7Fm?7IgjSRebcpdT3U`HS;_92>iEI4 zzJ;+OSzlk_rTS1&Y?ym5$V;3hNFT>2gO7_ zcysmB*NielkSRR!j)=Wdd`1M6t>rABS5)17Lf+O%j_GAVe_by%kn#92#m`KCsNx>i+EGQjcZO z->t7uep1pxiWJ9HI$ut-P)=a-oyrU7NJ!R9)ASJI=KjWNm`+Q~J`f$<|GE8=@^kof z>0s;YY}gik@7MOf?YIdBbeQWBdM&T8pNxR4yZ<*Ne@Xjb1Yp4fddChD+dAUv>Bu*4)Mc$Fl5+`!Er^#ga zUZQ;$3=xp8H2o^z>j?yL=h{VC!=gJ+iso&TKS9fc78H@ZGnTvqJu`~Hiu$`f(BQnZ z#)G~yj@Cc%TkU!}^H%L0jp^f=f4T417i&JeVsv}!J`;6h$Ywz$WpTvQkS%i4a{3`2 z`I*pp_>iuty{M+Ra{a_mT#`%_yWYL3R^@!Qd@_vrl08kb8R1h!>3XbLEv@a*SRB!^ zTI|8+;(bYJ@^4OIzZKfz`fYI|Gf!+9y+0itUG+r7abvPiIjmFlr@c3+8>#*me*3BO zyqfi6%Wnvt^pLADj#+sM%l3frqKt(_Dl<+vg=y?!>Tj+yH8;b@KYa7KvUkz6pYu{2 zYd?;M*N5E@aoofzV^$YWLs`?7CUI2GmXFJmPZ#YtxbUA|e6HE?5bToep((?&xP9sW zBLc_iQ%XKqicr?kmA*_EK@fp*g)|?6=+(_oHggFCwRXZlDi@SzS-)4y&%oct;lH(i z!7+L(G`=&9ITaZ8S=->Y_k$w>+ZHrjhU7!7>~;cy!rL98f6f*DwN8$+4k>vk^YSoD zp@*QBHb0x;$B*K7w)AWsaFZ4ut?zcaY0uSxKV!k%%S8N{5t&dI4&IDv#!JC;*p($;8pNSY&hg-=fApZ+i^=xj$$TEjgk^~JrORw-`hIQFgDEPq@o@5+=?eQ3egh zekU4}r7+yE0ftT@OviF7*4_dMO4fL$^Zk~P1Bt`uUSr;oaK_0Vs|Nwi_xIa4i)yHZ zGa0GyP>C;`;Yt%PVD$0F5-txAGQ)WhDWZ4i@k6Ihm@EA}%E!(d`p*~X)7F<^GqqCp zLR;N#xy7iJhTY+G3{JPQe=KiA*P|yUa1Pu|=a1W&LimK@HX`1)8YxF`kf67Nql-(x z+Hq*6YXuRSDggJ<{`u7^jF5>z_Wh)WhQ?p2QGD_)KowML0oUp_<5!f2nV76>0%F@Ad5lo= z5TS}}QBzkxxV5zv7J0lwLsm^ggC2x3XCP(vN&L`l=H5JRn`SSs1G=GUXR7YwbYk=$ z#Iay<2QdkS%HM=uvyS+w0C4S5BXe)jrlzBhZwGN? zce3aj-0Bv5Os`tJq`}DVKSPl^Tp3oit(nUwOnm?EDU5d*DcWy6-eST^=GmKT)W`yA z=5+GAnjQyVlI$G!+lV-V_*vV`jMp}4Sj~uEt|6h_Washly?qUBJ3Q@NW&T8g8MRy9 zgxDpk#$}y4q>$vwIoWLZlDSo-=%iCC^UKIbXRHX-gBf{5@gin@&C5nS-m}UPSO1m6 zP}1MM4D2|br$F%uqBMzjS9MYEI95c2Vr`Lceb6itH=HUUiRxA8ZyRpW>D=?$tLuQB zNZpq=o@sQ+{k_e)Y>k<}jG@EY(Z)BSBBybzr?8?$s;}?)Nt8^ISS&OO?D+y>PfV%u z%+<))K6zo8n{)zoGe$l1;m8|#;>aENzMX^wf^Nxs?KG^u9skI1XWJKrL)De07fdAb z^&3L_Ab#Qv>2_d{y|CiCsndriksQCa!6W=*%@*=v$)2x3d|ik@2*omU=N4Ff;Qn`| zP?dO4g3>L1J?63WE(um5fe=z^Ls{AI=O>g#M`21$vg@rLr70Y`a-k!@(;*`|u<0lC zvrKt4>uDf(+v517Nq+G?6-)8!0UDrT3-0t@58WzVEpTS^rj$0Vp5@l1u zvU1k4a!AxTxF7d~AG|H#<6c33MG0_w3MP)gf#oh+(0GwOpKF5J&l|C-SmGGrZ#;XG zc%v_#KFfMde4A+gYO0z1R9)GlYeBADv$pWezt) zcFGx4JCGdXE?t}Cn!jn$$x%&XF>FPJuGVL@<^&hh-6x2Ae9+_EBuLV_)PErxH!sUjxaMIg@Nc%LaW$^6r{ER8 zW-8WxVl?6#D~V(fzCceuHctu(fgMHd=;O547EN|=8Im-wLqxYP+V`KW@mhX9ICV^0 zr_Z?Vu`7g7IM7iQt@p9o{Rc()mc0yBaYdVAbYI*cm8(>)btxt`3ny}+A|TUke~_0o zS#gp*sBOVM(30`(EVGB%=#m3|5sf<*ACGq}l9zC45+{WY;GmcSfz6wFje>Kd2MyW# zAbhiKeyZsPlgN3;GZF_qZ?{C1a<;q2WU!F9NY9B@>ZPixq)pRcT22zdE?HcNoi+R* z?9{V!Hq+W^uiFf`NDCV?1*PrXk&}~5t83)$sx)aZ6CimxPc9nOa9GSFlQ1`m-s4KM_>#> z0e`F{dAE+A3ggIL?Cl_wC|=;@GYD)j&-KB2J5-L>^B5<*)!d6^`2In=#I4(s{sY1E zCAt^I#Ed=F^yM1;l02~=WxRtUpQ5qy ze34@(UsHq)EHl1S`O732%oX%zDS<%i+UZr~m!GS7o@Zx(#mX!$z_N%k%&xY6b>Qxa zyOz7VHC*l8s5*|)C8{U(5$CEGzdsibsjl3VDGIw=c3%C+BTt3%yL>P%?Z;}0Tw7v# z3LO-Pt|kDDanFcEp0midXfP*8w=rmY$5YH8)hTln{hX-aTg3m|O!K??I$`yjvq%X< z{+MdTUtdb`BoI(=8FDr&-`F%51>2^K%!d7MqY5iVqk3Oi0OG3r|)?mo)zkx=mQ z&6iuj_bPvFzVjp`<*qtba_|uozS(u3T(~fwrS+|FxMMmoRYrzz77+(CHiEB zinB7!I|JW~=P_}!U{rRpa@r2`M5RyPTX`RF;O%+dr3|Z;L`MaUkNcfGX|i+E@31cP zCCOm)7Z&&a-pP@>1MjmEn^&Om!YPhptavZLEHc_}d6Ab8N;xLOvcVrH?2wa)nI=?T zVysx$adh!x(FNFtY99*VM11bN+Z4oTpGaE%xQ#wvZ>_aC(&&OexZQQ)xME3u7v4 zz_Y_aS{NIaI`}EL`iT==^ktFxdt6Nl5w8N8)e{Ri79JFm*P8OjJwO2T%AabO8j+!+ zI&=y!ib63U&AM=La ziu}w%AJ_A+HoP5;jl{;W(=$GcwPL*iiiJckH`fz)O4^g-B#n0hbb&>|>qv~U0S#BH0;kJ%sc2Rd2$JxxzGQfAEm z4n1T@t27N<`lJJX9SZpM)Pk?xaKO_XC>_?wMS%l-=ka9mBYH7uc?ofb z!2sS!X{d6m?!Jb+Dmy(>eeO}lD&ZM)@M>)@McL_`*jAJnh?uGF2Ou|Sk z1%9&<_y7Llb}2(X05xT1Cy>{P48@<3K5JxX_zCXdh?074ZY~65ktobM06Rulz(AOX^_HkR8Cuy9fThd zsi^^T-x$!D1r^UWkWq^4`-zCT=_vU0Q+dZA z$nP@@eBdsi7k+_>C00t^sX`J-Pew*YG}BkNraa`^+S@^Tx`Wb8-jWkwg9!(!)Con! zI$W$&n60u~Axp1v+3#|~ygcNY0jx#;s$fF=OomM=q92Z^oW6}!HW9N83lEP=O2W;S zBf^@ePv^wt`v-tF04JAeg^e#LHvhfAn=Sal0UNcEC0a~z??hE^38W%jeemb)$kPF5 z=@|46&>#&2n)j|wv5zC?OIljmLVs~|6vO9oarK>^gd%Ll(-x)IiHXP)t$o#& zc7=kDQrq5>{~Zd_5i3w1p75f(nbP^`a<^g&bkhtr1gaqUTZr0sdOQEmSBUOzD#*z} z5Th^RKwc+o33I8`W{4y@C_vNf4^7Osper&o+OXJ|cguyk; z!XF@HT!^~Rz!upFS=bU|$-{gXg1}e`KkC+q3IYTLGFTUZmLth~=%|@T+{8SL$W72Y z1<=6#mGLIOzrR{NHv-`uM|_vbNPGDBxFv#pfCP!g?b~q!%V|kTH*9UsCg-)S%AiwQ z8lb7AsCG#nWb1*rWA*uk2=tKWn9qK9HzTGO0S5%7a}Pwd);FZ!EC|sjknl(A>2ZP# z9zq-M+^!Fsn}b{Xjhi>2Fdo@{<#+t5%{PIL3<@zikVOP#{J#q}Dr?&=GvHOcBG?4A zEGZqhKW_Jq`;T|B&vOH2szxTH?_Isg~=g@8wqj0|z@6&p?ol*xn(iDt@ zLA-De?BT}u606WXC>WRr9e`~J?2`kQp`U*at{n22vokXlpLe*j-}1@W#|M0os+@zP zPe<@t69#=LUdjur8a^KAqj&15*HqdMaISY*B9ix*oxKL3UwVH1STh z5f)3?QUoh6zu(lSzE7Pg6E;o?jo7; zN*9pq^f54quLsN$M0x)-&haN*bH_X_ZlE}0Vt&}+=$L!C z$t@k7Y2fdHp3B?+Nty#k4Lk$`(o{z-7KHm14T$e}Ru~x=tib22LB^9qP;fN9@91Pm zFH~gs41?uGw@hT>@tj%tY8Cz z(%1a1M;65cAnSYqkZK^^Pq1(YVFfNh!A@l2sZQF7=Ye9M1pETjj_=A@sVEEUs3AWp>D-1YayeHBkZr0YK}=AvwQR^JvxEoW|8|qTtApmP3T5 z02DB2N*YM;)kk}9KuYUAs3ZJ^?0sQn_rQF4Sy_KcqQ0ss4a^pC@9(fn1471A1osRK zzPqJ^pkXfww&=9FvrYnCNI08idmXE^8pwco3#jx*@>daFIeuhwU+`fdFA1XAB4^JsVH*Ab zJuw7!U~2J~t}aBl6xmsj`#<QbR>OW{6hQ>RBMMOo1?{-B(xZuGA+jfHr3oJ`JrpfM$0GO9MRx zY6q`Enl3m@98BDmQ4cO73x64el-uK(25JH18?!xlQ6qJoZEzG~%F6XVwF*-IXlPKp z$wl*sNB{eMe^(f;#yQa9LfNTs9Bc+!!(fdk)OUFYX-(6xMTWuv0IA$`b&;ueX?lA4 zBGAsV-@e@%4`nCJbNiDGK|J=h5d6T4h{e}1l?PW>SD_SQuBHplC2y{}pSh*7>#^NUEYjZO?qj&(S)k>h03Ca#y zRQ7(cs}|<^0x(w}gTd^2!VUm%@5?||r#gY_1^Q$>xD(XRUFi2bRlVF$v)oz(UO_me zQxk3!X^TK|k^>5*GNN(|(2%0iWgsvZ7#LgtLM2!qNcsGjRoTBvqvZ2@9IVT!$?K&0 zj**^TGvMdjy1QoqT2iQAw>V$!Gu}uGwh&Hr=-H)1IL~K*M+awUH+T0oXw`+i&hzKh{o6bw6wKTeh1wDR1gOP2Zg*|o@+{#K(cr^_#n}$bdXR|XNg8k&$y0jwP8?3=cC~o|FPsc}v zGgT4VCQ5zQ?NxQ89%0DuFP`Ju3G-a}Ow62bvXHuFyfqzO`K z4c^e}MQUmXqW3^9hCnT(*g1rSEo`(n$Dw_j2524MR$LuK!Y=u4I4T2hmRHIotYrf4 z9>WTj0z{vTA_@fc_=FnZZ(-f8G}6_*LUDPVgTv(iBLr8kNgQtieTYY=r5ru99jm_ z^}i%299;ig=iP#g9|k1IgYxVO{rS;RkF?+eCPM(}-7}PbBFwh+hhZ8=_UM<-pDn>b zs;jGW2nltS+4TW-K5D|@IKg)tVnML7E&=%)HlIsgOLxR2B+f}mO~dcSL_~sNvLH63 z2j^`sp|K>6?83NXkkzYANK9t*Q1LD2m-yva%?2z!1fOVpUE6_7R_53qPTz3OD{6f-?&08YpPjf;9b#Vw06@bGM z@W7doYYPn|erAp~{Gkv0(<1Dr+yyd?kN0;sw|?FVWN8F{$*1fmhd@4&^FG|3ii={R zpFaoZf0blf`COv7?)}#Hct}9Jw0MoYIq20OTLhrwNu|3$On2;e<>cffTzfTKF48r99=)QL@PFWSeQYe;W1NKHX@UC>LF-1{V$dcSA#0y$Ed?@_zu8nMZ~I diff --git a/tests/triton_tests/plot2.py b/tests/triton_tests/plot2.py deleted file mode 100644 index d433548..0000000 --- a/tests/triton_tests/plot2.py +++ /dev/null @@ -1,69 +0,0 @@ -import matplotlib.pyplot as plt -import pandas as pd -import numpy as np -import os - -import matplotlib.gridspec as gridspec - -cmap=plt.get_cmap('cool') - -if __name__ == '__main__': - - fig = plt.figure(tight_layout=True, figsize=(6,3.5)) - gs = gridspec.GridSpec(1, 1) - - - rdf = pd.read_json('tests/triton_tests/info.jsonl', lines=True) - - ax = fig.add_subplot(gs[0, 0]) - - # now plot the % speedup for different batch sizes - for j, batch_size in enumerate([2**14, 2**15, 2**16, 2**17]): - all_xs, all_ys = [], [] - for k, marker, ls, color, name in [ - ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), - ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), - ]: - - xs, ys = [], [] - df = rdf[rdf.batch_size == batch_size] - for embed_dim in [1024, 1280, 1408, 1664, 2048, 4096]: - df_ = df[df.dim_in == embed_dim] - df_ = df_[df_.dim_out == embed_dim * 4] - xs.append(embed_dim) - y_ = 0 - for k_ in k.split('+'): - y_ += df_[k_].values[0] - df_ = df[df.dim_in == embed_dim * 4] - df_ = df_[df_.dim_out == embed_dim] - for k_ in k.split('+'): - y_ += df_[k_].values[0] - ys.append(y_ * 0.5) - all_xs.append(xs) - all_ys.append(ys) - - color = cmap(j * 0.25) - real_ys = [100 * all_ys[1][i] / all_ys[0][i] for i in range(len(all_ys[0]))] - markers = ['^', 'v', 'P', 'o'] - ax.plot(all_xs[0], real_ys, color=color, label=f'batch * sequence length = {batch_size}', marker=markers[j], markersize=5 if marker=='s' else 5) - - ax.legend() - ax.set_xlabel('dim', fontsize=13) - ax.set_xscale('log') - ax.grid() - ax.set_ylabel(r'% time occupied by quantize ops', fontsize=12) - - - ax.tick_params(axis='x', labelsize=11) - ax.tick_params(axis='y', labelsize=11) - - ax.set_xticks([1024, 2048, 4096]) - ax.set_xticklabels([1024, 2048, 4096]) - ax.set_xticks([], minor=True) - - #ax.set_title(' Linear layer summary, varying dimensions', fontsize=10, loc='left', y=1.05, pad=-20) - - - - plt.savefig('tests/triton_tests/plot2.pdf', bbox_inches='tight') - diff --git a/tests/triton_tests/plot3.pdf b/tests/triton_tests/plot3.pdf deleted file mode 100644 index 19e93a24eb4a38dcc82cce0729c3e8995a096054..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20122 zcmeHvbzBtP`=~UmgtVj!hzJ7P%Pys~N=Pc*DGf^q2GU5kASkT}5`suchct+a3JOXp zselL~+!@gCYw)|T_kQkw_sf0`bLN>d=R7s%i8*KZ)#VjM5Ta-Zf8hwUpc;aJ!lBOQ z=OL1kP?+`w7b_@C&WvE@;A{hhshinYxkHga15K#3G{nlu5^RY3=>|n-Cju0;(*UM> z#{9gM1pz9yd#d0~P|_lp5v-u-U4Xh7fneq41jT@-5SW&&nWdeR4HWzRs++TgmK6bN z01PXy2%uu+O@P8qIRYriem}~6Kc0db{e%wUPXa)^LG0bE0PJ?rhiO{5JA1fU0Q7+P z1LLb&S=yP&I(q{n!hs(SgBFG35MppV8ji(7kw}~<4v)rSP{3d?Ie<){FLx+@7g-r6 zCuiUc0p6$y`G+;&75(jRRvywuj!wsn>f3L5J-9nM-Z!K_ z)c0}PDA&b9M?op_n)sc-uJl^;5iBCsi^Vh1WXo8mUT6Bufhm@UQpG~|RnbLJd5BK3 zFNQj8>x#KCSu9iOXpU@-+FS@l`q#vwZ z%?KnM{?w7fJn}62Z4R@oM_6IkRff;;F`e&L)hA1nN}gPM7|7&ZHOyQ%GHw<`W>rXs z?JY^2yKTllPNQp8`neSOIvsZver__s?Qz3{o~4id#3wZ3v)FXA*U^RYxi5~}t;L&?V;|IU=`%l)kU)h&*WYCmuzc>3~gMcp5Ok&>%nm`gW z8r~D}X*yBSYMIsq-j;{2R)_C2ebJxPxMIFh7sV>Yd=Vc}6dy83>3SWWu(}Srd)X6J z?MvMoGms#ek{ZGu>l~{)Fz3rLic{8LTlA+eE9J4hJE5z=Jc3}n9fL|rBd)&|pcFE& zuDj%!8734s*1AKQDcK*wxjQiTWs>>&3@r_pv#C8P zP}8Xlm(81e+AtL+aQ5fnvQGcy!3lJ2kML42x__@SG1gj{ zP5HT9Ghc6hn?vElis9Z?&CmH7jVYdCUAYmw0^%sAVYZ&)GDOk9YZ?tfX;Q6_RHMw{ zkH!)646{{ZX=OB?X%N3*t)sZUN^)QQ+cPwjBHq)6x5?|X*7o&6rdc&FExk%hP^!{r zUcHq6l|(mODyN{8n_&T0M~}gWA1o-XDXY@bc|EYH2Fc-JZy;5H>UOdxcBPtZtru^+ zU$9<2E@;0Ioa*pAzfSL*9tXRE4;B+bs5ohH}l3{^m zHH?>O9W^5GwWG2thImc*o%BLm)EXmYRz_i1&YAaDq|u&4KJfd&x6H&|o@rAHW_n>o zzg9YUxz>beBJ}mKpjr`^lGe3`F}BARcT_lzQk;2lh77hJe>GXjkf_h}y+vsma{2P5 zWuuvH-PZJsnlI3U#;_9t4_-7Srm7OtCkz8z2uja@=JFXNSGQk1icG z8+7<8^5Vdy;S;%cGUGC>cvcuIT9}}VddI_>nKIl$nl{G2qQ;ke5|7hrP>k-k)<3)M zC;DabB|r*(9TpZ71Ctt_2U1+v4PTj z9Y!8`nBzvVLT?S6@5ZYNxuMEfOUAdi9lyf;?dLN_9*CVU9F3ms>;58&wjDvf#)Zf$7X^ZXO%< z9#_d&_owy4#8fPkaiz-R3}kx|&NuG7O@B~ECUQec32SFf8ZFmyHP5GH6WMI@LPj=} zdnv`=T@8Ng%C%97RI`GmV^$I@(QDT&OMqu}Is8&zUIxO0FTH-;U)q0uf}JGsTF{Mn zno=u^Pxl@tS{f|87`u!z=s$R)*^_z1K_P&O5`ih4u^2|9nnE;uZ0R1cqE} zilufV#<)tGnf2yj+KxVSg+y1bt;KQEGsL-k4vL508VSa6x;>(~DpKWMYrs0q(;xn# z-Fz*@D2Y6N-ExIC#83$)`^gJ}4yj40@UHds>%PG8q^qs_GkvyLu4Edp z7^XwnBf@pq7lJRk3+gWfrLX8TH8)MULKr(qQ2~zkZ$WNE;2M)!4X)DIWUMus&yP)* zH1+_}^O@1Li#Mur@?()ZBr!xk#Qe}7x9O3=s-cxlD=ILwCsovyW(mM#>->Q8g=? zl(?^|%}rVu3M!P`#&HqGw)Bry_Lb+`e?gH4HEIcdnH#5ba+h}`>Vt}n-^WSG;ctiC zU_JO~scs0pFYYLZ`lqWR4d;h39sN_R1syz)HercM)C9skULZt6vM6KNxmVt8NxXd1 zu@$)RG3U_}y@2J%{r_ngms<)nLKzf#Xy*|^2%5kPgu-9c9*8E%R zjKk?P#~-dL-lD$$s()anCD+1N*5I=5B7+Bc5uzj|zvUYXc4_jzBnNG{j}}$H zWKo3Utdm_(5;d;CQKZQ|@=f(ML_|d-=LcJb{O0XB(pk=$X#zWL+lhZ-^G6OD6EcD(&MEwowe8)_g}C6TT^>VP$oHgtl0 zc(`3WN}c;qn@$Av`ds>GWzUf zBFpSUj~`;m#F@)R5CJ#${W zA&s@)P`LqDQ=QX<1R3@eKO^?IB}wLD`GtAY@^$0s1F>h>r1V}s_r0$x&_aLY9(lbg zfA>Dv+pzPLuWk_CY;>i1z;lz-V(}O`hE2?kJpRJDkG`ydr+KK#rs!IgMz_qGa5NF) zQB!v85_a52gt;;AG@CG^_im^AY&$Zdl}vJ8N@-;9m;}EyKU#5{n%8l(dz7A~XI>3{ zb>wd|U zi=QpNEQOX*9ZB}OaK_Hj$9Z(b$t!tiIOu)iC~GRI0vWZE@J%q1(oaKI=id14Oy8)kEY0fj(Eis}P2YF7lFt}+(^A<)I z2Mn!pj+ltO-(+VyL~u~|QI0Ff&~C1CD`${M`d}0=7m#=3$p_S6(VSoI$Kb-JIcit9 zR_pnMn{wWqRG{EtsUPg(x-Xo=`EpjEU&tJV!JLDumz3{g$jeiNOerQo1cMDvNQWkNS=rjrT2K1v zv9z5aKgpJ6wlAkK)!o#`trA)JWM3Eaf$l$%K!L&Z?}`Wi3!z4Jfr=-$sM1LJR*U$n zwhmp#)==<8pGG=^$CXqp)Q7TP1tpR=`L2&3T!(&0yB&+Ya=Jvqw=S;(o@Lue1P>fM#XogbDt)rZFTDT8;%Zb}`{a_@H! zjosukakDq;Pnlw66)9l5XQeVQXLi>?g!)6Ij92ICQ=g(`IQ`->$*7F9)1J_f~;$?rTwR z+My#i4L;K!ukFv^eu@#L5+j_;wh}~J-2HmxiSK3va_qc{k%MM|?;%NJd7YUV3Z*b! zi)7a6aT%)jr<-i`LleCeNMsg1m3R75EeyXo|M)&d{`Fo7LF% zCbM7I%l3UKImW+o@7C+Hsg?1M>j?|%Cv2Nv(s8)V$sua79xo_#rdM{V(*&Nn@%-;r{6;botuOj$o}z~b zTrBLc?Iy$Y1y+4Cv!G`(A|?P;r9in?nfl{RlfsB%%qR~xvb$B9>bp0H zL+`8K!yB?0+U;}AVzAi*7ceLM8!kxHUl=6J8GIZm4|UgtAZfVH!Qm72w{KTqU#wrL zj!RLCS>ov!WaLWmPT}J#v2GvMw}e`cX{of zZO_C$U5Z8Qmuq^)2VOL(JU_=`BRWiGVVBl|?{JDRVl0jHxOZDiR|%d;9Z0BXPJp^= z=d5%@A*W_=u>v>47e5aRd*UzC&hE!X@-R<-5%TfJ24u}$SR`h5li4bRMhMO1?Vf;{a#Ay10p2>xpm zB+#A1YVPbwfv~&0Q6ErIf;WYHZ+Wu_&p*Z*2*gQEK;LR=hIQoLO6WZ9*qHzkw;muB zW~dsoWK_j<)tu6dU`s%L;Lh{py;LaqI9q(iQnj~`FXiRnncOHC+1XmpvE~Umu)4$dGJvg>BSwd!1q{M4xvL z4({PnqVSl%;Gm|mQ|k%J;b|trj3-Ms`YvedysrJ~Y%iyoFH{?bXoFwsjH#MbHezrM z>Ud~eWFVP$QPe;`v*z-_``{P@^cB=_hO=K4l~v>zqT>wG@Tx}PgKOFEuZ8&*B+YRdnck+ zzD4?s>c-I|6_K-TPFidxn+Xw5FA~Uv^3k6XKK1LA_3zgWyTI@nzj^Y@^m7NEU~#AC zpTZ^&p<0m)y{5`|)6C>?rLkc>5Begms>)u%)kBHTT~x|vFF#&Avqc!z@A0U0qs%%n zsVcKnLtCU++R}oQPLCAy$r3JMPpwr%HJTluf4kjvU z{aNh?iWnOGA6vWR67AEdV`KRfBlI3F?q5Bpsv4)RkpM<%nt9^{k8ynMZMhV=>FuFT z+8{hBRdHN=(skYabcn!1!ct#idECXmaa_)NS)=C@*YR@u`ZTYW#uv>yqUN$jp&Wgg z^h;8i_6&NZk3Qmsvs2vbC=LZxlAx%gGS$UD4#15J z3){3dM`G?=7_dlR+i&rePjU||_wbq#e-of8#I(DT0bcXsU1EN?A7r(#eVJ}{t2lUN zP{!w=EVcQWrV_(*u4aVh=!YUGnk!0d3vC-ni`r+pP=Of|>e#S?ft(G`R<^q<6VJ?x zva<|f;=ByqapMIi_%4dyZ;!92Kz$<^ADZJ?WZV`KnszcH61}@{VqNpl$EVRRH9n+f zMhhLY=RW`HT%ZyC>0xQn+eo$7k4W_2K1wW*dbN0*+f)409>nh9ri%TAqlH2~1Rk>z zG!a8E%Mr+wHTh2PSeGAGB`eB$lDegnsEpdLoIlM!)W+{R)bT(oltsQYC5%m;D~%|& zYwR@1KK@~rjy>?%!}G(T{$eKY#xwZADarb8YHjibKIUC>0?c&tKnjMy$5Q0 z_*m$_2&VRARnmd8qO2;y6y3xFNiGoHn~jO&`>Jec1ojwqFLw&_7rvcBtg;&^lu~}} zrazzG=7ePYX)8q4HkPK>66=|K=Ud=uzB~ruX{HPCOC6tjEGyo7;m=BNkTEG7z22cS zQ%YPQq1BRNA9jA$!fK*&P=dUbqt^(ZZ|RyH;i2szIyNfwgC+VNuoZDGzVo{N+_T-p zJdW9&D11cl&a}O#{t%n^n|M_FMBdln-YV1NngEr>_2bX`HnvE(1+6#sVCB!A*6wPF zAIlzKS{~+vo%Iun&TfuSnC$M_0uclpiGYfru_!1)44A4ANYu`fg75hQ6dv-gskLz= zXp68IoyU_%F)VSJTZ1s4J0Eoa;T;16BvhD}As6?+bq}8g0if}(<0-^QFp%(2%lWEh z@_peWk?ylb3S!rsEnYZiHVo=Q18NxkpT6=BoAD2+;D1fFc#qV^V)H%L3Bxc!>fv`b zXWTYH^HNo-T_WM5E!O?m{(FpE*0eqe*=!jH#F^jBXHB@}E%dI-xvZM3)|_u-botnvj+vdFau?OZ^cw3ER3lpWR|QhX4?)B~??LMx zwjPQ23l*1AuZkoEQi4W7?79s;^Z7O`;mUd2D<{`K^A4~mXbNFc;sjwH3H*9mZ^SPq zwa{J&o^-Or@(U@z;!F6~`XL`Lg-tQ7y?&NdCWlV8#b4f!61vO5HK>eDbrE8$&KPM% zZO}LpNX~|Eml{C(b4-USplG7jgEZvH686dOl7h1u5q z_Q{>5SSwl}*2%f^YJ(HqQZX?yQZX(!Mc_s7B3$zRw<~g;RB_3+%I`8iB^V@iTCMTnoja;0 z}4}9GTHt?)O5aYpkB83()f!( z!K?43kd4-5kMQsUsb1O`waT{Q4}%hkhe^V!W0qxeA+y|1`Lm*r4PVzxR}+~$n0#x- z_hQsOsn$1)%*nl{SxaX{G(&9kN_-rB+COSv)9BHN)eZ8yo5o^$@NEx216aNE?@u^J zQ~5q0f^v9Gu(ClrB^n`pCeQeh!qnDqY>97d2I*l^>pmX2sf6PnG<`cE@%0}T%Ha=LtfIZQ?GR?M~M=X(*Y3*fD zqxAUvDa*$h=3J*EJ`+Q=Cum;=#FpK>mnZQFznRcCnbW6t?|JEJm~e~?5m$_>q{daz z{F6hAoM{~yu^4!K)1g@ZTVblI4k+E+t4jXLLA`~Nh!!Z-`6qB){*eMrPxZ6I^S~M#M$$;X5zx=1)PEPiqyMDOFeySh&wI? zC$dF-X6w8Tg~(lzFRDDh|6{{Eb6?MeAjXtPMy`R68>H7{$(i4VjMv~H3Bmav$-VhM zn@!}E$8O9(CMSCIvK2NeBj2KlOwU=``A@d9kl7~qqqm}(bT&6B=-(H&?IGShJS!yn zFD7tRjhcHv@^w}!hy@4aUh6K1dV9B8ZDVuNNb}|8m`-&xYgJoznq4}rcrr?q&XHZ& zp;}T*Rx9M1f4+U#hlGNZylyRF)&Au-dPP$Q$9#3fo{p}B8WYEA`0JQmrHzrOH2w0@ z{)mIdeqKbdI?B(k3F2PKfFNaycwlodL5N!IS)b6kudv%zFN8|G;px1`@2A?{HmTmZp7>hnmJXmAug9dE|9yDF)V}%iY#Qc^#yp-)Tm9xSTvcRy?{Filoror87kRe?AGL4GA z*z6vJ?%`9SP=65;iD)XjwL+5{7ReU>#e$FA-0Owi9c2`4u$NwR z$38y|wBi|WWqW1D+-}A^p8s7Tz+DdBWD<8FM!MBk=LoV$;r<8#pqdd(VQrl2ZRulw7hgJ%`XSv;{P z&l@^8M}5;h-5=y+sCbPp=2kwhWJXR$_Yugu=rnFOAri9>sYL`g4jtFqH@h)6!XKhe zSX+LaM|?jm9hI_H(JK%@DPAww}r`I<=mW za{aV?T=MLtQrVT@hB!%SB}9kc95&1`i|oiWcP@W-3zK+BJ8`Xb>NA}g(&h`@irVFlRp82+f?CF%bmH)!zIHtHMbY3IgsM%#9smF_Qz;=s+ zrE|=JInGb%b}0jm9j+m?;ifUJ)p(f^Pumb`k)A3{ZDD)lwog=8L;uaPT(h=zZvE9| zI_>utL-r8F&mQ}amv(>Q81Ce?{hFENiqQMGrbsmR=y@;y8gTCa-M@}jrX}S8!m6~a zsD}1K)2_((LGIi0e(6q(7E(dlSF8Lpr9|yX(wPtOuvL6~n4c7-peL>Eqj{FDV=Sbw zt&3+QLwL#mlu$5_diZtrSP?6yhknCYn)~+?wQ{3M!n5JgpUWMXxlOVRcs=5d2eb&m zM#ihT9_K!gP1P_!4?DcC(%) z|0NTND?u|sLSNgnDu@-bI5w=OeZrhEgkwPl7qsdu5o+Te1MT|=~YVA`?g zBJH|lnjEij^VilgFNoYH4}Fx9bPd9FHjL%6%*PC+$V;COYU@G=<~dKfM1-lw`gvY` zb&?wIE9|LpXa#i?!8r4ZLt#YbDu3K{d}}pfW%NsbX~x!z2(fx`q4x&*dKOQlIp!1E zgpu!kbXN$=&)Wm`qZhL_r=L+Xpsga~+9>Y5!Ut!iXq@$aC>KxDgJ|bjroGwBMs<2a zlBqtLpOGu!@hB}#<`om}?1(h%8{(;~TO-O>nYsFYyZGaS zx}l*%T=RZ=cmjao`tyYd<}Y4|h-jR0Baxx3={vR(XnlLsBKcUz#P$$nCj~i8_z8mk zyNrliP~mg6ld%Q1&7&xU|0r3=gj&`F-4yEk%rNo5!!iWDQUZcJZuVbO$Ooya?gId>69lRj0PegB+p;iriNZX;Jw z0ajQc=gKOUgjK_?@Y9Kt4d0yTR^9K%k9sRl_p%I*`}TBvYSK1sy)oo;0{iUUXl>~D z*NEcj>Ge$#6H|lGKNTEU(C`Nj5r2~pR*gADD+h$bYiUiacLGrj(~|Xft8EwS@5xbk zT-}_2x=2d)yB-!LSk6R59lOCf-cSa$csGFC?x>V*LCCz$646ceY=7C!;NFBp5LYO6 zyp+sppldgzWUx#=RM!CIHHuz8yQ~+#()Uo}b#!&PwTV<#L)aW;YYEXgzi-v;SCyOA zjRz-^h4?KPLq1*oz^jRs$UN{sd(Q3dwX)YYj=3^))xvA=k+ zfqH<}1hU{*U~PxBA|&s~)K6C6Uw zO`P?m{x;caep^4^LM^+IJXgjRE)3ZR!4GNKwFjzh<=ebK$Gt|E#kp6~)cf{n4Q<~p zsO?%Od*mg$wFgyuc$`SYUvL_ufu{x1EpuzkTPF@jz`W079qrkw?}->V%iU-gT%N#_ z@<}uP4$q{Pv`>m?TJ^|K_j&YmQn`q`Uu8ut>|R_$n007CquupNqEtyDxuJnO8gi`M z{ZDhXj+Ts&N?&{EMEBa_!PVpnp6M{C`B64unVx(FMi$3DfgDUdka&2LJpcN!R&HC; z!K>s%XK|cEGeb*>QfH0~5SWoq3ou;=sTdV_CD)_!o-xkqqn{udp0?2!o+qi5_y$+o zw+~n{N^E)SbVAugi*xBp8Tq-5EuvdW+DU)%2l=z3x|=%N**yf4HFLN6UQ^Z3kUx7& z-s-&BSr08UCwI{LxVaPLY|Y$&hZm+|_Pvh8V1Tq4uqj90%H6`v&V}IY21SDW*M$C) zh6Bd{tg)JzBd{4r_V=!L@Bf!19XN!l8NtnNXP1yD91hMQ!0$gbBM1ycP95011ZfQ?DuP9y^;xFW&b#mvGAiURV$E>^$>C1A@E zxX8o8+0oGqiUvFtA1gO!D9qUjpn~89)Cjh2RzTev*v6y^Yh0a;8ZU}OL>D9jZK1BL@6LV&_NpfFD$kqHcQ0a!Q)>|%nQv$G`F0vrIJ z?4S3F?Fi+^oBV%-{*%~t#P~nR2oS%6nTlz0K>^uKG@y@Yz&F65a8N7?3&8_9J~2STNMM3T;o&<41K2iv0n&Dc2t z$_BLm-xbJ@>nNZtSinAh6b!Hy0|5safI|ZX=m?L4iUBL(uz)^efWgJUf=Tc}1IJw$c-$cdB?bYHAQB)^ zV8J3#JNIG$!gu=s+5+S5)PW!vDTdoA7%^Z+A5g~w_8tqsu?QqU;O@9+U|bBK%P2f} z9~z1QjyQl>;Blt_6Boz~kO|-s9Dhe>AOis$0ER-qF)_eBJ52e39C+N(A8;fzcmfAm z1fmU+1|EU3BM)%6-2%Y?B<|b^^7o$#fZ%QyfC_@QVL+?^%)#41HTWT}A4dRNASe1$ z4`?x<5wHpvL?6_HUr+^sJIVy~1m5+ZUO=SaaLCTJ?;YTQ-H|^Es0h0{0P4Z7Wrx>* zUVu9BQ`yl6P$zzr9bNgZ6QDl)TtK{b&VfEa-S|;J%CT?^AVUCGPzUxXcpM-TP%i)* z{B!xC6F-h1QvkU`c8*~Er?MmK-RoeR{}fO!K&gVqe^yTa9P-Nm`nEF{0+a8~dS$>e zsQ@(Z!c?IFmcb}X1Vple_a0?f|9pgXW#a{vsvb7l#MaJRN&ICfyW z^Bcgh-LswfAIPBtXOJB$0SI{a(vIN(+^wBfz;wo%^oyBJ2eL=`um8E zz(fPWAb>>xgtc?k849d1+N}Y`Y3Ka{K#>ce%Dd1X<_vf^c3QbYf!(w_eYim}-$!r< zWVCzryA9iExoZ-E31jDs00mxF?$&lpALct%pf%iyjDS%+0J-h9@q}W(k9Pr((JpMq zgaR+%!A}0!tqWfKVVe{Y&>aWm=j&kmAJ=6NKU?$vY;A_g$pDRD7G_Wc*fC57`9C%T ztmQw=Q2%2yxF}%qfrj8wTKXq!e{!GxA>k*L`v>@Q#zPgHgLdeKsR9rEcdn}7w zUxnM=R`5Dz`!M7dli7|`L9j|O8xOWr25jkCm2oma})xFK*Kd3Pd>qDz||iUQGIAqaIaLqa0F6K zZYp!Qlv-3&Ky+US8J*!CnK96N`adQFh{Ay1KaLPq?BD&MA1DJjKShy~kQp%s8H!pO z(=yQYcD_Z*jPtmTsW_NJNjzMaVxG>XW_*ls9BloNwTrxbr9jLc$p3-+?pK3g%667u zh6{+)K=*IwBRU>HLT1NL{=O*{NKgGkxE$~y959b}6iB$aI5-m=?98E_C{Zy{gfP^W zKyYyvhrt~Gb4k?M%|;Lc0}^kR9u~lNb$-6f#nKvTZf0Q*jPkt)K_CSLc5mnGBoBU( z=%~Cn5{^W{QAj)-iNRvT^aX$VijTLOl{Ewi9I+4}2LJvCq%bf*^kohG4-B-`;2+fK zI}CxtfW-a^!(hRz;O{Unp7{-i1A6)c2CDe)?Z9~L7Z`8?GV(VV3XIZzg`qG&*!>#} zj)d>g4oH*y+74Jc0p>n`=??*r_6Kdi2b2D6Cx-u{ofsU9*M98}i0=PvhXFPp|K1J< zq?3Mw;lOm{?=Tb?mwm@hW!D2EMGFoORLh6j?-zrk>DFzWxc9q_fT zKlqHp!2fKA#r>`ua2y^Cb${&-zsIwHM}awn-`Zh;^yaTHz@UH`jbGXUDRm&<_d5(= z*6%O`;t!bv+OkJG%pdxP0M-=$PA>w91X5PNjf=#98KYnE12SNL;DW@1nWk9r&Mi1UEA~2P?OoZ^&uc`B(vSClsdX>+(S=0@0= SW)V0sBoe~UuW&{Y^1lFs0*>PV diff --git a/tests/triton_tests/plot3.png b/tests/triton_tests/plot3.png deleted file mode 100644 index e83178d7a65f7f2c78c9b9ad369b13a6c1a3a917..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 58335 zcmb4rby$?`w=SK6G}57fBHb;GAT=;_N;gP%iHdZ0=P)24(v5_4i!_qb2qF#VneW?s z|MoxUT-Wis6c~q@XP$RGYu)Q!_x*lUS5?5np~OK#Lc&vgDXW2mgjRrrge;DQ0Ui-@ z7;FJwgx%$I-8G%9+`V4AS|X{uc6V`bc6YEfqxZCQb+dJL;^h$J;AW$@ad&rd6XE1^ z{GTUqIJ;VNF5qB&055{=@>0(Y3F+Z$#9w6Ci&9%8q;+LQSt+P@*8Y;0zvkIh=kGDa zjC%FvykY$c2aX7OR@SG`_iUVNrm)mV62tR|2$WEIlmfZ;;ka_0zcr_jx zq-f9R)#LLk@bmaoGEcA9{U(_mC*zqN=f#{&wv)4N9m|AS#(dhxbf!Fv6tEb=2&Di0 z=LN5p2kQU)#31VuCzlphE?vRmS7(I$AM=IwXn2>#gmK@Ef2Ra0u-ZJ;eabXl?=Wv zi-c{onP@|uFoKYmmFzQ_h6dofKJwY*JUm$%Vb?J52k!KqK$_%qJ2%L6*=JkJgI88~ zz(Wp;@Y}|`hWJ1o_zuSEYJ@#62?g(v7w7|lKrH9`t!itmMksyH52Jg{ep{c05zJ%> z=925_={akaQG3M6jiuuQN6sJT=Jew zo9Ds2teo7yhh~BTS<=haxasTNf&ckc(%XtS`&VbsW&2l$ABp4QrM}EWFo|a$r#+Hh zU%OdSW{~CA(pu3A6(G=Bih9Hy8Wr_G;^z2kuviX(bog$m-}#~Yuknz}(`_9O%?%Nj zFapvfzcn<`{#b>EIUBT_b|;#QUg?(7+!WI!0>(VFfZlGu!zN6XsmANWmdH=nV?lRs zOKial7nsh~j!6uu2rg~>_Wt$U*?0^5WmXltJRI5`)j(4lKA-e zpPz5uK9n%+i=vQtk;~gOip?Oq`AyUfW68=*uVletBU@Ngjva*2d8h;ddR$yw=V!|y zA+JNjuFqLn)XRa_ju`@u`iJW(xf0oh3b7}rV=vt1EK|8Hz7e$F{#_tCzdRY@6!!h| z6l{v+{1@BdEa60^S1C!?i!-Cfri@&}-b)5`BZSgeCVH*j+~-SPbSwU+cnUGpWxHFA zS`AK1O663nYc~dMo1%+ave6-CBoRqWm7^Ti`W-U=o;@O*A-?QBb807Kji+g{kmIHxi z_nmIF7q=U4{K{tr)40uT#`BPW{`~2FFrQ2pc+q8&SP*4ya6}7QjUS+k? z9^e#v{`KJt^^U**tC0+?!`xZ>S0MtO4y#7%b$G}6oBUE(3^zZA(|OrU+Oxpb{Q$1Q z>AZavX@%cG-E5kGBjrN9`VCGr><0B{rsH{1HW~6}W@cRybOC<|Qw?3Q!6NwV zzem|lmBgo}>Qoy~@fio4X<^_|${x(u+0K1tq6@m=&f`e{e~`kXqdf-x94xnqJ`_8a z?0PdfHANxeFH}=gGhJzv*&cBH*~|H-;PR}oUvyg9qxOq6-1&OPSJ@&SzotqbD#X&V zpLr1_A-1`z(S&D&cf!J)Sou`hwc`oSMvjn_r#|JJt?dT2#J*={*4GcX^#kH>3K9&S zh7t5C(hqXEZoEw4Fw*DhuoxgZY+R4Qp%Z-a1TC^x+R4c&{rvKGKiyoj$8)fae;Y?G zey{u7-=6#2p3PqFl;+w@6rj%YbC` z-k;@F?FBkj#)#;_CT32$|L~{J*p$Q+^CYsB8MQ-4l{bu_n{GMa+z5o2%-;7;O(6F4 zK3gbC34URad4D`E^KhclNHm64e6Z^+=FWVbz0}tJWbrEym^M=5mS5lFJ)o1wx<2a2 zT?xF7PBjTkM{HC&uXS^l8+gSBv|?!o3k_3MCNB^Hqna%uCoPR~)NwC?)Z%rt^=qO~ z?0Ub}X+hkq2P^RAw}Qmog?an+p{To2sYyrRkE~{23u|j$>rv`u|5KIr%PlH=TCr~+ zdJHR9)YJ{P!;ar&lPq%DUQ#kIg{4#t+$JF5*h?3yrc3W8Ju6~2#FC4|w5PPiB`0&J zb-VG#md9p*XOTP?TnxNEP|gyHe{pw#8my8jP~$Mib9dfx?-MY*xM;XFn*GYkiWz(m z$Bzrf9y3~L%FM2AZZ&fbi$2@g;PWmX)LW0`Y!;^)uhq>O7do%BBeCg!39qxC{p|hQ ztS_3XAD6RL(%qeJD24qwEN}&`Rc_X=-{i{l^~1w$aD6Pl#W5J)S8Ud!9YZZlP{~l$ zw2a-tj?3Ci^(_jfwp7yu(oOlsKf|e*Sf|!{BN3Mb84+#i>FNI9>boB;XY1G5K}uC= zq@|^8reTBvnbFnNT)VSX4<(hAA4F5~vkHCxVmsx%5wCCoJ{F#2tiyl({SR;= zcXoEZILsw*oA)&g=;0m@a_Jce|l_}upIuJZKHparj@?n49!EUw^PxAWj0q%DNSy=*7(q8ZZ z2mN!5T09Afh}IH{p;gA>AoCfyI3mNqgHi-Id2iPHEJf7DjkaZ7#vxq=lPF3t=PXeH z{vL;XbTqHfk_(U8BMW8fnpA1JDf#FtUVrT#NPGA(0zQ7cFOBu{BU(XL6t!@|J&1p# z*5htmV>;`_#y5!KEaI^{Uj6zjPe3rslPBW^a=1t!^9_LwPUEq}iG6V=1k&uLlguam zI;Eu&pVO`Ba-CREp~gFc?uneY0{SqDknELy$YTzgJT?x8Z%=PRdGDTMH8P_bg^+1| zZ_eX$UKS%FCQcXi;yBxzwFap# z!U0qX8ww9+@Yp5TwPbupkgS3NQAM08Cnt`Kk~d2pvpxY?jg5_=nU2osBUUFTC*MX! z*gfAaxB9qA%gD@qu_XqeP9m>a2|%KU=yKKz4bEeGRUI)SR;S+UH00#{h$?-4xI`i5 z&2`j%T^Zy&*-*$M9mruRN^?P4jq>`=K88S|;n9vOrK!Z_265{Ccf{T$vZ(QZtu9ec z4;`>D1Lyw%jYtv&uN6{UQc};~gN1a70FiQ?DvopERCa^D#U{6%rIv=F6Dp_15U*vQ z4=qRSnd9T?C#R7?wO@4EdQ(hX2blJk+ghY&lfmOi4cqQE3-#bqOP@89H3xqf&$DeDw?@r;h}J{a(|b)o#Ph+KqAkZHQ9o~!nsx-UQi2# zKY?cpX}`t5y8zhdrIM1_kLuTK2K7;K1a$X$Kh#aybGeph<(8m&kh#Z{yH-jvq$_;L zxyY!&$&xfH0t#5U)rAo~PUwMrCC>32rO16oucE6!tEelQAbgUJ4}4n&Hw+r{<5nk>qXw9ww%xeG5Uwyx2@Nu>s4F;y&)_ zk^TP3Kp~aG2n!eYv4B8IKdoOBGAhRS7h6rI_Dd^eW@3<{%==?##D4b>TiQ)sZs*39 z*jDJ*MFienkxrMne*bLw3Im_|8>r(xmz${{0mzG2pkWJ z=7X|~vAjkkiLw~yr!ktH;KNgyopv4g zjAxhs=FJzstDVq_GytBox?HWBRo9lhh-v8_HiC-w^=nzc-Q~0Z%H8OGQxkw&5CRYf z+InhPk_~g;g#CZqz6`M9ef{PQ2Qpl}|N82To8qBD*!kiN=9Htr$!J!*WD<;QL$w0S zORz^S08v@@e@s~Fxw`UHwO_$!w-4rOS@KX!qp1W-02=9)mvv1^NC;hW+BUhrA}>|T z-I{#Gxf#jXPA?!prXErQeklSZa0bXvu+I`;abu{YOu++EU z++k!!UujsM|2Z#^dkYf<7g>R3xf~*>VGE<0vM_z&^7xp^g}s_GZcG!B>~2a|DOH*q zT;WPB)2xmb1(rfYN2dlFJ1PMOq&yIb91C9@7g7Lf!KW5dcQCRA?{ipxy#zX+o+-5# zBj9*9?UZC*tiF8+;*at1H&jbc;u{Epk7^cSU7NMk0b%)vZudPD9 z_5$uL(Gj9BqF@~PWHN88*w|7}Gqa``dmKEbRz>3O?B;aQUze8~Ibu5!FFp1jHIHa> zja79tA6D4v?tVYhOq({W^wJuBR?e73KSYH4XnNJFDyWo4Bfs+QM`USZsp0peBB zF36TzPn|bd3`#bA+Nmz)W#n3q{_8{YoWooAv(13+bOOs#R7~|QW`)%qmc>{@5`1?g z4s?wVGgd*7p*+_q0TQ6CC4QVBOAKI?zd{q#NbvZ8{Gh=@oCq=%-vTL%Fu``OB30Lxt+3His2v6`Nu zjU4*^o~0pe*+ZcqF)*QADk}c~eUm3Rp5OkP5|a<9t-aECefs%C(!t&1V4rKf_~c|c zL&GdX19;iVB9%pHalDA7Qq88L)$mg<`ay%{r5N_dgl(|Yci;wJB9#C zTL?4{m-UfWdk-3eQVD9%Dy928PEpI8z=OuL?!2jDNrP!kPwUBdjHV#u!$H%4C@TPM zvii4{=J-Z{CXCde9Yj=U{QjUhDDtFEi3A~xe7s3l3wu*1w<)r7kt;=ZF|asIPfe$6 z!-@%Q*TU&?{=QuiI!7i!W^((aNjxYd+ZU-)qffiF`6A18e~n8+Vsv1+ESJxJgfElOCnk?%-HRy0Ef_6PMGo735HWgSRBF8MfZ&g{A9H?-<^z;9kj#-mp@sg)E zdWxbhFOLy${kPb-&A0Xb?m8_hK9G8;m4}zth^Jt3N{jEa(^4~StT2cTa3Br1j=qKO zf&y$I`*Af9Z(bhn;5puu2=RRVBM`0^FP%gXZjEyyI3b@n>l=4L2c zkxeDGx&?(bvI?4|mcJR__*jo6`0YOuR+XEg>0&kT6}bctv;}os6W(=qFJi#DDFu2o z8l2wAUzG3rD*FaX)JHvV*xA`ADK!`Mh^WQHC$UGB6BYOe=;OEl#;D8SV|XG$`3g$H z0D?WYaZrh{W;Ddwhz66be;jW-Ct!Yt7Ni#{Ks)`B|A72s(O&>8QUEQ<==`FIz-typt zhHGpH(+MQ^lL_Ti0}NARbiVMjV#8Yf(B~3cn^c>$4DA(8X|4R$T+)-KwIE)rVR}x^ zsvjqoLn+*#Ev60l1F3BLtDnMYGMXp#7Svi)9m6;8Oy-Fw$v=?ML_2P)+GPg?)?J-H z{1{HZ5sC{7=tA%PzPY^-=XcY%pbTG3WT`Ruly!ZoHhOz$T@z3jN1(;lX5-ZXqPAS; z;U^-@>ThEMT~=tR)Kbjb!HMkRGzL~yzv&ZGKL_5WSLBL6(5ti!n=aFQR1^`z{Dc+0 z+BRelg?5srr-mNP7)YWQ4Dt*MfKD-LMWR(scoRhpg%_fyeg<}0=BXwbG#z8&BkFgE zZfEO@ZESc`7P z#vjSxF9da@Lbs;d(xMRn_o+gKhtjC~C9lF+3kx~sV>+jFB(35diPRrP_!iJo^Yi`v zom5>}i1CcDcpjIHC;i7_8F8jEv;6vP-rMd54NXm-pYO=iK$qaP63{TzT&7vd$bxiG6}IF@5$#p|P|w_#~zHZ%-98p}d9VmbO_d!m|KvN32BQKrqF zthh;1aYE9_GXvjjs7xrZ!h%UcZNEAw$jfU>m4_*GtY)BlJU(W$91Uf4B%mRW?_^!| zTos3UbtHC*8`m2_O@bg|zuc!sJguN!BHMYIesQ#PU7O2(N0u{it>J9ZR8Ud`oRKu@H5l7y{tP0mk{YrUL4C6EjNOhQnnC9RXntZ=CrXBZ1M}C?*_?w_`F*L$-{)5{!mS){ucxrq%|5@FV7;ZUtU`{kV}^u4ni|NZUJ*WzCIdnQ52SG;y)k@h;0>urzYHYc^Jsg2Mg z$sD};eVV4Da14=2!(ID*(i}g444x4lj&*W+y0h5C3l8gQhJ9sOw>Cz)Arxae5fr<& z;6Zg(^Ho32-IBqjRkHz@t9tLD%GY4a&Y+NYoo$`irmk~i&*ACjBuB>EhZ*K5cOmxU zJ7#&>n|zvRmHhVKkq|T)se#vOCotVhmV!Fr za1XMJ_uZXSe;M~=Aa6eip{S***NYAi!f>z6a5VE06CFZ)ssRbY-qh4YE#xA(yP%CSV!>Du7|^{t`vI5J zU9WV{w+kyMW*+*z<9=(g>#c5Vv)`azbKEZ*wwfCBkYcl=z3;t-l)exL;%y z%*V#8j}q0Cy8co!=AlBU1f48E^9~}k!Bh{>(2oF_3V7O$Z}E>onfj0;?ssxFYm$0V z+_l8DkcabPK7uUpw6n%Yuc0bF@O>PF!KMTLE%p{e^m1&ooDH^Sv@Wi1!Bh6FAu~xu zW!9(H$+~38C=JJ(OxZ&vEtl`BdEC91#NQ@Q(kOzN**D{Xlha?cWUlpOu4^w=K2+|E zWBgGSxngSeb*z#J%QV;ic@JOqH%2?Nj?678E>5;>+1Rj1NG#8b9XIA;=b5l|V^Pjj zei=jsDLYY@x~qeAPI-~sFYZgeUfr?FyFry^WmiYd2uIO_J*j=Gns|D{pwu2^3;U6C zqhBuX=_vrt;^qKLuDht@JM-~KKp3ZP#z^r1-c!q+T(Z3uD)0%=FniMs;E+<^M<80* zm8GZ%gAA)!2_@OnEhZ;(1rMe_@3%;VTy+v3&=N$OM7)_PYLdS)ipr^;UI0k@qVcK2r7?En^YZE@fym7}SKm$D?uu zy~xF%#*^&gUA@UETj-TluYZdVZ~anBz%s3uSRLrYqJMZERwM}|MGl~GbKALEHJE?a z=QjA1=IY8t?qhOnS7#>SHCnF9B?>zIR@l=M`x)2t6T6viTG~SaZpKGif4x$IAISPp zS}kOQ{Bd=DGzJ($oi*+N?qSby{F+X?o~|BSc6P zdiXQQx20sF@{Re`*`Cfx$d#1wtk`hj@Du+;N&ipYC#jn)V^M@!EK4dSQ^@D{e|=vq z@lTb#RrR(IijKO66FdLAO>5IH^my+Q?*zIz?+Un%|1kBo2==dm-4UK#Y9kSWD;Kw{ zCYaxa*BIHFK~T|H9#yGC4tE4csBO2*>ByY9k|ZAiqR#Yiu?Y~63B$^y8e-<|=^}1h z53EPCo_+Hl93C!$6n6VPQYe7}?stjS0K{k;03dUxBZ8Zi{nMY+=%n{%%0CNeds49) zfN92ci5lm5;A;wsFi_l0YwPJjYFR*Xf5$nnwIz_Z-oKx$(Bp0BPdQ@K8k>a0 ztL<@X$y1dV;3a?3c3=YxR6A1sobt_h{*Tj>YKlVgEy~A_=6QHJoj4y&R2PkTxE;*X z;tq?VXd|GnQ&Hz4G0V51Z=DG_s>|_@!+Ls>cAUSF=ROBji`6mB(F;NEfT=;_{^udq z{C83tEYR6FljzG#NbDx1H{hFBkyky;+woj*;affvQtqRe_<0!Lt_ieKTYDMw0Rn=}urY3kqa_O9?FqOR zVAFNGIzv!#>k(wWxbGi&KE5xXGua0j_&6%3IX8a({ynX#T#<#*1W6J2{-mAbN9Dr7 z4<24E`a!}X9%Eb-?NZg(4f+FPqs%WvGqYqqh}Em%Vn4%DB4t;1elu9j^A1fOi}}0< z+cSnP)~R*^>mp#OS&99S_E87A48UPQfRrE35zmx*ixCf)vG~Ud2!t7Qdqxgc2J23l zrPl^LfhGbTghx>N?BGu0O~0PLJ(lTopANSA`b;8RwcyRwPN!d|ym=;(=&ljEgQO+l zzU-1s6|%n1%3&UqOP=rDL{}+9>KmeEKAA?BhzBj;LgjXYg2S7Wpnl&j#YlYK#ngW-ljP`- zEYkRpLwGs&1Lbf7Pn0B6O`BOm?d2~>rOjfR=!g|I=ubfPL9judMqotPxRLM*$@}Ja zf73zzg3WTPFqoDg2CSemACCyGK+Wvm!G!%O=%~D^%W~Yemkm;^+vi*Jb+90cS_+=S+iB2f3vKp~(wpDRlZ)HmENpsk^?#a08OD zOmDc0{+831)MudjcBr2(+=hrkJcz_fYD};kP<#t{+?r8x^HCTJ+*SGQBbxfZN~k~( ztW6nlaZwdE15&c>g*)~h`KrW>lmdEQ1O_Y%RV(IP4|XS0Y^f8nn=wOh4pxq&{TLeb z7}6AjwD**Hsqzodse2^B2cIF>yzc!(ASiB2mxpnswW)<_zPBdrn0WrAIA3czI*RG z3}a1;jgc?!n+*-dkxjNtI~aR$zj%*_Eud|R5f?fb3XjE+)S}1TeLk&HvK~~$4b#xv zEPK3+O-kiQ;*H^r?s0#MQQ5^t{t-q3Z-+UVD|RaMl|LPgx$>+=Pm8^#Pm+1E8yVv^ zt|+fb)Gexd>*)Tex+S{AKIzAbyo{D$jGX1OlqXbUV7qpNc5Cm$xsyw1Er&iQT)Q1v z=F6jcScuKR$5Xo~dr&{gtw( zg_vWPM;ad<>5W(ADO3gLduoz{DA8Hm4jSvPB+d8h?;x(fkI*cR4H?3Y&<(^(b^(Wp z#jo=C2~PqzbVB$3u?usbv|V2fRStd&zPSmt+&|B{a;h{kKrdos6l(Eu{@A-T!I#=> za92UW6aq8d{&K>FxCIo8EiPhO+y_-|Bj@X|H z?e5ehLe{fCdhm{0X?w@H^t|PFmP2`A-=jC<>i2(gnuq!2l3Q?!INX5n3a+a1fFPs0 z8_W4e-0^!nai_wCgAGf|e(s^j62RE-AY30k4kw?S1`3bZ*?L=KDSabdoI8E+<4e^2y*7SDTOqggK2yS4B0;Kj2FOFOEvr1@zolCx7!XB2U2v#2T5N*5i-KY)Ig2kJ=SW!M*90(1zf z1N`=31O>2{=B$nz8qZnbKcA^@Vp-fqH#^brmN*)J8Hm|k*2Wx@D)Nu@yZ-Qgw?*RN znR=(%Y_b2e+aoWES8}C)TxnbIS@k}}f7lUUy?41U=g$wkaoQ9sMab-wtShx5^+AP? zh-E2KrNNf{$g0kVYxne++W6s4)?4@lAC-sm$|KUF9g}?d6eAfZ-bpd+(3dJC5+XN3 z^Ck&}($1#+3N~Hl0~0;bfuu`@aF8c2ZDbv6RF=PbG)_3%Ifb6Yd2NrzYHrH?&OFY@g4Xi_ij3iOGeR8w|#=hf>I@+ zD%R*F`F4pHcU0drrSx&mnTWk(04BWy)V;S;_lcBq7_IMoGH# znvL~XfQbLk@-ygcn||`dJkKP=L_h1T>Wjiqh+JI{U^cnicc#WsNE?Zb7ydUtrg`R( z@DBEU7zR_YmC983C+;rEcX#CUP+pD>sd%px`!W8oemkI&xlfr!WRz;D>H9swyseIU z#O05JJGnAzGaK2;nQ5wp0Z=R!?2@t{=D1x{zk2P8OhqK^q4-4QmLLl(V5G zEX1*r0#qTAn#Q9kUy8FA0^+~6v8emFC7E(&?Uyr@73 zl$Y+FdWZZ7ljdxkTUZahiUzr4YEvwbFQ=tf7&Bv6(?c-eaooR4h9?&{`g!|{)e#fl zE|X!)&d6#*JDN%%O&1d1Lg(MZ(KnwCaX<1#t}@q`u$Dh-Zoxn6ye5b4ln3R1t~1LL zY^T*VZ*`I-Exn+g=@`#-fZ^F&o4w|Y%CJL8Zu%3tJJyHD&XZSAlCM<~pR_O3H7)$dabbub^;9;WhuHHjR+Y5l1YzJ7{qx>cm#lwVDj z^cdHU!7f~w^FY>8%8`G|nG1hz{8~mkLfw>^3u%V<$ZPuL4|Dadvd{!|a@5RVV_37Y zQ7~uJ=4P_uZw}9Xka1ptuOSKA%+<)4zo~7m%2E0~dzjx5+|`jhnnC$e8o?qPEosED zgq{+Ckv*EUp7lmDkx{et_Z70;bYlR!sYd+EFBCH}V(N5-iX2MvNP$2wL6KpU81iRfTYAC@)o)+pmNrfurOkFP`p5I0^qh&`8w_ZbmPEJvE zTE;|Yu>$Um8AFS%^9tMeo&I;_D2>u@klytt;}qC9+TRpC%vS zi@?m743621Ek9tRVd&!@<3-8J#f zTBa{5?JhV!+t^meG?=~UMn)@7-wPg{ywjYW@Sstd8s|r>2@>j- zL~Ah!Ulmq{A+s+_2W;qU)j6W^*kvJ#O4wVp0(d61DJvIduBFt0Oz#(gN2hJUIuMiZ z_6g0MJkwRXw5tlx$w{`=j^G_fXcxZMEu2dr5Q#L{PVV*hq~kI+G0Y5nb>Ag2Vi(#R zR23EV`{6qtl(ToIq%GMa3bC$o>wSy10z>4ggkKzB95r&NuQ0QhQ=z|B^d7{m^6|6t zbP#xBj4hdS$|vuZ~G^By-JZo4N`!_zw?tA@^a+h*9trkG@Y^ z=8`?~@RtR>Hjn2(HTTu*ul282;qb)RU3>!#X3$kS^i9gHQbJt?lfP>ns+#6}@R$_; zCkHWdy`+bWd_$&{k^8t0%aF$n30bD&!Sj&0OXgmo(G9&~q^hg@W^FAGwHd#J-aDzCgnupRl)SO@8TDW4>C{mfZ(5&2u43yK zvOj$`nodJr(%k^P`j|WY+l05uAJjuOC({rUjZo#YvR35O%UDTdvUstO5LLG2R%=q@Geq-P@LZuN4^w?npj4q1wtONlc21Dl^hp$0HhX-XZ zQihU$Ooufug#6vihll08KfUW9tgb1?dC62FrC)%fe1LmmlO4kf*&>J-G-Rc;ac$MM z`umn(MoS8#;6H<$jF4Y3q@+R|&oax{>b^_jebLv5dq#V9-}29yA!rBTDd`IK!;V&B zDD(|BmEgZ2;xWkvn z%SA)Sqt4E2Ymx3ik^|-|!m_GU|`XeZifmM@|iTo^9($aJpM;R#xQvxMWI@%%? zZ(l0<;*ye!<`Lc>Oxx!ne(EMr z>ELHDW%9`L4-Y82B=)_k?{B(;{C%=v&xn3!5n5s#7Ft=G;{0QB z8+12k=>tSwfDy_^$eJa#SaSWP4Gj&uZV+z{)O_iJ&a|2(%3UCuJ$9i8u^AX8AR+W@ zK&h$>{0lSXI)2yd&`P&X3+SvUO)~l!4p-;3Opa{88=wBP4yRdW;s3i<_TE#`9$;1f8h0%aegD z+BvXjuon9auO}Y;%o!>PN6}Ny&BIB!f9IQUhXvniigMq`Hl_RL(*C5nYgU~m;h+d_ zj;Y)Orjbh9gb<+B2VrZ+SA_xrO3JNnTuuuhdzY1$Z?4GXvn2wfZ_AU70g21dnm6n6 zYwPPk1&4S7(ihuxku3wDpc@z&*#t~y?{{I{PudmQ_Fewk+GI!qnRuBa+9mZu_0RYI za(wRk(}+D)%gkpa<6Mw@FDHaO=+W8tM2u@lqp93V=h*%${vd@uq~Y@&K`(%Gy0_gB33zwn zv$BSJJ`z7)Yy={Z&mh)-;2l@e*H;7y$i6uidi&t+1)8O*K4RZB)x>*&uq1-rq}B?r z&OX%{X%C3pQ|zysTXrXZH^oc~YhVVyou#$t|3>qpIwZ9XJIHQ4m|VntwWv3=WbeVv zt(TLE%4XWq;Q_-t0tuowPr#!YlRLUB_-#JnZ^iLl>j&mE&WA=vE5p8P#SAv9LcQ37 z(>+DY8=Q)Pd>WCSI0hL+EDon} zlZudaf=@;v>gn*$CSzk`|2bUBguy;Xb6V8S>9%;X1HEUr*)y5TBL^-O{@41eN$6)9 z64BQJCEU?%SHhozv8{L`quF3&&7*5K{*}tgVLh@{G0(z|X%0N_PJ>4BLY!tNlquPD z>Yv;v`~UgL6rHG@R;EV`?eq9Iw_!nUy-CA;uXai)z>pjv0!78dch_rh-BdCTytD+O zkVB6`9v+@wAehf~XV!1eSNbi`jHuJqfV!X0ZW?Cl_wOu5oR*i3ZP@YlX=JZPeV&HT z`CK?09T;Hfkmq?R8Mv1pLK{bgHl)n&0#_AP3O!%G!+b#^ebH^a%F$VA#n1Om=Sb%8 zL5Ne5?oDZHu3X6Q%ItsH)MU>#w9wX@wsQ)BDsItxJ(|yUGK`FUTCjX2=sx?gLd-Z| z=gHV~J44ZNasE#5MaN2jR@*#R)GG#|^-E7506Gi>AhhlhQMfIgo`xdy=+$EU{0KF0 z2EYB*NbI`7TxVftEi6BF4f1&H?DQ_qIvBe%vhN1HQ)AjopFx&%2(unVG~^xZ{%C1J zeWvVf5mD=lD&l{)Fno8YTyh#H7r}c288=E?N=gYn8Id&bCjr**=d4MP$DG2{7me7= z)YN_;ZDCDPo`~oEOI_VZ2qk^Pk|!1LQLsFF_7MnXYJo!mVRq^GQl|VlM6nYEr#6oaIpDeH z%k3U`3CtoQht$4|prB$IxjvsK+Y<#d>$^uD7Xc~2_`5Nw*f+{jvtr~xyPjKu!5C!3 z(puxTttxT*M^nV>Flx!E--+|?A#gAJze-G~Hts*w3&M*IeUtzHC2^Pw9`v8L9*#xZ z(#U6n9RX31U(}@q)J$tm369G!_yqH1G1qv~2PHxGG>mB+6{6Yk1 zM@tLutjUrOf!KlbTLBHX85;2DnAbR#sAdm>tey@`EJ&bGIjpp2?@X1}`qo9svHp{v z!>Yis6Y)Ma-Jz0_N{#x=x<~xypV9vFORA;+HQINff~zM!#@z)fP$=DzT0E*U`c@th zYX(x4Mt%<~Ls;!)=PVdINEq^_GKT2EhM9sHI^RH72e@ZAH!s*BWsw7U_9D%dg|?I# zm1UbuB&=1oHy+u?@69`L)&3r>i~)T^8ozzY&M2mAvhk zry_N~!W<9pe(==94*74oXMN?eQb*t|m%(tLkfLe1>g>lqf83FPYtJyHzYnf}3}#fW zEx9vxy>@kE!`7CT$fmV(z{q1M(P6%Dvi~>?O9Ej_wT=XRgd}3wJ>i73vnl|;jAn}_ zrl;>XW}*xMU;smcnjxOqE*27&R#uZRNMYz^ zjB;7C$6oW2^IFfxC>qgvQwiW%2n$047G4!7bnZhc*qsWamK2~WeO;rgv?7H?nACb( z;bLKrz<43uoP}F+2cCCVdz?Yk%_mH2myw^yzt$Z6%h+Hd<38ILej*#4{E#}&`@JTs zsT`@2Ta-L`B#!iqyhr}?jLi+hmF70?k9JIPmBcYuYG4QsWTefeJ|LceXBOBihYtc@ zzkcl$nP$L7R)wOpe8>`p#jyztz+|QTYCz z0sfa_u&a!f$fZI>y88U&FK0i3@t}5;@NGna-@m(V!;!a|#Nkf_!o^p+HBv_!Zbw&OrVqN*s^{J~E%p^VdCfO{j zf6OKDYfXVk9subG(G!4{K|w86X*iIqW32bgkpy_l{6N$28mJXp|4hDm!o<|% z-huz{;dY9F(@)UZBp@93P~fycM|^wou^BlKG^I zXdXGX<|@9X-(@ss&G2NGe>|cRmUt6WOYdjR0x1fZ08~T zD(1l1x4B#;@=E5+^lC6oO`0kFi8jP9>}ekuLvTD);*HAmF_TFRQGy?q%DghPMzc?LiKMNyc zck00d8eV`ESkR|Ra|0uK&0bFyDFW0T z1-c*ClMO}SK-}xej zc7c6{X)kq~uZq!*#uM!Il$v2OY%th0kbNC1p3!>%0h?v>5CPn@3omcn%CHAmUV4g; z*8f-vaTJZbmgI6J0=z+#mp7+_1-iFJBrgI4`sZLF6(I&|Z0zR#zP`4WChW!>aP{XW zun(W=Jx1*^UWFunY|0k-4IAj8UFBbuUd5e;{W(<^&)&FhhVhMtlm;QIK!Sn`gDHY% zY!^syP}QiS&cxN9`QtwhUg{Jwt^VD-pvy5>@oT$x8SbP`iq*FVu2y&_5hdjy@ac^L z7qsp73JL%PyX@*F+XMW7FNPmsyBpp*dAQx;du9jnw>+TcuyJtB%(}mRwFOZ1RVv5XOe{0FBSVHx@1UUb@-Pr&?7s;B857uzy;_~1d3q5LheeC- z4$YC@}jnFLZ(CENM5<* zZjhR8Laq7FgQO0KAUH1nUnb+}m-X!B&5#*z&jf@c8u3>`bPF&LzkpKKye}#r)Rz4p zHE;Xrg0euXT@7v^fh6Fx_zkGqAMx9PTTqN`g8*d|7Eb^6?Hf^5EVw!kK>nHeWRN;0 zaUBKB`~3%3fEyqtCf1DiM2~8NaVC2rsi3MnUa`HEfyok61gc-WYgt=cLEfg=cG zcjM`(sIcn>7Px`NYj8UT_rt~Xcevyz2p+**N%a^3uLHK>1(B^pGcaKJ;7q~-+y{c0kjP-A}j7G8%Ml@ z1mX1QXgP30i5&a3w^!79%EwPK(WekWeABms{lV&n{EM1U>leTTq_F9AgIgCUs;Mc= zdZeCZl+*yRE>3PK*i)p9tu3=UeZ7uAakUo*zSdko&JJ$nK{Giy`BGWA;#l|{K|%8g z>w6wxe6G-b4tNbKkVPvoA7K9k_PW;yw!zj^>y->sabqFX-n-)rd}>Mb1WJmb_AZYw z5ibfgd`O^kzRNoGF__{XNB(YJ(mhnH4lqXhccCRkECpSE!@lBzF2o#oeZxL{m;$+b z_xq>4srBvcZS&f>3!r{W1Gdono9!2h%E}uJD}i6FY9~sf?SID9J1(HaWn{2|8=c6i zpNgbggDqbKaw-HEd5ifF#pUjDdkXjxQ@g*+7%?)L+!H_l#8e>fGCOlfU>DZ%u}@$r zm_w3ia)tvb3(Yg{PLJA2n&}fMDawYAkSbT8G1|~S+sR0!`;dd5Aq#zh7@rfN)Z#E| zJ_gcS0veiw({;dFT~bT&Tj5%&Y!B}6v zQ@`5kVh|ma+S=NB!hN`cNOL@Ux!JrO zRgei;Bf`1bV~RqXudD)v{^wZDL*(EBWp6g1DHd~S#%dcWqh*WN2A}8MTv}6m^86~e zzBCu8=CDDs!lIrt6oK2@3q}ZYK&Vp!ED@CUv-t^<0Bi~`ji8zHZT}?duvM=gLjwb_-~<{rT8Q7eM9ZI$#0)#qXkW03cxwz z?Bvv2Y|M*z7eFfa4z#c}01u)Np8z<#?6+5^RCj(noK{|j5@82e^9cX{W*v9kxU~(@ zztu+K>n<9LjJQkp%7>TS)q_&RI2C5jy;PcOFr!jX^?OZx)yST&1FcFfcN%S+`coVq z`p~DSzdW2}`%%k-%h$2IaZ6sGyldH6^n5Ko>;0zqv5wyJ&w?nyIepxYaUtKQxbi2f z9k=w+YSsYaGi26c!A*JdoLW1uE+9Kmf?WPnrjif4lK0b|YdT8TCK}XO6QU6q1{Mbo z*nmK?4IUrX%_kxKMhU5_(t}V}T>Sa4H-JLRxepZnQ1C~Z0;M|mSB=Bkm0~@fz$oCZ zt^u}P5q22*xbHX!Q57bFW&g^%Z-5jc=wNT%7(^s$k~Ke=?vA#;fA7`Ga-rG~r2Jp^ zM4SiFX|7f4y@RFDSemZ2eF)0kYn)f=mnG~+j}ds=Ma-!7`OQ-Dsl8;&{J57}>qo{wA@e zLe1lL{L(os&x^m{CQXeAS-TW=3d8MRfP+r4S$T8#x;UGV{+TnAWp&Hn;Lyv1G#UJR zz!70uW(Pm|74l=Ke-M)wKzKp!5&-$M)tiezC52NqReKte`dM>bLM(46a?I zw3$Q3?R$0c^1!-;%D%tG#|7k4&7jf!UbTLQuab==%6mxA_gVj+9FvIS;m1vkPYA94 zP&mm)o;pIZ%H7NM!S`ip3`TL;i`0vmC5|bz1do7oj9%Gf7VImdxsk^2h&Xfzo~)8q zXFwOU19k-=yG?%(vyGj#X{ZJ4szxDfIQ^dMQw1*rsK?N~AU0Iro+K7F&h7$F6W1@V zL_sf&0Uwcso-dHH52Rj*zyZ3p4mo9@RU_c|kpdwNpiftbq31mZMk+{v_nGNJS`>&0 zidEorD$mx#JdoAOUspGdn3%P9!q{Ct0l`TjIrn!E1H!cF9r!g)`FGDnfteMs(& zD`z8nzh{$TUDh+*Pt|RjMXlPnvVOX`+(oPBi@v>nlArwqME7Q;i8gTQVcB?2DtQyL z>Awp$h0a~eS;I(kAd#e+Twc2^3+E8ItXt2Zo|=(i0N+34>mss7k>D)qR~L4$p0l@} zbNG9{2I9EzDuzMcjUXayn1)z4%=ZajGcS<$>^$s=Qni*=g&$GN-Y+P7M8Z8 zv}FLY!8QQtv0{k4XKVO(gkVxHnq4dUwCnZ8GQwC_G8*=(Mg`JxT5L+!s zkVa~Jjzfn-P*|8O96VuCp)hy=P%M(FdwWkO9tXb!EqE*mEX3maG1@FYiTYiHU-ihn zqUB64m7eUIcRF5;e&s1Yg#J!u)PFVj^k0p{9ojOEpH1o_AH2Qmh3Az$f0WqXRA0@x z$KY!u`1OOAx08}zuOAbaoMy?D#y@}G4$0@giGr@)KzuN10NcRhqh?dh%{azYozy4F zei-mtnSzr}2+~tPksVI5m6uQV^!6#HIJ(TjPlYR!pPVIp;ST;&(3ekd z8r?Wq1+H*pXs2jR0tyR2DgL@{~-pzzv33q+#`KXW|0@8aiHP2 zv(wb78Is$0;-$>yGWU%*ha*V+_sjsik!_?N`K~|Qv z$?QDf+k9eTCApo}pawf*WaPa0bA*OdE)FVpu)%8qM0^LLon@uN>9et{@~Paz@l>iI zZuS1SB>nsJ3Nuyc!u_+!nk)>~hW5vVpX5?%OPr=XI(Ibo7U#wyyS=*)FJ`c6;?u#D zCQmf4ICaET!lWk4ZUy7OqjKcT!pA%Q1g zGQyjC3+&QbW90n0%5597p$NGPA`)Sv^!Ly+y%QHVi7zR0m}-f(3f-B8;F@2{MG|b~ zG~x^-^JrxWsl}~v)+SRH&r5!5XvO6NG!Bbj|CdDXJJ*BrW_+IB zV<~{;1ELO%CK2I>ci>+h*MiZA;S-rcrb_^kv@}CM2#hKaDWS3Y&M~lR5q|@f-DdFS zhzpBHCT>c9f&3)RHYZE)F0=g4^mns3fn3xf5zl-1^=FYCmjpjP{;A2@O00Vk$7K}z zq^VY5RFMtcQz%{in@))Qv5bk(OU=|?JaOi)iqhoW5R}!$jvfJ$4{*9*lZLTuLxN3> zNzT?L&BdMJeYjV)8XvWtxehNn#p{@JteQ=I*x|o#!V|W4L(a$%9yS6#KD(Apk`J`B zHR!$Wf-MmOZK#&rjf=x!-`%6u@Ctl?;r*FOo5{?68EQQ1sj#NNic;l<>lPQ?j-|G) zG}*_y2xg&QL=0uV;ijp-`^T2F$&2c?feMz=Zds?m;|ljheKnKDY|GrTUkiKcZLYTM zMnBz~#;Z?oVnmMXTtqMx#?|C}$-Qzkxd~f4KQ1zsb5)H+QZ+m2+P#m-wV!TOvkx!Z zPXtW5jrl(zABxq{t{hzw>V}>V1&s}9KBwH(CRh^lH})a^LZp5+l*FWEN15%pY>4AC zbIj$%Y!i+&7PmZR<}%QA=E-^*U?<9*dgn*xD|s>z(IH31=w_jWex++z)vHc^Up74m zB7LW;?i6hwc73eAVw!s87-nuJ*{0i(k{=)m9nB8%!F)H35W}hd$5&x>zhtKKKCjQv z5g#;{a_3qLAq&Son1lO6;_|DSEI&VKh@G!Fty|XS2$nuqzavgxyar$^xXK!a!S8#L z!V9k{ZQVq6E#FtQQOYs8*3h^vf4j*E@Y$BN*y&FXbwA$S-p$u-ku|rtO(4b^%h{&5$2VA-W>QCE$Ai63h(@u|2bDcx?#bM9lAsB_5aS31t2Lf_LFi?})q$)ZK*Yan2v zKO5TN`}fRoANnM4peGfHeA6lSOO^j-q+4zz+!$XWFQ0u+&46pB*Bd@hkoAsJX5JGG_x!XHS5^K={5 zpEB|OGHTyw#vSK$vum&uq4*Lxxvdl(_V-3vel~r)ZFd=cA|Y>+fB2@X>TB(d-9sD; z{-hZ$sQUNo*zZ@XuPD=>rX9R*;)K`x#n=B@eg6&>FVp`PM5W#@)-(m=-8T3a>5UsV zQpW@!?SzMoavQ+60!I8qHSmShsG!@6BbvN+B_}3`%>(Nm>_$xIc4+MpSNB>DQl}Ok z8h_tctL}+^@Y|oi!?+CM3%PCLnUVrGNhKGlLgKsVA4EAV^yN*3Ik!iu({ga+Ir6d7 z%2(4Ftz>k+ynR3X3H$Kb+>hXg!Ny9X3)lV^)#3aWr3Li39l^~YoJxEAUx=JX)qr0X z7|oeo0n=7(lL*a}cYs8nF)$$JJ8|MZcv_nMzL#I*{oA=Dww7!ExLbgd#wKxpgeO*X zqZ3PddIinOxmb=mkVz+Hh$oz3Y8m>W41Tv_opG;}7dZEiuPV4DGR4M*8=F|qWReqF z6$cghUeQ<3DRXry2sO_ZOl@iPidYRe8aBOFomS1+^A+et;u%tm%++>7qvQ^WbUI*yl?*O^t8fn1iLVS-3PQW!_s^03ci93sr@Vs> zw}SZN^>^?i44%)oOf@Rx%vR@Vt@K`AqmuJU9}=O_jzI4EG2C^Oh^r2GPDE&*%a1g4 zNu1`yB#fM~`j&R%Xsqx__D#yYKyJA>lY*gWI@Ftp4La$hNXgRN1l~{{#_iqBB?GV? zB~rY#DGqx`0g7ZvXDN99sUIPB0rcNgfL7 z_1$(R`Cb5*OKK_MU>if~2Pj=7>sPhG{Z7-#>1k)}*@wb?P)|WEC2lJnnaZ8&qd{sO zJmq7?7(4vn=Gr;+RhCtrRaHC=N47^jOhZHNp+i~#UsexRtQq}HK4KO*4)4Ui<~(BZI^B(JT!c+IltLWc;CtySgZb)5tu}Rp zj)SL#4CBw%%5fo~!r{l%;W(TK^k1GR-f)QKI*!jPz=`xKJBCWH$}oH%?ca;39og5I ztf+KHdA&&yXZr$ayB}$vZ%DjRqm&P5zhP!!8hYQ+$&`I) zgf4h$M%5mdSZLUU-B{8k^qBF+$=Xcl2uZT2k>hBlj}eB2Je-M2O-77O6r+~1aMTU0 z??T}QqbRPz)uV5!tUS)@Zxj&zcAUQe0x zEr5UA6@bH#tweKlE~vBeKlnip$>QC<$>Ayb&rUknarCxT6>lH$qN(6Q|CT*BlGEZr z1WgDN?(!pm2_cP;U~rxSHku4Lb7cp7qXZ{GgNch2^!j3hArvewO373LN*7+$aBSyB zIcn(JUpUDXttE2AE&S5#3{Z)-5w ze>{^m{5c>lC4M1jtM=Eaqa^9<k99Qilb>Bs1-VmiX{cNqjs%&Qi4fS3kWtP+Ta1o$l@R<}qe1=^|EfB|D%AL2M3 zO)42&&`UM--BgYCqb*1p^0In)|LDqk=H@M$G`r^sg+C&clv7IQ)ny$-_iix{XPs|p z)1bX1gQ0lc?D~B@d_t5)N4{+A4bnJoIfMXqeWC$K=T`xvKml!Y!H%C_?)sM$ux$?A z-?;D?zK62GfQa%!<@KY{=0=Biu0{8suU^DUQ_EL;#X4quX*z;aWW!xL0`KX}^@6P~ z-UCWB67F)zjbtfifiuo1f8tK;ckvaS(|0@ao|aMRMVL$qLoV(hI0uQ0jbGh4B#(w8 z`6V9(sWeb%mK*+f3_id3E{;(UUv?uuxL)GJ567A&(zKVfX)#Gpu`1cJ!bvFa1B)XfPP ztcQ@=^ck|+Z^|#CbM_MB1FtL0sI}V1vn-Z%`K^ls*U&-Z)i zxo2sFsjt+p(*bWRr1=FLoxi$dd;WU3W1ax!1T72i)k&?|n@g7}+<_zm#wBcEXAQ=u z)l)HC!2`OC6l)R8cKBZn91D~)(_gtIlOFKido+DuPQh4>AN3PZQ_D(AOLORSbaYhm zTSyrVftHO$$!mnKL5<4z6!Ynd*m@23=pz^2schH#ln<|H`>f+mf!AR9aAl7#Tj3h5 z*y_`d@B4+1?4;|eS+JU-JH_AveZXivP|A9q)XMha^@i9W#Aov$1&pI_;%~7V{1GM!VFZu{v0P@VfHEJ zkYl|HEtMBzzS+V<*e@yWY=}wh7T57(qZzvMpC)gdZbOn`#owTD3US{GW^fl8hW|YqU zTzJ`ZPC z_U-{i`bGR@acI-DLjnPZT@TdMw$g`CU3)E$L62s*nUu{+6FS#vvM%RdA4KwJMiG@p z@P@NRgkggQ%;?q0WAgR&Qsm-IlIHPzI`{A%LwAOUEy1QNLBiPB*#8;VwEO?f{1My$ zyJvX9pGz)CiL)B)a7poz8`3|KUJ+!WwQgch2J@%`0W;c&KJNM*VnA=>E?;0Iz za=T0D*F@2tD4WZ%gKaDL*U4XX+X-KE8pD2z)#q;cgw^i#26y?V_sNe735f$Eww88f zW+mrM+v?`amiwg~;z&$DtW)4{fwG={|IZc}k0?RFP(%Ea!7=R@;Jtkn_NUZ$=L8Y| zjR7N8!0v(Jg|Ktvu9{e~Lvm@3m@NW0OuT@WcK!U+n|BMIIC3HPuk}x$Gu>Nc#08HW z>s|+Dr?OBhflc@sa9(->oB(1nn$;OS#_S&`7m=eaEH+m^Wd83{Ol_KfGKGLnT@ z5>3kPU^9(v>!zG0wR4Le#nPQM6kx#f;2QSS29YW`fow^Y*06eM(-QBL%e8>ciP67x zF6JI9(dsRHOvj&#jEOm7z`8QbR(+zfTzwlXV4-5*;tAchM*!v?0U`$^sVxu4_TwOw z+vfZ=IENt%U=-HnH)MUGLLKaF0n;+ByY=c{q|i&Rt~P5N{GzWKMVx8j8_U830`)&X zWy|W<^r5_X56}JW4mL`qG0#ndR%`r2Bxh}RC!g6#FD`(}$L-`+S@1>k9(G4DBJ$m7 zyi?cfq8W(Z*$z>LA$)y+H-$JjI7mAIWR3Xf^dRpou{roEP~xU$ZO8}nbl*vHbY|Le z8!M&f#o%v|`n^>l3QptQ$0=>XN}k37e}4%scZ_n99WD`mQq#1)-qH6nT3&ThrL9Eu z22(2VgtK8sCjhH;71FTJ(E0}GGV7nHsaDQ&dgJHFL&#YW_ib>+*7$j|&Y*7uvA2ie z8?f}Cujw+`6oEo4vVqK1IeZex^3IUV0Rf#j9L~`_IzK4-8JB2WGP>Qjf zv@0l<+KZ|bW8s`yA|#gKLF`M_J^o{lsrcPkfu*}Q4fcdHK+FCf!{uRzP;LNU1skIq z;N9B~CK{jV`7lFtW){Z&b`6!!pPY!`Yo%E+SUi|2Voe zIZDuDBa?9VR}XYpu7)s~*FVd5-Nwa!HA{a?kt-NK)y=ZR`JQQJffXh;pD*~mU>%|o9r#hzwb@<>)= zN|g!j8unLMJ%YJMvdXr=x#Xo@25R?ni!tmE8MHhA)Eo<5|5OiEJ*s|V@G>&-0i4M1 zZEpvFp)kAwIFAvHUBHe`b>UVQy~Dp%Xz@pdW2@|6NcDa5NCx$IFe15<#T)w!?$K1gKW_ZI+RJoU!Rc4MAH!sM-##qMM$JC1 z^*>S5^iKFl)q?VcIs7GedPi&8-BL;{2K=(z2rdbEFUpVPlYPcs)S@>Te(SD|i8DR> zM6EIZ0SFNa3wPxuUNjdrxiGV8PDC!~Z{<~f?Z3^`E2ZK9yuOFg{VBqDot*8fM%V5xUAPKX1pCOc)Zd$TdAODw5W=L|VamVfp_{A6@uh4dBcn;dlu1CFgVKJ{S89W^qphGrEANSR$n zd~NSaFy^N#Cu49SWU2PQm7`zh?#;M(4%H?toFUdT zQ0ybceY=w^UQC@ixc@FKiotmh-~mL#5zS@;lp-2P^qvBH_tmwb?xjrt&P#u=La~Fh zh!D}6F@=sz5+@6*l(&;UP`8G@Hyk;B&IvX<>H0t8fuN@F1RwMc-_VRtZD1rf50tLg zJX9AhKg-)MwQn31J@cFQYUCDaV3ix^p?rqYQ&*iuWdm{6-OYK^-DGWo2^j7v%E(7E zoa01MW5?UuzctLlbdAK0#gj@8iA;C=lt5KaVWMQ(UxYKhDCGQ zNqtJQC8a5u)6&bQdRSM$bsU5?h7ZBO02^(``$D0+T)*-Z)a1BfSY^`bxTj=Q%Jp1Cq20^wVKSE0~Y`ZYnH@GQ{*j!K@aK@^HyrF<`x>+;LC33Z2&079Wt=F0)g!re17VL!RE>Rm z(RBjk06~Z?I+R);y1RvigrcEH1ToPk1djvCP{^+Wg1GX+jb(4g2B`&6AL5ClU!(9G zDok@L8K8i36S1{Eu0#8ZCG}EWXIkTtv;+*whSElhG*42q_0EGmzkFz~9Vk~Lc$Llu zc8cT?GM)xv5aLROs6P>+<3ytd7z9gfRf2J6pWpZr0q!z#39zh@I)5q955n*oS-Yn2 z_yczNhh)BF;<)0j5}X&q!YRT-$MLlC+a!5W3_|Nb6Cg<6G9i8IPn2ZB(qh}TqJ>i} z-jYInH;MkZ&l-g^<)Ro!J2Dylc7MAH2qqlI_vC2gAk!6~gm17Lf%qOU^TZ`!F3&sZ zgkUgGcny6cPlRdUhp%0`dn@G*cN&qZN0&djN69b<>Esiwj_cIC)^=XbCbmI;+q+fk zOLYt1=qP?IZuZi*zn|5z1s^s5GdozNY^;i|%Nw2Qmn-KIs{c*iOFMUmGmMka=tX+l zq&NZ$FSN(WY0Cay_a0pL*-u<;beNLFEtUYb9J0Jk^bzcdvO8^i$2C>V=r^XD${SRm zrw89hBv!^n=PgjX5k#NOHuyf|OuaxwLjfP~9iS2&@S3$@mq0B)4X)HbfV=nmazQkg zm`TWtOoRx5`we}^l_g>_(1Do{wqjK&ep{^ z_Mt4MN)2u;QiePq(xuZdD+k=wreLZ|E-)jIbQ2^FFs$L$=Fh1PvGS+zm&ha=xS5@o zA+I5e7-7$4u9bIP*t&0SX?ugljQG`Gxm{=a{guYwR4I3&%W8 zTk_q__a$r?QSf8j00E&ns(24+>Vx|yVrKY2$e`D6NM#O6%zjXh0DHS;`&+`r*KVm{ zoKEAk^F7~oj`Y$M&y*~I!T^<`#2^og0GXo!w61n&q#}d60%4OY25(EFAbmxMnvl8u zf>O30=4dp^E6)v^5aF_$9^&rD1kai0s;nyD*;+>f$|_;CWq4gO3^Zx+0$Mb#)fMp7RW z6ekhIFfBB6hwn9UHMjW@Dyu*Ke?Gp%Ek34;LT}||554DHtSdkpyW*AuKAvIFrpPBT3!Ayt)J39~sxr$AIu`0M7nn9urmN z!A<7FGG<}7B{b#mlZ#@SI<1hP{LYuAkSDf`Ko0q@4;OXreI%bjWx)mu0^3LJ!<=2t z?|#x1omX|A_wPciX@UY1`O$_SN29#wpwR{C6CzCq&;yaClhO11{3}gI${3510skbW z*Wuo&`}=>iK76Ivbf0(){F9DnT{)K$SS9kI@A)hX3@42D+$Iu2m%-mtiq|0IW* zEirGh+*2?bXElo)xRG1xNhc9S;q9G8A#gn02q*W&#Ux~6Z&l8bcA=)L9Je^gK=m>s z{m+3f6iTW8LzQ03`L=A%Px_?Av`4nGU&r@K5L6npnftpT0TKnj32!!xO3$G(0^w_4sy;&3q#YS~1hxRyY3a zobOO)!7i`rLZcdZuq^78BHHAe*nn6y)bQ<|ct08uH_3itpm3M|@ zUus@^d0cM}b@AqUnB5=$+ehMTkL&3x-K2Atm|aIIesmWs41imAzH(c*)4( zO~X#)uItzAGv@k7gd&x4j7qUX;DEZiOCc7zHjFeD`IyLZvM*enF@Fx^+Oc<<60qDx z;!R<&v}+qQAiSHM`Cl_D4Dkzh1lL7@aIOorg8Ryc6#V6(_2XauPqn?s-9PikH}hR2 zHGTSxjN7Ald^E;`k~XUwt*h*N_E}83x2`Hp6kM#z9oam8{I`go_T2{z8?TigvBvOx zX_mi+Id-3^z)m|Z-1$Ex`J3vNrBN0aIeEE4r~< z7{GMld}b3j{`Wu#)#3*+(b0^otQPJytq!6)1`fBMdF>ii(a0+r(O1UNgA&oBPZm#=it*LU5*BEt9!DtEZi|96 zt%T5%CNOG|0%k?}k_(J0{cXW21d%|t-DT#Se3`r#CY7bBxaK6JAUGcq-7&a-M839_ z#18Lie4jp;(#oYvfy9PHQgdS$QUlGD|6c_>h$16MA>Fq(56iQ)DNd`?;#<@SaRD0a>;A+(M7(t38#Jp7_}_#99OG-Q?d4U-4&CrRO~>pS zPE63GG7hv>74I;a#|3j?l&Xs>_iP)~k{GnN$Dq-Gm>eU%yKqthlv`tG4dfP(OMQpI zm$Pq1Jqm!23BtijsG;7tkF)@j{d;4KQlwEj+0})C$JR4)wAz>PqPsU=cV!a~<|IuL zuCQdjDAZc^bnCEk`+51Yv;Kg$laU{pL4CU^CBNHAaMg;Q#cJU!&G-|w8Z=JJnlUW! z5d0}MRQ(a|%8|EP_6!S>c64sOs=On2FggcMPVgozWPyBppcR}AP>`R;#+RyiO`>3? z?EmHY`arC_{>6)oMcX1ToL^!3(X|7~t~BK;Qtt-xKdy>fuhWlecZ<>J57eStR3bjc z8qYvNXMo0d8h~ z|G4Cdc_n{U_--+iwwW>PB^nL16j3?qN-pcdxv|)T*xEc>+DuRJhky6d`6Z&Ff--~G zCPUAr8`Pqc@x-q`r8P2|$JG~l8bb&WU~s)+E{J)~Eqz|%Wllt9DMrN4BLNea?a%zY z5qBrrO~q6$%61t(^H`Z%C>?)Pk6GR78jPfaVZ5zHAXa#a&#=hk9w?mBCwTew>>M!H z-Wbkq@@)FvpaCFKT>2Su89-B8fkUN0BCG1ol{m{5651{jq;i03+V`RmwIKD^Ymj*PF81ggm z*L4ZN@LZNLbnENG;Kyun9<NA?A>uY_=!E^{-y`HMYxOsnen39RVDxVP9xIMPhVj z8eNoJy`Fay2)hc$xdC;w6{lBa4iL1she5v5@Gu5E>>zCrK zf-9?jj_@}LsgX4q!WCnU<%1dczmO)9TX3wen{0F&;T9iS0k1pW$MM^upP{GY{=-f_|6ZEq|nmYI#TixR7@kAY@$&I2~_8@_RWmw@~&IIENYP?DNQMVx5H6UPC9JNYU_e0 z|FjjzJIN#vhdcyGy+HUw0`EW_$wr|xOZxDTMGDvLBv5$+A~5=#V! zA-#F|zEa$m=)==2*Vme@o4n0Fuedsf@?i{y$bU(|f6jMGC(HXZz#LL?=}HyeE~S8C zw?+bEnObLH^rW2Q&tS~zxw3-a7rTAb4(@Q+8T*HM+)~cWB%+#fdfQZ z_WBn-Z4H5lx&kwyzLR%vg&=hj#c%nW$V%KD=7zeQ!f9_=?JM;CFJsinl7H%)YA zuPAceVzb)5aJ&XOVaGP zSz<}u3S>y|CUZo)*qzH|rFA7bRXm&wuZwnNzi?@;p>^}7a79n)`ZwHbyLTY&woU8* z)5}d==WdL2;GTd^pjn+J0UA8*enTubbr#3#6p*3KPh7I6Kr&tkxunO|t%YaBpl`fX zIsK2$?8~&FD%nwBK0ek8{;_%v-zIwI!&p1fe)VKSBYw;=wluP* zxjf&p$#u;3D(bM&A2f8GXEsUGxfW~9_K+Z^Gu>y%Flb?vXwv!9g@5+AxywRVtk?3W z%ux~g?ohH~jhGK%yX9{I`;ix5FlU9n9L(Wix$FUU42KV@nb}Eei~8SxCAnS@Q^X~B zER+6=3g?X2r?g+!lkoK6Q@CsvHWUDmv#GzROBn>DQ|4vcTrW>uhBDR}#@E6y&1k3` zKuEIJeqCW1nB~Y|DTI}FE=TD^_grgt8l=$l-nWw=>+qGLjcVUV#!ip-Q-`xaGcUu<@cEZ*QjDB+Q+XP2AzZIaF)}-{OQE_m}$No)4&% zbb;8f2-Ju6B?S6`G3RMu820PeuPzum&7&4n0iDMDf|8F{6{QOF?SJG;vP%&w1cp@~ zHKSQP+jPIgUk%R?I7%gF3aAQ2fZs~&PpOW|+htTwFNZ~R6}HJhkmt~PW{}8dt|$Fx z@^V($Et_K3>>)}1)C5q7t`tgS^eRkct{I;Jhnf1_g_3%x>j6n%3P;8j zK)+@Ua9-Kay>0z#p3!mu$&!JdfXaqR)`VWC8_f+1r@Rz_&tQUm-Z|U2IbxSa9a^sG zvlN77(j?o+%pCKfhRmGiUD+Sp7JfKy|9A7w=j!An9w_r2{De@YZ$ za{K68n!VAgST=}>4l=1W>bJnbWO(VO2hbWLok1d62o?}9Axqhd7^}QWe(!1*J$jIM ze02|1$lQo#<5=M$GA?k-n22E(fL0Zw6K7W}@sj??nONQRX77e{0Z$RDRR1mteU9d5 z@I^LNesD+0l23K%MUSR0mQ0!r3q@;T3}nqa@wg+hP1(y0N}IllO2nL9q7ywB9ib99 zSdnq6@axUmxwN=huArnADHMw@9nQJFk$8i!6FhPB5^Oh0a6xc|>C-TKU9uCXIGrFA zE5Vr;Zga8I1}DVxD7DScEssxaqVB@M>c@OVef)BSFcKIBE?HiSd&~QI_Irax3grTf z(4D9`@)@;YRV*C4XISCk(~-GL`Mi$rP%+cGpeXX4p0;Idi${q4W8Cv+zg^T`k-y#} z1jq4|ev#4%N5X)<0@KEsVQ%g-Wy4c`c9N@16&yHE@xN%hSu*(6K%pBiaW^SRqOvUI zUzRm0x@xaR+=t!Y$mC2^Ucb22r%sTbxTdR;mlwP6!?R1TF)!*8qmUwzs;f+8uakgr zzx2HF<)1gp-uww=2Hdk1W*X+t0drC6y-`Hhkecu9gyX~pY(GAhC5 zZCueJIB`8m`#4Dc##{NfT*|tQ?2wbg1}WRB$Ks$<#z0(o*1>K9W8=PV%Y6T|8uV?M zQa3MR&HgfL1augrk-{T;=uf_M^)psNt)UTJE$#DXx^^AdTMOj9J*iXSfnJ-k+kg0R z{q)-1#0K?=lgH8VB>5ck)YRX^-i{+oELRv(K8$&eSvJD)(#* zFiR9H-_hrfz`giiE=e}LZh~$)OOBdH-6r2ItGhmhbKD&|DGmf2R$c81e7|krw1T^U z|Fa{h>FJjoq{YO=K@3~r&?&)cP}6Yik!vG;_-#$u-mksV{wExZf`|CCS{}su6V*jL zX=;?C^@$s!qrh6HY{|A|^a%3npzwcCPUWJ0FVXhgT0IfIJOXVdv@VzolIkT)PHsUr()9Osa%!n8s5dwV-!6oHM3RpgNyTaZc#JX#kHcS2hKG1HZ0{Gj#@;G%?!wcb zqg8kRcFJH%9!EHW^+M1%0j^yjn}+y>$j5c`^n5^q9jN+d;s=p2QHF`Jlp$`iziW+q zhFO|8fM{d&`(*;|c^VCnde*OVHO(*`<-@q!IXc zt+y?9!ftyY)j^>6Mdz4%!X9Yz?iH>Q$+)kICy0N}^MoeM$ZoXBI?aLUkgC&MMMcNcvltnU@;^S)Cm8k zuP-z-{SBL8f}!u1F-qVW{w66xO>iw2aT=8{?^f|}|-_MZvgm6KNR<$dm5 z=Il)6KmMtf>vK-4PpP;{W;%Ue!N7_E$uUZuoe>MO3-x!J-nP9UGarQHfkIMHH_;AU zyb3+oV0JsG;L~J_nR2ovNOdF5%Jv~?)5D_LNuu|;`Dgb#{XZNMeQXMKJP@#>>m4Qk z#DXjpR9pu7N%^h0N37HvdzKcO)X}{<@`3rYT3;u!>xgE!(WCZFWIJKc?(nMd_8gu4 z!m|2P*=)L#iHoY8$9E}P1UX&y=B^XiB9BgRLJ!&OoN5vjRi{*}MC z=h@Plak$@QCC>W@5dHf!!+!agK^;Z^sLBAenJlj>H_$%5J+ zsr(FnDDyw_Cmj#YPpVBScUbpZA$_KS)8^zE7=2l-GsN=Iw%XRAcVL}qC|j;|N=@_3kW zUAyV?tAOy1Uth;^cgYXX_ek=CMaVat?G#L?%?}*Uq`o-=??hAoy^n6nmFNC~Yyp&M z$($_oxonst5Od)2x+VYG+SqqAx0uk3Zzt=G&8yT$iH1bMfsww+LXj$0<Zz5?TuVYWEPO%6y(s4wa zBOjxc-yE#iTb7dKdsWfaStgor!L+n<-2G9xOkq4JE>UzvdT!)iovD$_cxH#$P)K-( zH<4xc<6Scz*lLkKamAl+wC%lX$_mdR-vg`^oAsK6mH2;n{z+IVXb|mj`5pE%3{3x_ z`yqhj*l~PE200y8Vt?=8JiniQlDkaI@CEbOslWVdo-+76{j<;3v(n2)l*wvtDENdrF@{ULU8L8a)cim7SkD`@iLH9dW$Dx8z{DQZE)H2gR<(PApg<*Jx!0e#}jM2RGPeSHxx z&T;FUIkvslD96Q|I67s1y#_9BafBsFBJAujOG?e;L*7U}=0jV?R(9l557zOGAJ2(> zH+|b{7mRU{zk8rW6hud=RvX`uOnQD$Yu_|jIxz}LX>0`A3o@S!pM-}5ddYiVX&ipZV8@pq9#Ak;Dx^Q~b@8`8YrQt6UD=*x! zXe+L@UHzzP-JLW`Bt=I&A4_z4S}wu+v>68z3QLb`nQQcgAAPRMWa8~k6#gF5L5_Sm z_L2yUjJo+3o(Ti56T6PT>GIvlFb#q zVzPQ_o=HpZnfUfr<9u1_3v=W_$Re+bu z(j`t%%s}Z*g!<6gi8wi~?#$*C0KxIn#Ec0u&6$R?&vcy5GhG_}^6iMVbpsZbqJ{mb z=E`W~s)^?OuKSUC%jXBbHEt=eGt3PT zV(X8ViG{c73#qGw@g$_`|GnPkXT>-Fyd;eIwUfDyW$H%lM=LU|=P%-JuwaMEZS}yD zQ>VYHtIO`%wFG#S7fj3kKJUN3+1#kT5`J06#P*KWtot4(S3LY9x8a3dZJBGKOJ18= z`r`)=(6X{G9r{0g`t%Drz)X+9a2b62-mI!gNYJA0Wn?4(x{&myG~Kv=hr`jcEIx(J z>Yp1r(T2&=IX`E6wz=`agRk??L*&Z@@cEAbsrXW23uJSWsYOs{#DE`qpXz~T)& z@0zs~HX!_>Q28&&7@W!l25y+j`(DK;C?J0es>Rz44WZ6XNhdZraaS+qVIC^JAsDjU zkvMvR-7y`1CNM9l`uulHe%Gf@_aUiu57O6z9|Nc05C;Ig#Sg?Q{U1qHzvhz6l8$3w|r5r{Q|l7#>MnF{ae4hL7{!&KGO?46uQ zR8&+*$jH6{>HX5UR8a_Wilt9hw{+ZwgUIlRywV%whQZAOu|u0&ScnHV6nvHoD4Hl} zXmr5g+gG=)0VH@Sm=%_h66~(CoRoD%_`0?)AzJqr(mXm8K!FM;xej-qmUPEB4Fae^!s|M_jf!PWJf+{>fq zP17+g&T;JpHYTInH4OC)Z!(uSY{Vj}s&--*L7r~_^wKg43LN^zLD@n!$6n?~{&{^H z4w?J?c}5vqTCXT(3FU1=B&DOsyL76_zj^p@#m=2OqY@INy$THbJJZx-{qkYVRUjDJ(=yS|6WCb!PzQwSh~kg+;eNdN6A{(%JD^xe3af??c&G) zYFz|^{#tGs3I+Ph)rJF!V#dUy$r}Zva!Ct!&m~w+rHB0fMT9IZ37hQhI-`jkgHDYj zwtHtjtvL13mZ*nPR0DQE3RN8R!}oQqAKR~Azkbp;^F`qe6e;4%E8`7(_-xkW$oMtt zdqSQWc~3ib>}c{fmCT5YjC2VWYQkCf`trfzy0Q;Ld3m||00iZ8(3A*>i!)%VUpaI| zH+N8wE%!{w`JHp~nu)4r?#bovKHS{g4KH6>$OpA2iV;2GdS2cbM5>r>OeI`I{)8^B z;q~hcvUC=vf}L1TX_r~f|a+}J2( zw;+h8UvNi5#@fcFt)U^5OgTd(wZ3qxfI!-#&(J-VJ(}@&Q=LRHIC=7<$c&(qp4Em8 z8+25jjgF2+erp$(?xj&fsK|7&gm6g`OK4!Rq9%70J7Yh%K!)f2b=JoHBX5t~pGc0U zI7aYgR&?qHE8*Yn5&x1=b(+JIw*O21{kH_`sloowY@E+({h#&8pQN9+^8YnA|IZ&a z=h?u|-$i`vv&knQnUUK`@0C*dz0?SajDG?N^Q6xF*|joeY|Sk#>OvPyPVK;-)omeo zh`VP;yyOQ{R1D`$LrugBXSd;h3y8xv1jcaRVAY==v*~ZA4|~v_h{PGx)7J;+qD2&a z)ZCnVqoVZaJ!!l8^KkxDOE+j37~loN#e@0j3rR_lV#Zl%;B;l=<*A5s>tQjwDgd7n z5$7Mjd^$9MD!ckEW{zC??()wL*2MxY$sg}u4+9}8lqMIiUd^5N2lhJ$v-b=V>oBnf zUOWZ^4*P(fkohsH^9TTHZfV}Bz`BYRcxpe!DZITa52VPp*ls1T^}H)zE9aL1ffjCf zIbNo8LeD9kx*OV0S~aN0x%5wYdK&CvqAbIKGGkKGvQ!|Nst&uc-C{{P4&PxBOi$t8 zd6Js{0}r?0yVpNId|19b&x(6M*wAtE!OVV+|e$QP&k4$99!!?cQJ7z30X zOq~7Y?pBlrWKxNgV|WZj;_(tdUZrtPi~%;})lyCj)iWWtOaR7Yy&4#3LJMH@Gj12h zwIe4^+})wCungC#u_uzGROD~v!0#AnDUPg@suYTml@#Y?m)G8$(cZoef6N;{%79bm zKHA@u$A>eu8ljN~zQxqw;CdH-Oc((5nyFrOM4{yLuipn}W(6|bIe5i#y{how(zmM* z^ivp%l5H{9NAQ(xNht?}C}OOedP;FB3Im7|6c%22=FAzTkY6gi3T_c|bDrP{SEIF^ zH~RrcGT-5!XUwNjHG_L!qd*|E6@-B%2??u-_)&EbmBIP*=ks<2;EA|#|I35Xiqho& z=Y4T9CFccS{68S;|3I(*^M?G_UzKh7zrI}mD?gfk=nap3fl3s;pP%33XU~de3rJb% zA#Y=2Gch^2m)uP=?f^-gym#;3{kf-l_V)G^{u{|3pFQF*ENFS$*0vWf5c_HJc9HYL zy#f*6;n*?86)VE0o-#NUC8796P~4YmVuoMkq?dO4_6m%@B**_?6%vvhi!Ro=jXHN> zTp#DGslA;Z^Z7bAld>!HU#Ot1fza5~^y4rg(^F>>0<>BX_U%5>8|3{f*cG-)>WYhZ?3+}vx<)H zVNVYm9uode3vBE1-2n6l)J;BRuO3-le#z8y^Ei@bXMx52^K}J3k2dH_2$*Fr zz0cyfaLZEc+8el3BEQ<7I1!J?7Tb;I^>%hPzO=N|sL!xle;$=|G0ei;kA5q&KYH{W z-Vch=%If<@VqxLo4o8ld9k0H9do$+v@;}ccnebryh4M-|x?@2B9^!YS91B>NQ3ttn z-&uqAv`tcyFg?RrYNq+SLw*ICTUazcdbD!y-n|(y?3qT<_|goj9I&&G0SM42je{L; z1}ZTFQT800njO2b;YKC+r@2>6%y%>H;mvXcf@CkxqxD1O z>C%(Mw3dmftJ2HqIt;6Nmk(-dH(~=8zUI0rYa}oKw;0Z#gqD`dKRzr+0B5P5i;L*_ zxHw3?X(*K#!U7qEueTbLD@)(LMf*GC1h=B9B9vDSc5X#~+X;Q+VDJxjZeCt8cQMhE zov`0;)vrDD8&yuP#~VIpEv1QbkKcC&^9zk#T`UCmfEjb=A|6>+r;3tm+W?SkMOU$tCU8GoTG(L= zklPz{1En{7RN=8{m9LL0v8(*Vn;5`_Mx-i|d|{>*hGfpLaD7k6#&6%gnU}h-;I?&Q zVhXl{t1$Jcj;z??CswGpG8AL24Ja6Kd0e1QARA4(KEZ}o1ODlRvcP8-l9xB}F3y>C z^udE^ZrQ%{xz7%Z=@{_{oA58fDPPmp_T~jCa1hw-1P+ewWYvZfCr-Ko5Mm5&)c{5!dteCpg7L>22KQ6I3fXs>4hto zE`?#ifCId?$rWp}wj!D!MaUgf}_xGzEJXlnH|C8l*1qD+p!4~ne z5*g-k$(Ju1k7UoqX;>_9)?GDHJm?3pt^-8J#HbPV#QKdJ<1rSj=6#=!lXJ_R<)zVt zevH+PKf5J;N7|;E9U!0;kt=p$!bLAtehq>&r+6s9!&w}~bU<^ZQ?naLkxc*t3d<&C z!g#@Kb_8r=Ge{e(ag2w#PU0~;hN5YVB|XG39v&N81^lSu_vy}~Z|BzW$DsUi09B*( z*ZA5fdewk0RD>5i=}x$3V4gqf3Nf>V4>uDo4+`R?rKJX(LjoeG)RpWEs}S3ylsAGO zj*5$GMv$U%sYWg%sNq&HW5O#smP=Y_+qSjvmF^ZT5(nB?!qSd=$NHo;>^!*=(*)%2 z5&aILG1(Z_k9%9(-X#6tCo6W@HRA)r+HEcNFw)=;dnOc%rksAt(Kp1peNX(lHyqcC zuy{CY&4`Hiuu=l5Fb%C|fB)Uw6v^oI?UOTM2v9AlJ72zWgCEAKD=7G!8c5+d_;eJq z6XTDh>;BPEh&kxjtl2v=P^R(f7lLSn&W2N`PGQFBp1Fl-N*-&;or7rPv{!v~*45WX zX&yPZj3K_T`1p0?>VQ|moQNYnBs@`KTZ(2S^Y!4|x)nRo!to`D6)@rBaig*3Ur9XabYlody7wm$FUjFHrAB;>8(AR@Qy1_jxmtOiY-*b zXkk-AVKN3kV_y_=I6ZqjfBp2s__?)Gyz@+~GX6!e&q@;P47e3;wYPF}SE+7f*raJX z2!I#~DW?-YoqFaT3YM3w1$y}Uap>H+b996v3<(K&T^Dd@bVjZv&%$@xg_3olB@sM{ z4LxP67F@d!P#oOcVlXN?3gT}+zBCIP+X~7ORO#tOMRrG|2fBIvV|MQtpO}c!h~-?v z3PeEUxxL`R?`_fu_)N%6gdVnfU13%% zFQ8HNUkugT5`O>w{RvBB8|Its#~)#tg9)a+30)$bo24v+ulH9BRPLQ1h9w%Jh1a9KV$iy!Rd}6McmbR38&Y0i&Xb z-8tAcw#=crfMkjRI2_fGU)R^JT|0qAIjr5cRQ%LY)5C~GKyQT_mSkmoPDxE=c%Y3r zE1EQT-K?1eQ)yCUt7=&vST(R*<2AaPCJnFIVF0#++FLHK`G%!BB9ZEXKUc%>>adH8 z);Z=)TZA%jJ`Au&MmyJpVVX=zU)9?WcD!b87r@Ci7=R z361c>sE)?Ts_ST6Na-!i&l9F&%@F^F4LP9pGG;!*l?%%ev0dutkNn_i1Txl(U*f$? zwYCbG^lYu$@C{4$RQ*n$j{0>s>)@>e8#itgky=1;%SIVCzoq0AMlryFlsRZ=$*Y}d zG*Ihzf50aveg@$t!W9W^WhfGW&g(BovJ(I#BEdT1PBBKwM}5GLYlDHoTG1_Ci>T;I z%FDMP$Iz9Gk!xl80JnmJ*g4(B$kZOPh!ATBq%5- z&+fr)6zmajAKC|cEen`8*}UZn=Pe#54&x8CM&i+*dK7-Vo>XVQt;rs4wA-J0s@k=@ zUcIsd6a(TYkF%Enq1)7(nhMdup=zI4TxSR?8?i5%Z(^z(pAVuig04Vmo(Q^e4bXC#w8qsjG47u2XmC_$d~a zmfiRV#dhqt0t%l+G>m#^BgWn?E%N`9PC& zUb5I+eb1YQkVPp9Z)*K6qps3Hug)9vuaJ;3j6-SCa7~SoTggvFG0g5HwCiUw^$pG( z`JF!lK!$;yUI6_!hv4U!75-Se4-kNzpI;yGHWP6!amgwt-WfY+Cm?iB$LHz70ch** zPuaL9c$;O>6z>Nl)jPLu@52X5!%H1No3HJ#TvuHo*fA`F>B1`%lah*$-?ZldSTFJN zdRQj7p(Ma5)%Q-YOer7P{dLHx;1WT?IGy`3fB$xLG#o)89X%Ksv{(M#oZJ!l5tmKt z;K5dOIWOM484rgjye8AqK~(oJ+1Nnw1hfEMOv@0N(fUw57$4g|=FzM@+bfqqMK zg7|DK>(_^#SwS~~t~dPL*1;E+^bcGY#fQ4vAE8hckdj)5vVFa~qO>$KA`^un*zP%H zKwRUQq`q~1APzQV8BAQ2-#^ZN@){=VKtAgl;Yp=BO-Mb+3le;psE4* zjwE0mWgO|2At4Db7Qj~UA^Cu))12@F8A}-@Fgsn#ugs1uJv0?ZQ6EjaM(@soHE#aNX_?hmP!(X|K-3pkVU%ES-#JrYbT*^eb*E{T2N5TfyG_( zgPDT-yPqseqHI6%q=9@IV34v94{_2C#Tpy>v^34Zi<>pcUw0sb#N#$~{t9GJ2}xoF z1>JKm1lquFW67ja29a?Fqxow9UD;-f8Ggs*sRS;qCw-i2*Z9=c)!S<;{4GFexz(cE zb|oe2_U7R4rKPP(F@svzsXCnbDKkHxobX{(?@g$*iec3fIN!__6%)f+9e8E`bCJ&| z+$4NA=kfL{12fjw*Q2TAS3KtK?(T(q6!%H|q82q_Ev;x@r8I}Ly?rR``h~#e!i`T& zON$;(f6^+WHxYW^#Ks5)Z(DhRxE5uVVzl$cw!_jlhB6dyutrm7f7ZwLV-r0~I}QszU4 z4y8NR;BSzYML}Vqom)U8-&&J|MnxDO_Lm}4k|TzML>W7pZC2MLIOBPi3G5U*lQM~)so%ziDaT1)I;^D$RvNARtl zKfbub+(T=o1o$1a@3|a~;)yC}+#7UnM^PFzX?S!oen$Tq-;ER5llOg%D2i7&;WD9E zX%7C41e{zYJ%tu}(tzVyiVu3kvT;$wIT556JKznk!s(n~zUPs}Q(?+@J~@y4zb}db z&#a}Rqto{dmo?)N{uFjo+u6^Hz0xke;_3^gRp+a(>`u7jsILl01DmzA@a{Z=Uc8{R z^ubCguT!V+w%7xMf+R{$aQHCo%2fDr2bT|72P9))u}_j|o86@n-t=~zzB@FTwR!hh zde9m~2qK~=fND>ua!&o45UMa5P$-1v$8TTo_UU^t=*TdTGYlPkXSZF zz6ZOpdF@kI2Zu8Ry8(%NdU`g%MC{`A>)W>FPhI+jHBb(yh|KTVpP6$Rgv4MS&gQV; za7m)+7D7A$bezl06(fBNunj0W)?r6Mii)2cait}B4|=ahh(QQMZVI1LP^!7-f4qj+Ki`OX9hgO`F3MuP-e zgg#P1;)d0Jxb26OtAX@LY{tRAGKM#&rlzWIxhx2@CnzY0dT=XR4PImI9E2|gmB(gz z?9H1u;GE8ZP_qXM13*cX6J2S0S2$1{TFezLXvzDltZNCP+@5WS9i`|5uUVaT$biQA zwUsX1dBIUnfE*_g6Zj*>wjE*?3Ae{$dN%tK%1d3kXo0#V4(-EjqQePZzka=dUTjp< zdR}qBey@*%syOod+>1m__qhxwvv8P|$|^zSfytTXaa5rMT2hB}V;&^Ej6$&*1@Rg^ z(`JN2+!B`0S-d+>3S$jx6mDo^bF)V8v78*lUUU{6f#sXZrVs8}7EWz#Xi%FO3a_5z z*x54mf-&)zVh(TZZXqFcmz*{LUugU!{d@0aw7*)~#KIz&%E`h)16C8cB?I>l=onl7 z?##K|R3?E{JzWj@7?8$ylUkOH>U!yo&zoLMRpOq4neyY-RsgO7E%BsFfSrYQ4{Ws# zF`ZU~muX39sc@wFUT}4ZNl7A4(B5>#gEx5)9Ms*{w^c$y;uDw^kkSkbco^YG=+OHd z5x(4(gDWPF#g-2CEeN$e&@^gGB^%>MU4dqsR|nRH{N9a{BM#L%GG`n(;yb?gK3%x5 z9^@ql4>z{~uyW)XUfoZi2ficEwDtBTqSf?mX{i%Zx|qKe5j<$saOA|UTuDr%LLfkj z;EN{WB1exNWmvoR4)jJ{A9JFS2@R0dA49f+->k7IaPCZtpQ$4KCtUMf^d2)@H5NwS zzE#7s&qmdD(OPONumAwS4Huq7IrE(tedA?+ga9aKgC9gMaXnt;1xW9Lu;w+Rv16aQ z`s1#yi%7nx!n#12#_}h_2<`$@-rB;fq#Di$+pEhZX-P%?zEf0m5Osc2L;qB~JJj4X zX(&}p27Up`B^%5+?(Nd_3C>?Va_Fnr2kaca10vxvx=1JzyHJxz^B%S-3lW*owQv?< z>B5gh{?+^X^>ySYO++h>?q?umkf=pgtXSdpqTs4bj{ESCyu5rOG&|td3~@2hJJWCe zGKP}77k)>V+72IfbW{gaVP$%1yL8<|%z6tei%ostRkF7kycsBnTog*|TwK}n^)6Q0 ztp-W&fi~R$6fDfmFW^Z7tfeaZ=ys=~>SX1j zlDllp6IW!-tE3El3%+dsbqh=Z>Z03=i(!YNA#l5pnJGf_MZ~unR0}$<*}gCFtu9i9 z(t*Q3(_NgqwB9EidZu?ZE-!85XKnF0aU9~X3W4x5V6BL*stYk#_jPh-rvVOUwgh)x0Wf!rUh(0{& zS&D2p8cY-D>)rtsW`G`=cDeyOUqLNef{Vy%hM+(HhJ~58x3{RWvgJ(h;DeA`xI#JL z8N8wA!$agGJQ*aKIyf3sDmyznIRosRocl4g#$1Q?)Exs+dESCr)W47`ijYlNZ6I~9 zcTgZzde5F)V6bs))fO{0`#=IjrMTes;B9E)<7ea7$E+TWM=)B=eDJ^l&edJBK$7H_ zdLu3kf9-w8ovff1h-MG?6T~+b)xaeKZkY!Y3cn)Ou;rk`Fht74mA)Juy#`3l(9`qYK*e|v zeC&Fm6uqRA`EFuD)I=z;^j1<=RIXd-GIlVujpLqg9BVd^fL;v63Rd> zyj)mV*txzIf9}Ykgbw*(S66LH#1K;A_IHz$+D5r1y>Crk$i8@|9|@SW1rqJD z2W*D(0Toa!eTR@LvgITo0Z6%m3%}$z3^NsezTZuaJ7t|8hJvIy{n-aXDAOm~1SFt@Ub9b`9tS$RA! zxX3B+NI`YoqF>VQwNZbqN<0{oJ!wuW5oa_t47ZeMEUNEk6a(`K?A*x% z_(C*0l09ax-^>0{*nR1)|M{t7>M1QmNt8~76|XPp9+3W&<nKR%vU5_6A$mi&p;y%t~eQ2?0?;dx2-T)Z}X9G>G>Fv&T(xiSQCF>*uNG5Z+R*UR^U}7pYPgoj3h)4<0m_6|jVPpWGpJ z3|M%l5{yD=o&KItFtnuq-~>o&Q8aLE-n=~R{zsx^dKub4 z0jx=e)<#_5ziDj?K4HqEbmBb#XvQ~Eh&6`OJ?!pg1#*QaUL(qZyps?r zCkectccLD=c6R#yQN~*nu07&qa&7=2q+~@7B@3K%bhWfXfjOXX*0Islp7%kd1+$f2 ze*dQ>{@On9%?R7pZ!B`zHTX`N;?NN*lR}T8J27>@4}wZc=F_hNj3-S`t%e@!8uX_C z%m!~>c1EeEhpMONK%}Fn22C*BpalYxJI0#FdC;AQ)YcQ4e@EAl<$Tj5Ku{wD1CYIfp_dcxjl9FlvMmf|8aNq#%r656F8U51^kO@e$Qg({Iq@ zDB0skS&DQ>WWH##IGi|P>kNHk{szCv$+XwjUGnXJYuNfLuXTIg{rw9{#q%Jxgp`!H zpFVv`7CCq38gCWckD6sQU>jSK<`~M~^uIsUo1s#6aB?!no1VFvnre26(^lg3X4Ghl z(2J)5@R2TSX(<(FhiKs`zlZAn0%e0w5rznc0h$Y!FK^yf1c8$Gq!FZ#d$(Lp2Lgfv zw;lCWrT=GfPE*+70xPoJ*CFGQ~##=0EcSo!Wj z$O!0Z!cf8tRCpwywL(8hjldOa930$el*^8qy%AMP|6Q{~RsW6`+zHcCeEf%?qc#OCvOsyXGVN_3 z85f|CFO55l%eKnE^C`Nnv|tv{y}ega(TX;X#w$F&9bkLu(fuU@KZyNF^$+l^z^684 zPsBA4z8Ksnv@;enpU9SEv|gjoKOw?;wk>*OlX5XM!>koo@svp|3r!rZ9#Pa(`{ZA+ zeNlDe)~y7{zQ_Z`kN}L2cxh>Asc%M}mz~}5U;Xnv2=9>Cu(!WN(i%q7K)rfO<#n+W z9WEIW0H9QRi;Ul-;>_Py@(4E%-QHq*u)<^LEM61%&jg~`LuZ$Yz69kh;4W#EU_*8| z)M=K1lHdDNpgsdHBWyjYtfXWQ2@6I8$cPg}DY&q)pXV?-A`i<>Mg%2fN%Wy?D|bk@ zq1izMKtRD*0J3S9?}zo7Feg_=H=L{~sS%FASg-467^$;3pU3awE zxOsRQaW+Bf?}hp!G;HHZR51J8-10i_pZfntML_7e(V_1?$Y=#ELPA92`Yn6zZ#P%; zQe`~fszEuj0SK3#nYk7F3j{28Z`-D)^#kosoF!XW>)2`8p=*Kl9q}FymN`N1rC>)F zY+JXPP*6p!1qY)J-GvwS`w(#tpr$mWbdf%KHsoe!B3q=`!afg88jsg521^pH*kxIi{AVfMWlJ{Ym1Z=>E zD*+Y`Y&T(*a7ZJe77%4;0K~O1ae!~aS@Z~@ZqZ|3iY0hE`1|D64N9IA^sgHM0A-In z;TBxR38tr~C(IWM56>DXZ^5t_R|>wtCJr(pf{-$TLaV>RV-xytLfiJh{(`8iK@>HC z>*z#phx~Infj#j0+`68N>m1&^J>ES1x^)DpV$eteuPLaq$_CI=vQ4tm*P$9|j@bh>E_I$sHciaou}4G<9eGoB;Q zu%@=}F42Ai<#M|Jz%r%=OecAwM8H6s6ZB=6)RmAe%42N1v^$PhLHw@}h-9Nt{iY-O zrtJJH%b@Fr90omg-P6sMSA!=antjrEvHV@ubnN7bB8-VNHZ~IXYSLZ9Jz@ulAA5Qf zj`XA-2-PGvvYsZ!dOEBAI!Z>gQnrGE0Tg_QJnsd#4DBT=Nlx9myY8l5G1f@()#G5$ zC~n=_Vbnbj?Ew!|2xLVgl}e53CJj*NA~K+^Bnn-ep<)E*lkQL_Pl>8*F?q54!|i=r zb;tvwmh-3z_d!yNnl38);H?p870+g8Zza?o#)N);4EZhb7k_5+As@WD6@b7<4=l)- z8;OQ=EsxwvT<(@gMlM7El6>g<20_ddhG+NBxqa4ECs$!es0Ak>_Or0$Hqf_-r48ha zqNh)~dZ4Vi4_sHGNlQy7`c(A(8z6xQ#oKVe&=f-MX+=sINLJh&;*SMm)u+biwb6)Qf4uUL%pjoVoJ$Y*L| z;%$CWy*dWyaCorPd}#%$^c55kf2;BDMj~_&oR^yuXGwB(x= zs^c|8{zSa8%n=}6ddgd*IxQU?@!~Fb)H4_lVo#f(2TM#RB3~D^lde1O*f)sSR)#`B+?z$WawSn>)SaGJ zPbtUb97@APtUU)wA)MSXi5S2tsR)LdSIJX=g+QKy~YfQ^3C}g6m&B|iLxv3x2Oj-5sP1eNkfN z6VQ12lnDeXD%E@B(HdGc(63D@fJ(wiI2_%x}VT1V=3Oj~Tsn3%=nlrT;zqja?V4tg;j7%Di`4y#vq9SRO ziF)p-Ti%x!7D^VH7prR>O4^AYRg*kEx#j&MaV~GbBxM4{v$AxdjrjP{qll7{66ykm z)jH8Yiq5gWHFk1mW@hG^Nrd@KvdAH-efQne53$Ob{5_~!yCKlB8Y$7xpi(cO&FK<= z=2A4Nojd4LM(+I^hQN|Eyko)V(izv*+0PIKI^JAZ#FHSaI zgsvzlG5M;+)Yj2K)oK}U(HomomVWQq6kW66Djl3mS1GtcLnu0;qb8uc*~-t)4@r?}%}*3c zZLeO{JX)5Nh-cxj&Zb+nDta92wNANU`hmLmyN?TU31Oj`H=Ecq+sYOJx-|NXkp~39 zu})5=+T|>xZ3<36EKu5ixgcY7k$Kv}*^9p;2x25WbQhS3~5cwH&o_j)?i zQRqLC4iNTA#>>2EYRY)h+NU!zHYW1Z4tiba#kUnJK;-s+rHA1J->5PD6fA_b_4TjLzsK5UT5oX0C}4I&=WYOL*S0*l`EwK8ns3La!zg23nsq?284Y!3x5s3*(-o zk%G|N+0*k74U%x&YGlvY@~by)FzM=c$Sq`b=NN8I=U}2BRm}s}Q9@soH-?f%VzpPm zzba~^&>~4d!XgOykE|wlB?XGx42VSC`gqvb!ckVBK~&Z29^miK@Vla{Y!yYtf03Sv zSKeS($nb-gF-z|*AB^~o4)#h+_)cK0*vQC;_#*ua016pg1Y-T%J7^B<)I!n+LUkHq z$^=|Y5x`fZefVn^xDIV??Tk-Ym-(b9jnD+IUIWV-vmx$Dq?NI)c zjjO7w-D)>wR-vSWMhhb})@a}9r`SFS4l=YDLZycBLedaYKUQXxOGN@HKHSDiS(PA& zZWISLy-A{0D800&rN6`Y$k9<0)Ze(aS=w@bJK*ORbyfCuuc`c-TAu2&XL;T4f<7bz z5_8M{y@&r3j6J4g_98)bBJp*l_;461-OTgP<($pEoB9PH1ETu3c%bO}$4of${f{=j zXgqBwy6K(9(^v&f{342vZBHUXA%F;u*AIUmTH>|-Tz9A9o$2`UUW>kN)LPBxMXu-J ziN-73jA9xUBJ5Vgy>7H7Y-+_XAw6bH(4)!9jAVroDT7^E&aP0cME>>_W?e(32`Y=C zakepz)%5C>RZ13yt3dX0)F_HiRzDjYthfI0rFu9<)XZsedYWjfpO@H>Ij<4Y_`@C> z&&3JD0=owr$%1w83lT>XbZ`6b`2HV45%DCQNI&#+;pP3wu>%8$BlwuSW2QnE*=wpg zB;^{Mon>TX)G$85!pa(k%1Z$Fj|f(7+V1Vb;xHl<(Vcl!1%^4|cQWVe_RRtUl!G@_ z8gV<`zI{u9Hdb0%dU&1}r4UH%5r69Q_pdfMV^}G)My+bKu=Mw@ zA{VtHbk@7O-|l`)bLY@#dD%N)W`ea75}18wU#~%L2gkgrwKcm}v+%Io)&0}bf#*$= z9P*qBh@m>BrE^(((70UB&tKS8PD9Ui6W$i$r!;y|cMeZalLdqJ5LsnDeRW%tnKV@) z5o|(LOE4wr0)tlXU3x8rly)H3`4DO*oZ;v3x6PKAR5EXzLd_X7t84N81Io;K$k4AS zKLZ>lKO2aR9x31;a~EjUaK1bNVTWr93*C!jOhHZ!cX4AfG6#2(5Gbhs8UQCS{B{n= z`;Sft%}Pv23?&botv+pOd0rnfK;#ALXh5DxsZm-1x1pK8g;#-4sZWRj3pky*&!3y| zs)paai@->Gvho$z|AVf&( zPG`=HbtGdDS8QeOHArI~04V=>Q%zgM)X26)~9#JuiJ*ynNnBm{MxI)qE0n zgF0mD6gG*g16l`F;Tlv2q=u)6?�W`yi&$5)NGmkRkSt67QD6F&_jesP(qV+Vr%0 zahNjZ<(VEZCgUZT8@eKGXItP?DGX^53OzM7H40dPSilxXFnqi>ULoe0l16C|1$pe;eS zuzcnfp8{w{3m7IVNpo^>ki%)qC!mcx@PN*zN8;r9=Y?FN%N_;_r-P`}i1phc+9j)a zVpl&uDl!UgTU;9=6g+=O=@mijZ-yNO>=`mnJOCcx#z+9<&5C z40obNh@?=s+E#tai?68($n+T1zX**0LMIXWR00P>EFoPh+X4y`-5c`smbpk@62RMott^O^$tx6St5o>e_5lKb`&5CT}&~1q>M-j0V!=n!o zC!h|1gg8E@!X7mBTL@G!8Z}6J0LX3$WEBIAhpD5EdeE{%9n@3}A|3n(UYdgbIg55!O4K$Q&8yR;mZH z<9X9P(Nv@HQ7dm{sSw1h`m=VJnsC#Ltgx#rH_K9!1{=)Ark2jjqC(NYlD zC_wpvSilNk(}>b;9%FF%HsyDg@|Ymi(Zs%GVx3Biv!QIR#Lt2xHXE^@B4--%b@J}f z#sz(|f=mHI!oH3|d^qs=&w*uFkMnjGW>0tR9;`!i42>}YG01ol$>C5631)qn?qdee zExtWI_a~|ZJR%V?QUuLB8~JKvE@;QIf$BX5*#*a3DI}AFJphFo0YO4y^=@?Q0c{*H zNrRk*!i-ux*nm+c=-@TY%~NCwqOGOQfpdj!^C0oXK?WjD1D&!`?U0v&T?VVR7t(!u z7Z-YzyIm^ymfV=bz)&Is<|WA(6ka26)#==0^@{Df31-da968b9>9#e#ae;w>kbcKN zCQ$`XCPe*6@8_YXpxzMF)zt-sz|PGr%4>2c*Q6VdBNw$5kxC$-zZ)Ih2kgQ=UWH*H z6tw5io7IAv0fulcSP$Ox_7dEOvQfWXwwB7qkM_iHNW?HLo&NnR2*bC0C}N=K;X{i| zYsP*I5~5y^(xei``R>LsF`V+4`~RdbG3R_P@MI`(f$4d!@j22?M)#i#v^d-(10G0P zZJnJLVBR$W(G@3C1Ue)-sF~Ig4GB6BWhdTP%2OQR3qX_v$nd054HT4%3*ZfvlTi2Z zd!I6X$l+GG4b|7Rys~#bh7&_p*CQB^svPX~BnnzCrpLUjtm=51U<_br6AQ}C?)cmG ze^;W=%>H*JN-1!^kkOtc=YNP%+Urrb8G-yHV`1cVRabXo6M5u9_MAsp*eT|i-H8kvRcJQpbi5)K^sD0C!TA+gLHe-A5VK`4-jI0{Ov zf%>2tQQ|B@2oEBq!bf#`T`9}2a@s%(&n+8G9ipT`uO2!omeo#u!tpd|V9`kM(Sa5L zhr$*3A#*l+$1HP9_d82VXN3K1bou*@$91{l6FKo=st!#)VTZHytY zQidxqqp;S`-~W<{cxKacUT1zM{nb%JNWtLrF5;|s78)-qt74}WdGvx`0s;bh@U0@f zeO7$516J`4GG%j>nAMGHmz~<`^jsLx74P2j62$LM!IqAg zNg)XwI%Oum1I!_Wj!x+IJioz_5B}+w;kFV3a}+fMN`?1-T#Ur?DhpzIz-VRX=QC2^Hj+DjU8W@^ zCC}O@$(W;o^7P^a3k4F(eK0Ngi@04DE3&Z&qE(8@E9~mkm%4Va5BYWL z=FN@hVPWnTgSpK7{Ik0nxbuU7z5vzW6w=ufP?aYfd~9RBY!p57rrnRwnD{Sa6MI-7 zK 0.7 -# 256 * 128 * 8192 -> 10 -if __name__ == '__main__': - torch.manual_seed(0) - - # hparams - repeat = 16 - dim=8192 - layers = 4 - - batch_size = 256 * 128 - - # simulate forward pass - x = torch.randn(batch_size, dim, dtype=torch.float16).cuda() - - for _ in range(repeat // 2): - quantize_rowwise_nogroup(x) - - torch.cuda.synchronize() - start = time.time() - for _ in range(repeat): - quantize_rowwise_nogroup(x) - torch.cuda.synchronize() - end = time.time() - - print(f"time: {(end - start) / repeat * 1000:.3f} ms") - - - - - - \ No newline at end of file From 2331212b35c6e974f734b240aca0e64e4ca5ea07 Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Sat, 1 Apr 2023 19:13:15 +0000 Subject: [PATCH 30/97] add readme for speed bench --- speed_benchmark/README.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 speed_benchmark/README.md diff --git a/speed_benchmark/README.md b/speed_benchmark/README.md new file mode 100644 index 0000000..80cbeee --- /dev/null +++ b/speed_benchmark/README.md @@ -0,0 +1,4 @@ +Steps: + +1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (feel free to change the name for your profiling). +2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. \ No newline at end of file From 51a21df7288a7e2f78c10778493f9ba554694e81 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 1 Apr 2023 16:10:18 -0700 Subject: [PATCH 31/97] Added 8-bit compression to quantization statistics. --- bitsandbytes/functional.py | 38 +++++++++++++++++++--------- bitsandbytes/nn/modules.py | 10 +++++--- tests/test_autograd.py | 13 +++++----- tests/test_functional.py | 52 +++++++++++++++++++++++++++++++++++--- tests/test_modules.py | 2 +- 5 files changed, 88 insertions(+), 27 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 8bfd668..8234c46 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -155,7 +155,7 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True): #return torch.Tensor(values[:l].tolist() + [-1e-6]*((gap//2)-1) + [0]*2 + [1e-6]*((gap//2)-1) + values[l:].tolist()) return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) -def custom_map(seed=0, scale=0.01): +def create_custom_map(seed=0, scale=0.01): v = [12, 10, 8, 6, 3, 2, 1] # 16-bit 7B 22.33, 4-bit best 22.88, FP4 23.25, 4-bit 95 22.97, 4-bit evo 22.45 # 16-bit 13B 70.35, 4-bit best 67.16, FP4 100.78, 4-bit-95 69.39, 4-bit evo 70.48 @@ -191,13 +191,13 @@ def custom_map(seed=0, scale=0.01): # 13B evo start #v = [1.6077535089716468, 1.1914902148179205, 0.8999752421085561, 0.6967904489387543, 0.4949093928311768, 0.30920472033044544, 0.15391602735952042] #v = [1.586363722436466, 1.202610827188916, 0.9003332576346587, 0.6904888715206972, 0.49490974688233724, 0.2971151461329376, 0.15683230810738283] - v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908] + #v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908] # mean evo 7B + 13B #v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237] # theoretically optiomal (0.93333) - # v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333 + v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333 @@ -599,7 +599,9 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra assert rand is None lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel())) - return out, (absmax, code) + state = (absmax, code, blocksize) + + return out, state def dequantize_blockwise( @@ -644,9 +646,9 @@ def dequantize_blockwise( if out is None: out = torch.zeros_like(A, dtype=torch.float32) if quant_state is None: - quant_state = (absmax, code) + quant_state = (absmax, code, blocksize) else: - absmax, code = quant_state + absmax, code, blocksize = quant_state if A.device.type != 'cpu': @@ -669,7 +671,7 @@ def dequantize_blockwise( return out -def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64) -> Tensor: +def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False) -> Tensor: """ Quantize tensor A in blocks of FP4 values. @@ -704,12 +706,11 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize blocks += 1 if n % blocksize > 0 else 0 absmax = torch.zeros((blocks,), device=A.device) - state = (absmax, input_shape, A.dtype, blocksize) if out is None: out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device) - assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] + assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64, 32] prev_device = pre_call(A.device) is_on_gpu([A, out, absmax]) @@ -722,6 +723,17 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) + if compress_statistics: + offset = absmax.mean() + absmax -= offset + #code = create_custom_map().to(absmax.device) + #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256) + qabsmax, state2 = quantize_blockwise(absmax, blocksize=256) + del absmax + state = (qabsmax, input_shape, A.dtype, blocksize, (offset, state2)) + else: + state = (absmax, input_shape, A.dtype, blocksize, None) + return out, state @@ -756,8 +768,12 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: shape = out.shape dtype = out.dtype else: - absmax, shape, dtype, blocksize = quant_state + absmax, shape, dtype, blocksize, compressed_stats = quant_state + if compressed_stats is not None: + offset, state2 = compressed_stats + absmax = dequantize_blockwise(absmax, state2) + absmax += offset if out is None: out = torch.empty(shape, dtype=dtype, device=A.device) @@ -1986,8 +2002,6 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None): ccolsB = ct.c_int32(B.shape[1]) cldb = ct.c_int32(ldb) cldc = ct.c_int32(ldc) - # print(cooA.rowidx[:64]) - # print(cooA.colidx[:64].sort()[0]) is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out, dequant_stats]) if B.dtype == torch.float16: diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index a550ec1..45eef42 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -134,15 +134,17 @@ class Embedding(torch.nn.Embedding): return emb class FP4Params(torch.nn.Parameter): - def __new__(cls, data=None, requires_grad=True, quant_state=None): + def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True): cls.quant_state = None + cls.blocksize = blocksize + cls.compress_statistics = compress_statistics if data is None: data = torch.empty(0) return torch.Tensor._make_subclass(cls, data, requires_grad) def cuda(self, device): w = self.data.contiguous().half().cuda(device) - w_fp4, quant_state = bnb.functional.quantize_fp4(w) + w_fp4, quant_state = bnb.functional.quantize_fp4(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics) self.data = w_fp4 self.quant_state = quant_state @@ -173,10 +175,10 @@ class FP4Params(torch.nn.Parameter): class LinearFP4(nn.Linear): - def __init__(self, input_features, output_features, bias=True, compute_dtype=None): + def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True): super().__init__(input_features, output_features, bias) self.state = bnb.MatmulLtState() - self.weight = FP4Params(self.weight.data, requires_grad=False) + self.weight = FP4Params(self.weight.data, requires_grad=False, compress_statistics=compress_statistics) self.compute_dtype = compute_dtype def init_8bit_state(self): diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 436c6b1..4356c1d 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -454,14 +454,15 @@ for c in req_grad: transpose = [(False, True), (False, False)] str_transpose = ["NT", "NN"] dtype = [torch.float16, torch.float32] +compress_statistics = [False, True] has_fp16_weights = [True, False] has_bias = [True, False] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}".format(*vals) for vals in str_values] +values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics)) +str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics)) +names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics".format(*vals) for vals in str_values] @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias", values, ids=names) -def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias): +@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics", values, ids=names) +def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) if has_bias == False: @@ -481,7 +482,7 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, bias2 = bias.clone() torch.nn.init.xavier_uniform_(B) - B2, quant_state = bnb.functional.quantize_fp4(B) + B2, quant_state = bnb.functional.quantize_fp4(B, compress_statistics=compress_statistics) if not transpose[0] and transpose[1]: out_torch = funcs[0](A, B.t()) diff --git a/tests/test_functional.py b/tests/test_functional.py index cd4728e..a974701 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -167,8 +167,8 @@ def test_dynamic_blockwise_quantization(): relerr = sum(reldiffs)/len(reldiffs) assert abserr < 0.011 assert relerr < 0.018 - print('randn', blocksize, sum(diffs)/len(diffs)) - print('randn', blocksize, sum(reldiffs)/len(reldiffs)) + #print('randn', blocksize, sum(diffs)/len(diffs)) + #print('randn', blocksize, sum(reldiffs)/len(reldiffs)) diffs = [] for i in range(100): @@ -184,8 +184,8 @@ def test_dynamic_blockwise_quantization(): relerr = sum(reldiffs)/len(reldiffs) assert abserr < 0.0035 assert relerr < 0.015 - print('rand', blocksize, sum(diffs)/len(diffs)) - print('rand', blocksize, sum(reldiffs)/len(reldiffs)) + #print('rand', blocksize, sum(diffs)/len(diffs)) + #print('rand', blocksize, sum(reldiffs)/len(reldiffs)) def test_dynamic_blockwise_stochastic_quantization(): @@ -1806,6 +1806,7 @@ def test_bench_matmul(batch, seq, model, hidden): torch.nn.init.xavier_uniform_(B) B_fp4, state = F.quantize_fp4(B) + B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True) linear8bit = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() linear8bit.eval() @@ -1839,6 +1840,13 @@ def test_bench_matmul(batch, seq, model, hidden): torch.cuda.synchronize() print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + bnb.matmul_fp4(A, B_fp4.t(), quant_state=state_c) + torch.cuda.synchronize() + print( f"bnb fp4 + compressed stats: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + #torch.cuda.synchronize() #t0 = time.time() #for i in range(iters): @@ -2244,6 +2252,42 @@ def test_fp4_quant(): assert relerr.item() < 0.28 +def test_fp4_compressed_stats(): + for blocksize in [128, 64]: + errs1 = [] + errs2 = [] + for i in range(10): + A1 = torch.randn(1024, 1024, device='cuda').half() + q2, SA2 = F.quantize_fp4(A1, blocksize=blocksize) + q3, SA3= F.quantize_fp4(A1, blocksize=blocksize, compress_statistics=True) + A2 = F.dequantize_fp4(q2, SA2) + A3 = F.dequantize_fp4(q3, SA3) + + + err = (A1 - A2).abs().float() + relerr = (err/(A1.abs().float()+1e-15)).mean() + err = err.mean() + + errs1.append(err.item()) + + assert err.item() < 0.11 + assert relerr.item() < 0.28 + + err = (A1 - A3).abs().float() + relerr = (err/(A1.abs().float()+1e-15)).mean() + err = err.mean() + + errs2.append(err.item()) + + assert err.item() < 0.11 + assert relerr.item() < 0.28 + + #print(sum(errs1)/len(errs1), blocksize) + #print(sum(errs2)/len(errs2), blocksize) + + + + def test_bench_fp4_dequant(): blocksize = 256 a = torch.rand(1024*12*4, 1024*12, device='cuda').half() diff --git a/tests/test_modules.py b/tests/test_modules.py index 41cc050..d0f5ca2 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -507,7 +507,7 @@ def test_linear_kbit_fp32_bias(module): assert l1.bias is None @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") -@pytest.mark.parametrize("module", [bnb.nn.Linear8bitLt, bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4']) +@pytest.mark.parametrize("module", [bnb.nn.Linear8bitLt, bnb.nn.LinearFP4, lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True)], ids=['Int8Lt', 'FP4', 'FP4+C']) def test_kbit_backprop(module): b = 17 dim1 = 37 From eb6c53cf557adda4e207669ed7b14ddca68a51c7 Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Sat, 1 Apr 2023 23:50:12 +0000 Subject: [PATCH 32/97] clarify in readme --- speed_benchmark/README.md | 4 ++-- speed_benchmark/make_plot_with_jsonl.py | 1 + speed_benchmark/speed_benchmark.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/speed_benchmark/README.md b/speed_benchmark/README.md index 80cbeee..bb33b5b 100644 --- a/speed_benchmark/README.md +++ b/speed_benchmark/README.md @@ -1,4 +1,4 @@ Steps: -1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (feel free to change the name for your profiling). -2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. \ No newline at end of file +1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling). +2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. \ No newline at end of file diff --git a/speed_benchmark/make_plot_with_jsonl.py b/speed_benchmark/make_plot_with_jsonl.py index 0920851..8897564 100644 --- a/speed_benchmark/make_plot_with_jsonl.py +++ b/speed_benchmark/make_plot_with_jsonl.py @@ -20,6 +20,7 @@ if __name__ == '__main__': ax = fig.add_subplot(gs[0, 0]) + # TODO: change this to what you want. rdf = pd.read_json('speed_benchmark/info_a100_py2.jsonl', lines=True) df = rdf[rdf.batch_size == batch_size_for_plot1] diff --git a/speed_benchmark/speed_benchmark.py b/speed_benchmark/speed_benchmark.py index bd8a6e5..eccc455 100644 --- a/speed_benchmark/speed_benchmark.py +++ b/speed_benchmark/speed_benchmark.py @@ -97,5 +97,6 @@ if __name__ == '__main__': info_json = json.dumps(info) - with open("speed_benchmark/info_a100_py2.jsonl", "a") as file: + # TODO: change this to what you want. + with open("speed_benchmark/info.jsonl", "a") as file: file.write(info_json + "\n") From 2dd5d69056e3b94f0462dd9ce6aaff7a89294d23 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 2 Apr 2023 12:42:01 -0700 Subject: [PATCH 33/97] Generalized FP4 data type. --- csrc/kernels.cu | 144 ++++++++++++++++++++++----------------- tests/test_functional.py | 10 +-- 2 files changed, 88 insertions(+), 66 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index e7e57d7..2e61297 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -64,6 +64,33 @@ __device__ float dDequantizeFP4(unsigned char val, float absmax) } } +__device__ float dDequantizeFP4Tree(unsigned char val, float absmax) +{ + float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; + if((val & 0b0100) == 4) // 0 + if((val & 0b0010) == 2) //01 + if((val & 0b0001) == 1) // 111 + return 0.25000000f*absmax*sign; // 1111 + else + return 0.16666667f*absmax*sign; // 1110 + else + if((val & 0b0001) == 1) // 110 + return 0.50000000f*absmax*sign; // 1101 + else + return 0.33333333f*absmax*sign; // 1100 + else + if((val & 0b0010) == 2) //10 + if((val & 0b0001) == 1) // 101 + return 1.00000000f*absmax*sign; // 1011 + else + return 0.66666667f*absmax*sign; // 1010 + else + if((val & 0b0001) == 1) // 100 + return 5.208333333e-03f*absmax*sign; // 1001 + else + return 0.00000000f*absmax*sign; // 1000 +} + __device__ unsigned char dQuantizeFP4(float x) { // FP4 with bias of 3 @@ -78,42 +105,79 @@ __device__ unsigned char dQuantizeFP4(float x) // 0b010 = 8 // 0b011 = 12 + + // we do a binary search + // the pivots are divided by 12 (the FP4 absmax) + // since we assum input data is in [-1.0, 1.0] + + // !be careful here, its easy to make a mistake + // that is difficult to noice if you add an extra + // zero somewhere! + + int sign = x < 0 ? 0b1000 : 0b0000; + x = fabsf(x); + if(x > 0.29166667f) + if( x > 0.583333f) + if( x > 0.8333333f) + return 0b0011+sign; + else + return 0b0010+sign; + else + if(x > 0.4166667f) + return 0b101+sign; + else + return 0b100+sign; + else + if(x > 0.0859375f) + if(x > 0.20833333f) + return 0b0111+sign; + else + return 0b0110+sign; + else + if(x > 0.00260417f) + return 0b0001+sign; + else + return 0b0000+sign; +} + +__device__ unsigned char dQuantizeNormal(float x) +{ + // FP4 with bias of 3 + // first bit is a sign + // subnormals + // 0b000 = 0 + // 0b001 = 0.0625 + // 0b110 = 2 + // 0b111 = 3 + // 0b100 = 4 + // 0b101 = 6 + // 0b010 = 8 + // 0b011 = 12 + int sign = x < 0 ? 0b1000 : 0b0000; x = fabsf(x); if(x > 3.5f) - { if( x > 7.0f) - { if( x > 10.0f) return 0b0011+sign; else return 0b0010+sign; - } else - { if(x > 5.0f) return 0b101+sign; else return 0b100+sign; - } - } else - { if(x > 1.03125f) - { if(x > 2.5f) return 0b0111+sign; else return 0b0110+sign; - } else - { if(x > 0.03125f) return 0b0001+sign; else return 0b0000+sign; - } - } } template @@ -575,8 +639,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float for(int j = 0; j < NUM_PER_TH/2; j++) { unsigned char packed_fp4 = 0; - packed_fp4 |= dQuantizeFP4(((float)vals[2*j])*local_abs_max*12.0f) << 4; - packed_fp4 |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max*12.0f); + packed_fp4 |= dQuantizeFP4(((float)vals[2*j])*local_abs_max) << 4; + packed_fp4 |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max); qvals[j] = packed_fp4; } } @@ -639,8 +703,10 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH; j++) { - vals[j*2] = dDequantizeFP4(qvals[j] >> 4, local_abs_max*0.083333f); - vals[j*2 + 1] = dDequantizeFP4(qvals[j] & 0x0F, local_abs_max*0.083333); + //vals[j*2] = dDequantizeFP4(qvals[j] >> 4, local_abs_max*0.083333f); + //vals[j*2 + 1] = dDequantizeFP4(qvals[j] & 0x0F, local_abs_max*0.083333); + vals[j*2] = dDequantizeFP4Tree(qvals[j] >> 4, local_abs_max); + vals[j*2 + 1] = dDequantizeFP4Tree(qvals[j] & 0x0F, local_abs_max); } } else @@ -656,52 +722,6 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs } } -//template -//__global__ void kDequantizeBlockwiseFP4(unsigned char * A, float * absmax, T *out, const int n_store) -//{ -// -// const int n_load = n_store/2; -// const int base_idx = (blockIdx.x * TILE_SIZE); -// -// T vals[NUM_PER_TH*2]; -// unsigned char qvals[NUM_PER_TH]; -// -// int valid_items = (base_idx + TILE_SIZE) > n_load ? ((base_idx+TILE_SIZE) - n_load) : TILE_SIZE; -// int idx = base_idx + (threadIdx.x*NUM_PER_TH); -// -// float local_abs_max = __ldg(&absmax[idx/BLOCK_SIZE]); -// -// if(valid_items == TILE_SIZE) -// { -// // we do 64 byte loads so we can 128 byte stores -// reinterpret_cast(qvals)[0] = reinterpret_cast(A)[idx/8]; -// } -// else -// { -// #pragma unroll -// for(int j = 0; j < NUM_PER_TH; j++) -// if(idx+j < n_load) -// qvals[j] = A[idx+j]; -// else -// qvals[j] = 0; -// } -// -// -// #pragma unroll NUM_PER_TH -// for(int j = 0; j < NUM_PER_TH; j++) -// { -// vals[j*2] = dDequantizeFP4(qvals[j] & 0xF0, local_abs_max*12.0f); -// vals[j*2 + 1] = dDequantizeFP4(qvals[j] & 0x0F, local_abs_max*12.0f); -// } -// -// -// reinterpret_cast(qvals)[0] = reinterpret_cast(A)[idx/8]; -// reinterpret_cast(A)[idx/16] = reinterpret_cast(local_valC)[j/num_items]; -// -// -//} - - __global__ void kDequantize(float *code, unsigned char *A, float *out, const int n) { const unsigned int numThreads = blockDim.x * gridDim.x; diff --git a/tests/test_functional.py b/tests/test_functional.py index a974701..12411e3 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2246,8 +2246,10 @@ def test_fp4_quant(): err = (A1 - A2).abs().float() relerr = (err/A1.abs().float()).mean() + idx = err > 1.0 err = err.mean() + assert err.item() < 0.1 assert relerr.item() < 0.28 @@ -2256,7 +2258,7 @@ def test_fp4_compressed_stats(): for blocksize in [128, 64]: errs1 = [] errs2 = [] - for i in range(10): + for i in range(10000): A1 = torch.randn(1024, 1024, device='cuda').half() q2, SA2 = F.quantize_fp4(A1, blocksize=blocksize) q3, SA3= F.quantize_fp4(A1, blocksize=blocksize, compress_statistics=True) @@ -2268,7 +2270,7 @@ def test_fp4_compressed_stats(): relerr = (err/(A1.abs().float()+1e-15)).mean() err = err.mean() - errs1.append(err.item()) + errs1.append(relerr.item()) assert err.item() < 0.11 assert relerr.item() < 0.28 @@ -2277,7 +2279,7 @@ def test_fp4_compressed_stats(): relerr = (err/(A1.abs().float()+1e-15)).mean() err = err.mean() - errs2.append(err.item()) + errs2.append(relerr.item()) assert err.item() < 0.11 assert relerr.item() < 0.28 @@ -2301,7 +2303,7 @@ def test_bench_fp4_dequant(): #print(max_theoretical_s*1e6) b = torch.randn(128, 1024*12, device='cuda').half() - iters = 5 + iters = 500 torch.cuda.synchronize() t0 = time.time() for i in range(iters): From 0d332a641ff6b28e71b2a9ab5e641f8cf4a2ec99 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 2 Apr 2023 14:09:08 -0700 Subject: [PATCH 34/97] Added normal with extra value. --- bitsandbytes/functional.py | 28 +++++++++++++++++++++++----- tests/test_functional.py | 3 --- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 8234c46..161f58f 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -9,7 +9,7 @@ import random import torch import itertools import math -import scipy.stats +from scipy.stats import norm import numpy as np from functools import reduce # Required in Python 3 @@ -181,7 +181,7 @@ def create_custom_map(seed=0, scale=0.01): #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.207 #v = [1.6118251211466303, 1.188665228776879, 0.9112895004060624, 0.690763326564427, 0.4997008778346997, 0.3254280317127771, 0.16057446047146948] # 0.9465 24.30 #v = [1.6027040905517569, 1.184321770169049, 0.9085808314549837, 0.6889461706317986, 0.4984841229538408, 0.32467299997597887, 0.1602117348657326] # 0.9455 24.293 - #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88 + v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88 # 7B evo start #v = [1.62129629, 1.18870191, 0.90848106, 0.69108646, 0.50515268, 0.34927819905, 0.14122701] # 22.06 @@ -197,9 +197,7 @@ def create_custom_map(seed=0, scale=0.01): #v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237] # theoretically optiomal (0.93333) - v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333 - - + #v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333 if seed > 0: v = np.array(v) @@ -220,6 +218,26 @@ def create_custom_map(seed=0, scale=0.01): assert values.numel() == 256 return values +def create_normal_map(offset=0.966666, use_extra_value=True): + + if use_extra_value: + # one more positive value, this is an asymmetric type + v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist() + v2 = [0]*(256-15) ## we have 15 non-zero values in this data type + v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist() + v = v1 + v2 + v3 + else: + v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist() + v2 = [0]*(256-14) ## we have 14 non-zero values in this data type + v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist() + v = v1 + v2 + v3 + + values = torch.Tensor(v) + values = values.sort().values + values /= values.max() + assert values.numel() == 256 + return values + def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8): e = exponent_bits p = precision_bits diff --git a/tests/test_functional.py b/tests/test_functional.py index 12411e3..47a30a6 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2318,6 +2318,3 @@ def test_bench_fp4_dequant(): # torch.matmul(b, a.t()) #torch.cuda.synchronize() #print((time.time()-t0)/iters*1e6) - - - From 4ad999d1440e896abec3f3c7029f292ce46cc820 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 2 Apr 2023 14:42:45 -0700 Subject: [PATCH 35/97] Added quantization tree generation. --- bitsandbytes/functional.py | 2 +- tests/test_functional.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 161f58f..5198526 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -218,7 +218,7 @@ def create_custom_map(seed=0, scale=0.01): assert values.numel() == 256 return values -def create_normal_map(offset=0.966666, use_extra_value=True): +def create_normal_map(offset=0.9677083, use_extra_value=True): if use_extra_value: # one more positive value, this is an asymmetric type diff --git a/tests/test_functional.py b/tests/test_functional.py index 47a30a6..074135e 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2318,3 +2318,19 @@ def test_bench_fp4_dequant(): # torch.matmul(b, a.t()) #torch.cuda.synchronize() #print((time.time()-t0)/iters*1e6) + + + +def test_normal_map_tree(): + code = F.create_normal_map() + values =code[:8].tolist() + code[-8:].tolist() + num_pivots = 1 + while num_pivots <16: + idx = list(range(16//num_pivots//2, 16, 16//num_pivots)) + print(idx) + num_pivots *= 2 + pivots = [] + for i in idx: + pivots.append((values[i-1]+values[i])/2) + print(pivots) + From 64cc05920d0e506e41e814b9ef6053923d967a95 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 2 Apr 2023 16:10:35 -0700 Subject: [PATCH 36/97] First draft of NF4. --- bitsandbytes/functional.py | 44 +++++- csrc/kernels.cu | 271 +++++++++++++++++++++++++------------ csrc/kernels.cuh | 4 +- csrc/ops.cu | 50 +++---- csrc/ops.cuh | 11 +- csrc/pythonInterface.c | 28 ++-- tests/test_functional.py | 23 ++-- 7 files changed, 289 insertions(+), 142 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 5198526..83c2605 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -688,8 +688,13 @@ def dequantize_blockwise( return out +def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False): + return quantize_4bit_packed(A, absmax, out, blocksize, compress_statistics, 'fp4') -def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False) -> Tensor: +def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False): + return quantize_4bit_packed(A, absmax, out, blocksize, compress_statistics, 'nf4') + +def quantize_4bit_packed(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor: """ Quantize tensor A in blocks of FP4 values. @@ -705,6 +710,8 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize The output tensor (8-bit). blocksize : int The blocksize used in quantization. + quant_type : str + The 4-bit quantization data type {fp4, nf4} Returns ------- @@ -715,6 +722,8 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize """ if A.device.type != 'cuda': raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}') + if quant_type not in ['fp4', 'nf4']: + raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.') n = A.numel() input_shape = A.shape @@ -734,9 +743,15 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize is_on_gpu([A, out, absmax]) if A.dtype == torch.float32: - lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == 'fp4': + lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + else: + lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) elif A.dtype == torch.float16: - lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == 'fp4': + lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + else: + lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) @@ -754,8 +769,13 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize return out, state +def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: + return dequantize_4bit_packed(A, quant_state, absmax, out, blocksize, 'fp4') -def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: +def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: + return dequantize_4bit_packed(A, quant_state, absmax, out, blocksize, 'nf4') + +def dequantize_4bit_packed(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor: """ Dequantizes FP4 blockwise quantized values. @@ -771,6 +791,10 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: The absmax values. out : torch.Tensor Dequantized output tensor. + blocksize : int + The blocksize used in quantization. + quant_type : str + The 4-bit quantization data type {fp4, nf4} Returns @@ -780,6 +804,8 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: """ if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") + if quant_type not in ['fp4', 'nf4']: + raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.') if quant_state is None: assert absmax is not None and out is not None @@ -802,9 +828,15 @@ def dequantize_fp4(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: device = pre_call(A.device) is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: - lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + if quant_type == 'fp4': + lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + else: + lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) elif out.dtype == torch.float16: - lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + if quant_type == 'fp4': + lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + else: + lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 2e61297..0ed413f 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -140,44 +140,111 @@ __device__ unsigned char dQuantizeFP4(float x) return 0b0000+sign; } +__device__ float dDequantizeNF4(unsigned char val, float absmax) +{ + // the values for this tree was generated by test_normal_map_tree + // in the file tests/test_functional.py + if((val & 0b1000) == 8) + if((val & 0b0100) == 4) // 1 + if((val & 0b0010) == 2) // 11 + if((val & 0b0001) == 1) // 111 + return 1.0f*absmax; + else + return 0.7229568362236023f*absmax; + else + if((val & 0b0001) == 1) // 110 + return 0.5626170039176941f*absmax; + else + return 0.44070982933044434f*absmax; + else + if((val & 0b0010) == 2) //10 + if((val & 0b0001) == 1) // 101 + return 0.33791524171829224f*absmax; + else + return 0.24611230194568634f*absmax; + else + if((val & 0b0001) == 1) // 100 + return 0.16093020141124725f*absmax; + else + return 0.07958029955625534f*absmax; + + else + if((val & 0b0100) == 4) // 0 + if((val & 0b0010) == 2) //01 + if((val & 0b0001) == 1) // 011 + return 0.0f*absmax; + else + return -0.09105003625154495f*absmax; + else + if((val & 0b0001) == 1) // 010 + return -0.18477343022823334f*absmax; + else + return -0.28444138169288635f*absmax; + else + if((val & 0b0010) == 2) //00 + if((val & 0b0001) == 1) // 001 + return -0.39491748809814453f*absmax; + else + return -0.5250730514526367f*absmax; + else + if((val & 0b0001) == 1) // 000 + return -0.6961928009986877f*absmax; + else + return -1.0f*absmax; + +} + __device__ unsigned char dQuantizeNormal(float x) { - // FP4 with bias of 3 - // first bit is a sign - // subnormals - // 0b000 = 0 - // 0b001 = 0.0625 - // 0b110 = 2 - // 0b111 = 3 - // 0b100 = 4 - // 0b101 = 6 - // 0b010 = 8 - // 0b011 = 12 - int sign = x < 0 ? 0b1000 : 0b0000; - x = fabsf(x); - if(x > 3.5f) - if( x > 7.0f) - if( x > 10.0f) - return 0b0011+sign; + // the values for this tree was generated by test_normal_map_tree + // in the file tests/test_functional.py + if(x > 0.03979014977812767f) + if(x > 0.3893125355243683f) // 1 + if(x > 0.6427869200706482f) // 11 + if(x > 0.8614784181118011f) // 111 + return 0b1111; + else + return 0b1110; else - return 0b0010+sign; + if(x > 0.5016634166240692f) // 110 + return 0b1101; + else + return 0b1100; else - if(x > 5.0f) - return 0b101+sign; + if(x > 0.2035212516784668f) // 10 + if(x > 0.2920137718319893f) // 101 + return 0b1011; + else + return 0b1010; else - return 0b100+sign; + if(x > 0.1202552504837513f) // 100 + return 0b1001; + else + return 0b1100; else - if(x > 1.03125f) - if(x > 2.5f) - return 0b0111+sign; + if(x > -0.33967943489551544f) // 0 + if(x > -0.13791173323988914f) // 01 + if(x > -0.045525018125772476f) // 011 + return 0b0111; + else + return 0b0110; else - return 0b0110+sign; + if(x > -0.23460740596055984f) // 010 + return 0b0101; + else + return 0b0100; else - if(x > 0.03125f) - return 0b0001+sign; + if(x > -0.6106329262256622f) // 00 + if(x > -0.4599952697753906f) // 001 + return 0b0011; + else + return 0b0010; else - return 0b0000+sign; + if(x > -0.8480964004993439f) // 000 + return 0b0001; + else + return 0b0000; } template @@ -564,7 +631,7 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c } } -template +template //__launch_bounds__(TH, 4) __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n) { @@ -574,13 +641,13 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float T vals[NUM_PER_TH]; float rand_vals[NUM_PER_TH]; - unsigned char qvals[FP4 ? NUM_PER_TH/2 : NUM_PER_TH]; + unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH]; //float local_abs_max = -FLT_MAX; float local_abs_max = 0.0f; int local_rand_idx = 0; typedef cub::BlockLoad LoadT; - typedef cub::BlockStore StoreChar; + typedef cub::BlockStore 0) ? NUM_PER_TH/2 : NUM_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar; typedef cub::BlockReduce BlockReduce; typedef cub::BlockLoad LoadFloat; @@ -591,7 +658,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float __shared__ float smem_code[256]; __shared__ float smem_absmax_value[1]; - if(!FP4) + if(DATA_TYPE == General8bit) for(int i = threadIdx.x; i < 256; i+=blockDim.x) smem_code[i] = code[i]; @@ -633,31 +700,41 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float LoadFloat(loadf).Load(&rand[local_rand_idx], rand_vals, BLOCK_SIZE, 0); } - if(FP4) + unsigned char packed_4bit = 0; + switch(DATA_TYPE) { - #pragma unroll NUM_PER_TH - for(int j = 0; j < NUM_PER_TH/2; j++) - { - unsigned char packed_fp4 = 0; - packed_fp4 |= dQuantizeFP4(((float)vals[2*j])*local_abs_max) << 4; - packed_fp4 |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max); - qvals[j] = packed_fp4; - } - } - else - { - #pragma unroll NUM_PER_TH - for(int j = 0; j < NUM_PER_TH; j++) - { - if(!STOCHASTIC) - qvals[j] = dQuantize<0>(smem_code, 0.0f, ((float)vals[j])*local_abs_max); - else - qvals[j] = dQuantize<1>(smem_code, rand_vals[j], ((float)vals[j])*local_abs_max); - } + case General8bit: + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + { + if(!STOCHASTIC) + qvals[j] = dQuantize<0>(smem_code, 0.0f, ((float)vals[j])*local_abs_max); + else + qvals[j] = dQuantize<1>(smem_code, rand_vals[j], ((float)vals[j])*local_abs_max); + } + break; + case FP4: + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH/2; j++) + { + packed_4bit |= dQuantizeFP4(((float)vals[2*j])*local_abs_max) << 4; + packed_4bit |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max); + qvals[j] = packed_4bit; + } + break; + case NF4: + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH/2; j++) + { + packed_4bit |= dQuantizeNormal(((float)vals[2*j])*local_abs_max) << 4; + packed_4bit |= dQuantizeNormal(((float)vals[2*j+1])*local_abs_max); + qvals[j] = packed_4bit; + } + break; } __syncthreads(); - StoreChar(storec).Store(&(out[FP4 ? i/2 : i]), qvals, FP4 ? (valid_items+1)/2 : valid_items); + StoreChar(storec).Store(&(out[(DATA_TYPE > 0) ? i/2 : i]), qvals, (DATA_TYPE > 0) ? (valid_items+1)/2 : valid_items); } } @@ -2957,44 +3034,60 @@ MAKE_optimizerStatic8bit2State(ADAM, float) template __global__ void kPercentileClipping(float * __restrict__ g, float *gnorm_vec, int step, const int n); template __global__ void kPercentileClipping(half * __restrict__ g, float *gnorm_vec, int step, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +#define MAKE_kQuantizeBlockwise(dtype, blocksize, num_per_thread, stochastic, data_type_name) \ +template __global__ void kQuantizeBlockwise(float * code, dtype * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); \ -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kQuantizeBlockwise(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +MAKE_kQuantizeBlockwise(half, 4096, 4, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 4096, 4, 1, General8bit) +MAKE_kQuantizeBlockwise(half, 2048, 4, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 1024, 4, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 512, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 256, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 128, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 64, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit) +MAKE_kQuantizeBlockwise(float, 2048, 4, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 512, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 256, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 128, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 64, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 4096, 4, 0, FP4) +MAKE_kQuantizeBlockwise(half, 2048, 4, 0, FP4) +MAKE_kQuantizeBlockwise(half, 1024, 4, 0, FP4) +MAKE_kQuantizeBlockwise(half, 512, 2, 0, FP4) +MAKE_kQuantizeBlockwise(half, 256, 2, 0, FP4) +MAKE_kQuantizeBlockwise(half, 128, 2, 0, FP4) +MAKE_kQuantizeBlockwise(half, 64, 2, 0, FP4) +MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4) +MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4) +MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4) +MAKE_kQuantizeBlockwise(float, 512, 2, 0, FP4) +MAKE_kQuantizeBlockwise(float, 256, 2, 0, FP4) +MAKE_kQuantizeBlockwise(float, 128, 2, 0, FP4) +MAKE_kQuantizeBlockwise(float, 64, 2, 0, FP4) +MAKE_kQuantizeBlockwise(half, 4096, 4, 0, NF4) +MAKE_kQuantizeBlockwise(half, 2048, 4, 0, NF4) +MAKE_kQuantizeBlockwise(half, 1024, 4, 0, NF4) +MAKE_kQuantizeBlockwise(half, 512, 2, 0, NF4) +MAKE_kQuantizeBlockwise(half, 256, 2, 0, NF4) +MAKE_kQuantizeBlockwise(half, 128, 2, 0, NF4) +MAKE_kQuantizeBlockwise(half, 64, 2, 0, NF4) +MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4) +MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4) +MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4) +MAKE_kQuantizeBlockwise(float, 512, 2, 0, NF4) +MAKE_kQuantizeBlockwise(float, 256, 2, 0, NF4) +MAKE_kQuantizeBlockwise(float, 128, 2, 0, NF4) +MAKE_kQuantizeBlockwise(float, 64, 2, 0, NF4) -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n); #define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread) \ diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 23aad6c..ed549cb 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -14,8 +14,8 @@ template__global__ void kEstimateQuantiles(T *__restrict__ const A, __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n); __global__ void kDequantize(float *code, unsigned char *A, float *out, const int n); -template __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); -template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n); +template __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n); template __global__ void kPreconditionOptimizer32bit2State(T* g, T* p, diff --git a/csrc/ops.cu b/csrc/ops.cu index a5a23b5..de14039 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -50,7 +50,7 @@ void dequantize(float *code, unsigned char *A, float *out, int n) CUDA_CHECK_RETURN(cudaPeekAtLastError()); } -template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n) +template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n) { int num_blocks = n/blocksize; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; @@ -60,34 +60,32 @@ template void quantizeBlockwise(float * co if(blocksize == 4096) kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 2048) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 1024) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 512) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 256) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 128) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 64) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); - else if(blocksize == 32 and FP4 == 0) - kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); + kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) { int num_blocks = n/blocksize; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; - int tile_size = FP4 ? 1024 : 512; + int tile_size = (DATA_TYPE > 0) ? 1024 : 512; - if(FP4) - kDequantizeBlockwise<<<(n+tile_size-1)/tile_size, 64>>>(code, A, absmax, out, blocksize/2, n); + if(DATA_TYPE > 0) + kDequantizeBlockwise<<<(n+tile_size-1)/tile_size, 64>>>(code, A, absmax, out, blocksize/2, n); else - kDequantizeBlockwise<<<(n+tile_size-1)/tile_size, 64>>>(code, A, absmax, out, blocksize, n); + kDequantizeBlockwise<<<(n+tile_size-1)/tile_size, 64>>>(code, A, absmax, out, blocksize, n); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } @@ -682,16 +680,20 @@ template void transformRowToFormat(char * A, char *out, int rows, template void estimateQuantiles(half *A, float *code, float offset, int n); template void estimateQuantiles(float *A, float *code, float offset, int n); -template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void quantizeBlockwise(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); #define MAKE_optimizer32bit(name, gtype) \ template void optimizer32bit(gtype* g, gtype* p, \ diff --git a/csrc/ops.cuh b/csrc/ops.cuh index b3e2424..f73d4e0 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -81,6 +81,13 @@ typedef enum Transform_t COL_AMPERE = 4, } Transform_t; +typedef enum DataType_t +{ + General8bit = 0, + FP4 = 1, + NF4 = 2, +} DataType_t; + class Context { public: @@ -128,8 +135,8 @@ template void estimateQuantiles(T *A, float *code, float offset, in void quantize(float *code, float *A, unsigned char *out, int n); void dequantize(float *code, unsigned char *A, float *out, int n); -template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); -template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n); +template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); +template void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n); template void optimizer32bit(T* g, T* p, float* state1, float* state2, float *unorm, float max_unorm, float param_norm, diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index a485a09..d169178 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -76,17 +76,21 @@ MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16) void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping(g, gnorm_vec, step, n); } void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping(g, gnorm_vec, step, n); } -void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } -void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } -void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } -void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } -void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } -void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(code, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } +void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise(code, A, absmax, out, rand, rand_offset, 4096, n); } +void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } +void quantizeBlockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise(NULL, A, absmax, out, NULL, 0, blocksize, n); } -void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } \ -void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } -void dequantizeBlockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } \ -void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } +void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } \ +void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(code, A, absmax, out, blocksize, n); } +void dequantizeBlockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } \ +void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } +void dequantizeBlockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } \ +void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise(NULL, A, absmax, out, blocksize, n); } #define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \ void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \ @@ -157,6 +161,10 @@ extern "C" void cquantize_blockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); } + void cquantize_blockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); } + void cquantize_blockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); } + void cdequantize_blockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); } + void cdequantize_blockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); } #define MAKE_CFUNC32(name, gtype, gbits) \ void c##name##32bit_g##gbits(gtype *g, gtype *p, \ diff --git a/tests/test_functional.py b/tests/test_functional.py index 074135e..98edb7c 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2254,16 +2254,18 @@ def test_fp4_quant(): assert relerr.item() < 0.28 -def test_fp4_compressed_stats(): +@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") +@pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) +def test_4bit_compressed_stats(quant_type): for blocksize in [128, 64]: errs1 = [] errs2 = [] - for i in range(10000): + for i in range(10): A1 = torch.randn(1024, 1024, device='cuda').half() - q2, SA2 = F.quantize_fp4(A1, blocksize=blocksize) - q3, SA3= F.quantize_fp4(A1, blocksize=blocksize, compress_statistics=True) - A2 = F.dequantize_fp4(q2, SA2) - A3 = F.dequantize_fp4(q3, SA3) + q2, SA2 = F.quantize_4bit_packed(A1, blocksize=blocksize, quant_type=quant_type) + q3, SA3= F.quantize_4bit_packed(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) + A2 = F.dequantize_4bit_packed(q2, SA2, quant_type=quant_type) + A3 = F.dequantize_4bit_packed(q3, SA3, quant_type=quant_type) err = (A1 - A2).abs().float() @@ -2290,10 +2292,12 @@ def test_fp4_compressed_stats(): -def test_bench_fp4_dequant(): +@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") +@pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) +def test_bench_fp4_dequant(quant_type): blocksize = 256 a = torch.rand(1024*12*4, 1024*12, device='cuda').half() - qa, SA = F.quantize_fp4(a, blocksize=blocksize) + qa, SA = F.quantize_4bit_packed(a, blocksize=blocksize, quant_type=quant_type) input_size = a.numel()/2 output_size = a.numel()*2 @@ -2307,7 +2311,7 @@ def test_bench_fp4_dequant(): torch.cuda.synchronize() t0 = time.time() for i in range(iters): - F.dequantize_fp4(qa, SA, blocksize=blocksize) + F.dequantize_4bit_packed(qa, SA, blocksize=blocksize, quant_type=quant_type) #b.copy_(a) torch.cuda.synchronize() #print((time.time()-t0)/iters*1e6) @@ -2325,6 +2329,7 @@ def test_normal_map_tree(): code = F.create_normal_map() values =code[:8].tolist() + code[-8:].tolist() num_pivots = 1 + print(values) while num_pivots <16: idx = list(range(16//num_pivots//2, 16, 16//num_pivots)) print(idx) From 4ea489d3bfc119ab4ceb50f999ce611690dc21e2 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 3 Apr 2023 11:00:12 -0700 Subject: [PATCH 37/97] Refactor FP4 into 4Bit and integrate NF4 data type. --- bitsandbytes/__init__.py | 2 +- bitsandbytes/autograd/_functions.py | 6 +- bitsandbytes/functional.py | 21 +++---- bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 26 ++++++--- csrc/kernels.cu | 87 ++++++++++++++++------------- tests/test_autograd.py | 15 ++--- tests/test_functional.py | 42 ++++++++------ tests/test_modules.py | 34 ++++++++++- 9 files changed, 145 insertions(+), 90 deletions(-) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index c83b7ff..fd83532 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -10,7 +10,7 @@ from .autograd._functions import ( matmul, matmul_cublas, mm_cublas, - matmul_fp4 + matmul_4bit ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 8070ff8..a9c3a53 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -475,7 +475,7 @@ class MatMul8bitLt(torch.autograd.Function): return grad_A, grad_B, None, grad_bias, None -class MatMulFP4(torch.autograd.Function): +class MatMul4Bit(torch.autograd.Function): # forward is the same, but we added the fallback for pre-turing GPUs # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @@ -547,6 +547,6 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_fp4(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None): +def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None): assert quant_state is not None - return MatMulFP4.apply(A, B, out, bias, quant_state) + return MatMul4Bit.apply(A, B, out, bias, quant_state) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 83c2605..20841eb 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -689,14 +689,14 @@ def dequantize_blockwise( return out def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False): - return quantize_4bit_packed(A, absmax, out, blocksize, compress_statistics, 'fp4') + return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4') def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False): - return quantize_4bit_packed(A, absmax, out, blocksize, compress_statistics, 'nf4') + return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4') -def quantize_4bit_packed(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor: +def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor: """ - Quantize tensor A in blocks of FP4 values. + Quantize tensor A in blocks of 4-bit values. Quantizes tensor A by dividing it into blocks which are independently quantized to FP4. @@ -763,19 +763,19 @@ def quantize_4bit_packed(A: Tensor, absmax: Tensor = None, out: Tensor = None, b #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256) qabsmax, state2 = quantize_blockwise(absmax, blocksize=256) del absmax - state = (qabsmax, input_shape, A.dtype, blocksize, (offset, state2)) + state = (qabsmax, input_shape, A.dtype, blocksize, (offset, state2), quant_type) else: - state = (absmax, input_shape, A.dtype, blocksize, None) + state = (absmax, input_shape, A.dtype, blocksize, None, quant_type) return out, state def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: - return dequantize_4bit_packed(A, quant_state, absmax, out, blocksize, 'fp4') + return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4') def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: - return dequantize_4bit_packed(A, quant_state, absmax, out, blocksize, 'nf4') + return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4') -def dequantize_4bit_packed(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor: +def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor: """ Dequantizes FP4 blockwise quantized values. @@ -812,7 +812,8 @@ def dequantize_4bit_packed(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, shape = out.shape dtype = out.dtype else: - absmax, shape, dtype, blocksize, compressed_stats = quant_state + absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state + if compressed_stats is not None: offset, state2 = compressed_stats diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 954a67f..439f750 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,4 +2,4 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, LinearFP4, FP4Params +from .modules import Int8Params, Linear8bitLt, StableEmbedding, Linear4bit, LinearNF4, LinearFP4, Params4bit diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 45eef42..86ea342 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -133,18 +133,19 @@ class Embedding(torch.nn.Embedding): return emb -class FP4Params(torch.nn.Parameter): - def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True): +class Params4bit(torch.nn.Parameter): + def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True, quant_type='fp4'): cls.quant_state = None cls.blocksize = blocksize cls.compress_statistics = compress_statistics + cls.quant_type = quant_type if data is None: data = torch.empty(0) return torch.Tensor._make_subclass(cls, data, requires_grad) def cuda(self, device): w = self.data.contiguous().half().cuda(device) - w_fp4, quant_state = bnb.functional.quantize_fp4(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics) + w_fp4, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type) self.data = w_fp4 self.quant_state = quant_state @@ -168,17 +169,16 @@ class FP4Params(torch.nn.Parameter): if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"): return self.cuda(device) else: - new_param = FP4Params(super().to(device=device, dtype=dtype, non_blocking=non_blocking), + new_param = Params4bit(super().to(device=device, dtype=dtype, non_blocking=non_blocking), requires_grad=self.requires_grad, quant_state=self.quant_state) return new_param - -class LinearFP4(nn.Linear): - def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True): +class Linear4bit(nn.Linear): + def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4'): super().__init__(input_features, output_features, bias) self.state = bnb.MatmulLtState() - self.weight = FP4Params(self.weight.data, requires_grad=False, compress_statistics=compress_statistics) + self.weight = Params4bit(self.weight.data, requires_grad=False, compress_statistics=compress_statistics, quant_type=quant_type) self.compute_dtype = compute_dtype def init_8bit_state(self): @@ -198,12 +198,20 @@ class LinearFP4(nn.Linear): x = x.to(self.compute_dtype) bias = None if self.bias is None else self.bias.half() - out = bnb.matmul_fp4(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) + out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) out = out.to(inp_dtype) return out +class LinearFP4(Linear4bit): + def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True): + super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4') + +class LinearNF4(Linear4bit): + def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True): + super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4') + class Int8Params(torch.nn.Parameter): def __new__( diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 0ed413f..86a93ae 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -194,7 +194,7 @@ __device__ float dDequantizeNF4(unsigned char val, float absmax) } -__device__ unsigned char dQuantizeNormal(float x) +__device__ unsigned char dQuantizeNF4(float x) { // the values for this tree was generated by test_normal_map_tree @@ -221,7 +221,7 @@ __device__ unsigned char dQuantizeNormal(float x) if(x > 0.1202552504837513f) // 100 return 0b1001; else - return 0b1100; + return 0b1000; else if(x > -0.33967943489551544f) // 0 if(x > -0.13791173323988914f) // 01 @@ -726,8 +726,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH/2; j++) { - packed_4bit |= dQuantizeNormal(((float)vals[2*j])*local_abs_max) << 4; - packed_4bit |= dQuantizeNormal(((float)vals[2*j+1])*local_abs_max); + packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4; + packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max); qvals[j] = packed_4bit; } break; @@ -738,7 +738,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float } } -template +template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n) { @@ -747,55 +747,62 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs int valid_items_store = 0; const int base_idx = (blockIdx.x * TILE_SIZE); - T vals[NUM_PER_TH*(FP4 ? 2 : 1)]; + T vals[NUM_PER_TH*((DATA_TYPE > 0) ? 2 : 1)]; unsigned char qvals[NUM_PER_TH]; float local_abs_max = -FLT_MAX; typedef cub::BlockLoad LoadChar; - typedef cub::BlockStore StoreT; + typedef cub::BlockStore 0) ? 2 : 1), cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT; __shared__ typename LoadChar::TempStorage loadchar; __shared__ typename StoreT::TempStorage storet; for (unsigned int i = base_idx; i < n_load; i += gridDim.x*TILE_SIZE) { - if(FP4) - { - valid_items_load = (n+1)/2 - i > TILE_SIZE ? TILE_SIZE : (n+1)/2 - i; - valid_items_store = n - i*2 > TILE_SIZE*2 ? TILE_SIZE*2 : n - i*2; - } - else - { - valid_items_load = n - i > TILE_SIZE ? TILE_SIZE : n - i; - valid_items_store = n - i > TILE_SIZE ? TILE_SIZE : n - i; - } - local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH)/(blocksize)]); + if(DATA_TYPE > 0) + { + valid_items_load = (n+1)/2 - i > TILE_SIZE ? TILE_SIZE : (n+1)/2 - i; + valid_items_store = n - i*2 > TILE_SIZE*2 ? TILE_SIZE*2 : n - i*2; + } + else + { + valid_items_load = n - i > TILE_SIZE ? TILE_SIZE : n - i; + valid_items_store = n - i > TILE_SIZE ? TILE_SIZE : n - i; + } + local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH)/(blocksize)]); - __syncthreads(); - LoadChar(loadchar).Load(&(A[i]), qvals, valid_items_load, 128); + __syncthreads(); + LoadChar(loadchar).Load(&(A[i]), qvals, valid_items_load, 128); - if(FP4) - { - #pragma unroll NUM_PER_TH - for(int j = 0; j < NUM_PER_TH; j++) - { - //vals[j*2] = dDequantizeFP4(qvals[j] >> 4, local_abs_max*0.083333f); - //vals[j*2 + 1] = dDequantizeFP4(qvals[j] & 0x0F, local_abs_max*0.083333); - vals[j*2] = dDequantizeFP4Tree(qvals[j] >> 4, local_abs_max); - vals[j*2 + 1] = dDequantizeFP4Tree(qvals[j] & 0x0F, local_abs_max); - } - } - else - { - // load code through read-only cache via __ldg - #pragma unroll NUM_PER_TH - for(int j = 0; j < NUM_PER_TH; j++) - vals[j] = __ldg(&code[qvals[j]])*local_abs_max; - } + switch(DATA_TYPE) + { + case General8bit: + // load code through read-only cache via __ldg + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + vals[j] = __ldg(&code[qvals[j]])*local_abs_max; + break; + case FP4: + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + { + vals[j*2] = dDequantizeFP4Tree(qvals[j] >> 4, local_abs_max); + vals[j*2 + 1] = dDequantizeFP4Tree(qvals[j] & 0x0F, local_abs_max); + } + break; + case NF4: + #pragma unroll NUM_PER_TH + for(int j = 0; j < NUM_PER_TH; j++) + { + vals[j*2] = dDequantizeNF4(qvals[j] >> 4, local_abs_max); + vals[j*2 + 1] = dDequantizeNF4(qvals[j] & 0x0F, local_abs_max); + } + break; + } - __syncthreads(); - StoreT(storet).Store(&(out[FP4 ? i*2 : i]), vals, valid_items_store); + __syncthreads(); + StoreT(storet).Store(&(out[(DATA_TYPE > 0) ? i*2 : i]), vals, valid_items_store); } } diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 4356c1d..db33375 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -440,7 +440,7 @@ dim4 = torch.randint(32, 96, size=(n,)).tolist() dim2.append(0) -funcs = [(torch.matmul, bnb.matmul_fp4)] +funcs = [(torch.matmul, bnb.matmul_4bit)] str_funcs = ["matmul"] req_grad = list(product([True, False], repeat=3)) req_grad_str = [] @@ -457,12 +457,13 @@ dtype = [torch.float16, torch.float32] compress_statistics = [False, True] has_fp16_weights = [True, False] has_bias = [True, False] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics".format(*vals) for vals in str_values] +quant_type = ['fp4', 'nf4'] +values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type)) +str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics, quant_type)) +names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics_{}_quant_type_{}".format(*vals) for vals in str_values] @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics", values, ids=names) -def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics): +@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type", values, ids=names) +def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) if has_bias == False: @@ -482,7 +483,7 @@ def test_matmul_fp4( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, bias2 = bias.clone() torch.nn.init.xavier_uniform_(B) - B2, quant_state = bnb.functional.quantize_fp4(B, compress_statistics=compress_statistics) + B2, quant_state = bnb.functional.quantize_4bit(B, compress_statistics=compress_statistics, quant_type=quant_type) if not transpose[0] and transpose[1]: out_torch = funcs[0](A, B.t()) diff --git a/tests/test_functional.py b/tests/test_functional.py index 98edb7c..1f19d43 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1784,8 +1784,8 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): print("partial matmul", time.time() - t0) -batch_size = 4 -seqdim = 256 +batch_size = 2 +seqdim = 2048 values = [] values.append((batch_size, seqdim, 768, 4 * 768)) values.append((batch_size, seqdim, 1024, 4*1024)) @@ -1798,7 +1798,7 @@ values.append((batch_size, seqdim, 12288, 4*12288)) names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): - iters = 128 + iters = 32 formatB = F.get_special_format_str() A = torch.randn(batch, seq, model, device="cuda").half() @@ -1808,6 +1808,8 @@ def test_bench_matmul(batch, seq, model, hidden): B_fp4, state = F.quantize_fp4(B) B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True) + B_nf4, state_nf4= F.quantize_nf4(B) + linear8bit = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() linear8bit.eval() @@ -1836,17 +1838,24 @@ def test_bench_matmul(batch, seq, model, hidden): torch.cuda.synchronize() t0 = time.time() for i in range(iters): - bnb.matmul_fp4(A, B_fp4.t(), quant_state=state) + bnb.matmul_4bit(A, B_fp4.t(), quant_state=state) torch.cuda.synchronize() print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) torch.cuda.synchronize() t0 = time.time() for i in range(iters): - bnb.matmul_fp4(A, B_fp4.t(), quant_state=state_c) + bnb.matmul_4bit(A, B_fp4.t(), quant_state=state_c) torch.cuda.synchronize() print( f"bnb fp4 + compressed stats: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4) + torch.cuda.synchronize() + print( f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + #torch.cuda.synchronize() #t0 = time.time() #for i in range(iters): @@ -2262,17 +2271,18 @@ def test_4bit_compressed_stats(quant_type): errs2 = [] for i in range(10): A1 = torch.randn(1024, 1024, device='cuda').half() - q2, SA2 = F.quantize_4bit_packed(A1, blocksize=blocksize, quant_type=quant_type) - q3, SA3= F.quantize_4bit_packed(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) - A2 = F.dequantize_4bit_packed(q2, SA2, quant_type=quant_type) - A3 = F.dequantize_4bit_packed(q3, SA3, quant_type=quant_type) + q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type) + q3, SA3= F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) + A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type) + A3 = F.dequantize_4bit(q3, SA3, quant_type=quant_type) err = (A1 - A2).abs().float() relerr = (err/(A1.abs().float()+1e-15)).mean() err = err.mean() - errs1.append(relerr.item()) + errs1.append(err.item()) + assert err.item() < 0.11 assert relerr.item() < 0.28 @@ -2281,23 +2291,23 @@ def test_4bit_compressed_stats(quant_type): relerr = (err/(A1.abs().float()+1e-15)).mean() err = err.mean() - errs2.append(relerr.item()) + errs2.append(err.item()) assert err.item() < 0.11 assert relerr.item() < 0.28 - #print(sum(errs1)/len(errs1), blocksize) - #print(sum(errs2)/len(errs2), blocksize) + #print(sum(errs1)/len(errs1), blocksize, quant_type) + #print(sum(errs2)/len(errs2), blocksize, quant_type) @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") @pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) -def test_bench_fp4_dequant(quant_type): +def test_bench_4bit_dequant(quant_type): blocksize = 256 a = torch.rand(1024*12*4, 1024*12, device='cuda').half() - qa, SA = F.quantize_4bit_packed(a, blocksize=blocksize, quant_type=quant_type) + qa, SA = F.quantize_4bit(a, blocksize=blocksize, quant_type=quant_type) input_size = a.numel()/2 output_size = a.numel()*2 @@ -2311,7 +2321,7 @@ def test_bench_fp4_dequant(quant_type): torch.cuda.synchronize() t0 = time.time() for i in range(iters): - F.dequantize_4bit_packed(qa, SA, blocksize=blocksize, quant_type=quant_type) + F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type) #b.copy_(a) torch.cuda.synchronize() #print((time.time()-t0)/iters*1e6) diff --git a/tests/test_modules.py b/tests/test_modules.py index d0f5ca2..94cf36b 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -506,8 +506,16 @@ def test_linear_kbit_fp32_bias(module): o1 = l1(b1) assert l1.bias is None +modules = [] +modules.append(bnb.nn.Linear8bitLt) +modules.append(bnb.nn.Linear4bit) +modules.append(bnb.nn.LinearFP4) +modules.append(bnb.nn.LinearNF4) +modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True)) +modules.append(lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True)) +names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C'] @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU") -@pytest.mark.parametrize("module", [bnb.nn.Linear8bitLt, bnb.nn.LinearFP4, lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True)], ids=['Int8Lt', 'FP4', 'FP4+C']) +@pytest.mark.parametrize("module", modules, ids=names) def test_kbit_backprop(module): b = 17 dim1 = 37 @@ -515,6 +523,8 @@ def test_kbit_backprop(module): ref = nn.Sequential(*[torch.nn.Linear(dim1, dim2), torch.nn.Linear(dim2, 10)]) ref[1].weight.requires_grad = False + torch.nn.init.kaiming_normal_(ref[0].weight) + torch.nn.init.kaiming_normal_(ref[1].weight) kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 10)]) kbit[0].weight.detach().copy_(ref[0].weight) kbit[1].weight.detach().copy_(ref[1].weight) @@ -523,6 +533,10 @@ def test_kbit_backprop(module): ref = ref.half().cuda() kbit = kbit.half().cuda() + errs1 = [] + errs2 = [] + relerrs1 = [] + relerrs2 = [] for i in range(100): batch = torch.randn(b, dim1).half().cuda() out1 = ref(batch) @@ -535,12 +549,26 @@ def test_kbit_backprop(module): bgrad1 = ref[0].bias.grad bgrad2 = kbit[0].bias.grad - torch.testing.assert_allclose(grad1, grad2, atol=0.008, rtol=0.05) - torch.testing.assert_allclose(bgrad1, bgrad2, atol=0.008, rtol=0.05) + err1 = (out1-out2).abs().float() + err2 = (grad1-grad2).abs().float() + relerr1 = (err1/(out1.abs().float()+1e-9)) + relerr2 = (err2/(grad1.abs().float()+1e-9)) + errs1.append(err1.mean().item()) + errs2.append(err2.mean().item()) + relerrs1.append(relerr1.mean().item()) + relerrs2.append(relerr2.mean().item()) + + + #torch.testing.assert_allclose(grad1, grad2, atol=0.008, rtol=0.05) + #torch.testing.assert_allclose(bgrad1, bgrad2, atol=0.008, rtol=0.05) ref.zero_grad() kbit.zero_grad() assert kbit[0].weight.grad.sum().item() == 0 assert kbit[0].bias.grad.sum().item() == 0 + print('out', sum(errs1)/len(errs1)) + print('grad', sum(errs2)/len(errs2)) + print('rel out', sum(relerrs1)/len(relerrs1)) + print('rel grad', sum(relerrs2)/len(relerrs2)) From 1ccb7bdec6c9afe8eccf23bea0619ef7d962f279 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 3 Apr 2023 18:47:00 -0700 Subject: [PATCH 38/97] Fixed ParamsIn4 init; fixed PyTorch 2.0 test failure. --- bitsandbytes/nn/modules.py | 18 +++++++----------- tests/test_functional.py | 4 ++-- tests/test_modules.py | 13 ++++++++----- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 86ea342..30f92ce 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -136,12 +136,14 @@ class Embedding(torch.nn.Embedding): class Params4bit(torch.nn.Parameter): def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True, quant_type='fp4'): cls.quant_state = None - cls.blocksize = blocksize - cls.compress_statistics = compress_statistics - cls.quant_type = quant_type if data is None: data = torch.empty(0) - return torch.Tensor._make_subclass(cls, data, requires_grad) + + self = torch.Tensor._make_subclass(cls, data, requires_grad) + self.blocksize = blocksize + self.compress_statistics = compress_statistics + self.quant_type = quant_type + return self def cuda(self, device): w = self.data.contiguous().half().cuda(device) @@ -177,16 +179,10 @@ class Params4bit(torch.nn.Parameter): class Linear4bit(nn.Linear): def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4'): super().__init__(input_features, output_features, bias) - self.state = bnb.MatmulLtState() self.weight = Params4bit(self.weight.data, requires_grad=False, compress_statistics=compress_statistics, quant_type=quant_type) self.compute_dtype = compute_dtype - def init_8bit_state(self): - pass - def forward(self, x: torch.Tensor): - self.state.is_training = self.training - # weights are cast automatically as Int8Params, but the bias has to be cast manually if self.bias is not None and self.bias.dtype != x.dtype: self.bias.data = self.bias.data.to(x.dtype) @@ -197,7 +193,7 @@ class Linear4bit(nn.Linear): if self.compute_dtype is not None: x = x.to(self.compute_dtype) - bias = None if self.bias is None else self.bias.half() + bias = None if self.bias is None else self.bias.half(self.compute_dtype) out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) out = out.to(inp_dtype) diff --git a/tests/test_functional.py b/tests/test_functional.py index 1f19d43..61ea712 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1798,7 +1798,7 @@ values.append((batch_size, seqdim, 12288, 4*12288)) names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): - iters = 32 + iters = 1 formatB = F.get_special_format_str() A = torch.randn(batch, seq, model, device="cuda").half() @@ -2317,7 +2317,7 @@ def test_bench_4bit_dequant(quant_type): #print(max_theoretical_s*1e6) b = torch.randn(128, 1024*12, device='cuda').half() - iters = 500 + iters = 5 torch.cuda.synchronize() t0 = time.time() for i in range(iters): diff --git a/tests/test_modules.py b/tests/test_modules.py index 94cf36b..89c319c 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -558,14 +558,17 @@ def test_kbit_backprop(module): relerrs1.append(relerr1.mean().item()) relerrs2.append(relerr2.mean().item()) - - #torch.testing.assert_allclose(grad1, grad2, atol=0.008, rtol=0.05) - #torch.testing.assert_allclose(bgrad1, bgrad2, atol=0.008, rtol=0.05) + if isinstance(module, bnb.nn.Linear8bitLt): + torch.testing.assert_allclose(grad1, grad2, atol=0.008, rtol=0.05) + torch.testing.assert_allclose(bgrad1, bgrad2, atol=0.008, rtol=0.05) + else: + torch.testing.assert_allclose(grad1, grad2, atol=0.015, rtol=0.05) + torch.testing.assert_allclose(bgrad1, bgrad2, atol=0.02, rtol=0.05) ref.zero_grad() kbit.zero_grad() - assert kbit[0].weight.grad.sum().item() == 0 - assert kbit[0].bias.grad.sum().item() == 0 + assert kbit[0].weight.grad is None or kbit[0].weight.grad.sum().item() == 0 + assert kbit[0].weight.grad is None or kbit[0].bias.grad.sum().item() == 0 print('out', sum(errs1)/len(errs1)) print('grad', sum(errs2)/len(errs2)) print('rel out', sum(relerrs1)/len(relerrs1)) From e9fa03b7176d51fa23d23616b16ef389db18ab02 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 7 Apr 2023 09:59:21 -0700 Subject: [PATCH 39/97] Some fixed for loading PEFT modules with Params4bit. --- bitsandbytes/functional.py | 10 +++++--- bitsandbytes/nn/modules.py | 52 +++++++++++++++++++++++++++++++++++--- csrc/kernels.cu | 32 +++++++++++++++-------- tests/test_optim.py | 4 +-- 4 files changed, 78 insertions(+), 20 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 20841eb..b168606 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -362,9 +362,13 @@ def get_special_format_str(): def is_on_gpu(tensors): on_gpu = True + gpu_ids = set() for t in tensors: if t is None: continue # NULL pointers are fine on_gpu &= t.device.type == 'cuda' + gpu_ids.add(t.device.index) + if len(gpu_ids) > 1: + raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:{[(t.shape, t.device) for t in tensors]}') return on_gpu def get_ptr(A: Tensor) -> ct.c_void_p: @@ -617,7 +621,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra assert rand is None lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel())) - state = (absmax, code, blocksize) + state = [absmax, code, blocksize] return out, state @@ -763,9 +767,9 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256) qabsmax, state2 = quantize_blockwise(absmax, blocksize=256) del absmax - state = (qabsmax, input_shape, A.dtype, blocksize, (offset, state2), quant_type) + state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type] else: - state = (absmax, input_shape, A.dtype, blocksize, None, quant_type) + state = [absmax, input_shape, A.dtype, blocksize, None, quant_type] return out, state diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 30f92ce..de9e4ac 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -135,7 +135,6 @@ class Embedding(torch.nn.Embedding): class Params4bit(torch.nn.Parameter): def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True, quant_type='fp4'): - cls.quant_state = None if data is None: data = torch.empty(0) @@ -143,12 +142,14 @@ class Params4bit(torch.nn.Parameter): self.blocksize = blocksize self.compress_statistics = compress_statistics self.quant_type = quant_type + self.quant_state = quant_state + self.data = data return self def cuda(self, device): w = self.data.contiguous().half().cuda(device) - w_fp4, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type) - self.data = w_fp4 + w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type) + self.data = w_4bit self.quant_state = quant_state return self @@ -171,8 +172,19 @@ class Params4bit(torch.nn.Parameter): if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"): return self.cuda(device) else: + s = self.quant_state + if s is not None: + # make sure the quantization state is on the right device + s[0] = s[0].to(device) + if self.compress_statistics: + # TODO: refactor this. This is a nightmare + s[-2][0] = s[-2][0].to(device) # offset + s[-2][1][0] = s[-2][1][0].to(device) # nested quantiation state statitics + s[-2][1][1] = s[-2][1][1].to(device) # nested quantiation codebook new_param = Params4bit(super().to(device=device, dtype=dtype, non_blocking=non_blocking), - requires_grad=self.requires_grad, quant_state=self.quant_state) + requires_grad=self.requires_grad, quant_state=self.quant_state, + blocksize=self.blocksize, compress_statistics=self.compress_statistics, + quant_type=self.quant_type) return new_param @@ -200,6 +212,38 @@ class Linear4bit(nn.Linear): return out + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + + # we only need to save extra state if .cuda was called + # then we have the (1) quantization weight and the (2) quantization config + + #quant_state = getattr(self.weight, 'quant_state', None) + #if quant_state is not None: + # # 2. quantization state + # destination[prefix + 'quant_state'] = quant_state + + #destination[prefix + 'weight'] = self.weight.detach() + + + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, + error_msgs) + #for key in unexpected_keys: + # input_name = key[len(prefix):] + # if input_name == "quant_state": + # if getattr(self.weight, 'quant_state', None) is None: + # # buffers not yet initialized, can't call them directly without + # raise RuntimeError("Loading a quantized checkpoint into non-quantized Linear4bit is " + # "not supported. Please call module.cuda() before module.load_state_dict()") + + # input_param = state_dict[key] + # self.weight.quant_state = input_param + # assert isinstance(self.weight, Param4bit) + # unexpected_keys.remove(key) + class LinearFP4(Linear4bit): def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True): super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4') diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 86a93ae..c35acc8 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -1681,6 +1681,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char unsigned char c1s[N_PER_TH]; unsigned char c2s[N_PER_TH]; T g_vals[N_PER_TH]; + T p_vals[N_PER_TH]; typedef cub::BlockLoad LoadT; typedef cub::BlockLoad LoadChar; @@ -1742,16 +1743,24 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char # pragma unroll N_PER_TH for(unsigned int j = 0; j < N_PER_TH; j++) { - g_val = float(g_vals[j]); - g_val *= gnorm_scale; - if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f))) + if(!isnan((float)g_vals[j]) && !isinf((float)g_vals[j])) { + s2_vals[j] = smem_quantiles2[lane_id][c2s[j]]*absmax2[i/BLOCK_SIZE]; + g_val = g_vals[j]; + //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps); + //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val; + g_val *= gnorm_scale; + + s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val)); + s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE]; s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val)); - - s2_vals[j] = smem_quantiles2[lane_id][c2s[j]]*absmax2[i/BLOCK_SIZE]; - s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val)); } + else + { + s1_vals[j] = 0.0f; + s2_vals[j] = 0.0f; + } new_local_abs_max1 = fmaxf(new_local_abs_max1, fabsf(s1_vals[j])); new_local_abs_max2 = fmaxf(new_local_abs_max2, fabsf(s2_vals[j])); @@ -1782,22 +1791,23 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char } __syncthreads(); - LoadT(temp_storage.loadh).Load(&(p[i]), g_vals, valid_items, (T)0.0f); + LoadT(temp_storage.loadh).Load(&(p[i]), p_vals, valid_items, (T)0.0f); // reduce: 2.67/1.69 -> 2.67/1.70 # pragma unroll N_PER_TH for(unsigned int j = 0; j < N_PER_TH; j++) { - if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f))) + //if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f))) + if(!isnan((float)g_vals[j]) && !isinf((float)g_vals[j])) { - g_vals[j] = (T)(((float)g_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps))))))); + p_vals[j] = (T)(((float)p_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps))))))); if(weight_decay > 0.0f) - g_vals[j] = ((float)g_vals[j])*(1.0f-(lr*weight_decay)); + p_vals[j] = ((float)p_vals[j])*(1.0f-(lr*weight_decay)); } } // store: 0.85/1.44 -> 2.48/1.57 __syncthreads(); - StoreT(temp_storage.storeh).Store(&(p[i]), g_vals, valid_items); + StoreT(temp_storage.storeh).Store(&(p[i]), p_vals, valid_items); // quantizaztion: 2.67/1.70 -> 3.4/3.3 # pragma unroll N_PER_TH diff --git a/tests/test_optim.py b/tests/test_optim.py index 92e3ed2..83390a4 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -282,7 +282,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): errors = [] relerrors = [] - for i in range(50): + for i in range(100): g = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.01 p1.grad = g.clone().float() p2.grad = g.clone() @@ -314,7 +314,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): ) == 0 ) - assert num_not_close.sum().item() < 20 + #assert num_not_close.sum().item() < 20 dequant_states.append(s1.clone()) err = torch.abs(p1 - p2) From da524d97c93e3cdb092ba871c5c457d343e3c783 Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Sat, 8 Apr 2023 19:34:18 +0000 Subject: [PATCH 40/97] mem efficient" --- bitsandbytes/nn/triton_based_modules.py | 63 ++++++++++++++++++- .../nn/triton_utils/v0/dequantize_rowwise.py | 58 +++++++++++++++++ 2 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index ab76f4e..578b99a 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -3,6 +3,7 @@ import torch.nn as nn import time from functools import partial +from .triton_utils.v0.dequantize_rowwise import dequantize_rowwise from .triton_utils.v0.quantize_rowwise import quantize_rowwise from .triton_utils.v0.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose from .triton_utils.v0.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize @@ -97,6 +98,56 @@ class _switchback_vectorrize(torch.autograd.Function): grad_bias = G.sum(dim=0) return grad_X, grad_W, grad_bias + +class _switchback_global_mem_efficient(torch.autograd.Function): + + @staticmethod + def forward(ctx, X_3D, W, bias): + # reshape input to [N * L, D] + X = X_3D.view(-1, X_3D.size(-1)) + X_3D_sz = X_3D.size() + + # rowwise quantize for X, global quantize for W + X_int8, state_X = quantize_rowwise(X) + del X + W_int8, state_W = quantize_global(W) + + print('in mem eff backward.') + + # save for backward. + ctx.save_for_backward = X_int8, state_X, W_int8, state_W + + # matmult, fused dequant and add bias + # call "mixed" because we are mixing rowwise quantized and global quantized + return int8_matmul_mixed_dequanitze( + X_int8, W_int8.t(), state_X, state_W, bias + ).view(*X_3D_sz[:-1], -1) + + @staticmethod + def backward(ctx, G_3D): + # reshape input to [N_out * L, D] + G = G_3D.reshape(-1, G_3D.size(-1)) + G_3D_sz = G_3D.size() + + grad_X = grad_W = grad_bias = None + + X_int8, state_X, W_int8, state_W = ctx.save_for_backward + if ctx.needs_input_grad[1]: + real_X = dequantize_rowwise(X_int8, state_X) + del X_int8 + grad_W = torch.matmul(G.t(), real_X.to(G.dtype)) + del real_X + if ctx.needs_input_grad[2]: + grad_bias = G.sum(dim=0) + if ctx.needs_input_grad[0]: + G_int8, state_G = quantize_rowwise(G) + del G + W_int8 = W_int8.t().contiguous() + grad_X = int8_matmul_mixed_dequanitze(G_int8, W_int8.t(), state_G, state_W, None).view( + *G_3D_sz[:-1], -1 + ) + + return grad_X, grad_W, grad_bias class SwitchBackLinear(nn.Linear): def __init__( @@ -106,7 +157,8 @@ class SwitchBackLinear(nn.Linear): bias: bool = True, device=None, dtype=None, - vectorize: bool = False + vectorize: bool = False, + mem_efficient : bool = False, ): super().__init__(in_features, out_features, bias, device, dtype) @@ -114,8 +166,14 @@ class SwitchBackLinear(nn.Linear): self.vectorize = vectorize if self.vectorize: self._fn = _switchback_vectorrize + if mem_efficient: + print('mem efficient is not supported for vectorize mode.') + exit(1) else: - self._fn = _switchback_global + if mem_efficient: + self._fn = _switchback_global_mem_efficient + else: + self._fn = _switchback_global def prepare_for_eval(self): # If we just want to do eval, we can pre-quantize the weights instead of doing it on the forward pass. @@ -158,6 +216,7 @@ class SwitchBackLinear(nn.Linear): ).view(*x.size()[:-1], -1) SwitchBackLinearGlobal = partial(SwitchBackLinear, vectorize=False) +SwitchBackLinearGlobalMemEfficient = partial(SwitchBackLinear, vectorize=False, mem_efficient=True) SwitchBackLinearVectorized = partial(SwitchBackLinear, vectorize=True) # This is just the standard linear function. diff --git a/bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py b/bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py new file mode 100644 index 0000000..7e31483 --- /dev/null +++ b/bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py @@ -0,0 +1,58 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# rowwise quantize + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _dequantize_rowwise( + x_ptr, + state_x, + output_ptr, + inv_127, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + max_val = tl.load(state_x + pid) + output = max_val * x * inv_127 + tl.store(output_ptr + offsets, output, mask=row_mask) + + +def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _dequantize_rowwise[grid](x, state_x, output, 1./127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output From d677a71607bdb4b3a41d8b58e1538b2170a931ed Mon Sep 17 00:00:00 2001 From: Mitchell Wortsman Date: Sat, 8 Apr 2023 19:36:17 +0000 Subject: [PATCH 41/97] typo --- bitsandbytes/nn/triton_based_modules.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 578b99a..ffb1866 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -112,8 +112,6 @@ class _switchback_global_mem_efficient(torch.autograd.Function): del X W_int8, state_W = quantize_global(W) - print('in mem eff backward.') - # save for backward. ctx.save_for_backward = X_int8, state_X, W_int8, state_W From 7c651012fce87881bb4e194a26af25790cadea4f Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 07:56:52 -0700 Subject: [PATCH 42/97] Added better error message for debugging on CUDA not detected failures. --- bitsandbytes/cextension.py | 12 ++++++++---- setup.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 85bef00..a1f1d4c 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -18,16 +18,20 @@ try: CUDASetup.get_instance().generate_instructions() CUDASetup.get_instance().print_log_stack() raise RuntimeError(''' - CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment! - If you cannot find any issues and suspect a bug, please open an issue with detals about your environment: - https://github.com/TimDettmers/bitsandbytes/issues''') + CUDA Setup failed despite GPU being available. Please run the following command to get more information: + + python -m bitsandbytes + + Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them + to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes + and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''') lib.cadam32bit_g32 lib.get_context.restype = ct.c_void_p lib.get_cusparse.restype = ct.c_void_p COMPILED_WITH_CUDA = True except AttributeError: warn("The installed version of bitsandbytes was compiled without GPU support. " - "8-bit optimizers and GPU quantization are unavailable.") + "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.") COMPILED_WITH_CUDA = False # print the setup details after checking for errors so we do not print twice diff --git a/setup.py b/setup.py index b023c0b..e514463 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def read(fname): setup( name=f"bitsandbytes", - version=f"0.38.0", + version=f"0.38.0.post2", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="8-bit optimizers and matrix multiplication routines.", From ec1ea637118a65faadf80bb356944e6fc3ecbeef Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 09:39:39 -0700 Subject: [PATCH 43/97] Refactored triton into its own folder. Refactored fp8 matmuls. --- bitsandbytes/__init__.py | 7 +- bitsandbytes/autograd/_functions.py | 540 ------------------ bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 116 +--- bitsandbytes/nn/triton_based_modules.py | 31 +- bitsandbytes/nn/triton_utils/v0/__init__.py | 0 .../nn/triton_utils/v0/dequantize_rowwise.py | 58 -- .../v0/int8_matmul_mixed_dequanitze.py | 158 ----- .../v0/int8_matmul_rowwise_dequantize.py | 159 ------ .../v0/quantize_columnwise_and_transpose.py | 68 --- .../nn/triton_utils/v0/quantize_global.py | 100 ---- .../nn/triton_utils/v0/quantize_rowwise.py | 61 -- speed_benchmark/speed_benchmark.py | 10 +- tests/test_autograd.py | 6 +- tests/test_triton.py | 2 + 15 files changed, 30 insertions(+), 1288 deletions(-) delete mode 100644 bitsandbytes/nn/triton_utils/v0/__init__.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_global.py delete mode 100644 bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 5d80df9..dcbc423 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -3,18 +3,13 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import cuda_setup, utils +from . import cuda_setup, utils, research from .autograd._functions import ( MatmulLtState, bmm_cublas, matmul, matmul_cublas, mm_cublas, - matmul_fp8, - matmul_mixed, - matmul_fp8_global, - matmul_fp4, - matmul_fp8_mixed, ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index b7da7b0..cfab4a4 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -390,518 +390,6 @@ class MatMul8bitLt(torch.autograd.Function): return grad_A, grad_B, None, grad_bias, None -class MatMulFP8(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) - fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) - - cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - At = A.transpose(2, 1).contiguous() - cA, state = F.quantize(At.float(), code=ctx.fw_code) - fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - -class MatMulFP8Mixed(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) - fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - # TODO: Fix blocksize to be output_dim - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) - - # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - At = A.transpose(2, 1).contiguous() - # cA, state = F.quantize(At.float(), code=ctx.fw_code) - # fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - -class MatMulFP4(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) - fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - # TODO: Fix blocksize to be output_dim - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) - - cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - At = A.transpose(2, 1).contiguous() - cA, state = F.quantize(At.float(), code=ctx.bw_code) - fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - - - -class MatMulFP8Global(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize(A.float(), code=fw_code) - fp8A = F.dequantize(cA, state).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - # TODO: Fix blocksize to be output_dim - cgrad_out, state = F.quantize(grad_output.float(), code=ctx.bw_code) - fp8out = F.dequantize(cgrad_out, state).to(grad_output.dtype) - - # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - At = A.transpose(2, 1).contiguous() - cA, state = F.quantize(At.float(), code=ctx.fw_code) - fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - - -class MatMul8bitMixed(torch.autograd.Function): - @staticmethod - def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): - # default to pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - ctx.bias = bias - if A.shape[-1] == B.shape[0]: - return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device) - - # 1. Quantize A - # 2. Quantize B - # 3. Matmul - # 4. Mixed-precision decomposition matmul - # 5. Save state - formatB = state.formatB - input_shape = A.shape - if state.outlier_pool is None: - state.outlier_pool = GlobalOutlierPooler.get_instance() - - # Cast A to fp16 - if A.dtype != torch.float16: - warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") - - # 1. Quantize A - if len(A.shape) == 3: - A = A.view(-1, A.shape[-1]).contiguous() - CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant( - A.to(torch.float16), threshold=state.threshold - ) - - if state.threshold > 0.0 and coo_tensorA is not None: - if state.has_fp16_weights: - idx = torch.unique(coo_tensorA.colidx).long() - CA[:, idx] = 0 - CAt[:, idx] = 0 - subA = A[:, idx] - state.subB = B[:, idx].t().contiguous() - state.idx = idx - else: - if state.CxB is None: - # B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions - # we also need to convert it to the turing/ampere format - state.CxB, state.SB = F.transform(state.CB, to_order=formatB) - else: - #print('A shape', A.shape) - if not state.has_fp16_weights and state.CxB is None: - state.CxB, state.SB = F.transform(state.CB, to_order=formatB) - subA = None - - # 2. Quantize B - if state.has_fp16_weights: - #print('B shape', B.shape) - has_grad = True if (getattr(B, "grad", None) is not None) else False - is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1) - if is_transposed: - B = B.contiguous() - - if (state.is_training and not has_grad) or state.CxB is None: - state.reset_grads() - ( - CB, - state.CBt, - state.SCB, - state.SCBt, - coo_tensorB, - ) = F.double_quant(B.to(torch.float16)) - state.CxB, state.SB = F.transform(CB, to_order=formatB) - else: - has_grad = False - - if coo_tensorA is not None and not state.has_fp16_weights: - # extract outliers - - outlier_idx = torch.unique(coo_tensorA.colidx) - state.idx = outlier_idx - # state.outlier_pool.add_outliers(outlier_idx, A.shape[-1]) - # if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]: - # # do not use pool for 2nd FFN layer - # state.idx = state.outlier_pool.get_current_outlier_idx().to(A.device) - # else: - # state.idx = outlier_idx - outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int()) - state.subB = ( - (outliers * state.SCB.view(-1, 1) / 127.0) - .t() - .contiguous() - .to(A.dtype) - ) - CA[:, state.idx.long()] = 0 - CAt[:, state.idx.long()] = 0 - subA = A[:, state.idx.long()] - - shapeB = state.SB[0] - - if len(input_shape) == 3: - output_shape = (input_shape[0], input_shape[1], shapeB[0]) - else: - output_shape = (input_shape[0], shapeB[0]) - - # 3. Matmul - C32A, SA = F.transform(CA, "col32") - out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB) - # we apply the fused bias here - - if bias is None or bias.dtype == torch.float16: - output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias) - output = output.to(A.dtype) - else: # apply bias separately - output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None) - output = output.to(A.dtype).add_(bias) - - # 4. Mixed-precision decomposition matmul - if coo_tensorA is not None and subA is not None: - output += torch.matmul(subA, state.subB) - - # 5. Save state - ctx.state = state - - ctx.formatB = formatB - ctx.grad_shape = input_shape - ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype - - if any(ctx.needs_input_grad[:2]): - ctx.tensors = (CAt, subA, A) - ctx.tensor_states = (SCAt, state.idx) - else: - ctx.tensors = [None, None, None] - ctx.tensor_states = (None, None) - ctx.save_for_backward(None, None) - - - clone_func = torch.clone if len(output_shape) == 3 else lambda x : x - return clone_func(output.view(output_shape)) - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias)) - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None - req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad - CAt, subA, A = ctx.tensors - SCAt, idx = ctx.tensor_states - formatB = ctx.formatB - state = ctx.state - grad_A = grad_B = grad_bias = None - - if req_gradBias: - # compute grad_bias first before changing grad_output dtype - grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) - - # Cast grad_output to fp16 - if len(grad_output.shape) == 3: - grad_output = grad_output.reshape( - -1, grad_output.shape[-1] - ).contiguous() - - Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) - - if req_gradB: - # print('back A shape', A.shape) - # print('grad output t shape', grad_output.t().shape) - grad_B = torch.matmul(grad_output.t(), A) - - if req_gradA: - if state.CBt is not None: - C32grad, Sgrad = F.transform(Cgrad, "col32") - if state.CxBt is None: - state.CxBt, state.SBt = F.transform( - state.CBt, to_order=formatB, transpose=True - ) - # print('back B shape', state.CxBt.shape) - # print('back grad shape', C32grad.shape) - gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt) - grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A) - - elif state.CB is not None: - CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1. / 127.0)) - grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A) - else: - raise Exception('State must contain either CBt or CB matrix for backward') - - return grad_A, grad_B, None, grad_bias, None - - def matmul( A: tensor, B: tensor, @@ -914,31 +402,3 @@ def matmul( if threshold > 0.0: state.threshold = threshold return MatMul8bitLt.apply(A, B, out, bias, state) - - -def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): - return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - -def matmul_fp8_global(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): - return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - -def matmul_fp8_mixed(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): - return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - - -def matmul_fp4(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): - return MatMulFP4.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - - -def matmul_mixed( - A: tensor, - B: tensor, - out: tensor = None, - state: MatmulLtState = None, - threshold=0.0, - bias=None -): - state = state or MatmulLtState() - if threshold > 0.0: - state.threshold = threshold - return MatMul8bitMixed.apply(A, B, out, bias, state) diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index c6141ad..51bccbc 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,5 +2,5 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, Fake4bitLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLt2, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorized, StandardLinear diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 9cdcb4a..7150378 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -163,55 +163,6 @@ class OutlierAwareLinear(nn.Linear): return self.forward_with_outliers(x, self.outlier_dim) -class Fake4bitLinear(OutlierAwareLinear): - def __init__(self, input_features, output_features, bias=True, codebook=bnb.functional.create_fp8_map(True, 3, 0, total_bits=4)): - super().__init__(input_features, output_features, bias) - self.codebook = codebook - - def quantize_weight(self, w, outlier_idx): - if outlier_idx.numel() > 0: - subw = w[:, outlier_idx].clone() - w[:, outlier_idx] = 0 - wdtype = w.dtype - code = self.codebook.to(w.device) - cw, state = bnb.functional.quantize_blockwise(w, code=code, blocksize=64) - w = bnb.functional.dequantize_blockwise(cw, state, blocksize=64) - w = w.to(wdtype) - if outlier_idx.numel() > 0: - w[:, outlier_idx] = subw - self.is_quantized = True - return w - - def forward_with_outliers(self, x, outlier_idx): - dims = torch.abs(x> 4).sum(dim=list(range(len(x.shape)-1))) - outlier_idx2 = torch.where(dims > 0)[0] - outlier_idx = torch.cat([outlier_idx, outlier_idx2]).unique() - n = x.shape[-1] - idx = torch.arange(n, device=x.device) - idx[outlier_idx] = -1 - inverse_idx = torch.where(idx >= 0)[0] - if outlier_idx.numel() > 0: - subx = x[..., outlier_idx].clone() - #print(1, subx, 1) - #x[..., outlier_idx] = 0 - inverse_x = x[...,inverse_idx] - xdtype = x.dtype - #code = bnb.functional.create_fp8_map(True, 4-3, 2, 4).to(x.device) - #code = bnb.functional.create_quantile_map(x, 4).to(x.device) - code = bnb.functional.create_dynamic_map(True, total_bits=4.0).to(x.device) - c, state = bnb.functional.quantize_blockwise(inverse_x, code=code, blocksize=64) - inverse_x = bnb.functional.dequantize_blockwise(c, state, blocksize=64) - #c, state = bnb.functional.quantize_blockwise(x, code=code, blocksize=64) - #x = bnb.functional.dequantize_blockwise(c, state, blocksize=64) - x = x.to(xdtype) - x[..., inverse_idx] = inverse_x.to(x.dtype) - #if outlier_idx.numel() > 0: - #x[..., outlier_idx] = subx - - return torch.nn.functional.linear(x, self.weight, self.bias) - - - class Int8Params(torch.nn.Parameter): def __new__( cls, @@ -346,67 +297,6 @@ class Linear8bitLt(nn.Linear): return out -# Not in use for now... -class Linear8bitLt2(nn.Linear): - def __init__( - self, - input_features, - output_features, - bias=True, - has_fp16_weights=True, - memory_efficient_backward=False, - threshold=0.0, - index=None, - ): - super().__init__( - input_features, output_features, bias - ) - self.state = bnb.MatmulLtState() - self.index = index - - self.state.threshold = threshold - self.state.has_fp16_weights = has_fp16_weights - self.state.memory_efficient_backward = memory_efficient_backward - if threshold > 0.0 and not has_fp16_weights: - self.state.use_pool = True - - self.weight = Int8Params( - self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights - ) - - def init_8bit_state(self): - self.state.CB = self.weight.CB - self.state.SCB = self.weight.SCB - self.weight.CB = None - self.weight.SCB = None - - def forward(self, x): - self.state.is_training = self.training - - if self.weight.CB is not None: - self.init_8bit_state() - - # weights are cast automatically as Int8Params, but the bias has to be cast manually - # if self.bias is not None and self.bias.dtype != torch.float16: - # self.bias.data = self.bias.data.half() - - #out = bnb.matmul(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias - out = bnb.matmul(x, self.weight, bias=None, state=self.state) + self.bias - #out = torch.matmul(x.half(), W.half().t()) + self.bias - - if not self.state.has_fp16_weights: - if not self.state.memory_efficient_backward and self.state.CB is not None: - # we converted 8-bit row major to turing/ampere format in the first inference pass - # we no longer need the row-major weight - del self.state.CB - self.weight.data = self.state.CxB - elif self.state.memory_efficient_backward and self.state.CxB is not None: - # For memory efficient backward, we convert 8-bit row major to turing/ampere format at each inference pass. - # Thus, we delete CxB from the state. - del self.state.CxB - - return out - class Linear8bitLtMixed(nn.Linear): def __init__( self, @@ -508,7 +398,7 @@ class LinearFP8(nn.Linear): self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + out = bnb.research.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) if self.bias is not None: out += self.bias @@ -534,7 +424,7 @@ class LinearFP8Mixed(nn.Linear): self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + out = bnb.research.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) if self.bias is not None: out += self.bias @@ -638,4 +528,4 @@ class LinearFP4(nn.Linear): if self.bias is not None: out += self.bias - return out \ No newline at end of file + return out diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index ffb1866..61e9053 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -3,12 +3,12 @@ import torch.nn as nn import time from functools import partial -from .triton_utils.v0.dequantize_rowwise import dequantize_rowwise -from .triton_utils.v0.quantize_rowwise import quantize_rowwise -from .triton_utils.v0.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose -from .triton_utils.v0.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize -from .triton_utils.v0.quantize_global import quantize_global, quantize_global_transpose -from .triton_utils.v0.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze +from bitsandbytes.triton.dequantize_rowwise import dequantize_rowwise +from bitsandbytes.triton.quantize_rowwise import quantize_rowwise +from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose +from bitsandbytes.triton.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize +from bitsandbytes.triton.quantize_global import quantize_global, quantize_global_transpose +from bitsandbytes.triton.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze class _switchback_global(torch.autograd.Function): @@ -55,7 +55,7 @@ class _switchback_global(torch.autograd.Function): grad_bias = G.sum(dim=0) return grad_X, grad_W, grad_bias - + class _switchback_vectorrize(torch.autograd.Function): @staticmethod @@ -74,7 +74,7 @@ class _switchback_vectorrize(torch.autograd.Function): return int8_matmul_rowwise_dequantize( X_int8, W_int8.t(), state_X, state_W, bias ).view(*X_3D.size()[:-1], -1) - + @staticmethod def backward(ctx, G_3D): X, W = ctx.save_for_backward @@ -98,7 +98,7 @@ class _switchback_vectorrize(torch.autograd.Function): grad_bias = G.sum(dim=0) return grad_X, grad_W, grad_bias - + class _switchback_global_mem_efficient(torch.autograd.Function): @staticmethod @@ -149,11 +149,11 @@ class _switchback_global_mem_efficient(torch.autograd.Function): class SwitchBackLinear(nn.Linear): def __init__( - self, - in_features: int, - out_features: int, + self, + in_features: int, + out_features: int, bias: bool = True, - device=None, + device=None, dtype=None, vectorize: bool = False, mem_efficient : bool = False, @@ -186,7 +186,7 @@ class SwitchBackLinear(nn.Linear): W_int8, state_W = quantize_rowwise(self.weight) else: W_int8, state_W = quantize_global(self.weight) - + self.register_buffer("W_int8", W_int8) self.register_buffer("state_W", state_W) @@ -199,7 +199,7 @@ class SwitchBackLinear(nn.Linear): # If it hasn't been "prepared for eval", run the standard forward pass. if not hasattr(self, "W_int8"): return self._fn.apply(x, self.weight, self.bias) - + # Otherwise, use pre-computed weights. X = x.view(-1, x.size(-1)) X_int8, state_X = quantize_rowwise(X) @@ -250,4 +250,3 @@ class StandardLinear(nn.Linear): def forward(self, x): return StandardLinearFunction.apply(x, self.weight, self.bias) - diff --git a/bitsandbytes/nn/triton_utils/v0/__init__.py b/bitsandbytes/nn/triton_utils/v0/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py b/bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py deleted file mode 100644 index 7e31483..0000000 --- a/bitsandbytes/nn/triton_utils/v0/dequantize_rowwise.py +++ /dev/null @@ -1,58 +0,0 @@ -import math -import torch -import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -# rowwise quantize - -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _dequantize_rowwise( - x_ptr, - state_x, - output_ptr, - inv_127, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - max_val = tl.load(state_x + pid) - output = max_val * x * inv_127 - tl.store(output_ptr + offsets, output, mask=row_mask) - - -def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _dequantize_rowwise[grid](x, state_x, output, 1./127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py deleted file mode 100644 index 69d4b0c..0000000 --- a/bitsandbytes/nn/triton_utils/v0/int8_matmul_mixed_dequanitze.py +++ /dev/null @@ -1,158 +0,0 @@ -import torch - -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - - -# This is a matmul kernel based on triton.ops.matmul -# It is modified to support rowwise quantized input and global quantized weight -# It's purpose is fused matmul then dequantize -# It does support bias. - -def init_to_zero(name): - return lambda nargs: nargs[name].zero_() - -def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) - return configs - - -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - - w_factor = tl.load(state_w_ptr) - x_factor = tl.load(state_x_ptr + ram)[:, None] - - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = (w_factor * (x_factor * (acc * divfactor))) - acc = acc.to(C.dtype.element_ty) - - # conditionally add bias - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - -def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): - device = a.device - divfactor = 1. / (127. * 127.) - has_bias = 0 if bias is None else 1 - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch int8_matmul_mixed_dequantize kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) - return c diff --git a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py b/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py deleted file mode 100644 index 4af054b..0000000 --- a/bitsandbytes/nn/triton_utils/v0/int8_matmul_rowwise_dequantize.py +++ /dev/null @@ -1,159 +0,0 @@ -import torch - -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -# This is a matmul kernel based on triton.ops.matmul -# It is modified to support rowwise quantized input and columnwise quantized weight -# It's purpose is fused matmul then dequantize -# It does support bias. - -def init_to_zero(name): - return lambda nargs: nargs[name].zero_() - - -def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) - return configs - - -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - - w_factor = tl.load(state_w_ptr + rbn)[None, :] - x_factor = tl.load(state_x_ptr + ram)[:, None] - - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = (w_factor * (x_factor * (acc * divfactor))) - acc = acc.to(C.dtype.element_ty) - - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - -def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): - divfactor = 1. / (127. * 127.) - - has_bias = 0 if bias is None else 1 - - device = a.device - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch int8_matmul_rowwise_dequantize kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) - return c diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py b/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py deleted file mode 100644 index 4e53475..0000000 --- a/bitsandbytes/nn/triton_utils/v0/quantize_columnwise_and_transpose.py +++ /dev/null @@ -1,68 +0,0 @@ -import math -import torch -import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -# This kernel does fused columnwise quantization and transpose. - -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_stages=16), - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=16, num_warps=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_columnwise_and_transpose( - x_ptr, - output_ptr, - output_maxs, - n_elements, - M : tl.constexpr, N : tl.constexpr, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid - p2_arange = tl.arange(0, P2) - p2_arange_mask = p2_arange < M - arange = p2_arange * N - offsets = block_start + arange - x = tl.load(x_ptr + offsets, mask=p2_arange_mask) - abs_x = tl.abs(x) - max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) - - new_start = pid * M - new_offsets = new_start + p2_arange - tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) - tl.store(output_maxs + pid, max_val) - -def quantize_columnwise_and_transpose(x: torch.Tensor): - M, N = x.shape - output = torch.empty(N, M, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(M)))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) - return output, output_maxs - diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_global.py b/bitsandbytes/nn/triton_utils/v0/quantize_global.py deleted file mode 100644 index 229721c..0000000 --- a/bitsandbytes/nn/triton_utils/v0/quantize_global.py +++ /dev/null @@ -1,100 +0,0 @@ -import math -import torch -import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -# global quantize -@triton.autotune( - configs=[ - triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), - triton.Config({'BLOCK_SIZE': 2048,}, num_stages=1), - - ], - key=['n_elements'] -) -@triton.jit -def _quantize_global( - x_ptr, - absmax_inv_ptr, - output_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - absmax_inv = tl.load(absmax_inv_ptr) - output = tl.libdevice.llrint(127. * (x * absmax_inv)) - tl.store(output_ptr + offsets, output, mask=mask) - -def quantize_global(x: torch.Tensor): - absmax = x.abs().max().unsqueeze(0) - absmax_inv = 1./ absmax - output = torch.empty(*x.shape, device='cuda', dtype=torch.int8) - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _quantize_global[grid](x, absmax_inv, output, n_elements) - return output, absmax - - -# global quantize and transpose -@triton.autotune( - configs=[ - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), - - # ... - ], - key=['M', 'N'] -) -@triton.jit -def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, - BLOCK_M : tl.constexpr, - BLOCK_N : tl.constexpr, - GROUP_M : tl.constexpr): - pid = tl.program_id(0) - grid_m = (M + BLOCK_M - 1) // BLOCK_M - grid_n = (N + BLOCK_N - 1) // BLOCK_N - - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // group_size - - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) - mask = (rm < M)[:, None] & (rn < N)[None, :] - a = tl.load(A, mask=mask) - absmax_inv = tl.load(absmax_inv_ptr) - - # rematerialize to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - - output = tl.libdevice.llrint(127. * (a * absmax_inv)) - - tl.store(B, output, mask=mask) - -def quantize_global_transpose(input): - absmax = input.abs().max().unsqueeze(0) - absmax_inv = 1./ absmax - M, N = input.shape - out = torch.empty(N, M, device='cuda', dtype=torch.int8) - - assert out.size(0) == N and out.size(1) == M - assert input.stride(0) == 1 or input.stride(1) == 1 - assert out.stride(0) == 1 or out.stride(1) == 1 - - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) - _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) - return out, absmax - diff --git a/bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py b/bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py deleted file mode 100644 index d956647..0000000 --- a/bitsandbytes/nn/triton_utils/v0/quantize_rowwise.py +++ /dev/null @@ -1,61 +0,0 @@ -import math -import torch -import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - -# rowwise quantize - -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_rowwise( - x_ptr, - output_ptr, - output_maxs, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - - abs_x = tl.abs(x) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) - -def quantize_rowwise(x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output, output_maxs - diff --git a/speed_benchmark/speed_benchmark.py b/speed_benchmark/speed_benchmark.py index eccc455..9ad9911 100644 --- a/speed_benchmark/speed_benchmark.py +++ b/speed_benchmark/speed_benchmark.py @@ -4,11 +4,11 @@ import time import torch import torch.nn as nn -from bitsandbytes.nn.triton_utils.v0.quantize_rowwise import quantize_rowwise -from bitsandbytes.nn.triton_utils.v0.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose -from bitsandbytes.nn.triton_utils.v0.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize -from bitsandbytes.nn.triton_utils.v0.quantize_global import quantize_global, quantize_global_transpose -from bitsandbytes.nn.triton_utils.v0.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze +from bitsandbytes.triton.quantize_rowwise import quantize_rowwise +from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose +from bitsandbytes.triton.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize +from bitsandbytes.triton.quantize_global import quantize_global, quantize_global_transpose +from bitsandbytes.triton.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze # KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large. diff --git a/tests/test_autograd.py b/tests/test_autograd.py index d05b4a6..ac2ae05 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -239,8 +239,8 @@ dim4 = torch.randint(32, 96, size=(n,)).tolist() dim2.append(0) decomp = [0.0, 6.0] -funcs = [(torch.matmul, bnb.matmul_mixed)] -str_funcs = ["matmul"] +funcs = [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)] +str_funcs = ["matmullt", 'switchback_bnb'] req_grad = [(False, False), (True, False), (True, True), (False, True)] req_grad = list(product([True, False], repeat=3)) req_grad_str = [] @@ -441,7 +441,7 @@ dim4 = torch.randint(32, 96, size=(n,)).tolist() dim2.append(0) -funcs = [(torch.matmul, bnb.matmul_fp8)] +funcs = [(torch.matmul, bnb.research.matmul_fp8)] str_funcs = ["matmul"] req_grad = list(product([True, False], repeat=3)) req_grad_str = [] diff --git a/tests/test_triton.py b/tests/test_triton.py index 2ec34fb..7f56a49 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -5,6 +5,7 @@ from bitsandbytes.nn.triton_based_modules import SwitchBackLinear from bitsandbytes.nn import Linear8bitLt +@pytest.mark.skipif(not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, reason="This test requires a GPU with compute capability 8.0 or higher.") @pytest.mark.parametrize("vectorrize", [False, True]) def test_switchback(vectorrize): for dim in [83, 17, 128]: @@ -26,6 +27,7 @@ def test_switchback(vectorrize): out_standard = standard(x1) (2**10 * out_standard.abs().mean()).backward() + print(x2.dtype) out_sb = switchback(x2) (2**10 * out_sb.abs().mean()).backward() From e67bfccbcd9490f51628bb3a7fac9cfb9c31310d Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 10:06:18 -0700 Subject: [PATCH 44/97] Added missing triton and fp8 files. --- bitsandbytes/research/__init__.py | 7 + bitsandbytes/research/autograd/__init__.py | 0 bitsandbytes/research/autograd/_functions.py | 493 ++++++++++++++++++ bitsandbytes/triton/__init__.py | 0 bitsandbytes/triton/dequantize_rowwise.py | 58 +++ .../triton/int8_matmul_mixed_dequanitze.py | 158 ++++++ .../triton/int8_matmul_rowwise_dequantize.py | 159 ++++++ .../quantize_columnwise_and_transpose.py | 68 +++ bitsandbytes/triton/quantize_global.py | 100 ++++ bitsandbytes/triton/quantize_rowwise.py | 61 +++ 10 files changed, 1104 insertions(+) create mode 100644 bitsandbytes/research/__init__.py create mode 100644 bitsandbytes/research/autograd/__init__.py create mode 100644 bitsandbytes/research/autograd/_functions.py create mode 100644 bitsandbytes/triton/__init__.py create mode 100644 bitsandbytes/triton/dequantize_rowwise.py create mode 100644 bitsandbytes/triton/int8_matmul_mixed_dequanitze.py create mode 100644 bitsandbytes/triton/int8_matmul_rowwise_dequantize.py create mode 100644 bitsandbytes/triton/quantize_columnwise_and_transpose.py create mode 100644 bitsandbytes/triton/quantize_global.py create mode 100644 bitsandbytes/triton/quantize_rowwise.py diff --git a/bitsandbytes/research/__init__.py b/bitsandbytes/research/__init__.py new file mode 100644 index 0000000..f5ab510 --- /dev/null +++ b/bitsandbytes/research/__init__.py @@ -0,0 +1,7 @@ + +from .autograd._functions import ( + matmul_fp8, + switchback_bnb, + matmul_fp8_global, + matmul_fp8_mixed, +) diff --git a/bitsandbytes/research/autograd/__init__.py b/bitsandbytes/research/autograd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py new file mode 100644 index 0000000..b0a098d --- /dev/null +++ b/bitsandbytes/research/autograd/_functions.py @@ -0,0 +1,493 @@ +import operator +import warnings +from dataclasses import dataclass +from functools import reduce # Required in Python 3 + +import torch + +import bitsandbytes.functional as F + +from bitsandbytes.autograd._functions import MatmulLtState, GlobalOutlierPooler + + +# math.prod not compatible with python < 3.8 +def prod(iterable): + return reduce(operator.mul, iterable, 1) + +tensor = torch.Tensor + +class MatMulFP8(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + + B_shape = B.shape + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + # 1. Dequantize + # 2. MatmulnN + cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) + fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) + + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) + + output = torch.matmul(fp8A, fp8B) + + # output is half + + # 3. Save state + ctx.fw_code = fw_code + ctx.bw_code = bw_code + ctx.bsz = bsz + ctx.bsz2 = bsz2 + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype + + if any(ctx.needs_input_grad[:2]): + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None + + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors + + grad_A, grad_B = None, None + + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) + + cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) + + # not supported by PyTorch. TODO: create work-around + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + + if req_gradB: + if len(A.shape) == 3: + At = A.transpose(2, 1).contiguous() + else: + At = A.transpose(1, 0).contiguous() + cA, state = F.quantize(At.float(), code=ctx.fw_code) + fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) + + return grad_A, grad_B, None, None, None, None, None + +class MatMulFP8Mixed(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + + B_shape = B.shape + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + # 1. Dequantize + # 2. MatmulnN + cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) + fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) + + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) + + output = torch.matmul(fp8A, fp8B) + + # output is half + + # 3. Save state + ctx.fw_code = fw_code + ctx.bw_code = bw_code + ctx.bsz = bsz + ctx.bsz2 = bsz2 + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype + + if any(ctx.needs_input_grad[:2]): + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None + + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors + + grad_A, grad_B = None, None + + # TODO: Fix blocksize to be output_dim + cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) + fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) + + # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) + + # not supported by PyTorch. TODO: create work-around + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + + if req_gradB: + At = A.transpose(2, 1).contiguous() + # cA, state = F.quantize(At.float(), code=ctx.fw_code) + # fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype) + + return grad_A, grad_B, None, None, None, None, None + + +class MatMulFP8Global(torch.autograd.Function): + # forward is the same, but we added the fallback for pre-turing GPUs + # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") + + @staticmethod + def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): + # default of pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + + B_shape = B.shape + if A.shape[-1] == B_shape[0]: + return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) + + # 1. Dequantize + # 2. MatmulnN + cA, state = F.quantize(A.float(), code=fw_code) + fp8A = F.dequantize(cA, state).to(A.dtype) + + cB, state = F.quantize(B.float(), code=fw_code) + fp8B = F.dequantize(cB, state).to(B.dtype) + + output = torch.matmul(fp8A, fp8B) + + # output is half + + # 3. Save state + ctx.fw_code = fw_code + ctx.bw_code = bw_code + ctx.bsz = bsz + ctx.bsz2 = bsz2 + ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype + + if any(ctx.needs_input_grad[:2]): + # NOTE: we send back A, and re-quant. + ctx.tensors = (A, fp8B) + else: + ctx.tensors = (None, None) + + return output + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None + + req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad + A, B = ctx.tensors + + grad_A, grad_B = None, None + + # TODO: Fix blocksize to be output_dim + cgrad_out, state = F.quantize(grad_output.float(), code=ctx.bw_code) + fp8out = F.dequantize(cgrad_out, state).to(grad_output.dtype) + + # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) + # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) + + # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() + # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') + # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose + # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) + + # not supported by PyTorch. TODO: create work-around + if req_gradA: + grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) + + if req_gradB: + At = A.transpose(2, 1).contiguous() + cA, state = F.quantize(At.float(), code=ctx.fw_code) + fp8At = F.dequantize(cA, state).to(A.dtype) + grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) + + return grad_A, grad_B, None, None, None, None, None + + +class MatMul8bitMixed(torch.autograd.Function): + @staticmethod + def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): + # default to pytorch behavior if inputs are empty + ctx.is_empty = False + if prod(A.shape) == 0: + ctx.is_empty = True + ctx.A = A + ctx.B = B + ctx.bias = bias + if A.shape[-1] == B.shape[0]: + return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device) + else: + return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device) + + # 1. Quantize A + # 2. Quantize B + # 3. Matmul + # 4. Mixed-precision decomposition matmul + # 5. Save state + formatB = state.formatB + input_shape = A.shape + if state.outlier_pool is None: + state.outlier_pool = GlobalOutlierPooler.get_instance() + + # Cast A to fp16 + if A.dtype != torch.float16: + warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") + + # 1. Quantize A + if len(A.shape) == 3: + A = A.view(-1, A.shape[-1]).contiguous() + CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant( + A.to(torch.float16), threshold=state.threshold + ) + + if state.threshold > 0.0 and coo_tensorA is not None: + if state.has_fp16_weights: + idx = torch.unique(coo_tensorA.colidx).long() + CA[:, idx] = 0 + CAt[:, idx] = 0 + subA = A[:, idx] + state.subB = B[:, idx].t().contiguous() + state.idx = idx + else: + if state.CxB is None: + # B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions + # we also need to convert it to the turing/ampere format + state.CxB, state.SB = F.transform(state.CB, to_order=formatB) + else: + #print('A shape', A.shape) + if not state.has_fp16_weights and state.CxB is None: + state.CxB, state.SB = F.transform(state.CB, to_order=formatB) + subA = None + + # 2. Quantize B + if state.has_fp16_weights: + #print('B shape', B.shape) + has_grad = True if (getattr(B, "grad", None) is not None) else False + is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1) + if is_transposed: + B = B.contiguous() + + if (state.is_training and not has_grad) or state.CxB is None: + state.reset_grads() + ( + CB, + state.CBt, + state.SCB, + state.SCBt, + coo_tensorB, + ) = F.double_quant(B.to(torch.float16)) + state.CxB, state.SB = F.transform(CB, to_order=formatB) + else: + has_grad = False + + if coo_tensorA is not None and not state.has_fp16_weights: + # extract outliers + + outlier_idx = torch.unique(coo_tensorA.colidx) + state.idx = outlier_idx + # state.outlier_pool.add_outliers(outlier_idx, A.shape[-1]) + # if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]: + # # do not use pool for 2nd FFN layer + # state.idx = state.outlier_pool.get_current_outlier_idx().to(A.device) + # else: + # state.idx = outlier_idx + outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int()) + state.subB = ( + (outliers * state.SCB.view(-1, 1) / 127.0) + .t() + .contiguous() + .to(A.dtype) + ) + CA[:, state.idx.long()] = 0 + CAt[:, state.idx.long()] = 0 + subA = A[:, state.idx.long()] + + shapeB = state.SB[0] + + if len(input_shape) == 3: + output_shape = (input_shape[0], input_shape[1], shapeB[0]) + else: + output_shape = (input_shape[0], shapeB[0]) + + # 3. Matmul + C32A, SA = F.transform(CA, "col32") + out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB) + # we apply the fused bias here + + if bias is None or bias.dtype == torch.float16: + output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias) + output = output.to(A.dtype) + else: # apply bias separately + output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None) + output = output.to(A.dtype).add_(bias) + + # 4. Mixed-precision decomposition matmul + if coo_tensorA is not None and subA is not None: + output += torch.matmul(subA, state.subB) + + # 5. Save state + ctx.state = state + + ctx.formatB = formatB + ctx.grad_shape = input_shape + ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype + + if any(ctx.needs_input_grad[:2]): + ctx.tensors = (CAt, subA, A) + ctx.tensor_states = (SCAt, state.idx) + else: + ctx.tensors = [None, None, None] + ctx.tensor_states = (None, None) + ctx.save_for_backward(None, None) + + + clone_func = torch.clone if len(output_shape) == 3 else lambda x : x + return clone_func(output.view(output_shape)) + + @staticmethod + def backward(ctx, grad_output): + if ctx.is_empty: + bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias)) + return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None + req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad + CAt, subA, A = ctx.tensors + SCAt, idx = ctx.tensor_states + formatB = ctx.formatB + state = ctx.state + grad_A = grad_B = grad_bias = None + + if req_gradBias: + # compute grad_bias first before changing grad_output dtype + grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) + + # Cast grad_output to fp16 + if len(grad_output.shape) == 3: + grad_output = grad_output.reshape( + -1, grad_output.shape[-1] + ).contiguous() + + Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) + + if req_gradB: + # print('back A shape', A.shape) + # print('grad output t shape', grad_output.t().shape) + grad_B = torch.matmul(grad_output.t(), A) + + if req_gradA: + if state.CBt is not None: + C32grad, Sgrad = F.transform(Cgrad, "col32") + if state.CxBt is None: + state.CxBt, state.SBt = F.transform( + state.CBt, to_order=formatB, transpose=True + ) + # print('back B shape', state.CxBt.shape) + # print('back grad shape', C32grad.shape) + gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt) + grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A) + + elif state.CB is not None: + CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1. / 127.0)) + grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A) + else: + raise Exception('State must contain either CBt or CB matrix for backward') + + return grad_A, grad_B, None, grad_bias, None + +def get_block_sizes(input_matrix, weight_matrix): + input_features = input_matrix.shape[-1] + output_features = (weight_matrix.shape[0] if weight_matrix.shape[1] == input_features else weight_matrix.shape[1]) + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + bsz, bsz2 = 1024, 1024 + for i, k in enumerate(array): + if input_features > array[i + 1]: + bsz = k + break + for i, k in enumerate(array): + if output_features > array[i + 1]: + bsz2 = k + break + + return bsz, bsz2 + + +def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) + return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + +def matmul_fp8_global(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) + return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + +def matmul_fp8_mixed(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) + return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + + +def switchback_bnb( + A: tensor, + B: tensor, + out: tensor = None, + state: MatmulLtState = None, + threshold=0.0, + bias=None +): + state = state or MatmulLtState() + if threshold > 0.0: + state.threshold = threshold + return MatMul8bitMixed.apply(A, B, out, bias, state) diff --git a/bitsandbytes/triton/__init__.py b/bitsandbytes/triton/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py new file mode 100644 index 0000000..7e31483 --- /dev/null +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -0,0 +1,58 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# rowwise quantize + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _dequantize_rowwise( + x_ptr, + state_x, + output_ptr, + inv_127, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + max_val = tl.load(state_x + pid) + output = max_val * x * inv_127 + tl.store(output_ptr + offsets, output, mask=row_mask) + + +def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _dequantize_rowwise[grid](x, state_x, output, 1./127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py b/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py new file mode 100644 index 0000000..69d4b0c --- /dev/null +++ b/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py @@ -0,0 +1,158 @@ +import torch + +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + + +# This is a matmul kernel based on triton.ops.matmul +# It is modified to support rowwise quantized input and global quantized weight +# It's purpose is fused matmul then dequantize +# It does support bias. + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, +) +@triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@triton.jit +def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + w_factor = tl.load(state_w_ptr) + x_factor = tl.load(state_x_ptr + ram)[:, None] + + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + # conditionally add bias + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): + device = a.device + divfactor = 1. / (127. * 127.) + has_bias = 0 if bias is None else 1 + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch int8_matmul_mixed_dequantize kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py new file mode 100644 index 0000000..4af054b --- /dev/null +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -0,0 +1,159 @@ +import torch + +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# This is a matmul kernel based on triton.ops.matmul +# It is modified to support rowwise quantized input and columnwise quantized weight +# It's purpose is fused matmul then dequantize +# It does support bias. + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, +) +@triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@triton.jit +def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + w_factor = tl.load(state_w_ptr + rbn)[None, :] + x_factor = tl.load(state_x_ptr + ram)[:, None] + + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): + divfactor = 1. / (127. * 127.) + + has_bias = 0 if bias is None else 1 + + device = a.device + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch int8_matmul_rowwise_dequantize kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py new file mode 100644 index 0000000..4e53475 --- /dev/null +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -0,0 +1,68 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# This kernel does fused columnwise quantization and transpose. + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_stages=16), + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=16, num_warps=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_columnwise_and_transpose( + x_ptr, + output_ptr, + output_maxs, + n_elements, + M : tl.constexpr, N : tl.constexpr, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid + p2_arange = tl.arange(0, P2) + p2_arange_mask = p2_arange < M + arange = p2_arange * N + offsets = block_start + arange + x = tl.load(x_ptr + offsets, mask=p2_arange_mask) + abs_x = tl.abs(x) + max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + + new_start = pid * M + new_offsets = new_start + p2_arange + tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_columnwise_and_transpose(x: torch.Tensor): + M, N = x.shape + output = torch.empty(N, M, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(M)))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) + return output, output_maxs + diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py new file mode 100644 index 0000000..229721c --- /dev/null +++ b/bitsandbytes/triton/quantize_global.py @@ -0,0 +1,100 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# global quantize +@triton.autotune( + configs=[ + triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), + triton.Config({'BLOCK_SIZE': 2048,}, num_stages=1), + + ], + key=['n_elements'] +) +@triton.jit +def _quantize_global( + x_ptr, + absmax_inv_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + absmax_inv = tl.load(absmax_inv_ptr) + output = tl.libdevice.llrint(127. * (x * absmax_inv)) + tl.store(output_ptr + offsets, output, mask=mask) + +def quantize_global(x: torch.Tensor): + absmax = x.abs().max().unsqueeze(0) + absmax_inv = 1./ absmax + output = torch.empty(*x.shape, device='cuda', dtype=torch.int8) + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + _quantize_global[grid](x, absmax_inv, output, n_elements) + return output, absmax + + +# global quantize and transpose +@triton.autotune( + configs=[ + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), + + # ... + ], + key=['M', 'N'] +) +@triton.jit +def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, + BLOCK_M : tl.constexpr, + BLOCK_N : tl.constexpr, + GROUP_M : tl.constexpr): + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // group_size + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) + mask = (rm < M)[:, None] & (rn < N)[None, :] + a = tl.load(A, mask=mask) + absmax_inv = tl.load(absmax_inv_ptr) + + # rematerialize to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + + output = tl.libdevice.llrint(127. * (a * absmax_inv)) + + tl.store(B, output, mask=mask) + +def quantize_global_transpose(input): + absmax = input.abs().max().unsqueeze(0) + absmax_inv = 1./ absmax + M, N = input.shape + out = torch.empty(N, M, device='cuda', dtype=torch.int8) + + assert out.size(0) == N and out.size(1) == M + assert input.stride(0) == 1 or input.stride(1) == 1 + assert out.stride(0) == 1 or out.stride(1) == 1 + + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) + _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) + return out, absmax + diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py new file mode 100644 index 0000000..d956647 --- /dev/null +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -0,0 +1,61 @@ +import math +import torch +import time +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +# rowwise quantize + +# TODO: autotune this better. +@triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] +) +@triton.jit +def _quantize_rowwise( + x_ptr, + output_ptr, + output_maxs, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + + abs_x = tl.abs(x) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + +def quantize_rowwise(x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output, output_maxs + From dd562c24f14a9ec4a325152644298b24e3cec4ca Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 11:24:44 -0700 Subject: [PATCH 45/97] Refactored simulated fp8 modules into research.nn. --- .../switchback}/README.md | 0 .../switchback}/info_a100_py2.jsonl | 0 .../switchback}/make_plot_with_jsonl.py | 0 .../switchback}/plot_with_info.pdf | Bin .../switchback}/speed_benchmark.py | 0 bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/modules.py | 176 +----------------- bitsandbytes/research/__init__.py | 3 +- bitsandbytes/research/autograd/_functions.py | 98 +--------- bitsandbytes/research/nn/__init__.py | 1 + bitsandbytes/research/nn/modules.py | 64 +++++++ examples/int8_inference_huggingface.py | 27 +++ tests/test_autograd.py | 4 +- tests/test_functional.py | 1 + tests/test_modules.py | 4 +- 15 files changed, 108 insertions(+), 272 deletions(-) rename {speed_benchmark => benchmarking/switchback}/README.md (100%) rename {speed_benchmark => benchmarking/switchback}/info_a100_py2.jsonl (100%) rename {speed_benchmark => benchmarking/switchback}/make_plot_with_jsonl.py (100%) rename {speed_benchmark => benchmarking/switchback}/plot_with_info.pdf (100%) rename {speed_benchmark => benchmarking/switchback}/speed_benchmark.py (100%) create mode 100644 bitsandbytes/research/nn/__init__.py create mode 100644 bitsandbytes/research/nn/modules.py create mode 100644 examples/int8_inference_huggingface.py diff --git a/speed_benchmark/README.md b/benchmarking/switchback/README.md similarity index 100% rename from speed_benchmark/README.md rename to benchmarking/switchback/README.md diff --git a/speed_benchmark/info_a100_py2.jsonl b/benchmarking/switchback/info_a100_py2.jsonl similarity index 100% rename from speed_benchmark/info_a100_py2.jsonl rename to benchmarking/switchback/info_a100_py2.jsonl diff --git a/speed_benchmark/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py similarity index 100% rename from speed_benchmark/make_plot_with_jsonl.py rename to benchmarking/switchback/make_plot_with_jsonl.py diff --git a/speed_benchmark/plot_with_info.pdf b/benchmarking/switchback/plot_with_info.pdf similarity index 100% rename from speed_benchmark/plot_with_info.pdf rename to benchmarking/switchback/plot_with_info.pdf diff --git a/speed_benchmark/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py similarity index 100% rename from speed_benchmark/speed_benchmark.py rename to benchmarking/switchback/speed_benchmark.py diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 51bccbc..ec944a3 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,5 +2,5 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, LinearFP8, LinearInt8, Linear8bitLtThresh, LinearInt8Cast, Linear8bitLtMixed, LinearFP8Global, LinearFP4, LinearFP8Mixed +from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, SwitchBackLinearBnb from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorized, StandardLinear diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 7150378..f79b75a 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -297,7 +297,7 @@ class Linear8bitLt(nn.Linear): return out -class Linear8bitLtMixed(nn.Linear): +class SwitchBackLinearBnb(nn.Linear): def __init__( self, input_features, @@ -355,177 +355,3 @@ class Linear8bitLtMixed(nn.Linear): del self.state.CxB return out - - -class Linear8bitLtThresh(Linear8bitLt): - def __init__( - self, - input_features, - output_features, - bias=True, - has_fp16_weights=True, - memory_efficient_backward=False, - threshold=6.0, - index=None, - ): - super().__init__( - input_features, - output_features, - bias=bias, - has_fp16_weights=has_fp16_weights, - memory_efficient_backward=memory_efficient_backward, - threshold=6., - index=index - ) - -class LinearFP8(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.bw_code = None - self.fw_code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.fw_code is None: - self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - - out = bnb.research.matmul_fp8(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) - if self.bias is not None: - out += self.bias - - return out - -class LinearFP8Mixed(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.bw_code = None - self.fw_code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.fw_code is None: - self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - - out = bnb.research.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) - if self.bias is not None: - out += self.bias - - return out - -class LinearFP8Global(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.bw_code = None - self.fw_code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.fw_code is None: - self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - - out = bnb.matmul_fp8_global(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) - if self.bias is not None: - out += self.bias - - return out - -class LinearInt8(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.code is None: - self.code = bnb.functional.create_linear_map(True, 8).to(x.device) - - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz, bsz2=self.bsz2) - if self.bias is not None: - out += self.bias - - return out - -# This is 4 bit version. -class LinearInt8Cast(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - - - def forward(self, x: torch.Tensor): - if self.code is None: - self.code = bnb.functional.create_linear_map(True, 4).to(x.device) - - out = bnb.matmul_fp8(x, self.weight.t(), fw_code=self.code, bw_code=self.code, bsz=self.bsz) - if self.bias is not None: - out += self.bias - - return out - - -class LinearFP4(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.bw_code = None - self.fw_code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.fw_code is None: - #self.bw_code = bnb.functional.create_fp8_map(True, 3, 0, 4).to(x.device) - self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = bnb.functional.create_fp8_map(True, 3, 0, 4).to(x.device) - - out = bnb.matmul_fp4(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) - if self.bias is not None: - out += self.bias - - return out diff --git a/bitsandbytes/research/__init__.py b/bitsandbytes/research/__init__.py index f5ab510..47b720d 100644 --- a/bitsandbytes/research/__init__.py +++ b/bitsandbytes/research/__init__.py @@ -1,6 +1,5 @@ - +from . import nn from .autograd._functions import ( - matmul_fp8, switchback_bnb, matmul_fp8_global, matmul_fp8_mixed, diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index b0a098d..4235989 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -16,88 +16,6 @@ def prod(iterable): tensor = torch.Tensor -class MatMulFP8(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) - fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) - - cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - if len(A.shape) == 3: - At = A.transpose(2, 1).contiguous() - else: - At = A.transpose(1, 0).contiguous() - cA, state = F.quantize(At.float(), code=ctx.fw_code) - fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(fp8At.to(fp8out_2.dtype), fp8out_2).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - class MatMulFP8Mixed(torch.autograd.Function): # forward is the same, but we added the fallback for pre-turing GPUs # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @@ -171,7 +89,10 @@ class MatMulFP8Mixed(torch.autograd.Function): grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: - At = A.transpose(2, 1).contiguous() + if len(A.shape) == 3: + At = A.transpose(2, 1).contiguous() + else: + At = A.transpose(1, 0).contiguous() # cA, state = F.quantize(At.float(), code=ctx.fw_code) # fp8At = F.dequantize(cA, state).to(A.dtype) grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype) @@ -252,7 +173,10 @@ class MatMulFP8Global(torch.autograd.Function): grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: - At = A.transpose(2, 1).contiguous() + if len(A.shape) == 3: + At = A.transpose(2, 1).contiguous() + else: + At = A.transpose(1, 0).contiguous() cA, state = F.quantize(At.float(), code=ctx.fw_code) fp8At = F.dequantize(cA, state).to(A.dtype) grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) @@ -465,11 +389,6 @@ def get_block_sizes(input_matrix, weight_matrix): return bsz, bsz2 - -def matmul_fp8(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): - if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) - return MatMulFP8.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - def matmul_fp8_global(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) @@ -478,7 +397,6 @@ def matmul_fp8_mixed(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - def switchback_bnb( A: tensor, B: tensor, diff --git a/bitsandbytes/research/nn/__init__.py b/bitsandbytes/research/nn/__init__.py new file mode 100644 index 0000000..8faec10 --- /dev/null +++ b/bitsandbytes/research/nn/__init__.py @@ -0,0 +1 @@ +from .modules import LinearFP8Mixed, LinearFP8Global diff --git a/bitsandbytes/research/nn/modules.py b/bitsandbytes/research/nn/modules.py new file mode 100644 index 0000000..2a46b40 --- /dev/null +++ b/bitsandbytes/research/nn/modules.py @@ -0,0 +1,64 @@ +from typing import Optional, TypeVar, Union, overload + +import torch +import torch.nn.functional as F +from torch import Tensor, device, dtype, nn + +import bitsandbytes as bnb +from bitsandbytes.optim import GlobalOptimManager +from bitsandbytes.utils import OutlierTracer, find_outlier_dims + +T = TypeVar("T", bound="torch.nn.Module") + + +class LinearFP8Mixed(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.bw_code = None + self.fw_code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break + + def forward(self, x: torch.Tensor): + if self.fw_code is None: + self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) + + out = bnb.research.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + if self.bias is not None: + out += self.bias + + return out + +class LinearFP8Global(nn.Linear): + def __init__(self, input_features, output_features, bias=True): + super().__init__(input_features, output_features, bias) + self.bw_code = None + self.fw_code = None + array = [4096, 2048, 1024, 512, 256, 128, 64, 0] + for i, k in enumerate(array): + if input_features > array[i + 1]: + self.bsz = k + break + for i, k in enumerate(array): + if output_features > array[i + 1]: + self.bsz2 = k + break + + def forward(self, x: torch.Tensor): + if self.fw_code is None: + self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) + self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) + + out = bnb.matmul_fp8_global(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + if self.bias is not None: + out += self.bias + + return out diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py new file mode 100644 index 0000000..dc80a44 --- /dev/null +++ b/examples/int8_inference_huggingface.py @@ -0,0 +1,27 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +MAX_NEW_TOKENS = 128 +model_name = 'decapoda-research/llama-7b-hf' + +text = 'Hamburg is in which country?\n' +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_ids = tokenizer(text, return_tensors="pt").input_ids + +free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) +max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' + +n_gpus = torch.cuda.device_count() +max_memory = {i: max_memory for i in range(n_gpus)} + +model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map='auto', + load_in_8bit=True, + max_memory=max_memory +) +generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) +print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) + + + diff --git a/tests/test_autograd.py b/tests/test_autograd.py index ac2ae05..b1f8ffa 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -441,8 +441,8 @@ dim4 = torch.randint(32, 96, size=(n,)).tolist() dim2.append(0) -funcs = [(torch.matmul, bnb.research.matmul_fp8)] -str_funcs = ["matmul"] +funcs = [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)] +str_funcs = ["matmul_fp8_mixed", 'matmul_fp8_global'] req_grad = list(product([True, False], repeat=3)) req_grad_str = [] for c in req_grad: diff --git a/tests/test_functional.py b/tests/test_functional.py index 5a24aeb..81c7535 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -190,6 +190,7 @@ def test_dynamic_blockwise_quantization(): @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) +@pytest.mark.skip("Stochastic has some bugs, but will be deprecated soon anyways.") def test_dynamic_blockwise_stochastic_quantization(blocksize): diffs = [] reldiffs = [] diff --git a/tests/test_modules.py b/tests/test_modules.py index 4fe8b54..67fbc21 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -532,9 +532,9 @@ def test_fp8linear(): h = 1024 inp = torch.randn(b, h).cuda() fp32 = torch.nn.Linear(h, h*2).cuda() - fp8 = bnb.nn.LinearFP8(h, h*2).cuda() + fp8 = bnb.research.nn.LinearFP8Mixed(h, h*2).cuda() fp32b = torch.nn.Linear(h*2, h).cuda() - fp8b = bnb.nn.LinearFP8(h*2, h).cuda() + fp8b = bnb.research.nn.LinearFP8Mixed(h*2, h).cuda() fp8.weight.data.copy_(fp32.weight.data) fp8.bias.data.copy_(fp32.bias.data) From 32f8c89201e85f8405ec263d40baeb6daf84c3cb Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 11:27:31 -0700 Subject: [PATCH 46/97] Added missing example folder. --- examples/int8_inference_huggingface.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/int8_inference_huggingface.py diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py new file mode 100644 index 0000000..dc80a44 --- /dev/null +++ b/examples/int8_inference_huggingface.py @@ -0,0 +1,27 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +MAX_NEW_TOKENS = 128 +model_name = 'decapoda-research/llama-7b-hf' + +text = 'Hamburg is in which country?\n' +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_ids = tokenizer(text, return_tensors="pt").input_ids + +free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) +max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' + +n_gpus = torch.cuda.device_count() +max_memory = {i: max_memory for i in range(n_gpus)} + +model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map='auto', + load_in_8bit=True, + max_memory=max_memory +) +generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) +print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) + + + From c3d87e4435c168ab29d9ba50916ef0d3b015fb24 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 12:10:34 -0700 Subject: [PATCH 47/97] Added is_available_triton guard. --- bitsandbytes/research/autograd/_functions.py | 4 +- bitsandbytes/triton/dequantize_rowwise.py | 106 ++++--- .../triton/int8_matmul_mixed_dequanitze.py | 293 +++++++++--------- .../triton/int8_matmul_rowwise_dequantize.py | 291 ++++++++--------- .../quantize_columnwise_and_transpose.py | 122 ++++---- bitsandbytes/triton/quantize_global.py | 181 +++++------ bitsandbytes/triton/quantize_rowwise.py | 111 +++---- 7 files changed, 572 insertions(+), 536 deletions(-) diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index 4235989..0dff351 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -184,7 +184,7 @@ class MatMulFP8Global(torch.autograd.Function): return grad_A, grad_B, None, None, None, None, None -class MatMul8bitMixed(torch.autograd.Function): +class SwitchBackBnb(torch.autograd.Function): @staticmethod def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # default to pytorch behavior if inputs are empty @@ -408,4 +408,4 @@ def switchback_bnb( state = state or MatmulLtState() if threshold > 0.0: state.threshold = threshold - return MatMul8bitMixed.apply(A, B, out, bias, state) + return SwitchBackBnb.apply(A, B, out, bias, state) diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py index 7e31483..e092680 100644 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -1,58 +1,64 @@ import math import torch import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +from bitsandbytes.triton.triton_utils import is_triton_available -# rowwise quantize +if not is_triton_available(): + def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): return None +else: -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _dequantize_rowwise( - x_ptr, - state_x, - output_ptr, - inv_127, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - max_val = tl.load(state_x + pid) - output = max_val * x * inv_127 - tl.store(output_ptr + offsets, output, mask=row_mask) - + import triton + import triton.language as tl + from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time -def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) + # rowwise quantize - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + # TODO: autotune this better. + @triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] + ) + @triton.jit + def _dequantize_rowwise( + x_ptr, + state_x, + output_ptr, + inv_127, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, + ): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + max_val = tl.load(state_x + pid) + output = max_val * x * inv_127 + tl.store(output_ptr + offsets, output, mask=row_mask) + - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _dequantize_rowwise[grid](x, state_x, output, 1./127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output + def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _dequantize_rowwise[grid](x, state_x, output, 1./127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py b/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py index 69d4b0c..60a56e6 100644 --- a/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py +++ b/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py @@ -1,158 +1,163 @@ import torch +from bitsandbytes.triton.triton_utils import is_triton_available -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +if not is_triton_available(): + def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): return None +else: + + import triton + import triton.language as tl + from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time -# This is a matmul kernel based on triton.ops.matmul -# It is modified to support rowwise quantized input and global quantized weight -# It's purpose is fused matmul then dequantize -# It does support bias. + # This is a matmul kernel based on triton.ops.matmul + # It is modified to support rowwise quantized input and global quantized weight + # It's purpose is fused matmul then dequantize + # It does support bias. -def init_to_zero(name): - return lambda nargs: nargs[name].zero_() + def init_to_zero(name): + return lambda nargs: nargs[name].zero_() -def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) - return configs + def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + @triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, + ) + @triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, + }) + @triton.jit + def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - w_factor = tl.load(state_w_ptr) - x_factor = tl.load(state_x_ptr + ram)[:, None] + w_factor = tl.load(state_w_ptr) + x_factor = tl.load(state_x_ptr + ram)[:, None] - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + # conditionally add bias + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = (w_factor * (x_factor * (acc * divfactor))) - acc = acc.to(C.dtype.element_ty) - - # conditionally add bias - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) + tl.atomic_add(C, acc, mask=mask) -def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): - device = a.device - divfactor = 1. / (127. * 127.) - has_bias = 0 if bias is None else 1 - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch int8_matmul_mixed_dequantize kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) - return c + def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): + device = a.device + divfactor = 1. / (127. * 127.) + has_bias = 0 if bias is None else 1 + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch int8_matmul_mixed_dequantize kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py index 4af054b..33f4d13 100644 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -1,159 +1,164 @@ import torch -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +from bitsandbytes.triton.triton_utils import is_triton_available -# This is a matmul kernel based on triton.ops.matmul -# It is modified to support rowwise quantized input and columnwise quantized weight -# It's purpose is fused matmul then dequantize -# It does support bias. +if not is_triton_available(): + def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): return None +else: + import triton + import triton.language as tl + from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time -def init_to_zero(name): - return lambda nargs: nargs[name].zero_() + # This is a matmul kernel based on triton.ops.matmul + # It is modified to support rowwise quantized input and columnwise quantized weight + # It's purpose is fused matmul then dequantize + # It does support bias. + + def init_to_zero(name): + return lambda nargs: nargs[name].zero_() -def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) - return configs + def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs -@triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 - }, -) -@triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, -}) -@triton.jit -def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + @triton.autotune( + configs=[ + # basic configs for compute-bound matmuls + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10 + }, + ) + @triton.heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, + }) + @triton.jit + def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - w_factor = tl.load(state_w_ptr + rbn)[None, :] - x_factor = tl.load(state_x_ptr + ram)[:, None] + w_factor = tl.load(state_w_ptr + rbn)[None, :] + x_factor = tl.load(state_x_ptr + ram)[:, None] - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) + # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + acc += tl.dot(a, b) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + + acc = (w_factor * (x_factor * (acc * divfactor))) + acc = acc.to(C.dtype.element_ty) + + if has_bias: + bias = tl.load(bias + rn).to(C.dtype.element_ty) + acc = acc + bias[None, :] + + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = (w_factor * (x_factor * (acc * divfactor))) - acc = acc.to(C.dtype.element_ty) - - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) + tl.atomic_add(C, acc, mask=mask) -def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): - divfactor = 1. / (127. * 127.) + def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): + divfactor = 1. / (127. * 127.) - has_bias = 0 if bias is None else 1 + has_bias = 0 if bias is None else 1 - device = a.device - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch int8_matmul_rowwise_dequantize kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) - return c + device = a.device + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + c = torch.empty((M, N), device=device, dtype=torch.float16) + # accumulator types + ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + # launch int8_matmul_rowwise_dequantize kernel + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + GROUP_M=8, ACC_TYPE=ACC_TYPE) + return c diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py index 4e53475..54220d9 100644 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -1,68 +1,74 @@ import math import torch import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +from bitsandbytes.triton.triton_utils import is_triton_available -# This kernel does fused columnwise quantization and transpose. +if not is_triton_available(): + def quantize_columnwise_and_transpose(x: torch.Tensor): return None +else: -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_stages=16), - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=16, num_warps=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_columnwise_and_transpose( - x_ptr, - output_ptr, - output_maxs, - n_elements, - M : tl.constexpr, N : tl.constexpr, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid - p2_arange = tl.arange(0, P2) - p2_arange_mask = p2_arange < M - arange = p2_arange * N - offsets = block_start + arange - x = tl.load(x_ptr + offsets, mask=p2_arange_mask) - abs_x = tl.abs(x) - max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) + import triton + import triton.language as tl + from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - new_start = pid * M - new_offsets = new_start + p2_arange - tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) - tl.store(output_maxs + pid, max_val) + # This kernel does fused columnwise quantization and transpose. -def quantize_columnwise_and_transpose(x: torch.Tensor): - M, N = x.shape - output = torch.empty(N, M, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) + # TODO: autotune this better. + @triton.autotune( + configs=[ + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_stages=16), + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=16, num_warps=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] + ) + @triton.jit + def _quantize_columnwise_and_transpose( + x_ptr, + output_ptr, + output_maxs, + n_elements, + M : tl.constexpr, N : tl.constexpr, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, + ): + pid = tl.program_id(axis=0) + block_start = pid + p2_arange = tl.arange(0, P2) + p2_arange_mask = p2_arange < M + arange = p2_arange * N + offsets = block_start + arange + x = tl.load(x_ptr + offsets, mask=p2_arange_mask) + abs_x = tl.abs(x) + max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) - P2 = int(2 ** (math.ceil(math.log2(M)))) + new_start = pid * M + new_offsets = new_start + p2_arange + tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) + tl.store(output_maxs + pid, max_val) - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) - return output, output_maxs + def quantize_columnwise_and_transpose(x: torch.Tensor): + M, N = x.shape + output = torch.empty(N, M, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(M)))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) + return output, output_maxs diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py index 229721c..845db6e 100644 --- a/bitsandbytes/triton/quantize_global.py +++ b/bitsandbytes/triton/quantize_global.py @@ -1,100 +1,107 @@ import math import torch import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time +from bitsandbytes.triton.triton_utils import is_triton_available -# global quantize -@triton.autotune( - configs=[ - triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), - triton.Config({'BLOCK_SIZE': 2048,}, num_stages=1), +if not is_triton_available(): + def quantize_global_transpose(input): return None + def quantize_global(x: torch.Tensor): return None +else: - ], - key=['n_elements'] -) -@triton.jit -def _quantize_global( - x_ptr, - absmax_inv_ptr, - output_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - absmax_inv = tl.load(absmax_inv_ptr) - output = tl.libdevice.llrint(127. * (x * absmax_inv)) - tl.store(output_ptr + offsets, output, mask=mask) + import triton + import triton.language as tl + from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time -def quantize_global(x: torch.Tensor): - absmax = x.abs().max().unsqueeze(0) - absmax_inv = 1./ absmax - output = torch.empty(*x.shape, device='cuda', dtype=torch.int8) - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - _quantize_global[grid](x, absmax_inv, output, n_elements) - return output, absmax + # global quantize + @triton.autotune( + configs=[ + triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), + triton.Config({'BLOCK_SIZE': 2048,}, num_stages=1), + + ], + key=['n_elements'] + ) + @triton.jit + def _quantize_global( + x_ptr, + absmax_inv_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, + ): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + absmax_inv = tl.load(absmax_inv_ptr) + output = tl.libdevice.llrint(127. * (x * absmax_inv)) + tl.store(output_ptr + offsets, output, mask=mask) + + def quantize_global(x: torch.Tensor): + absmax = x.abs().max().unsqueeze(0) + absmax_inv = 1./ absmax + output = torch.empty(*x.shape, device='cuda', dtype=torch.int8) + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + _quantize_global[grid](x, absmax_inv, output, n_elements) + return output, absmax -# global quantize and transpose -@triton.autotune( - configs=[ - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), + # global quantize and transpose + @triton.autotune( + configs=[ + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), - # ... - ], - key=['M', 'N'] -) -@triton.jit -def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, - BLOCK_M : tl.constexpr, - BLOCK_N : tl.constexpr, - GROUP_M : tl.constexpr): - pid = tl.program_id(0) - grid_m = (M + BLOCK_M - 1) // BLOCK_M - grid_n = (N + BLOCK_N - 1) // BLOCK_N - - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // group_size - - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) - mask = (rm < M)[:, None] & (rn < N)[None, :] - a = tl.load(A, mask=mask) - absmax_inv = tl.load(absmax_inv_ptr) - - # rematerialize to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) - mask = (rm < M)[:, None] & (rn < N)[None, :] + # ... + ], + key=['M', 'N'] + ) + @triton.jit + def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, + BLOCK_M : tl.constexpr, + BLOCK_N : tl.constexpr, + GROUP_M : tl.constexpr): + pid = tl.program_id(0) + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // group_size + + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) + mask = (rm < M)[:, None] & (rn < N)[None, :] + a = tl.load(A, mask=mask) + absmax_inv = tl.load(absmax_inv_ptr) + + # rematerialize to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) + mask = (rm < M)[:, None] & (rn < N)[None, :] - output = tl.libdevice.llrint(127. * (a * absmax_inv)) + output = tl.libdevice.llrint(127. * (a * absmax_inv)) - tl.store(B, output, mask=mask) + tl.store(B, output, mask=mask) -def quantize_global_transpose(input): - absmax = input.abs().max().unsqueeze(0) - absmax_inv = 1./ absmax - M, N = input.shape - out = torch.empty(N, M, device='cuda', dtype=torch.int8) - - assert out.size(0) == N and out.size(1) == M - assert input.stride(0) == 1 or input.stride(1) == 1 - assert out.stride(0) == 1 or out.stride(1) == 1 - - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) - _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) - return out, absmax + def quantize_global_transpose(input): + absmax = input.abs().max().unsqueeze(0) + absmax_inv = 1./ absmax + M, N = input.shape + out = torch.empty(N, M, device='cuda', dtype=torch.int8) + + assert out.size(0) == N and out.size(1) == M + assert input.stride(0) == 1 or input.stride(1) == 1 + assert out.stride(0) == 1 or out.stride(1) == 1 + + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) + _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) + return out, absmax diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py index d956647..26d2183 100644 --- a/bitsandbytes/triton/quantize_rowwise.py +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -1,61 +1,68 @@ import math import torch import time -import triton -import triton.language as tl -from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time -# rowwise quantize +from bitsandbytes.triton.triton_utils import is_triton_available -# TODO: autotune this better. -@triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] -) -@triton.jit -def _quantize_rowwise( - x_ptr, - output_ptr, - output_maxs, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - - abs_x = tl.abs(x) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) +if not is_triton_available(): + def quantize_rowwise(x: torch.Tensor): return None +else: -def quantize_rowwise(x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + import triton + import triton.language as tl + from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + # rowwise quantize - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output, output_maxs + # TODO: autotune this better. + @triton.autotune( + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=['n_elements'] + ) + @triton.jit + def _quantize_rowwise( + x_ptr, + output_ptr, + output_maxs, + n_elements, + BLOCK_SIZE: tl.constexpr, + P2: tl.constexpr, + ): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + arange = tl.arange(0, P2) + offsets = block_start + arange + row_mask = arange < BLOCK_SIZE + x = tl.load(x_ptr + offsets, mask=row_mask) + + abs_x = tl.abs(x) + max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) + output = tl.libdevice.llrint(127. * (x / max_val)) + tl.store(output_ptr + offsets, output, mask=row_mask) + tl.store(output_maxs + pid, max_val) + + def quantize_rowwise(x: torch.Tensor): + output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) + output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) + + P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) + + assert x.is_cuda and output.is_cuda + n_elements = output.numel() + grid = lambda meta: (x.shape[0],) + _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + return output, output_maxs From 5b612bc6dfa131fb0cb27dcae5fd863c15694328 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 12:16:55 -0700 Subject: [PATCH 48/97] Added is_available_triton guard to Triton SwitchBackLinear. --- bitsandbytes/nn/triton_based_modules.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 61e9053..7794fa0 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -3,6 +3,8 @@ import torch.nn as nn import time from functools import partial +from bitsandbytes.triton.triton_utils import is_triton_available + from bitsandbytes.triton.dequantize_rowwise import dequantize_rowwise from bitsandbytes.triton.quantize_rowwise import quantize_rowwise from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose @@ -160,6 +162,10 @@ class SwitchBackLinear(nn.Linear): ): super().__init__(in_features, out_features, bias, device, dtype) + if not is_triton_available: + raise ImportError('''Could not import triton. Please install triton to use SwitchBackLinear. + Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower''') + # By default, we use the global quantization. self.vectorize = vectorize if self.vectorize: From b8ea2b416d25130ed32a3cf436b8a9f8fd1d412f Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 12:28:35 -0700 Subject: [PATCH 49/97] Fixed bias conversion in Linear4bit --- bitsandbytes/nn/modules.py | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index de9e4ac..ab16e01 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -205,45 +205,13 @@ class Linear4bit(nn.Linear): if self.compute_dtype is not None: x = x.to(self.compute_dtype) - bias = None if self.bias is None else self.bias.half(self.compute_dtype) + bias = None if self.bias is None else self.bias.to(self.compute_dtype) out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) out = out.to(inp_dtype) return out - def _save_to_state_dict(self, destination, prefix, keep_vars): - super()._save_to_state_dict(destination, prefix, keep_vars) - - # we only need to save extra state if .cuda was called - # then we have the (1) quantization weight and the (2) quantization config - - #quant_state = getattr(self.weight, 'quant_state', None) - #if quant_state is not None: - # # 2. quantization state - # destination[prefix + 'quant_state'] = quant_state - - #destination[prefix + 'weight'] = self.weight.detach() - - - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs): - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, - error_msgs) - #for key in unexpected_keys: - # input_name = key[len(prefix):] - # if input_name == "quant_state": - # if getattr(self.weight, 'quant_state', None) is None: - # # buffers not yet initialized, can't call them directly without - # raise RuntimeError("Loading a quantized checkpoint into non-quantized Linear4bit is " - # "not supported. Please call module.cuda() before module.load_state_dict()") - - # input_param = state_dict[key] - # self.weight.quant_state = input_param - # assert isinstance(self.weight, Param4bit) - # unexpected_keys.remove(key) - class LinearFP4(Linear4bit): def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True): super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4') From 008dfff9b4b25501b487f39af0332e6306ba1ebd Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 12:57:46 -0700 Subject: [PATCH 50/97] Added triton utils. --- bitsandbytes/triton/triton_utils.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 bitsandbytes/triton/triton_utils.py diff --git a/bitsandbytes/triton/triton_utils.py b/bitsandbytes/triton/triton_utils.py new file mode 100644 index 0000000..c74c239 --- /dev/null +++ b/bitsandbytes/triton/triton_utils.py @@ -0,0 +1,4 @@ +import importlib + +def is_triton_available(): + return importlib.util.find_spec("triton") is not None From 9e7cdc9ea95e9756d9f5621a0e2c7e2538363fae Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 13:41:30 -0700 Subject: [PATCH 51/97] Added last SwitchBack refactors. All tests green. --- CHANGELOG.md | 7 +++++++ bitsandbytes/nn/__init__.py | 2 +- bitsandbytes/nn/triton_based_modules.py | 18 +++++++++--------- setup.py | 2 +- tests/test_triton.py | 16 ++++++++-------- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5399c02..2de70d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -221,3 +221,10 @@ Improvements: Deprecated: - Devices with compute capability 3.0 (GTX 700s, K10) and 3.2 (Tegra K1, Jetson TK1) are now deprecated and support will be removed in 0.39.0. - Support for CUDA 10.0 and 10.2 will be removed in bitsandbytes 0.39.0 + + +### 0.38.1 + +Features: + - Added Int8 SwitchBack layers + - Added Fake FP8 layers for research purposes (available under `bnb.research.nn. ...`) diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index ec944a3..f51f600 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .modules import Int8Params, Linear8bitLt, StableEmbedding, OutlierAwareLinear, SwitchBackLinearBnb -from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorized, StandardLinear +from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorwise, StandardLinear diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 7794fa0..6fbf583 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -157,7 +157,7 @@ class SwitchBackLinear(nn.Linear): bias: bool = True, device=None, dtype=None, - vectorize: bool = False, + vector_wise_quantization: bool = False, mem_efficient : bool = False, ): super().__init__(in_features, out_features, bias, device, dtype) @@ -167,11 +167,11 @@ class SwitchBackLinear(nn.Linear): Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower''') # By default, we use the global quantization. - self.vectorize = vectorize - if self.vectorize: + self.vector_wise_quantization = vector_wise_quantization + if self.vector_wise_quantization: self._fn = _switchback_vectorrize if mem_efficient: - print('mem efficient is not supported for vectorize mode.') + print('mem efficient is not supported for vector-wise quantization.') exit(1) else: if mem_efficient: @@ -188,7 +188,7 @@ class SwitchBackLinear(nn.Linear): # m.prepare_for_eval() # model.apply(cond_prepare) print('=> preparing for eval.') - if self.vectorize: + if self.vector_wise_quantization: W_int8, state_W = quantize_rowwise(self.weight) else: W_int8, state_W = quantize_global(self.weight) @@ -210,7 +210,7 @@ class SwitchBackLinear(nn.Linear): X = x.view(-1, x.size(-1)) X_int8, state_X = quantize_rowwise(X) - if self.vectorize: + if self.vector_wise_quantization: return int8_matmul_rowwise_dequantize( X_int8, self.W_int8.t(), state_X, self.state_W, self.bias ).view(*x.size()[:-1], -1) @@ -219,9 +219,9 @@ class SwitchBackLinear(nn.Linear): X_int8, self.W_int8.t(), state_X, self.state_W, self.bias ).view(*x.size()[:-1], -1) -SwitchBackLinearGlobal = partial(SwitchBackLinear, vectorize=False) -SwitchBackLinearGlobalMemEfficient = partial(SwitchBackLinear, vectorize=False, mem_efficient=True) -SwitchBackLinearVectorized = partial(SwitchBackLinear, vectorize=True) +SwitchBackLinearGlobal = partial(SwitchBackLinear, vector_wise_quantization=False) +SwitchBackLinearGlobalMemEfficient = partial(SwitchBackLinear, vector_wise_quantization=False, mem_efficient=True) +SwitchBackLinearVectorwise = partial(SwitchBackLinear, vector_wise_quantization=True) # This is just the standard linear function. class StandardLinearFunction(torch.autograd.Function): diff --git a/setup.py b/setup.py index e514463..009fd3d 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def read(fname): setup( name=f"bitsandbytes", - version=f"0.38.0.post2", + version=f"0.38.1", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="8-bit optimizers and matrix multiplication routines.", diff --git a/tests/test_triton.py b/tests/test_triton.py index 7f56a49..e18c7a9 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -1,19 +1,19 @@ import pytest import torch +from bitsandbytes.triton.triton_utils import is_triton_available from bitsandbytes.nn.triton_based_modules import SwitchBackLinear from bitsandbytes.nn import Linear8bitLt - -@pytest.mark.skipif(not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, reason="This test requires a GPU with compute capability 8.0 or higher.") -@pytest.mark.parametrize("vectorrize", [False, True]) -def test_switchback(vectorrize): - for dim in [83, 17, 128]: - for batch in [13, 128, 256]: +@pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, + reason="This test requires triton and a GPU with compute capability 8.0 or higher.") +@pytest.mark.parametrize("vector_wise_quantization", [False, True]) +def test_switchback(vector_wise_quantization): + for dim in [83]: + for batch in [13]: standard = torch.nn.Linear(dim, 4 * dim).cuda().half() - print('vectorrize', vectorrize) - switchback = SwitchBackLinear(dim, 4 * dim, vectorize=vectorrize).cuda().half() + switchback = SwitchBackLinear(dim, 4 * dim, vector_wise_quantization=vector_wise_quantization).cuda().half() baseline = Linear8bitLt(dim, 4 * dim).cuda().half() switchback.weight.data.copy_(standard.weight) switchback.bias.data.copy_(standard.bias) From 7dc198feb7b68f08790823a06d42c7500ff446fa Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 17 Apr 2023 18:01:49 -0700 Subject: [PATCH 52/97] Added 32-bit optimizer for bfloat16 gradients. --- bitsandbytes/cextension.py | 2 +- bitsandbytes/functional.py | 89 +++++++++++++++----------------------- bitsandbytes/nn/modules.py | 7 +++ csrc/kernels.cu | 7 ++- csrc/ops.cu | 1 + csrc/pythonInterface.c | 10 +++-- tests/test_optim.py | 35 +++++---------- 7 files changed, 65 insertions(+), 86 deletions(-) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index e2ca978..8adca93 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -23,7 +23,7 @@ try: CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment! If you cannot find any issues and suspect a bug, please open an issue with detals about your environment: https://github.com/TimDettmers/bitsandbytes/issues''') - lib.cadam32bit_g32 + lib.cadam_8bit_blockwise_fp32 lib.get_context.restype = ct.c_void_p lib.get_cusparse.restype = ct.c_void_p COMPILED_WITH_CUDA = True diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index b168606..ff0eb7e 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -28,7 +28,7 @@ name2qmap = {} if COMPILED_WITH_CUDA: """C FUNCTIONS FOR OPTIMIZERS""" str2optimizer32bit = {} - str2optimizer32bit["adam"] = (lib.cadam32bit_g32, lib.cadam32bit_g16) + str2optimizer32bit["adam"] = (lib.cadam32bit_gfp32, lib.cadam32bit_gfp16, lib.cadam32bit_gbf16) str2optimizer32bit["momentum"] = ( lib.cmomentum32bit_g32, lib.cmomentum32bit_g16, @@ -41,11 +41,6 @@ if COMPILED_WITH_CUDA: lib.cadagrad32bit_g32, lib.cadagrad32bit_g16, ) - str2optimizer32bit["lars"] = ( - lib.cmomentum32bit_g32, - lib.cmomentum32bit_g16, - ) - str2optimizer32bit["lamb"] = (lib.cadam32bit_g32, lib.cadam32bit_g16) str2optimizer8bit = {} str2optimizer8bit["adam"] = ( @@ -998,53 +993,37 @@ def optimizer_update_32bit( if max_unorm > 0.0: param_norm = torch.norm(p.data.float()) - if optimizer_name not in str2optimizer32bit: - raise NotImplementedError( - f'Optimizer not implemented: {optimizer_name}. Choices: {",".join(str2optimizer32bit.keys())}' - ) - if g.dtype == torch.float32 and state1.dtype == torch.float32: - str2optimizer32bit[optimizer_name][0]( - get_ptr(g), - get_ptr(p), - get_ptr(state1), - get_ptr(state2), - get_ptr(unorm_vec), - ct.c_float(max_unorm), - ct.c_float(param_norm), - ct.c_float(beta1), - ct.c_float(beta2), - ct.c_float(eps), - ct.c_float(weight_decay), - ct.c_int32(step), - ct.c_float(lr), - ct.c_float(gnorm_scale), - ct.c_bool(skip_zeros), - ct.c_int32(g.numel()), - ) - elif g.dtype == torch.float16 and state1.dtype == torch.float32: - str2optimizer32bit[optimizer_name][1]( - get_ptr(g), - get_ptr(p), - get_ptr(state1), - get_ptr(state2), - get_ptr(unorm_vec), - ct.c_float(max_unorm), - ct.c_float(param_norm), - ct.c_float(beta1), - ct.c_float(beta2), - ct.c_float(eps), - ct.c_float(weight_decay), - ct.c_int32(step), - ct.c_float(lr), - ct.c_float(gnorm_scale), - ct.c_bool(skip_zeros), - ct.c_int32(g.numel()), - ) + optim_func = None + if g.dtype == torch.float32: + optim_func = str2optimizer32bit[optimizer_name][0] + elif g.dtype == torch.float16: + optim_func = str2optimizer32bit[optimizer_name][1] + elif (g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name])==3): + optim_func = str2optimizer32bit[optimizer_name][2] else: - raise ValueError( - f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}" - ) + raise ValueError(f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}") + + is_on_gpu([g, p, state1, state2, unorm_vec]) + prev_device = pre_call(g.device) + optim_func( + get_ptr(g), + get_ptr(p), + get_ptr(state1), + get_ptr(state2), + get_ptr(unorm_vec), + ct.c_float(max_unorm), + ct.c_float(param_norm), + ct.c_float(beta1), + ct.c_float(beta2), + ct.c_float(eps), + ct.c_float(weight_decay), + ct.c_int32(step), + ct.c_float(lr), + ct.c_float(gnorm_scale), + ct.c_bool(skip_zeros), + ct.c_int32(g.numel())) + post_call(prev_device) def optimizer_update_8bit( @@ -1199,12 +1178,12 @@ def optimizer_update_8bit_blockwise( optim_func = None if g.dtype == torch.float32 and state1.dtype == torch.uint8: - optimizer_func = str2optimizer8bit_blockwise[optimizer_name][0] + optim_func = str2optimizer8bit_blockwise[optimizer_name][0] elif g.dtype == torch.float16 and state1.dtype == torch.uint8: - optimizer_func = str2optimizer8bit_blockwise[optimizer_name][1] + optim_func = str2optimizer8bit_blockwise[optimizer_name][1] elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and len(str2optimizer8bit_blockwise[optimizer_name])==3): - optimizer_func = str2optimizer8bit_blockwise[optimizer_name][2] + optim_func = str2optimizer8bit_blockwise[optimizer_name][2] else: raise ValueError( f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}" @@ -1213,7 +1192,7 @@ def optimizer_update_8bit_blockwise( is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2]) prev_device = pre_call(g.device) - optimizer_func( + optim_func( get_ptr(p), get_ptr(g), get_ptr(state1), diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index ab16e01..24f5070 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -178,6 +178,13 @@ class Params4bit(torch.nn.Parameter): s[0] = s[0].to(device) if self.compress_statistics: # TODO: refactor this. This is a nightmare + # for 4-bit: + # state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type] + # state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type] + #s[-2][0] = s[-2][0].to(device) # offset + #s[-2][1][0] = s[-2][1][0].to(device) # nested absmax + + # for 8-bit s[-2][0] = s[-2][0].to(device) # offset s[-2][1][0] = s[-2][1][0].to(device) # nested quantiation state statitics s[-2][1][1] = s[-2][1][1].to(device) # nested quantiation codebook diff --git a/csrc/kernels.cu b/csrc/kernels.cu index c35acc8..2d940be 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2981,12 +2981,15 @@ template __global__ void kPreconditionOptimizer32bit2State(float* g, float* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm, + const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); template __global__ void kOptimizer32bit2State(half* g, half* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm, const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); -template __global__ void kOptimizer32bit2State(float* g, float* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm, +template __global__ void kOptimizer32bit2State<__nv_bfloat16, ADAM>(__nv_bfloat16* g, __nv_bfloat16* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm, const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); #define MAKE_PreconditionStatic8bit1State(oname, gtype) \ diff --git a/csrc/ops.cu b/csrc/ops.cu index de14039..76777ae 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -703,6 +703,7 @@ template void optimizer32bit(gtype* g, gtype* p, \ MAKE_optimizer32bit(ADAM, half) MAKE_optimizer32bit(ADAM, float) +MAKE_optimizer32bit(ADAM, __nv_bfloat16) MAKE_optimizer32bit(MOMENTUM, half) MAKE_optimizer32bit(MOMENTUM, float) MAKE_optimizer32bit(RMSPROP, half) diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index d169178..0e9106c 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -29,8 +29,9 @@ void fname##32bit_g##gbits(gtype *g, gtype *p, \ MAKE_FUNC32(momentum, MOMENTUM, float, 32) MAKE_FUNC32(momentum, MOMENTUM, half, 16) -MAKE_FUNC32(adam, ADAM, float, 32) -MAKE_FUNC32(adam, ADAM, half, 16) +MAKE_FUNC32(adam, ADAM, float, fp32) +MAKE_FUNC32(adam, ADAM, half, fp16) +MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16) MAKE_FUNC32(rmsprop, RMSPROP, float, 32) MAKE_FUNC32(rmsprop, RMSPROP, half, 16) MAKE_FUNC32(adagrad, ADAGRAD, float, 32) @@ -173,8 +174,9 @@ extern "C" const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) \ { name##32bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \ - MAKE_CFUNC32(adam, float, 32) - MAKE_CFUNC32(adam, half, 16) + MAKE_CFUNC32(adam, float, fp32) + MAKE_CFUNC32(adam, half, fp16) + MAKE_CFUNC32(adam, __nv_bfloat16, bf16) MAKE_CFUNC32(momentum, float, 32) MAKE_CFUNC32(momentum, half, 16) MAKE_CFUNC32(rmsprop, float, 32) diff --git a/tests/test_optim.py b/tests/test_optim.py index 83390a4..a13b332 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -44,10 +44,6 @@ str2optimizers["momentum"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), lambda pxx: bnb.optim.SGD(pxx, 0.01, 0.9, block_wise=False), ) -str2optimizers["lars"] = ( - lambda pxx: bnb.optim.PytorchLARS(pxx, 0.01, 0.9), - lambda pxx: bnb.optim.LARS(pxx, 0.01, 0.9), -) str2optimizers["rmsprop"] = ( lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9), lambda pxx: bnb.optim.RMSprop(pxx, 0.01, 0.9, block_wise=False), @@ -64,10 +60,6 @@ str2optimizers["rmsprop8bit"] = ( lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9), lambda pxx: bnb.optim.RMSprop8bit(pxx, 0.01, 0.9, block_wise=False), ) -str2optimizers["lars8bit"] = ( - lambda pxx: bnb.optim.PytorchLARS(pxx, 0.01, 0.9), - lambda pxx: bnb.optim.LARS8bit(pxx, 0.01, 0.9), -) str2optimizers["adam8bit_blockwise"] = ( torch.optim.Adam, @@ -85,7 +77,6 @@ str2optimizers["rmsprop8bit_blockwise"] = ( str2statenames = {} str2statenames["adam"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["momentum"] = [("momentum_buffer", "state1")] -str2statenames["lars"] = [("momentum_buffer", "state1")] str2statenames["lamb"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["rmsprop"] = [("square_avg", "state1")] str2statenames["adam8bit"] = [ @@ -106,7 +97,6 @@ str2statenames["momentum8bit"] = [ str2statenames["momentum8bit_blockwise"] = [ ("momentum_buffer", "state1", "qmap1", "absmax1") ] -str2statenames["lars8bit"] = [("momentum_buffer", "state1", "qmap1", "max1")] str2statenames["rmsprop8bit"] = [("square_avg", "state1", "qmap1", "max1")] str2statenames["rmsprop8bit_blockwise"] = [ ("square_avg", "state1", "qmap1", "absmax1") @@ -114,14 +104,10 @@ str2statenames["rmsprop8bit_blockwise"] = [ dim1 = [1024] dim2 = [32, 1024, 4097, 1] -gtype = [torch.float32, torch.float16] -optimizer_names = ["adam", "momentum", "rmsprop", "lars"] +gtype = [torch.float32, torch.float16, torch.bfloat16] +optimizer_names = ["adam", "momentum", "rmsprop"] values = list(product(dim1, dim2, gtype, optimizer_names)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values -] - - +names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values] @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_optimizer32bit(dim1, dim2, gtype, optim_name): if dim1 == 1 and dim2 == 1: @@ -135,6 +121,8 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): if gtype == torch.float32: atol, rtol = 1e-6, 1e-5 + elif gtype == torch.bfloat16: + atol, rtol = 1e-3, 1e-2 else: atol, rtol = 1e-4, 1e-3 @@ -173,14 +161,14 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): rtol=rtol, ) - if gtype == torch.float16: + if gtype != torch.float32: # the adam buffers should also be close because they are 32-bit # but the paramters can diverge because they are 16-bit # the difference grow larger and larger with each update # --> copy the state to keep weights close - p1.data = p1.data.half().float() + p1.data = p1.data.to(p2.dtype).float() p2.copy_(p1.data) - torch.testing.assert_allclose(p1.half(), p2) + torch.testing.assert_allclose(p1.to(p2.dtype), p2) if optim_name in ["lars", "lamb"]: assert bnb_optimizer.state[p2]["unorm_vec"] > 0.0 @@ -246,7 +234,6 @@ optimizer_names = [ "momentum8bit", "rmsprop8bit", "adam8bit_blockwise", - "lars8bit", "momentum8bit_blockwise", "rmsprop8bit_blockwise", ] @@ -321,10 +308,10 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): relerr = err / torch.abs(p1) if g.dtype == torch.bfloat16: assert err.mean() < 0.00015 - assert relerr.mean() < 0.0015 + assert relerr.mean() < 0.0016 else: - assert err.mean() < 0.0001 - assert relerr.mean() < 0.001 + assert err.mean() < 0.00012 + assert relerr.mean() < 0.0012 errors.append(err.mean().item()) relerrors.append(relerr.mean().item()) From 0f9d30207f7a86c6be17f8fd897f0716db32cdfd Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 19 Apr 2023 11:48:47 -0700 Subject: [PATCH 53/97] Added nested quantization for blockwise quantization. --- bitsandbytes/functional.py | 25 +++++++++---- tests/test_functional.py | 72 ++++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 42 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index ff0eb7e..eb49800 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -541,7 +541,7 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n return out -def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, rand=None, out: Tensor = None, blocksize=4096) -> Tensor: +def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, rand=None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor: """ Quantize tensor A in blocks of size 4096 values. @@ -586,7 +586,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra out = torch.zeros_like(A, dtype=torch.uint8) if A.device.type != 'cpu': - assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64, 32] + assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] cblocksize = ct.c_int32(blocksize) prev_device = pre_call(A.device) code = code.to(A.device) @@ -616,7 +616,15 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra assert rand is None lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel())) - state = [absmax, code, blocksize] + if nested: + offset = absmax.mean() + absmax -= offset + qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False) + state = [qabsmax, code, blocksize, nested, offset, state2] + else: + state = [absmax, code, blocksize, nested, None, None] + + return out, state @@ -628,6 +636,7 @@ def dequantize_blockwise( code: Tensor = None, out: Tensor = None, blocksize: int = 4096, + nested=False ) -> Tensor: """ Dequantizes blockwise quantized values. @@ -665,13 +674,15 @@ def dequantize_blockwise( if quant_state is None: quant_state = (absmax, code, blocksize) else: - absmax, code, blocksize = quant_state - + absmax, code, blocksize, nested, offset, state2 = quant_state + if nested: + absmax = dequantize_blockwise(absmax, state2) + absmax += offset if A.device.type != 'cpu': device = pre_call(A.device) code = code.to(A.device) - if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64, 32]: + if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: @@ -736,7 +747,7 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz if out is None: out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device) - assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64, 32] + assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] prev_device = pre_call(A.device) is_on_gpu([A, out, absmax]) diff --git a/tests/test_functional.py b/tests/test_functional.py index 61ea712..82f6a71 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -150,42 +150,44 @@ def test_dynamic_quantization(): assert diff < 0.004 -def test_dynamic_blockwise_quantization(): - #print('') - for blocksize in [4096, 2048, 1024, 512, 256, 128, 64, 32]: - diffs = [] - reldiffs = [] - for i in range(100): - A1 = torch.randn(1024, 1024, device="cuda") - C, S = F.quantize_blockwise(A1, blocksize=blocksize) - A2 = F.dequantize_blockwise(C, S, blocksize=blocksize) - diff = torch.abs(A1 - A2) - reldiff = diff / torch.abs(A1 + 1e-8) - diffs.append(diff.mean().item()) - reldiffs.append(reldiff.mean().item()) - abserr = sum(diffs)/len(diffs) - relerr = sum(reldiffs)/len(reldiffs) - assert abserr < 0.011 - assert relerr < 0.018 - #print('randn', blocksize, sum(diffs)/len(diffs)) - #print('randn', blocksize, sum(reldiffs)/len(reldiffs)) - diffs = [] - for i in range(100): - A1 = torch.rand(1024, 1024, device="cuda") - C, S = F.quantize_blockwise(A1, blocksize=blocksize) - A2 = F.dequantize_blockwise(C, S, blocksize=blocksize) - diff = torch.abs(A1 - A2) - reldiff = diff / torch.abs(A1 + 1e-8) - diffs.append(diff.mean().item()) - reldiffs.append(reldiff.mean().item()) - #torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0) - abserr = sum(diffs)/len(diffs) - relerr = sum(reldiffs)/len(reldiffs) - assert abserr < 0.0035 - assert relerr < 0.015 - #print('rand', blocksize, sum(diffs)/len(diffs)) - #print('rand', blocksize, sum(reldiffs)/len(reldiffs)) +@pytest.mark.parametrize("nested", [False, True], ids=["False", "True"]) +@pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) +def test_dynamic_blockwise_quantization(nested, blocksize): + #print('') + diffs = [] + reldiffs = [] + for i in range(100): + A1 = torch.randn(1024, 1024, device="cuda") + C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested) + A2 = F.dequantize_blockwise(C, S) + diff = torch.abs(A1 - A2) + reldiff = diff / torch.abs(A1 + 1e-8) + diffs.append(diff.mean().item()) + reldiffs.append(reldiff.mean().item()) + abserr = sum(diffs)/len(diffs) + relerr = sum(reldiffs)/len(reldiffs) + assert abserr < 0.011 + assert relerr < 0.018 + print('nested=', nested, 'randn', blocksize, sum(diffs)/len(diffs)) + print('nested=', nested, 'randn', blocksize, sum(reldiffs)/len(reldiffs)) + + diffs = [] + for i in range(100): + A1 = torch.rand(1024, 1024, device="cuda") + C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested) + A2 = F.dequantize_blockwise(C, S) + diff = torch.abs(A1 - A2) + reldiff = diff / torch.abs(A1 + 1e-8) + diffs.append(diff.mean().item()) + reldiffs.append(reldiff.mean().item()) + #torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0) + abserr = sum(diffs)/len(diffs) + relerr = sum(reldiffs)/len(reldiffs) + assert abserr < 0.0035 + assert relerr < 0.015 + print('nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs)) + print('nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs)) def test_dynamic_blockwise_stochastic_quantization(): From 6bfd7a405f7ccea4c40fb54c8fd0c179984ac506 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 25 Apr 2023 16:13:43 -0700 Subject: [PATCH 54/97] Initial template. --- Makefile | 13 ++++++++++++- csrc/kernels.cu | 25 +++++++++++++++++++++++++ csrc/kernels.cuh | 2 ++ csrc/ops.cu | 12 ++++++++++++ csrc/ops.cuh | 2 ++ 5 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e114160..a377f65 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,7 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include INCLUDE_10x := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include +INCLUDE_cutlass := -I $(ROOT_DIR)/dependencies/cutlass/include LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib # NVIDIA NVCC compilation flags @@ -61,7 +62,7 @@ CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 all: $(BUILD_DIR) env - $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) @@ -100,6 +101,11 @@ cuda11x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) +cuda11x_cutlass: $(BUILD_DIR) env cutlass + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + cuda12x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o @@ -121,6 +127,11 @@ env: @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" @echo "============================" +cutlass: + if [ ! -d "$(ROOT_DIR)/dependencies/cutlass" ]; then \ + git clone https://github.com/NVIDIA/cutlass.git $(ROOT_DIR)/dependencies/cutlass; \ + fi \ + $(BUILD_DIR): mkdir -p build mkdir -p dependencies diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 2d940be..5d2a58e 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2919,10 +2919,35 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * } } + +template __global__ void kMatmul_inference_4bit(INPT *A, unsigned char *B, OUTT *out, int lda, int ldb, int rowsA, int colsA, int colsB) +{ +// element-wise kernel +// 1. Load batch x k into registers +// 2. Load k x k into registers +// 3. dequantize and store in second pair of k x k +// 4. matmul +// 5. sum with cub +// 6. store outputs +// TC kernel +// use k warps per thread block +// 1. threadblock use read-only cache to read in register tile for A into shared memory +// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments +// 3. each warp reads a segment of values 16x32 from B +// 4. do dequantization from register of B into second pair of registers +// 5. store (4) into fragment +// 6. matmul aggregate into fragment C +// 7. aggreecate files of C into shared memroy block C +// 8. sum (7) +// 9. write outputs to matmul output matrix +} + + //============================================================== // TEMPLATE DEFINITIONS //============================================================== +template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index ed549cb..ecf3a09 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -9,6 +9,8 @@ #ifndef kernels #define kernels +template __global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *out, int lda, int ldb, int rowsA, int colsA, int colsB); + template__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n); __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n); diff --git a/csrc/ops.cu b/csrc/ops.cu index 76777ae..022f397 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -90,6 +90,17 @@ template void dequantizeBlockwise(float *code, unsign CUDA_CHECK_RETURN(cudaPeekAtLastError()); } + +void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB) +{ + int num_blocks = (colsB+32-1)/32; + kMatmul_inference_4bit<<>>(A, B, out, lda, ldb, rowsA, colsA, colsB); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); +} + +template __global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *C, int lda, int ldb, int rowsA, int colsA, int colsB); + + template void optimizer32bit(T* g, T* p, float* state1, float* state2, float *unorm, float max_unorm, float param_norm, const float beta1, const float beta2, const float eps, const float weight_decay, @@ -653,6 +664,7 @@ template void extractOutliers(char * A, int *idx, char *out, int id CUDA_CHECK_RETURN(cudaPeekAtLastError()); } + //============================================================== // TEMPLATE DEFINITIONS //============================================================== diff --git a/csrc/ops.cuh b/csrc/ops.cuh index f73d4e0..137320b 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -183,4 +183,6 @@ template void spmm_coo_very_sparse_naive(int *max_count, template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); +void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); + #endif From 6e2544da251ccf281d5d88611d2cb5c13bcf42a6 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 25 Apr 2023 16:15:44 -0700 Subject: [PATCH 55/97] Added cutlass example. --- csrc/kernels.cu | 134 ++++++++++++++++++++++++++++++++++++++++++++++++ csrc/ops.cu | 57 ++++++++++++++++++++ 2 files changed, 191 insertions(+) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 5d2a58e..a108772 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2942,6 +2942,140 @@ template __global // 9. write outputs to matmul output matrix } +#include "cutlass/util/print_error.hpp" +#include "cutlass/util/GPU_Clock.hpp" +#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 +# include "cutlass/util/cublas_wrappers.hpp" +#endif +#include "cutlass/util/helper_cuda.hpp" + +template +__global__ static +__launch_bounds__(decltype(size(CThreadLayout{}))::value) +void +gemm_device(MShape M, NShape N, KShape K, + TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, + TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, + TC * C, CStride dC, CBlockLayout , CThreadLayout tC, + Alpha alpha, Beta beta) +{ + using namespace cute; + using X = Underscore; + + // Preconditions + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + + CUTE_STATIC_ASSERT_V(size(tA) == size(tC)); + CUTE_STATIC_ASSERT_V(size(tB) == size(tC)); + + //CUTE_STATIC_ASSERT_V(shape<0>(blockA) == shape<0>(blockC)); // BLK_M + //CUTE_STATIC_ASSERT_V(shape<0>(blockB) == shape<1>(blockC)); // BLK_N + CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K + + // Shared memory buffers + __shared__ TA smemA[cosize_v]; + __shared__ TB smemB[cosize_v]; + auto sA = make_tensor(make_smem_ptr(smemA), blockA); // (BLK_M,BLK_K) + auto sB = make_tensor(make_smem_ptr(smemB), blockB); // (BLK_N,BLK_K) + + // Represent the full tensors + auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K) + auto mB = make_tensor(make_gmem_ptr(B), make_shape(N,K), dB); // (N,K) + auto mC = make_tensor(make_gmem_ptr(C), make_shape(M,N), dC); // (M,N) + + // Get the appropriate blocks for this thread block -- + // potential for thread block locality + auto blk_shape = make_shape(size<0>(sA), size<0>(sB), size<1>(sB));// (BLK_M,BLK_N,BLK_K) + auto blk_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k) + + auto gA = local_tile(mA, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) + auto gB = local_tile(mB, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) + auto gC = local_tile(mC, blk_shape, blk_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N) + + // + // Partition the copying of A and B tiles across the threads + // + + // TUTORIAL: Example of simple partitioning of A|B tiles over tA|tB + // Default is a raked partition, but can be changed with Step parameter + + auto tAgA = local_partition(gA, tA, threadIdx.x); // (THR_M,THR_K,k) + auto tAsA = local_partition(sA, tA, threadIdx.x); // (THR_M,THR_K) + + auto tBgB = local_partition(gB, tB, threadIdx.x); // (THR_N,THR_K,k) + auto tBsB = local_partition(sB, tB, threadIdx.x); // (THR_N,THR_K) + + // + // Define C accumulators and A/B partitioning + // + + // TUTORIAL: Example of partitioning via projections of tC + + // Partition sA (M,K) by the rows of tC + auto tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{}); // (THR_M,BLK_K) + // Partition sB (N,K) by the cols of tC + auto tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{}); // (THR_N,BLK_K) + // Partition gC (M,N) by the tile of tC + auto tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{}); // (THR_M,THR_N) + + // Allocate the accumulators -- same size as the projected data + auto tCrC = make_fragment_like(tCgC); // (THR_M,THR_N) + + // Clear the accumulators + clear(tCrC); + +#if 1 + + // TUTORIAL: Example of a very simple compute loop + // Data is read from global to shared memory via the tA|tB partitioning + // gemm(.) operates on the shared memory directly via the tC partitioning + + auto k_max = size<2>(tAgA); + + for (int k = 0; k < k_max; ++k) + { + // Copy gmem to smem + copy(tAgA(_,_,k), tAsA); + copy(tBgB(_,_,k), tBsB); + + // In case copy uses cp.async, make sure that the cp.async + // instructions are ordered with respect to other cp.async + // instructions (fence), then wait on all the outstanding copy + // operations (wait<0>()). __syncthreads() alone does not do + // this. + // + // NOTE: cp_async_wait<0>() currently issues cp.async.wait_all. + // This is equivalent to cp.async.commit_group followed by + // cp.async_wait_group 0. This should make the first + // cp_async_fence() (which also issues cp.async.commit_group) + // redundant. The tutorial works as-is, so we'll leave the + // redundant fence in for now and study its removal later. + cp_async_fence(); + cp_async_wait<0>(); + + __syncthreads(); + + // Compute gemm on smem + gemm(tCsA, tCsB, tCrC); + + __syncthreads(); + } + +#endif + + axpby(alpha, tCrC, beta, tCgC); +} + //============================================================== // TEMPLATE DEFINITIONS diff --git a/csrc/ops.cu b/csrc/ops.cu index 022f397..1204cbd 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -665,6 +665,63 @@ template void extractOutliers(char * A, int *idx, char *out, int id } + +#include +#include + +#include + +template +void +gemm(int m, int n, int k, + Alpha alpha, + TA const* A, int ldA, + TB const* B, int ldB, + Beta beta, + TC * C, int ldC, + cudaStream_t stream = 0) +{ + using namespace cute; + + // Define shapes (dynamic) + auto M = int(m); + auto N = int(n); + auto K = int(k); + + // Define strides (mixed) + auto dA = make_stride(Int<1>{}, ldA); + auto dB = make_stride(Int<1>{}, ldB); + auto dC = make_stride(Int<1>{}, ldC); + + // Define block sizes (static) + auto bM = Int<128>{}; + auto bN = Int<128>{}; + auto bK = Int< 8>{}; + + // Define the block layouts (static) + auto sA = make_layout(make_shape(bM,bK)); + auto sB = make_layout(make_shape(bN,bK)); + auto sC = make_layout(make_shape(bM,bN)); + + // Define the thread layouts (static) + auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); + auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); + auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); + + dim3 dimBlock(size(tC)); + dim3 dimGrid(ceil_div(size(M), size(bM)), + ceil_div(size(N), size(bN))); + gemm_device + <<< dimGrid, dimBlock, 0, stream >>> + (M, N, K, + A, dA, sA, tA, + B, dB, sB, tB, + C, dC, sC, tC, + alpha, beta); +} + + //============================================================== // TEMPLATE DEFINITIONS //============================================================== From 84964db93789c66fbe8b2c150fb1f9f953781137 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 25 Apr 2023 17:15:51 -0700 Subject: [PATCH 56/97] CUTLASS compiles. --- Makefile | 7 ++++--- bitsandbytes/functional.py | 4 ++-- bitsandbytes/nn/modules.py | 1 + csrc/kernels.cu | 18 ++++++++++++------ csrc/ops.cu | 4 +--- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index a377f65..7e8be41 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,8 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) -GPP:= /usr/bin/g++ +#GPP:= /usr/bin/g++ +GPP:= /sw/gcc/11.2.0/bin/g++ ifeq ($(CUDA_HOME),) CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) endif @@ -25,7 +26,7 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include INCLUDE_10x := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include -INCLUDE_cutlass := -I $(ROOT_DIR)/dependencies/cutlass/include +INCLUDE_cutlass := -I $(ROOT_DIR)/dependencies/cutlass/include -I $(ROOT_DIR)/dependencies/cutlass/tools/util/include/ -I $(ROOT_DIR)/dependencies/cutlass/include/cute/util/ LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib # NVIDIA NVCC compilation flags @@ -104,7 +105,7 @@ cuda11x: $(BUILD_DIR) env cuda11x_cutlass: $(BUILD_DIR) env cutlass $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + $(GPP) -std=c++17 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) cuda12x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index eb49800..80725b1 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -176,7 +176,7 @@ def create_custom_map(seed=0, scale=0.01): #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.207 #v = [1.6118251211466303, 1.188665228776879, 0.9112895004060624, 0.690763326564427, 0.4997008778346997, 0.3254280317127771, 0.16057446047146948] # 0.9465 24.30 #v = [1.6027040905517569, 1.184321770169049, 0.9085808314549837, 0.6889461706317986, 0.4984841229538408, 0.32467299997597887, 0.1602117348657326] # 0.9455 24.293 - v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88 + #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88 # 7B evo start #v = [1.62129629, 1.18870191, 0.90848106, 0.69108646, 0.50515268, 0.34927819905, 0.14122701] # 22.06 @@ -186,7 +186,7 @@ def create_custom_map(seed=0, scale=0.01): # 13B evo start #v = [1.6077535089716468, 1.1914902148179205, 0.8999752421085561, 0.6967904489387543, 0.4949093928311768, 0.30920472033044544, 0.15391602735952042] #v = [1.586363722436466, 1.202610827188916, 0.9003332576346587, 0.6904888715206972, 0.49490974688233724, 0.2971151461329376, 0.15683230810738283] - #v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908] + v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908] # mean evo 7B + 13B #v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237] diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 24f5070..287a467 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -228,6 +228,7 @@ class LinearNF4(Linear4bit): super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4') + class Int8Params(torch.nn.Parameter): def __new__( cls, diff --git a/csrc/kernels.cu b/csrc/kernels.cu index a108772..ed7d6b2 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -12,6 +12,14 @@ #include #include #include +#include +#include + +#include +#include "cutlass/util/print_error.hpp" +#include "cutlass/util/GPU_Clock.hpp" +#include "cutlass/util/cublas_wrappers.hpp" +#include "cutlass/util/helper_cuda.hpp" #define HLF_MAX 65504 #define TH 1024 @@ -2709,7 +2717,7 @@ template @@ -2813,7 +2821,7 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o float valB = local_valsB[k]; float valA = local_valA[i]; if(valB != 0.0 && valA != 0.0) - local_valC[j+k] = (float)local_valC[j+k] + ((float)smem_dequant_stats[idx+k-local_idx_col_B_offset])*C*valB*valA; + local_valC[j+k] = (float)local_valC[j+k] + ((float)smem_dequant_stats[idx+k-local_idx_col_B_offset])*DENORM*valB*valA; } else local_valC[j+k] = (float)local_valC[j+k] + (float)local_valsB[k]*(float)local_valA[i]; @@ -2960,7 +2968,7 @@ void gemm_device(MShape M, NShape N, KShape K, TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, - TC * C, CStride dC, CBlockLayout , CThreadLayout tC, + TC * out, CStride dC, CBlockLayout , CThreadLayout tC, Alpha alpha, Beta beta) { using namespace cute; @@ -2991,7 +2999,7 @@ gemm_device(MShape M, NShape N, KShape K, // Represent the full tensors auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K) auto mB = make_tensor(make_gmem_ptr(B), make_shape(N,K), dB); // (N,K) - auto mC = make_tensor(make_gmem_ptr(C), make_shape(M,N), dC); // (M,N) + auto mC = make_tensor(make_gmem_ptr(out), make_shape(M,N), dC); // (M,N) // Get the appropriate blocks for this thread block -- // potential for thread block locality @@ -3034,7 +3042,6 @@ gemm_device(MShape M, NShape N, KShape K, // Clear the accumulators clear(tCrC); -#if 1 // TUTORIAL: Example of a very simple compute loop // Data is read from global to shared memory via the tA|tB partitioning @@ -3071,7 +3078,6 @@ gemm_device(MShape M, NShape N, KShape K, __syncthreads(); } -#endif axpby(alpha, tCrC, beta, tCgC); } diff --git a/csrc/ops.cu b/csrc/ops.cu index 1204cbd..a3a7c29 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -666,11 +666,9 @@ template void extractOutliers(char * A, int *idx, char *out, int id -#include -#include - #include + template void From 0afc8e9e2f2a0a2ca707057fe6523bed98451bb6 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 26 Apr 2023 17:12:34 -0700 Subject: [PATCH 57/97] Best attempt at cutlass3. --- Makefile | 8 +-- bitsandbytes/functional.py | 98 +++++++++++++++++++++++++++++ csrc/kernels.cu | 126 ++++++++++++++++++++++--------------- csrc/kernels.cuh | 22 ++++++- csrc/ops.cu | 73 +++++++++------------ csrc/ops.cuh | 12 ++++ csrc/pythonInterface.c | 18 ++++++ tests/test_functional.py | 21 +++++++ 8 files changed, 279 insertions(+), 99 deletions(-) diff --git a/Makefile b/Makefile index 7e8be41..059545c 100644 --- a/Makefile +++ b/Makefile @@ -55,8 +55,8 @@ CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 +#CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 +#CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 @@ -103,9 +103,9 @@ cuda11x: $(BUILD_DIR) env $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) cuda11x_cutlass: $(BUILD_DIR) env cutlass - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math --expt-relaxed-constexpr -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++17 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + $(GPP) -std=c++17 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(INCLUDE_cutlass) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) cuda12x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 80725b1..7e4874a 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1374,6 +1374,104 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8 return sout +def cutlass3_gemm( + A: Tensor, + B: Tensor, + out: Tensor = None, + transposed_A=False, + transposed_B=False, +): + sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.float32) + if out is None: + out = torch.zeros(size=sout, dtype=torch.float32, device=A.device) + + sA = A.shape + sB = B.shape + if transposed_A and len(sA) == 2: + sA = (sA[1], sA[0]) + elif transposed_A and len(sA) == 3: + sA = (sA[0], sA[2], sA[0]) + if transposed_B and len(sB) == 2: + sB = (sB[1], sB[0]) + elif transposed_B and len(sB) == 3: + sB = (sB[0], sB[2], sB[0]) + # this is a mess: cuBLAS expect column major, but PyTorch is row major. + # So to perform the matrix multiplication, we have to treat A, B, and C matrices + # (transpose of row major is column major) + # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these + + # matrices in the input arguments for cuBLAS + # column major: A @ B = C: [m, k] @ [k, n] = [m, n] + # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n] + # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m] + if len(sB) == 2: + if B.stride()[0] == B.shape[1]: + transposed_B = False + elif B.stride()[1] == B.shape[0]: + transposed_B = True + if len(A.shape) == 2: + if A.stride()[0] == A.shape[1]: + transposed_A = False + elif A.stride()[1] == A.shape[0]: + transposed_A = True + else: + if A.stride()[1] == A.shape[2]: + transposed_A = False + elif A.stride()[2] == A.shape[1]: + transposed_A = True + + if len(sA) == 2: + n = sA[0] + ldb = A.stride()[1 if transposed_A else 0] + elif len(sA) == 3 and len(sB) == 2: + n = sA[0] * sA[1] + ldb = sA[2] + + m = sB[1] + k = sB[0] + lda = B.stride()[(1 if transposed_B else 0)] + ldc = sB[1] + elif len(sB) == 3: + # special case + assert len(sA) == 3 + if not (sA[0] == sB[0] and sA[1] == sB[1]): + raise ValueError( + f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}" + ) + + transposed_A = True + transposed_B = False + + m = sB[2] + n = sA[2] + k = sB[0] * sB[1] + + lda = m + ldb = sA[2] + ldc = m + + ptr = CUBLAS_Context.get_instance().get_context(A.device) + + # B^T @ A^T = C^T + # [km, nk -> mn] + lda = ldb = ldc = 1 + #lda = 1 + print(m, n, k, lda, ldb, ldc) + is_on_gpu([B, A, out]) + m = ct.c_int32(m) + n = ct.c_int32(n) + k = ct.c_int32(k) + lda = ct.c_int32(lda) + ldb = ct.c_int32(ldb) + ldc = ct.c_int32(ldc) + alpha = ct.c_float(1.0) + beta = ct.c_float(0.0) + lib.ccutlass_gemm(m, n, k, alpha, get_ptr(B), lda, get_ptr(A), ldb, beta, get_ptr(out), ldc) + + return out + + + def igemm( A: Tensor, diff --git a/csrc/kernels.cu b/csrc/kernels.cu index ed7d6b2..4c83573 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -19,7 +19,6 @@ #include "cutlass/util/print_error.hpp" #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/cublas_wrappers.hpp" -#include "cutlass/util/helper_cuda.hpp" #define HLF_MAX 65504 #define TH 1024 @@ -2928,73 +2927,84 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * } -template __global__ void kMatmul_inference_4bit(INPT *A, unsigned char *B, OUTT *out, int lda, int ldb, int rowsA, int colsA, int colsB) -{ -// element-wise kernel -// 1. Load batch x k into registers -// 2. Load k x k into registers -// 3. dequantize and store in second pair of k x k -// 4. matmul -// 5. sum with cub -// 6. store outputs -// TC kernel -// use k warps per thread block -// 1. threadblock use read-only cache to read in register tile for A into shared memory -// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments -// 3. each warp reads a segment of values 16x32 from B -// 4. do dequantization from register of B into second pair of registers -// 5. store (4) into fragment -// 6. matmul aggregate into fragment C -// 7. aggreecate files of C into shared memroy block C -// 8. sum (7) -// 9. write outputs to matmul output matrix -} +//template __global__ void kMatmul_inference_4bit(INPT *A, unsigned char *B, OUTT *out, int lda, int ldb, int rowsA, int colsA, int colsB) +//{ +//// element-wise kernel +//// 1. Load batch x k into registers +//// 2. Load k x k into registers +//// 3. dequantize and store in second pair of k x k +//// 4. matmul +//// 5. sum with cub +//// 6. store outputs +//// TC kernel +//// use k warps per thread block +//// 1. threadblock use read-only cache to read in register tile for A into shared memory +//// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments +//// 3. each warp reads a segment of values 16x32 from B +//// 4. do dequantization from register of B into second pair of registers +//// 5. store (4) into fragment +//// 6. matmul aggregate into fragment C +//// 7. aggreecate files of C into shared memroy block C +//// 8. sum (7) +//// 9. write outputs to matmul output matrix +//} #include "cutlass/util/print_error.hpp" #include "cutlass/util/GPU_Clock.hpp" #if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 # include "cutlass/util/cublas_wrappers.hpp" #endif -#include "cutlass/util/helper_cuda.hpp" +//#include "cutlass/util/helper_cuda.hpp" -template -__global__ static -__launch_bounds__(decltype(size(CThreadLayout{}))::value) -void -gemm_device(MShape M, NShape N, KShape K, - TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, - TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, - TC * out, CStride dC, CBlockLayout , CThreadLayout tC, - Alpha alpha, Beta beta) +__global__ void gemm_device(int M, int N, int K, + float const* A, + float const* B, + float * out, int lda, int ldb, int ldc, + float alpha, float beta) { using namespace cute; using X = Underscore; // Preconditions - CUTE_STATIC_ASSERT(is_static::value); - CUTE_STATIC_ASSERT(is_static::value); - CUTE_STATIC_ASSERT(is_static::value); + //CUTE_STATIC_ASSERT(is_static::value); + //CUTE_STATIC_ASSERT(is_static::value); + //CUTE_STATIC_ASSERT(is_static::value); - CUTE_STATIC_ASSERT(is_static::value); - CUTE_STATIC_ASSERT(is_static::value); - CUTE_STATIC_ASSERT(is_static::value); + //CUTE_STATIC_ASSERT(is_static::value); + //CUTE_STATIC_ASSERT(is_static::value); + //CUTE_STATIC_ASSERT(is_static::value); - CUTE_STATIC_ASSERT_V(size(tA) == size(tC)); - CUTE_STATIC_ASSERT_V(size(tB) == size(tC)); + //CUTE_STATIC_ASSERT_V(size(tA) == size(tC)); + //CUTE_STATIC_ASSERT_V(size(tB) == size(tC)); + + // Define block sizes (static) + auto bM = Int<128>{}; + auto bN = Int<128>{}; + auto bK = Int< 8>{}; + + // Define the block layouts (static) + auto bA = make_layout(make_shape(bM,bK)); + auto bB = make_layout(make_shape(bN,bK)); + auto bC = make_layout(make_shape(bM,bN)); + + // Define the thread layouts (static) + auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); + auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); + auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); //CUTE_STATIC_ASSERT_V(shape<0>(blockA) == shape<0>(blockC)); // BLK_M //CUTE_STATIC_ASSERT_V(shape<0>(blockB) == shape<1>(blockC)); // BLK_N - CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K + //CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K // Shared memory buffers - __shared__ TA smemA[cosize_v]; - __shared__ TB smemB[cosize_v]; - auto sA = make_tensor(make_smem_ptr(smemA), blockA); // (BLK_M,BLK_K) - auto sB = make_tensor(make_smem_ptr(smemB), blockB); // (BLK_N,BLK_K) + __shared__ float smemA[128*8]; + __shared__ float smemB[128*8]; + auto sA = make_tensor(make_smem_ptr(smemA), bA); // (BLK_M,BLK_K) + auto sB = make_tensor(make_smem_ptr(smemB), bB); // (BLK_N,BLK_K) + + auto dA = make_stride(Int<1>{}, lda); + auto dB = make_stride(Int<1>{}, ldb); + auto dC = make_stride(Int<1>{}, ldc); // Represent the full tensors auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K) @@ -3083,11 +3093,27 @@ gemm_device(MShape M, NShape N, KShape K, } + //============================================================== // TEMPLATE DEFINITIONS //============================================================== -template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); +//template +//__global__ static +//__launch_bounds__(decltype(size(CThreadLayout{}))::value) +//void +//gemm_device(MShape M, NShape N, KShape K, +// TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, +// TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, +// TC * out, CStride dC, CBlockLayout , CThreadLayout tC, +// half alpha, half beta); + + +//template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index ecf3a09..ba6de59 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -9,7 +9,7 @@ #ifndef kernels #define kernels -template __global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *out, int lda, int ldb, int rowsA, int colsA, int colsB); +//template __global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *out, int lda, int ldb, int rowsA, int colsA, int colsB); template__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n); @@ -122,4 +122,24 @@ template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); +//template +//__global__ static +//__launch_bounds__(decltype(size(CThreadLayout{}))::value) +//void +//gemm_device(MShape M, NShape N, KShape K, +// TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, +// TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, +// TC * out, CStride dC, CBlockLayout , CThreadLayout tC, +// Alpha alpha, Beta beta); + +__global__ void gemm_device(int M, int N, int K, + float const* A, + float const* B, + float * out, int lda, int ldb, int ldc, + float alpha, float beta); + #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index a3a7c29..ca56fae 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -91,14 +91,12 @@ template void dequantizeBlockwise(float *code, unsign } -void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB) -{ - int num_blocks = (colsB+32-1)/32; - kMatmul_inference_4bit<<>>(A, B, out, lda, ldb, rowsA, colsA, colsB); - CUDA_CHECK_RETURN(cudaPeekAtLastError()); -} - -template __global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *C, int lda, int ldb, int rowsA, int colsA, int colsB); +//void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB) +//{ +// int num_blocks = (colsB+32-1)/32; +// kMatmul_inference_4bit<<>>(A, B, out, lda, ldb, rowsA, colsA, colsB); +// CUDA_CHECK_RETURN(cudaPeekAtLastError()); +//} template void optimizer32bit(T* g, T* p, @@ -666,60 +664,47 @@ template void extractOutliers(char * A, int *idx, char *out, int id + #include +#include "cutlass/util/helper_cuda.hpp" -template -void -gemm(int m, int n, int k, - Alpha alpha, - TA const* A, int ldA, - TB const* B, int ldB, - Beta beta, - TC * C, int ldC, - cudaStream_t stream = 0) +void gemm_host(int m, int n, int k, + float alpha, + float const* A, int lda, + float const* B, int ldb, + float beta, + float * C, int ldc) { + cute::device_init(0); using namespace cute; + + // Define shapes (dynamic) auto M = int(m); auto N = int(n); auto K = int(k); - // Define strides (mixed) - auto dA = make_stride(Int<1>{}, ldA); - auto dB = make_stride(Int<1>{}, ldB); - auto dC = make_stride(Int<1>{}, ldC); - // Define block sizes (static) - auto bM = Int<128>{}; - auto bN = Int<128>{}; - auto bK = Int< 8>{}; + printf("%i %i %i %i %i %i\n", m, n, k, lda, ldb, ldc); - // Define the block layouts (static) - auto sA = make_layout(make_shape(bM,bK)); - auto sB = make_layout(make_shape(bN,bK)); - auto sC = make_layout(make_shape(bM,bN)); - - // Define the thread layouts (static) - auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); - auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); - auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); - - dim3 dimBlock(size(tC)); - dim3 dimGrid(ceil_div(size(M), size(bM)), - ceil_div(size(N), size(bN))); + dim3 dimBlock(16, 16); + dim3 dimGrid((M+127)/128, (N+127)/128); +// auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); +//- +//- dim3 dimBlock(size(tC)); +//- dim3 dimGrid(ceil_div(size(M), size(bM)), +//- ceil_div(size(N), size(bN))); gemm_device - <<< dimGrid, dimBlock, 0, stream >>> + <<< dimGrid, dimBlock, 0, 0 >>> (M, N, K, - A, dA, sA, tA, - B, dB, sB, tB, - C, dC, sC, tC, + A, + B, + C, lda, ldb, ldc, alpha, beta); } - //============================================================== // TEMPLATE DEFINITIONS //============================================================== diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 137320b..843a9bb 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -20,6 +20,11 @@ #include #include +#include +#include + + + #define CUDA_CHECK_RETURN(value) { \ cudaError_t _m_cudaStat = value; \ if (_m_cudaStat != cudaSuccess) { \ @@ -185,4 +190,11 @@ template void extractOutliers(char * A, int *idx, char *out, int id void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); +void gemm_host(int m, int n, int k, + float alpha, + float const* A, int ldA, + float const* B, int ldB, + float beta, + float * C, int ldC); + #endif diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 0e9106c..c6de62d 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -20,6 +20,16 @@ void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimat void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles(A, code, offset, n); } +void +cppgemm(int m, int n, int k, + float alpha, + float const* A, int ldA, + float const* B, int ldB, + float beta, + float * C, int ldC) +{ gemm_host(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);} + + #define MAKE_FUNC32(fname, oname, gtype, gbits) \ void fname##32bit_g##gbits(gtype *g, gtype *p, \ float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \ @@ -306,6 +316,14 @@ extern "C" void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); } void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); } + void ccutlass_gemm(int m, int n, int k, + float alpha, + float const* A, int ldA, + float const* B, int ldB, + float beta, + float * C, int ldC) + { cppgemm(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);} + #endif void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } diff --git a/tests/test_functional.py b/tests/test_functional.py index 82f6a71..128c803 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2351,3 +2351,24 @@ def test_normal_map_tree(): pivots.append((values[i-1]+values[i])/2) print(pivots) + +def test_cutlass3_gemm(): + #A = torch.rand(2, 2).cuda() + #B = torch.rand(2, 2).cuda() + A = torch.arange(4).reshape(2, 2).float().cuda().contiguous() + B = torch.ones(2, 2).float().cuda() + + print('') + print(A) + print(B) + + C1 = torch.matmul(A, B) + print(C1) + C2 = F.cutlass3_gemm(A, B.t()) + print(C2) + C2 = F.cutlass3_gemm(A, B) + print(C2) + C2 = F.cutlass3_gemm(B.t(), A.t().contiguous()) + print(C2) + + From d1c4c2056893c35a7ca8e55a1b2beebeeeaee679 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Thu, 27 Apr 2023 15:11:26 -0700 Subject: [PATCH 58/97] Added non-cutlass template. --- Makefile | 14 +--- bitsandbytes/functional.py | 4 +- csrc/kernels.cu | 158 ++++++------------------------------- csrc/ops.cu | 28 ++----- tests/test_functional.py | 6 -- 5 files changed, 35 insertions(+), 175 deletions(-) diff --git a/Makefile b/Makefile index 059545c..ea6ee87 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) -#GPP:= /usr/bin/g++ -GPP:= /sw/gcc/11.2.0/bin/g++ +GPP:= /usr/bin/g++ +#GPP:= /sw/gcc/11.2.0/bin/g++ ifeq ($(CUDA_HOME),) CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) endif @@ -26,7 +26,6 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include INCLUDE_10x := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include -INCLUDE_cutlass := -I $(ROOT_DIR)/dependencies/cutlass/include -I $(ROOT_DIR)/dependencies/cutlass/tools/util/include/ -I $(ROOT_DIR)/dependencies/cutlass/include/cute/util/ LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib # NVIDIA NVCC compilation flags @@ -63,8 +62,8 @@ CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 all: $(BUILD_DIR) env - $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env @@ -102,11 +101,6 @@ cuda11x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) -cuda11x_cutlass: $(BUILD_DIR) env cutlass - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math --expt-relaxed-constexpr -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++17 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(INCLUDE_cutlass) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - cuda12x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 7e4874a..54a08a1 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1456,7 +1456,7 @@ def cutlass3_gemm( # [km, nk -> mn] lda = ldb = ldc = 1 #lda = 1 - print(m, n, k, lda, ldb, ldc) + #print(m, n, k, lda, ldb, ldc) is_on_gpu([B, A, out]) m = ct.c_int32(m) n = ct.c_int32(n) @@ -1466,7 +1466,7 @@ def cutlass3_gemm( ldc = ct.c_int32(ldc) alpha = ct.c_float(1.0) beta = ct.c_float(0.0) - lib.ccutlass_gemm(m, n, k, alpha, get_ptr(B), lda, get_ptr(A), ldb, beta, get_ptr(out), ldc) + lib.ccutlass_gemm(m, n, k, alpha, get_ptr(A), ldb, get_ptr(B), lda, beta, get_ptr(out), ldc) return out diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 4c83573..ed87c69 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -15,11 +15,6 @@ #include #include -#include -#include "cutlass/util/print_error.hpp" -#include "cutlass/util/GPU_Clock.hpp" -#include "cutlass/util/cublas_wrappers.hpp" - #define HLF_MAX 65504 #define TH 1024 #define NUM 4 @@ -2949,147 +2944,42 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 9. write outputs to matmul output matrix //} -#include "cutlass/util/print_error.hpp" -#include "cutlass/util/GPU_Clock.hpp" -#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 -# include "cutlass/util/cublas_wrappers.hpp" -#endif -//#include "cutlass/util/helper_cuda.hpp" - __global__ void gemm_device(int M, int N, int K, float const* A, float const* B, float * out, int lda, int ldb, int ldc, float alpha, float beta) { - using namespace cute; - using X = Underscore; +// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp +// 1. Load dataB into register +// 2. Dequantize B +// 3. Fetch data from A and multiply - // Preconditions - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); + typedef cub::BlockLoad LoadA; + __shared__ typename LoadA::TempStorage loada; + float dataA[1]; + int valid_items = 0; - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); - - //CUTE_STATIC_ASSERT_V(size(tA) == size(tC)); - //CUTE_STATIC_ASSERT_V(size(tB) == size(tC)); - - // Define block sizes (static) - auto bM = Int<128>{}; - auto bN = Int<128>{}; - auto bK = Int< 8>{}; - - // Define the block layouts (static) - auto bA = make_layout(make_shape(bM,bK)); - auto bB = make_layout(make_shape(bN,bK)); - auto bC = make_layout(make_shape(bM,bN)); - - // Define the thread layouts (static) - auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); - auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); - auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); - - //CUTE_STATIC_ASSERT_V(shape<0>(blockA) == shape<0>(blockC)); // BLK_M - //CUTE_STATIC_ASSERT_V(shape<0>(blockB) == shape<1>(blockC)); // BLK_N - //CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K - - // Shared memory buffers - __shared__ float smemA[128*8]; - __shared__ float smemB[128*8]; - auto sA = make_tensor(make_smem_ptr(smemA), bA); // (BLK_M,BLK_K) - auto sB = make_tensor(make_smem_ptr(smemB), bB); // (BLK_N,BLK_K) - - auto dA = make_stride(Int<1>{}, lda); - auto dB = make_stride(Int<1>{}, ldb); - auto dC = make_stride(Int<1>{}, ldc); - - // Represent the full tensors - auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K) - auto mB = make_tensor(make_gmem_ptr(B), make_shape(N,K), dB); // (N,K) - auto mC = make_tensor(make_gmem_ptr(out), make_shape(M,N), dC); // (M,N) - - // Get the appropriate blocks for this thread block -- - // potential for thread block locality - auto blk_shape = make_shape(size<0>(sA), size<0>(sB), size<1>(sB));// (BLK_M,BLK_N,BLK_K) - auto blk_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k) - - auto gA = local_tile(mA, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) - auto gB = local_tile(mB, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) - auto gC = local_tile(mC, blk_shape, blk_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N) - - // - // Partition the copying of A and B tiles across the threads - // - - // TUTORIAL: Example of simple partitioning of A|B tiles over tA|tB - // Default is a raked partition, but can be changed with Step parameter - - auto tAgA = local_partition(gA, tA, threadIdx.x); // (THR_M,THR_K,k) - auto tAsA = local_partition(sA, tA, threadIdx.x); // (THR_M,THR_K) - - auto tBgB = local_partition(gB, tB, threadIdx.x); // (THR_N,THR_K,k) - auto tBsB = local_partition(sB, tB, threadIdx.x); // (THR_N,THR_K) - - // - // Define C accumulators and A/B partitioning - // - - // TUTORIAL: Example of partitioning via projections of tC - - // Partition sA (M,K) by the rows of tC - auto tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{}); // (THR_M,BLK_K) - // Partition sB (N,K) by the cols of tC - auto tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{}); // (THR_N,BLK_K) - // Partition gC (M,N) by the tile of tC - auto tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{}); // (THR_M,THR_N) - - // Allocate the accumulators -- same size as the projected data - auto tCrC = make_fragment_like(tCgC); // (THR_M,THR_N) - - // Clear the accumulators - clear(tCrC); + __shared__ float[16*256] tileA; - // TUTORIAL: Example of a very simple compute loop - // Data is read from global to shared memory via the tA|tB partitioning - // gemm(.) operates on the shared memory directly via the tC partitioning - - auto k_max = size<2>(tAgA); - - for (int k = 0; k < k_max; ++k) - { - // Copy gmem to smem - copy(tAgA(_,_,k), tAsA); - copy(tBgB(_,_,k), tBsB); - - // In case copy uses cp.async, make sure that the cp.async - // instructions are ordered with respect to other cp.async - // instructions (fence), then wait on all the outstanding copy - // operations (wait<0>()). __syncthreads() alone does not do - // this. - // - // NOTE: cp_async_wait<0>() currently issues cp.async.wait_all. - // This is equivalent to cp.async.commit_group followed by - // cp.async_wait_group 0. This should make the first - // cp_async_fence() (which also issues cp.async.commit_group) - // redundant. The tutorial works as-is, so we'll leave the - // redundant fence in for now and study its removal later. - cp_async_fence(); - cp_async_wait<0>(); - - __syncthreads(); - - // Compute gemm on smem - gemm(tCsA, tCsB, tCrC); - - __syncthreads(); - } + for(int idxA = 0; idxA < M*K; idxA+= 256) + { + valid_items = M*K - idxA > 256 ? 256 : M*K - idxA; + int baserow = 0; + for(int row = baserow; row < baserow+16 && row < M + ; row++) + { + LoadA(loada).Load(&(A[(row*lda) + i]), dataA, valid_items, 0.0f); + tileA[row*256 + threadIdx.x] = dataA[0]; + __syncthreads(); + } + baserow += 16; + + + } + - axpby(alpha, tCrC, beta, tCgC); } diff --git a/csrc/ops.cu b/csrc/ops.cu index ca56fae..8933927 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -665,9 +665,6 @@ template void extractOutliers(char * A, int *idx, char *out, int id -#include -#include "cutlass/util/helper_cuda.hpp" - void gemm_host(int m, int n, int k, float alpha, @@ -676,29 +673,14 @@ void gemm_host(int m, int n, int k, float beta, float * C, int ldc) { - cute::device_init(0); - using namespace cute; + dim3 dimBlock(256); + int num_blocks = (n+31)/32; - - // Define shapes (dynamic) - auto M = int(m); - auto N = int(n); - auto K = int(k); - - - printf("%i %i %i %i %i %i\n", m, n, k, lda, ldb, ldc); - - dim3 dimBlock(16, 16); - dim3 dimGrid((M+127)/128, (N+127)/128); -// auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); -//- -//- dim3 dimBlock(size(tC)); -//- dim3 dimGrid(ceil_div(size(M), size(bM)), -//- ceil_div(size(N), size(bN))); + cout << num_blocks << endl; gemm_device - <<< dimGrid, dimBlock, 0, 0 >>> - (M, N, K, + <<< num_blocks, dimBlock, 0, 0 >>> + (m, n, k, A, B, C, lda, ldb, ldc, diff --git a/tests/test_functional.py b/tests/test_functional.py index 128c803..dd41972 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2363,12 +2363,6 @@ def test_cutlass3_gemm(): print(B) C1 = torch.matmul(A, B) - print(C1) - C2 = F.cutlass3_gemm(A, B.t()) - print(C2) C2 = F.cutlass3_gemm(A, B) - print(C2) - C2 = F.cutlass3_gemm(B.t(), A.t().contiguous()) - print(C2) From 9cab14a3ff920a153fb450e299329a473f1416a4 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Thu, 27 Apr 2023 15:12:49 -0700 Subject: [PATCH 59/97] Adedd pipeline draft. --- bitsandbytes/functional.py | 5 ++++ csrc/kernels.cu | 49 ++++++++++++++++++++++++++++++++++++++ csrc/kernels.cuh | 2 ++ csrc/ops.cu | 11 +++++++++ csrc/ops.cuh | 2 ++ csrc/pythonInterface.c | 1 + tests/test_functional.py | 5 ++++ 7 files changed, 75 insertions(+) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 54a08a1..bb3cde3 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -2341,3 +2341,8 @@ def extract_outliers(A, SA, idx): post_call(prev_device) return out + +def pipeline_test(A, batch_size): + out = torch.zeros_like(A) + lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size)) + return out diff --git a/csrc/kernels.cu b/csrc/kernels.cu index ed87c69..775716f 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -15,6 +15,9 @@ #include #include +#include +#include + #define HLF_MAX 65504 #define TH 1024 #define NUM 4 @@ -2983,6 +2986,51 @@ __global__ void gemm_device(int M, int N, int K, } +__device__ void compute(float* global_out, float const* shared_in) +{ + +} +template +__global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz) { + auto grid = cooperative_groups::this_grid(); + auto block = cooperative_groups::this_thread_block(); + assert(size == batch_sz * grid.size()); // Assume input size fits batch_sz * grid_size + + extern __shared__ float shared[]; // stages_count * block.size() * sizeof(int) bytes + size_t shared_offset[stages_count]; + for (int s = 0; s < stages_count; ++s) shared_offset[s] = s * block.size(); + + __shared__ cuda::pipeline_shared_state< + cuda::thread_scope::thread_scope_block, + stages_count + > shared_state; + auto pipeline = cuda::make_pipeline(block, &shared_state); + + auto block_batch = [&](size_t batch) -> int { + return block.group_index().x * block.size() + grid.size() * batch; + }; + + // compute_batch: next batch to process + // fetch_batch: next batch to fetch from global memory + for (size_t compute_batch = 0, fetch_batch = 0; compute_batch < batch_sz; ++compute_batch) { + // The outer loop iterates over the computation of the batches + for (; fetch_batch < batch_sz && fetch_batch < (compute_batch + stages_count); ++fetch_batch) { + // This inner loop iterates over the memory transfers, making sure that the pipeline is always full + pipeline.producer_acquire(); + size_t shared_idx = fetch_batch % stages_count; + size_t batch_idx = fetch_batch; + size_t block_batch_idx = block_batch(batch_idx); + cuda::memcpy_async(block, shared + shared_offset[shared_idx], global_in + block_batch_idx, sizeof(float) * block.size(), pipeline); + pipeline.producer_commit(); + } + pipeline.consumer_wait(); + int shared_idx = compute_batch % stages_count; + int batch_idx = compute_batch; + compute(global_out + block_batch(batch_idx), shared + shared_offset[shared_idx]); + pipeline.consumer_release(); + } +} + //============================================================== // TEMPLATE DEFINITIONS @@ -3004,6 +3052,7 @@ __global__ void gemm_device(int M, int N, int K, //template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); +template __global__ void with_staging_unified<2>(float const* global_in, float * global_out, size_t size, size_t batch_sz); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index ba6de59..37e214a 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -135,6 +135,8 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * // TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, // TC * out, CStride dC, CBlockLayout , CThreadLayout tC, // Alpha alpha, Beta beta); +template +__global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); __global__ void gemm_device(int M, int N, int K, float const* A, diff --git a/csrc/ops.cu b/csrc/ops.cu index 8933927..ee585bb 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -663,6 +663,17 @@ template void extractOutliers(char * A, int *idx, char *out, int id } +void pipeline_test(float *A, float *B, size_t n, size_t batch_size) +{ + + int threads = 256; + int num_blocks = (n+(256*batch_size)+1)/(batch_size*256); + + printf("%i %i\n", num_blocks, batch_size); + + with_staging_unified<2><<>>(A, B, n, batch_size); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); +} diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 843a9bb..83dd4e5 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -197,4 +197,6 @@ void gemm_host(int m, int n, int k, float beta, float * C, int ldC); + +void pipeline_test(float *A, float *B, size_t n, size_t batch_size); #endif diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index c6de62d..170093f 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -315,6 +315,7 @@ extern "C" void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); } void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); } + void cpipeline_test(float *A, float *B, size_t n, size_t batch_size){ pipeline_test(A, B, n, batch_size); } void ccutlass_gemm(int m, int n, int k, float alpha, diff --git a/tests/test_functional.py b/tests/test_functional.py index dd41972..7dec375 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2366,3 +2366,8 @@ def test_cutlass3_gemm(): C2 = F.cutlass3_gemm(A, B) +def test_pipeline_func(): + a = torch.rand(2, 4).cuda() + out = F.pipeline_test(a, 2) + print(a) + print(out) From c1bfb210c59dc56559b571a927714ca13cea80c5 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 28 Apr 2023 17:19:02 -0700 Subject: [PATCH 60/97] First baseline kernel. --- bitsandbytes/functional.py | 8 +-- csrc/kernels.cu | 101 +++++++++++++++++++++++++++++++++---- csrc/kernels.cuh | 2 +- csrc/ops.cu | 13 +++-- csrc/ops.cuh | 2 +- csrc/pythonInterface.c | 4 +- tests/test_functional.py | 20 ++++---- 7 files changed, 118 insertions(+), 32 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index bb3cde3..774e954 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1429,7 +1429,7 @@ def cutlass3_gemm( m = sB[1] k = sB[0] - lda = B.stride()[(1 if transposed_B else 0)] + lda = B.stride()[0] ldc = sB[1] elif len(sB) == 3: # special case @@ -1446,7 +1446,7 @@ def cutlass3_gemm( n = sA[2] k = sB[0] * sB[1] - lda = m + lda = n ldb = sA[2] ldc = m @@ -1454,7 +1454,7 @@ def cutlass3_gemm( # B^T @ A^T = C^T # [km, nk -> mn] - lda = ldb = ldc = 1 + #lda = ldb = ldc = 1 #lda = 1 #print(m, n, k, lda, ldb, ldc) is_on_gpu([B, A, out]) @@ -1466,7 +1466,7 @@ def cutlass3_gemm( ldc = ct.c_int32(ldc) alpha = ct.c_float(1.0) beta = ct.c_float(0.0) - lib.ccutlass_gemm(m, n, k, alpha, get_ptr(A), ldb, get_ptr(B), lda, beta, get_ptr(out), ldc) + lib.ccutlass_gemm(m, n, k, alpha, get_ptr(A), lda, get_ptr(B), ldb, beta, get_ptr(out), ldc) return out diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 775716f..91169dd 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2947,9 +2947,11 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 9. write outputs to matmul output matrix //} + +#define ROWS 2 __global__ void gemm_device(int M, int N, int K, float const* A, - float const* B, + float* B, float * out, int lda, int ldb, int ldc, float alpha, float beta) { @@ -2958,29 +2960,106 @@ __global__ void gemm_device(int M, int N, int K, // 2. Dequantize B // 3. Fetch data from A and multiply - typedef cub::BlockLoad LoadA; - __shared__ typename LoadA::TempStorage loada; - float dataA[1]; + typedef cub::BlockLoad LoadA; + //__shared__ typename LoadA::TempStorage loada; + typedef cub::BlockLoad LoadB; + //__shared__ typename LoadB::TempStorage loadb; + typedef cub::BlockReduce BlockReduce; + // Allocate shared memory for BlockReduce + //__shared__ typename BlockReduce::TempStorage reduce; + + __shared__ union { + typename BlockReduce::TempStorage reduce; + typename LoadB::TempStorage loadb; + typename LoadA::TempStorage loada; + } temp_storage; + + + float dataA[4]; + float local_B[4]; + float local_accC[ROWS]; int valid_items = 0; + const int warp_id = threadIdx.x/32; + const int warp_lane = threadIdx.x % 32; + const int col_offset = blockIdx.x * 8; - __shared__ float[16*256] tileA; + __shared__ float tileA[ROWS*1024]; + __shared__ float accumulatorC[ROWS*8]; + + //#pragma unroll 8 + //for(int i = 0; i < 8; i++) + // tileA[threadIdx.x + (i*256)] = 0.0f; + //__syncthreads(); + if(threadIdx.x < 64) + accumulatorC[threadIdx.x] = 0.0f; + __syncthreads(); - for(int idxA = 0; idxA < M*K; idxA+= 256) + for(int inner_idx = 0; inner_idx < K; inner_idx+= 1024) { - valid_items = M*K - idxA > 256 ? 256 : M*K - idxA; + valid_items = K - inner_idx > 1024 ? 1024 : K - inner_idx; int baserow = 0; - for(int row = baserow; row < baserow+16 && row < M + ; row++) + for(int row = baserow; row < (baserow+ROWS) && row < N; row++) { - LoadA(loada).Load(&(A[(row*lda) + i]), dataA, valid_items, 0.0f); - tileA[row*256 + threadIdx.x] = dataA[0]; + LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f); + + #pragma unroll 4 + for(int k = 0; k < 4; k++) + tileA[row*1024 + threadIdx.x + (k*blockDim.x)] = dataA[k]; + __syncthreads(); } - baserow += 16; + baserow += ROWS; + // load 16 columns from B at a time. B is transposed, so its like loading rows + // each warp loads one row + // each thread loads 128 byte + // col: inner_idx + warp_lane + // row: ldb*(offset + warp_id) + for(int col = 0; col < 8 && (col_offset + col) < M; col++) + { + int colB = col_offset + col; + + for(int k = 0; k < ROWS; k++) + local_accC[k] = 0.0f; + + int base_idxB = ldb*colB; + valid_items = K - inner_idx > 1024 ? 1024 : K - inner_idx; + LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f); + __syncthreads(); + + for(int row = 0; row < ROWS && row < N; row++) + { + #pragma unroll 4 + for(int k = 0; k < 4; k++) + { + int idxA = row*1024 + threadIdx.x + (blockDim.x*k); + local_accC[row] += tileA[idxA]*local_B[k]; + } + + local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); + if(threadIdx.x == 0) + atomicAdd(&accumulatorC[row*8 + col], local_accC[row]); + } + } } + for(int row = 0; row < ROWS && row < N; row++) + { + int out_idx = ldc*row + col_offset; + + //if(threadIdx.x < 8) + // if(accumulatorC[row*8 + threadIdx.x] != 0.0) + // printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x); + + if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M) + { + //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx); + out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x]; + } + } + } diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 37e214a..55397e7 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -140,7 +140,7 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, __global__ void gemm_device(int M, int N, int K, float const* A, - float const* B, + float * B, float * out, int lda, int ldb, int ldc, float alpha, float beta); diff --git a/csrc/ops.cu b/csrc/ops.cu index ee585bb..dd8fade 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -669,8 +669,6 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) int threads = 256; int num_blocks = (n+(256*batch_size)+1)/(batch_size*256); - printf("%i %i\n", num_blocks, batch_size); - with_staging_unified<2><<>>(A, B, n, batch_size); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } @@ -680,15 +678,22 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) void gemm_host(int m, int n, int k, float alpha, float const* A, int lda, - float const* B, int ldb, + float * B, int ldb, float beta, float * C, int ldc) { dim3 dimBlock(256); - int num_blocks = (n+31)/32; + int num_blocks = (m+7)/8; cout << num_blocks << endl; + cout << lda << endl; + cout << ldb << endl; + cout << ldc << endl; + + cout << m << endl; + cout << n << endl; + cout << k << endl; gemm_device <<< num_blocks, dimBlock, 0, 0 >>> (m, n, k, diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 83dd4e5..2f71966 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -193,7 +193,7 @@ void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rows void gemm_host(int m, int n, int k, float alpha, float const* A, int ldA, - float const* B, int ldB, + float * B, int ldB, float beta, float * C, int ldC); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 170093f..6ec5501 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -24,7 +24,7 @@ void cppgemm(int m, int n, int k, float alpha, float const* A, int ldA, - float const* B, int ldB, + float * B, int ldB, float beta, float * C, int ldC) { gemm_host(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);} @@ -320,7 +320,7 @@ extern "C" void ccutlass_gemm(int m, int n, int k, float alpha, float const* A, int ldA, - float const* B, int ldB, + float * B, int ldB, float beta, float * C, int ldC) { cppgemm(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);} diff --git a/tests/test_functional.py b/tests/test_functional.py index 7dec375..087bc84 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2353,17 +2353,19 @@ def test_normal_map_tree(): def test_cutlass3_gemm(): - #A = torch.rand(2, 2).cuda() - #B = torch.rand(2, 2).cuda() - A = torch.arange(4).reshape(2, 2).float().cuda().contiguous() - B = torch.ones(2, 2).float().cuda() + A = torch.rand(2, 4092).cuda() + B = torch.rand(4*4092, 4092).cuda() - print('') - print(A) - print(B) + #print('') + #print(A) + #print(B.t()) - C1 = torch.matmul(A, B) - C2 = F.cutlass3_gemm(A, B) + C1 = torch.matmul(A, B.t()) + C2 = F.cutlass3_gemm(A, B.t()) + #print(C1) + #print(C2) + + torch.testing.assert_close(C1, C2) def test_pipeline_func(): From 3aef78342aec4fff1922c0c2cdd83bdda928b536 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 28 Apr 2023 17:34:08 -0700 Subject: [PATCH 61/97] Added template refactor. --- bitsandbytes/functional.py | 4 +--- csrc/kernels.cu | 23 ++++++++++------------- csrc/kernels.cuh | 6 +----- csrc/ops.cu | 11 +++-------- csrc/ops.cuh | 7 +------ csrc/pythonInterface.c | 19 ++++--------------- 6 files changed, 20 insertions(+), 50 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 774e954..da4e66c 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1464,9 +1464,7 @@ def cutlass3_gemm( lda = ct.c_int32(lda) ldb = ct.c_int32(ldb) ldc = ct.c_int32(ldc) - alpha = ct.c_float(1.0) - beta = ct.c_float(0.0) - lib.ccutlass_gemm(m, n, k, alpha, get_ptr(A), lda, get_ptr(B), ldb, beta, get_ptr(out), ldc) + lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) return out diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 91169dd..45db448 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2949,22 +2949,18 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * #define ROWS 2 -__global__ void gemm_device(int M, int N, int K, - float const* A, - float* B, - float * out, int lda, int ldb, int ldc, - float alpha, float beta) +template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) { // 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp // 1. Load dataB into register // 2. Dequantize B // 3. Fetch data from A and multiply - typedef cub::BlockLoad LoadA; + typedef cub::BlockLoad LoadA; //__shared__ typename LoadA::TempStorage loada; - typedef cub::BlockLoad LoadB; + typedef cub::BlockLoad LoadB; //__shared__ typename LoadB::TempStorage loadb; - typedef cub::BlockReduce BlockReduce; + typedef cub::BlockReduce BlockReduce; // Allocate shared memory for BlockReduce //__shared__ typename BlockReduce::TempStorage reduce; @@ -2975,16 +2971,16 @@ __global__ void gemm_device(int M, int N, int K, } temp_storage; - float dataA[4]; - float local_B[4]; - float local_accC[ROWS]; + T dataA[4]; + T local_B[4]; + T local_accC[ROWS]; int valid_items = 0; const int warp_id = threadIdx.x/32; const int warp_lane = threadIdx.x % 32; const int col_offset = blockIdx.x * 8; - __shared__ float tileA[ROWS*1024]; - __shared__ float accumulatorC[ROWS*8]; + __shared__ T tileA[ROWS*1024]; + __shared__ T accumulatorC[ROWS*8]; //#pragma unroll 8 //for(int i = 0; i < 8; i++) @@ -3128,6 +3124,7 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, // TC * out, CStride dC, CBlockLayout , CThreadLayout tC, // half alpha, half beta); +template __global__ void gemm_device(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc); //template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 55397e7..900af90 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -138,10 +138,6 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * template __global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); -__global__ void gemm_device(int M, int N, int K, - float const* A, - float * B, - float * out, int lda, int ldb, int ldc, - float alpha, float beta); +template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc); #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index dd8fade..6aaa241 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -675,12 +675,7 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) -void gemm_host(int m, int n, int k, - float alpha, - float const* A, int lda, - float * B, int ldb, - float beta, - float * C, int ldc) +template void gemm_host(int m, int n, int k, T const* A, T* B, T * out, int lda, int ldb, int ldc) { dim3 dimBlock(256); @@ -699,14 +694,14 @@ void gemm_host(int m, int n, int k, (m, n, k, A, B, - C, lda, ldb, ldc, - alpha, beta); + out, lda, ldb, ldc); } //============================================================== // TEMPLATE DEFINITIONS //============================================================== +template void gemm_host(int m, int n, int k, float const* A, float* B, float * out, int lda, int ldb, int ldc); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 2f71966..b7ef9a3 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -190,12 +190,7 @@ template void extractOutliers(char * A, int *idx, char *out, int id void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); -void gemm_host(int m, int n, int k, - float alpha, - float const* A, int ldA, - float * B, int ldB, - float beta, - float * C, int ldC); +template void gemm_host(int m, int n, int k, T const* A, T* B, T * out, int lda, int ldb, int ldc); void pipeline_test(float *A, float *B, size_t n, size_t batch_size); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 6ec5501..a7c4787 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -20,14 +20,8 @@ void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimat void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles(A, code, offset, n); } -void -cppgemm(int m, int n, int k, - float alpha, - float const* A, int ldA, - float * B, int ldB, - float beta, - float * C, int ldC) -{ gemm_host(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);} +void gemm_host_fp32(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc) +{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } #define MAKE_FUNC32(fname, oname, gtype, gbits) \ @@ -317,13 +311,8 @@ extern "C" void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); } void cpipeline_test(float *A, float *B, size_t n, size_t batch_size){ pipeline_test(A, B, n, batch_size); } - void ccutlass_gemm(int m, int n, int k, - float alpha, - float const* A, int ldA, - float * B, int ldB, - float beta, - float * C, int ldC) - { cppgemm(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);} + void cgemm_host_fp32(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc) + { gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); } #endif void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } From f6df4aef6a7b9c4636061c2701de0a9c3ab10098 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 28 Apr 2023 18:26:52 -0700 Subject: [PATCH 62/97] Added fp16 and thread/item template. --- bitsandbytes/functional.py | 11 ++++++++--- csrc/kernels.cu | 39 +++++++++++++++++++------------------- csrc/kernels.cuh | 2 +- csrc/ops.cu | 3 ++- csrc/pythonInterface.c | 5 +++++ tests/test_functional.py | 28 ++++++++++++++++----------- 6 files changed, 53 insertions(+), 35 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index da4e66c..b5c622b 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1381,9 +1381,9 @@ def cutlass3_gemm( transposed_A=False, transposed_B=False, ): - sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.float32) + sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) if out is None: - out = torch.zeros(size=sout, dtype=torch.float32, device=A.device) + out = torch.zeros(size=sout, dtype=A.dtype, device=A.device) sA = A.shape sB = B.shape @@ -1464,7 +1464,12 @@ def cutlass3_gemm( lda = ct.c_int32(lda) ldb = ct.c_int32(ldb) ldc = ct.c_int32(ldc) - lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) + if A.dtype == torch.float32: + lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) + elif A.dtype == torch.float16: + lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) + else: + raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}') return out diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 45db448..67f9a3c 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2949,18 +2949,18 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * #define ROWS 2 -template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) +template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) { // 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp // 1. Load dataB into register // 2. Dequantize B // 3. Fetch data from A and multiply - typedef cub::BlockLoad LoadA; + typedef cub::BlockLoad LoadA; //__shared__ typename LoadA::TempStorage loada; - typedef cub::BlockLoad LoadB; + typedef cub::BlockLoad LoadB; //__shared__ typename LoadB::TempStorage loadb; - typedef cub::BlockReduce BlockReduce; + typedef cub::BlockReduce BlockReduce; // Allocate shared memory for BlockReduce //__shared__ typename BlockReduce::TempStorage reduce; @@ -2971,15 +2971,13 @@ template __global__ void gemm_device(int M, int N, int K, T const* } temp_storage; - T dataA[4]; - T local_B[4]; + T dataA[ITEMS]; + T local_B[ITEMS]; T local_accC[ROWS]; int valid_items = 0; - const int warp_id = threadIdx.x/32; - const int warp_lane = threadIdx.x % 32; const int col_offset = blockIdx.x * 8; - __shared__ T tileA[ROWS*1024]; + __shared__ T tileA[ROWS*THREADS*ITEMS]; __shared__ T accumulatorC[ROWS*8]; //#pragma unroll 8 @@ -2991,17 +2989,17 @@ template __global__ void gemm_device(int M, int N, int K, T const* __syncthreads(); - for(int inner_idx = 0; inner_idx < K; inner_idx+= 1024) + for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS) { - valid_items = K - inner_idx > 1024 ? 1024 : K - inner_idx; + valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; int baserow = 0; for(int row = baserow; row < (baserow+ROWS) && row < N; row++) { LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f); - #pragma unroll 4 - for(int k = 0; k < 4; k++) - tileA[row*1024 + threadIdx.x + (k*blockDim.x)] = dataA[k]; + #pragma unroll ITEMS + for(int k = 0; k < ITEMS; k++) + tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k]; __syncthreads(); } @@ -3021,16 +3019,16 @@ template __global__ void gemm_device(int M, int N, int K, T const* local_accC[k] = 0.0f; int base_idxB = ldb*colB; - valid_items = K - inner_idx > 1024 ? 1024 : K - inner_idx; + valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f); __syncthreads(); for(int row = 0; row < ROWS && row < N; row++) { - #pragma unroll 4 - for(int k = 0; k < 4; k++) + #pragma unroll ITEMS + for(int k = 0; k < ITEMS; k++) { - int idxA = row*1024 + threadIdx.x + (blockDim.x*k); + int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k); local_accC[row] += tileA[idxA]*local_B[k]; } @@ -3124,7 +3122,10 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, // TC * out, CStride dC, CBlockLayout , CThreadLayout tC, // half alpha, half beta); -template __global__ void gemm_device(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 900af90..9603e93 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -138,6 +138,6 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * template __global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); -template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc); #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index 6aaa241..aa3dacf 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -689,7 +689,7 @@ template void gemm_host(int m, int n, int k, T const* A, T* B, T cout << m << endl; cout << n << endl; cout << k << endl; - gemm_device + gemm_device <<< num_blocks, dimBlock, 0, 0 >>> (m, n, k, A, @@ -702,6 +702,7 @@ template void gemm_host(int m, int n, int k, T const* A, T* B, T //============================================================== template void gemm_host(int m, int n, int k, float const* A, float* B, float * out, int lda, int ldb, int ldc); +template void gemm_host(int m, int n, int k, half const* A, half* B, half * out, int lda, int ldb, int ldc); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index a7c4787..3dd0b05 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -22,6 +22,8 @@ void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimate void gemm_host_fp32(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc) { gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } +void gemm_host_fp16(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc) +{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } #define MAKE_FUNC32(fname, oname, gtype, gbits) \ @@ -314,6 +316,9 @@ extern "C" void cgemm_host_fp32(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc) { gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); } + void cgemm_host_fp16(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc) + { gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); } + #endif void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } diff --git a/tests/test_functional.py b/tests/test_functional.py index 087bc84..1564306 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2352,20 +2352,26 @@ def test_normal_map_tree(): print(pivots) -def test_cutlass3_gemm(): - A = torch.rand(2, 4092).cuda() - B = torch.rand(4*4092, 4092).cuda() +#@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) +@pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) +def test_cutlass3_gemm(dtype): + for i in range(2): + A = torch.rand(2, 4092, dtype=dtype, device='cuda') + B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') + #A = torch.rand(2, 4, dtype=dtype, device='cuda') + #B = torch.rand(4, 4, dtype=dtype, device='cuda') - #print('') - #print(A) - #print(B.t()) + #print('') + #print(A) + #print(B.t()) - C1 = torch.matmul(A, B.t()) - C2 = F.cutlass3_gemm(A, B.t()) - #print(C1) - #print(C2) - torch.testing.assert_close(C1, C2) + C1 = torch.matmul(A, B.t()) + C2 = F.cutlass3_gemm(A, B.t()) + #print(C1) + #print(C2) + + #torch.testing.assert_close(C1, C2) def test_pipeline_func(): From f3e97ccbd2cdc1f40fe32e027fb3b5c22a92f09a Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 28 Apr 2023 21:29:40 -0700 Subject: [PATCH 63/97] New implementation for batch size 1. --- csrc/kernels.cu | 271 ++++++++++++++++++++++++++------------- csrc/kernels.cuh | 2 +- csrc/ops.cu | 10 +- csrc/ops.cuh | 2 +- csrc/pythonInterface.c | 8 +- tests/test_functional.py | 12 +- 6 files changed, 199 insertions(+), 106 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 67f9a3c..3310285 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2947,117 +2947,212 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 9. write outputs to matmul output matrix //} - #define ROWS 2 -template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) +template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { -// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp -// 1. Load dataB into register -// 2. Dequantize B -// 3. Fetch data from A and multiply - typedef cub::BlockLoad LoadA; - //__shared__ typename LoadA::TempStorage loada; - typedef cub::BlockLoad LoadB; - //__shared__ typename LoadB::TempStorage loadb; typedef cub::BlockReduce BlockReduce; - // Allocate shared memory for BlockReduce - //__shared__ typename BlockReduce::TempStorage reduce; + __shared__ typename BlockReduce::TempStorage reduce; + int col_offset = blockIdx.x *8; - __shared__ union { - typename BlockReduce::TempStorage reduce; - typename LoadB::TempStorage loadb; - typename LoadA::TempStorage loada; - } temp_storage; + T local_A[8]; + T local_B[8]; + T local_C[8]; + __shared__ T smem_C[8]; - T dataA[ITEMS]; - T local_B[ITEMS]; - T local_accC[ROWS]; - int valid_items = 0; - const int col_offset = blockIdx.x * 8; - - __shared__ T tileA[ROWS*THREADS*ITEMS]; - __shared__ T accumulatorC[ROWS*8]; - - //#pragma unroll 8 - //for(int i = 0; i < 8; i++) - // tileA[threadIdx.x + (i*256)] = 0.0f; - //__syncthreads(); - if(threadIdx.x < 64) - accumulatorC[threadIdx.x] = 0.0f; + if(threadIdx.x < 8) + smem_C[threadIdx.x] = T(0); __syncthreads(); + #pragma unroll 8 + for(int k = 0; k < 8; k++) + local_C[k] = T(0); - for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS) - { - valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; - int baserow = 0; - for(int row = baserow; row < (baserow+ROWS) && row < N; row++) - { - LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f); - #pragma unroll ITEMS - for(int k = 0; k < ITEMS; k++) - tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k]; + for(int idx = threadIdx.x*8; idx < K; idx+=blockDim.x*8) + { - __syncthreads(); - } - baserow += ROWS; - - // load 16 columns from B at a time. B is transposed, so its like loading rows - // each warp loads one row - // each thread loads 128 byte - - // col: inner_idx + warp_lane - // row: ldb*(offset + warp_id) - for(int col = 0; col < 8 && (col_offset + col) < M; col++) + if(idx + 8 <= K) + reinterpret_cast(local_A)[0] = reinterpret_cast(A)[idx/8]; + else { - int colB = col_offset + col; - - for(int k = 0; k < ROWS; k++) - local_accC[k] = 0.0f; - - int base_idxB = ldb*colB; - valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; - LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f); - __syncthreads(); - - for(int row = 0; row < ROWS && row < N; row++) + for(int k = 0; k < 8; k++) { - #pragma unroll ITEMS - for(int k = 0; k < ITEMS; k++) - { - int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k); - local_accC[row] += tileA[idxA]*local_B[k]; - } - - local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); - if(threadIdx.x == 0) - atomicAdd(&accumulatorC[row*8 + col], local_accC[row]); + if(idx + k < K) + local_A[k] = A[idx+k]; + else + local_A[k] = 0.0f; } } - } - for(int row = 0; row < ROWS && row < N; row++) - { - int out_idx = ldc*row + col_offset; - //if(threadIdx.x < 8) - // if(accumulatorC[row*8 + threadIdx.x] != 0.0) - // printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x); - - if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M) + for(int col = 0; col < 8; col++) { - //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx); - out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x]; + int offset_B = (col_offset+col)*ldb; + if(idx + 8 <= K) + reinterpret_cast(local_B)[0] = reinterpret_cast(B)[(offset_B+idx)/8]; + else + { + for(int k = 0; k < 8; k++) + { + if(idx + k < K) + local_B[k] = B[(offset_B+idx)+k]; + else + local_B[k] = 0.0f; + } + } + + #pragma unroll 8 + for(int k = 0; k < 8; k++) + { + local_C[col] += local_A[k]*local_B[k]; + //if((float)local_A[k] != 0.0 && (float)local_B[k] != 0.0) + // printf("%i %i %f %f %f\n", k, threadIdx.x, (float)local_A[k], (float)local_B[k], (float)local_C[col]); + } + } } + #pragma unroll 8 + for(int k = 0; k < 8; k++) + { + local_C[k] = BlockReduce(reduce).Reduce(local_C[k], cub::Sum()); + __syncthreads(); + } + + if(threadIdx.x == 0) + #pragma unroll 8 + for(int k = 0; k < 8; k++) + smem_C[k] = local_C[k]; + else if(threadIdx.x >= 32) + // early return for unused warps + return; + + __syncwarp(); + + + //for(int k = 0; k < 8; k++) + // if((float)local_C[k] != 0.0f) + // printf("%i %f\n", threadIdx.x, (float)local_C[k]); + + if(threadIdx.x < 8 && col_offset + threadIdx.x < M) + out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; + } +//#define ROWS 2 +//template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) +//{ +//// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp +//// 1. Load dataB into register +//// 2. Dequantize B +//// 3. Fetch data from A and multiply +// +// typedef cub::BlockLoad LoadA; +// //__shared__ typename LoadA::TempStorage loada; +// typedef cub::BlockLoad LoadB; +// //__shared__ typename LoadB::TempStorage loadb; +// typedef cub::BlockReduce BlockReduce; +// // Allocate shared memory for BlockReduce +// //__shared__ typename BlockReduce::TempStorage reduce; +// +// __shared__ union { +// typename BlockReduce::TempStorage reduce; +// typename LoadB::TempStorage loadb; +// typename LoadA::TempStorage loada; +// } temp_storage; +// +// +// T dataA[ITEMS]; +// T local_B[ITEMS]; +// T local_accC[ROWS]; +// int valid_items = 0; +// const int col_offset = blockIdx.x * 8; +// +// __shared__ T tileA[ROWS*THREADS*ITEMS]; +// __shared__ T accumulatorC[ROWS*8]; +// +// //#pragma unroll 8 +// //for(int i = 0; i < 8; i++) +// // tileA[threadIdx.x + (i*256)] = 0.0f; +// //__syncthreads(); +// if(threadIdx.x < 64) +// accumulatorC[threadIdx.x] = 0.0f; +// __syncthreads(); +// +// +// for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS) +// { +// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; +// int baserow = 0; +// for(int row = baserow; row < (baserow+ROWS) && row < N; row++) +// { +// LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f); +// +// #pragma unroll ITEMS +// for(int k = 0; k < ITEMS; k++) +// tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k]; +// +// __syncthreads(); +// } +// baserow += ROWS; +// +// // load 16 columns from B at a time. B is transposed, so its like loading rows +// // each warp loads one row +// // each thread loads 128 byte +// +// // col: inner_idx + warp_lane +// // row: ldb*(offset + warp_id) +// for(int col = 0; col < 8 && (col_offset + col) < M; col++) +// { +// int colB = col_offset + col; +// +// for(int k = 0; k < ROWS; k++) +// local_accC[k] = 0.0f; +// +// int base_idxB = ldb*colB; +// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; +// LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f); +// __syncthreads(); +// +// for(int row = 0; row < ROWS && row < N; row++) +// { +// #pragma unroll ITEMS +// for(int k = 0; k < ITEMS; k++) +// { +// int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k); +// local_accC[row] += tileA[idxA]*local_B[k]; +// } +// +// local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); +// if(threadIdx.x == 0) +// atomicAdd(&accumulatorC[row*8 + col], local_accC[row]); +// } +// } +// } +// +// for(int row = 0; row < ROWS && row < N; row++) +// { +// int out_idx = ldc*row + col_offset; +// +// //if(threadIdx.x < 8) +// // if(accumulatorC[row*8 + threadIdx.x] != 0.0) +// // printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x); +// +// if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M) +// { +// //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx); +// out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x]; +// } +// } +// +// +// +//} + __device__ void compute(float* global_out, float const* shared_in) { @@ -3122,10 +3217,8 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, // TC * out, CStride dC, CBlockLayout , CThreadLayout tC, // half alpha, half beta); -template __global__ void gemm_device(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc); -template __global__ void gemm_device(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc); -template __global__ void gemm_device(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc); -template __global__ void gemm_device(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 9603e93..23ecf45 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -138,6 +138,6 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * template __global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); -template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc); #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index aa3dacf..c0c2658 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -675,10 +675,10 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) -template void gemm_host(int m, int n, int k, T const* A, T* B, T * out, int lda, int ldb, int ldc) +template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc) { - dim3 dimBlock(256); + dim3 dimBlock(128); int num_blocks = (m+7)/8; cout << num_blocks << endl; @@ -689,7 +689,7 @@ template void gemm_host(int m, int n, int k, T const* A, T* B, T cout << m << endl; cout << n << endl; cout << k << endl; - gemm_device + gemm_device <<< num_blocks, dimBlock, 0, 0 >>> (m, n, k, A, @@ -701,8 +701,8 @@ template void gemm_host(int m, int n, int k, T const* A, T* B, T // TEMPLATE DEFINITIONS //============================================================== -template void gemm_host(int m, int n, int k, float const* A, float* B, float * out, int lda, int ldb, int ldc); -template void gemm_host(int m, int n, int k, half const* A, half* B, half * out, int lda, int ldb, int ldc); +template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc); +template void gemm_host(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); diff --git a/csrc/ops.cuh b/csrc/ops.cuh index b7ef9a3..8822640 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -190,7 +190,7 @@ template void extractOutliers(char * A, int *idx, char *out, int id void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); -template void gemm_host(int m, int n, int k, T const* A, T* B, T * out, int lda, int ldb, int ldc); +template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc); void pipeline_test(float *A, float *B, size_t n, size_t batch_size); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 3dd0b05..f92b52f 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -20,9 +20,9 @@ void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimat void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles(A, code, offset, n); } -void gemm_host_fp32(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc) +void gemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) { gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } -void gemm_host_fp16(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc) +void gemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) { gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } @@ -313,10 +313,10 @@ extern "C" void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); } void cpipeline_test(float *A, float *B, size_t n, size_t batch_size){ pipeline_test(A, B, n, batch_size); } - void cgemm_host_fp32(int M, int N, int K, float const* A, float* B, float * out, int lda, int ldb, int ldc) + void cgemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) { gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); } - void cgemm_host_fp16(int M, int N, int K, half const* A, half* B, half * out, int lda, int ldb, int ldc) + void cgemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) { gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); } #endif diff --git a/tests/test_functional.py b/tests/test_functional.py index 1564306..f08c4a2 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2355,11 +2355,11 @@ def test_normal_map_tree(): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): - for i in range(2): - A = torch.rand(2, 4092, dtype=dtype, device='cuda') - B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') - #A = torch.rand(2, 4, dtype=dtype, device='cuda') - #B = torch.rand(4, 4, dtype=dtype, device='cuda') + for i in range(1): + #A = torch.rand(2, 4092, dtype=dtype, device='cuda') + #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') + A = torch.rand(1, 4096, dtype=dtype, device='cuda') + B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') #print('') #print(A) @@ -2371,7 +2371,7 @@ def test_cutlass3_gemm(dtype): #print(C1) #print(C2) - #torch.testing.assert_close(C1, C2) + torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.005) def test_pipeline_func(): From cad839941b2c0a013525be339f6e9c157caa925d Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Fri, 28 Apr 2023 22:10:42 -0700 Subject: [PATCH 64/97] Added bit template. --- csrc/kernels.cu | 77 +++++++++++++++++----------------------- csrc/kernels.cuh | 2 +- csrc/ops.cu | 16 ++++----- csrc/ops.cuh | 2 +- csrc/pythonInterface.c | 4 +-- tests/test_functional.py | 4 +-- 6 files changed, 45 insertions(+), 60 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 3310285..a5697ee 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -2947,16 +2947,31 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 9. write outputs to matmul output matrix //} -#define ROWS 2 -template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) +template __device__ inline void vector_load(T *local, T * __restrict__ const buffer, int idx, int limit_base, int limit) +{ + if(limit_base + ITEMS <= limit) + reinterpret_cast(local)[0] = reinterpret_cast(buffer)[idx/ITEMS]; + else + { + for(int k = 0; k < ITEMS; k++) + { + if(limit_base + k < limit) + local[k] = buffer[idx+k]; + else + local[k] = 0.0f; + } + } +} + +template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage reduce; int col_offset = blockIdx.x *8; - T local_A[8]; - T local_B[8]; + T local_A[128/BITS]; + T local_B[128/BITS]; T local_C[8]; __shared__ T smem_C[8]; @@ -2970,47 +2985,18 @@ template __global__ void gemm_device(int M, local_C[k] = T(0); - for(int idx = threadIdx.x*8; idx < K; idx+=blockDim.x*8) + for(int idx = threadIdx.x*128/BITS; idx < K; idx+=blockDim.x*128/BITS) { - - if(idx + 8 <= K) - reinterpret_cast(local_A)[0] = reinterpret_cast(A)[idx/8]; - else - { - for(int k = 0; k < 8; k++) - { - if(idx + k < K) - local_A[k] = A[idx+k]; - else - local_A[k] = 0.0f; - } - } - + vector_load(local_A, A, idx, idx, K); for(int col = 0; col < 8; col++) { int offset_B = (col_offset+col)*ldb; - if(idx + 8 <= K) - reinterpret_cast(local_B)[0] = reinterpret_cast(B)[(offset_B+idx)/8]; - else - { - for(int k = 0; k < 8; k++) - { - if(idx + k < K) - local_B[k] = B[(offset_B+idx)+k]; - else - local_B[k] = 0.0f; - } - } + vector_load(local_B, B, offset_B+idx, idx, K); - #pragma unroll 8 - for(int k = 0; k < 8; k++) - { + #pragma unroll 128/BITS + for(int k = 0; k < 128/BITS; k++) local_C[col] += local_A[k]*local_B[k]; - //if((float)local_A[k] != 0.0 && (float)local_B[k] != 0.0) - // printf("%i %i %f %f %f\n", k, threadIdx.x, (float)local_A[k], (float)local_B[k], (float)local_C[col]); - } - } } @@ -3022,9 +3008,11 @@ template __global__ void gemm_device(int M, } if(threadIdx.x == 0) + { #pragma unroll 8 for(int k = 0; k < 8; k++) smem_C[k] = local_C[k]; + } else if(threadIdx.x >= 32) // early return for unused warps return; @@ -3032,15 +3020,8 @@ template __global__ void gemm_device(int M, __syncwarp(); - //for(int k = 0; k < 8; k++) - // if((float)local_C[k] != 0.0f) - // printf("%i %f\n", threadIdx.x, (float)local_C[k]); - if(threadIdx.x < 8 && col_offset + threadIdx.x < M) out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; - - - } //#define ROWS 2 @@ -3217,7 +3198,13 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, // TC * out, CStride dC, CBlockLayout , CThreadLayout tC, // half alpha, half beta); + +// these are not used and make no sense, but the compiler needs them template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +// these are not used and make no sense, but the compiler needs them + +template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 23ecf45..aab7b95 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -138,6 +138,6 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * template __global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); -template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc); #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index c0c2658..2219690 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -675,7 +675,7 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) -template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc) +template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits) { dim3 dimBlock(128); @@ -689,20 +689,18 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out cout << m << endl; cout << n << endl; cout << k << endl; - gemm_device - <<< num_blocks, dimBlock, 0, 0 >>> - (m, n, k, - A, - B, - out, lda, ldb, ldc); + if(bits == 32) + gemm_device<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + else if(bits == 16) + gemm_device<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } //============================================================== // TEMPLATE DEFINITIONS //============================================================== -template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc); -template void gemm_host(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc); +template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc, int bits); +template void gemm_host(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc, int bits); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 8822640..ffc9e87 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -190,7 +190,7 @@ template void extractOutliers(char * A, int *idx, char *out, int id void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); -template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc); +template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits); void pipeline_test(float *A, float *B, size_t n, size_t batch_size); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index f92b52f..1ece3e6 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -21,9 +21,9 @@ void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimate void gemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) -{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } +{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc, 32); } void gemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) -{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc); } +{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc, 16); } #define MAKE_FUNC32(fname, oname, gtype, gbits) \ diff --git a/tests/test_functional.py b/tests/test_functional.py index f08c4a2..b256af9 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2352,8 +2352,8 @@ def test_normal_map_tree(): print(pivots) -#@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) -@pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) +#@pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): for i in range(1): #A = torch.rand(2, 4092, dtype=dtype, device='cuda') From 21723f796a3951e56b77460e7d572c76619b773f Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 29 Apr 2023 21:52:47 -0700 Subject: [PATCH 65/97] 4-bit draft. --- bitsandbytes/functional.py | 22 +++- csrc/kernels.cu | 222 +++++++++++++++++++++++++++++++++---- csrc/kernels.cuh | 1 + csrc/ops.cu | 18 +++ csrc/ops.cuh | 1 + csrc/pythonInterface.c | 6 + tests/test_functional.py | 30 ++++- 7 files changed, 273 insertions(+), 27 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index b5c622b..f725c1c 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1380,10 +1380,15 @@ def cutlass3_gemm( out: Tensor = None, transposed_A=False, transposed_B=False, + state=None ): - sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) + #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) + if state is None: + Bshape = B.shape + else: + Bshape = state[1] if out is None: - out = torch.zeros(size=sout, dtype=A.dtype, device=A.device) + out = torch.zeros(size=(A.shape[0], Bshape[1]), dtype=A.dtype, device=A.device) sA = A.shape sB = B.shape @@ -1456,7 +1461,13 @@ def cutlass3_gemm( # [km, nk -> mn] #lda = ldb = ldc = 1 #lda = 1 - #print(m, n, k, lda, ldb, ldc) + if state is not None: + m = Bshape[0] + k = Bshape[1] + lda = Bshape[1] + ldc = Bshape[0] + ldb = (ldb+1)//2 + print(m, n, k, lda, ldb, ldc) is_on_gpu([B, A, out]) m = ct.c_int32(m) n = ct.c_int32(n) @@ -1464,7 +1475,10 @@ def cutlass3_gemm( lda = ct.c_int32(lda) ldb = ct.c_int32(ldb) ldc = ct.c_int32(ldc) - if A.dtype == torch.float32: + + if B.dtype == torch.uint8: + lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3])) + elif A.dtype == torch.float32: lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) elif A.dtype == torch.float16: lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index a5697ee..53a183d 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -69,6 +69,27 @@ __device__ float dDequantizeFP4(unsigned char val, float absmax) } } +__device__ float d2DequantizeFP4(unsigned char val) +{ + float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; + if((val & 0b0110) == 0) + { + // subnormal + if((val & 0b0001) == 0) + return 0.0f; + else + return sign*0.0625f; + } + else + { + // normal + float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f); + float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f; + + return sign*exponent*fraction; + } +} + __device__ float dDequantizeFP4Tree(unsigned char val, float absmax) { float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; @@ -145,7 +166,7 @@ __device__ unsigned char dQuantizeFP4(float x) return 0b0000+sign; } -__device__ float dDequantizeNF4(unsigned char val, float absmax) +__device__ half dhDequantizeNF4(unsigned char val) { // the values for this tree was generated by test_normal_map_tree // in the file tests/test_functional.py @@ -153,49 +174,103 @@ __device__ float dDequantizeNF4(unsigned char val, float absmax) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f*absmax; + return 1.0f; else - return 0.7229568362236023f*absmax; + return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f*absmax; + return 0.5626170039176941f; else - return 0.44070982933044434f*absmax; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f*absmax; + return 0.33791524171829224f; else - return 0.24611230194568634f*absmax; + return 0.24611230194568634f; else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f*absmax; + return 0.16093020141124725f; else - return 0.07958029955625534f*absmax; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f*absmax; + return 0.0f; else - return -0.09105003625154495f*absmax; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f*absmax; + return -0.18477343022823334f; else - return -0.28444138169288635f*absmax; + return -0.28444138169288635f; else if((val & 0b0010) == 2) //00 if((val & 0b0001) == 1) // 001 - return -0.39491748809814453f*absmax; + return -0.39491748809814453f; else - return -0.5250730514526367f*absmax; + return -0.5250730514526367f; else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f*absmax; + return -0.6961928009986877f; else - return -1.0f*absmax; + return -1.0f; + +} + +__device__ float dDequantizeNF4(unsigned char val) +{ + // the values for this tree was generated by test_normal_map_tree + // in the file tests/test_functional.py + if((val & 0b1000) == 8) + if((val & 0b0100) == 4) // 1 + if((val & 0b0010) == 2) // 11 + if((val & 0b0001) == 1) // 111 + return 1.0f; + else + return 0.7229568362236023f; + else + if((val & 0b0001) == 1) // 110 + return 0.5626170039176941f; + else + return 0.44070982933044434f; + else + if((val & 0b0010) == 2) //10 + if((val & 0b0001) == 1) // 101 + return 0.33791524171829224f; + else + return 0.24611230194568634f; + else + if((val & 0b0001) == 1) // 100 + return 0.16093020141124725f; + else + return 0.07958029955625534f; + + else + if((val & 0b0100) == 4) // 0 + if((val & 0b0010) == 2) //01 + if((val & 0b0001) == 1) // 011 + return 0.0f; + else + return -0.09105003625154495f; + else + if((val & 0b0001) == 1) // 010 + return -0.18477343022823334f; + else + return -0.28444138169288635f; + else + if((val & 0b0010) == 2) //00 + if((val & 0b0001) == 1) // 001 + return -0.39491748809814453f; + else + return -0.5250730514526367f; + else + if((val & 0b0001) == 1) // 000 + return -0.6961928009986877f; + else + return -1.0f; } @@ -800,8 +875,8 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH; j++) { - vals[j*2] = dDequantizeNF4(qvals[j] >> 4, local_abs_max); - vals[j*2 + 1] = dDequantizeNF4(qvals[j] & 0x0F, local_abs_max); + vals[j*2] = dDequantizeNF4(qvals[j] >> 4)* local_abs_max; + vals[j*2 + 1] = dDequantizeNF4(qvals[j] & 0x0F)* local_abs_max; } break; } @@ -2947,7 +3022,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 9. write outputs to matmul output matrix //} -template __device__ inline void vector_load(T *local, T * __restrict__ const buffer, int idx, int limit_base, int limit) +template __device__ inline void vector_load(T *local, T * __restrict__ const buffer, int idx, int limit_base, int limit, float zero_value = 0.0f) { if(limit_base + ITEMS <= limit) reinterpret_cast(local)[0] = reinterpret_cast(buffer)[idx/ITEMS]; @@ -2958,7 +3033,7 @@ template __device__ inline void vector_l if(limit_base + k < limit) local[k] = buffer[idx+k]; else - local[k] = 0.0f; + local[k] = (T)zero_value; } } } @@ -3024,6 +3099,109 @@ template __global__ void gemm_device(int M, out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; } +template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) +{ + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage reduce; + int col_offset = blockIdx.x *8; + + T local_A[32]; + unsigned char local_B_4bit[16]; + T local_B[32]; + T local_C[8]; + + __shared__ T smem_C[8]; + + if(threadIdx.x < 8) + smem_C[threadIdx.x] = T(0); + __syncthreads(); + + #pragma unroll 8 + for(int k = 0; k < 8; k++) + local_C[k] = T(0); + + + for(int idx = threadIdx.x*32; idx < K; idx+=blockDim.x*32) + { + + // we load only 8 values per iteration from A, so we + // need to do 4 loads for every single load from B + // for B, we have packed values, so the 16 8-bit values + // turn into 32 4-bit values to 4x 4 loads turns into 4x 8 loads + vector_load(local_A, A, idx, idx, K); + vector_load(&(local_A[8]), A, idx+8, idx+8, K); + vector_load(&(local_A[16]), A, idx+16, idx+16, K); + vector_load(&(local_A[24]), A, idx+24, idx+24, K); + + for(int col = 0; col < 8; col++) + { + if((col + col_offset) >= M){ break; } + + int offset_B = (col_offset+col)*ldb; + // 0111 -> 0.0f in NF4 + // since we have packed 8-bits, we need cat(0b0111, 0b0111) = 0b01110111 + vector_load(local_B_4bit, B, (offset_B+idx+1)/2, (idx+1)/2, (K+1)/2, 0b01110111); + + int absidx = (idx + offset_B)/blocksize; + half local_absmax = __ldg(&(absmax[absidx])); + //for(int k = 0; k < 16; k++) + //printf("%i %i ", local_B_4bit[k] >> 4, local_B_4bit[k] & 0x0F); + //printf("\n"); + + //vector_load(local_A, A, idx, idx, K); + + #pragma unroll 16 + for(int k = 0; k < 16; k++) + { + + //if(local_B_4bit[k ] != 0b01110111) + //printf("(%i %i %i) %i -> %f, %i -> %f\n", threadIdx.x , k, K, local_B_4bit[k ] >> 4, dDequantizeNF4(local_B_4bit[k ] >> 4, local_absmax), + //local_B_4bit[k ] & 0x0F, dDequantizeNF4(local_B_4bit[k ] & 0x0F, local_absmax)); + //local_B[k*2] = d2DequantizeFP4(local_B_4bit[k] >> 4);//*local_absmax; + //local_B[k*2 + 1] = d2DequantizeFP4(local_B_4bit[k] & 0x0F);//*local_absmax; + local_B[k*2] = (half)(local_B_4bit[k] >> 4)*local_absmax; + local_B[k*2 + 1] = (half)(local_B_4bit[k] & 0x0F)*local_absmax; + //local_B[k*2] = (half)dDequantizeNF4(local_B_4bit[k ] >> 4);//*local_absmax; + //local_B[k*2 + 1] = (half)dDequantizeNF4(local_B_4bit[k ] & 0x0F);//*local_absmax; + } + + #pragma unroll 32 + //for(int k = 0; k < 8; k++) + for(int k = 0; k < 32; k++) + { + local_C[col] += local_A[k]*local_B[k]; + //if((float)local_A[k] != 0.0 && (float)local_B[k] != 0.0) + //if((float)local_B[k] != 0.0) + //printf("%i %i %i %i %f*%f\n", threadIdx.x, k, col, (float)local_A[k], (float)local_B[k]); + } + } + } + + #pragma unroll 8 + for(int k = 0; k < 8; k++) + { + local_C[k] = BlockReduce(reduce).Reduce(local_C[k], cub::Sum()); + __syncthreads(); + } + + if(threadIdx.x == 0) + { + #pragma unroll 8 + for(int k = 0; k < 8; k++) + smem_C[k] = local_C[k]; + } + else if(threadIdx.x >= 32) + // early return for unused warps + return; + + __syncwarp(); + + + if(threadIdx.x < 8 && col_offset + threadIdx.x < M) + out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; +} + //#define ROWS 2 //template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) //{ @@ -3207,6 +3385,8 @@ template __global__ void gemm_device(int M, int N, int K, half * template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); + //template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); template __global__ void with_staging_unified<2>(float const* global_in, float * global_out, size_t size, size_t batch_sz); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index aab7b95..4951031 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -139,5 +139,6 @@ template __global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc); +template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize); #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index 2219690..07e7107 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -695,10 +695,28 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out gemm_device<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } +template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) +{ + + dim3 dimBlock(128); + int num_blocks = (m+7)/8; + + cout << num_blocks << endl; + cout << lda << endl; + cout << ldb << endl; + cout << ldc << endl; + + cout << m << endl; + cout << n << endl; + cout << k << endl; + kgemm_4bit_inference<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); +} + //============================================================== // TEMPLATE DEFINITIONS //============================================================== +template void gemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc, int bits); template void gemm_host(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc, int bits); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); diff --git a/csrc/ops.cuh b/csrc/ops.cuh index ffc9e87..8919c60 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -191,6 +191,7 @@ template void extractOutliers(char * A, int *idx, char *out, int id void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits); +template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize); void pipeline_test(float *A, float *B, size_t n, size_t batch_size); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 1ece3e6..bdf821c 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -25,6 +25,9 @@ void gemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, in void gemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) { gemm_host(M, N, K, A, B, out, lda, ldb, ldc, 16); } +void gemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize) +{ gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } + #define MAKE_FUNC32(fname, oname, gtype, gbits) \ void fname##32bit_g##gbits(gtype *g, gtype *p, \ @@ -319,6 +322,9 @@ extern "C" void cgemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) { gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); } + void cgemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize) + { gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } + #endif void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } diff --git a/tests/test_functional.py b/tests/test_functional.py index b256af9..f58cd43 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2352,8 +2352,8 @@ def test_normal_map_tree(): print(pivots) -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) -#@pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) +#@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) +@pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): for i in range(1): #A = torch.rand(2, 4092, dtype=dtype, device='cuda') @@ -2373,6 +2373,32 @@ def test_cutlass3_gemm(dtype): torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.005) +#@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) +@pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) +def test_gemm_4bit(dtype): + for i in range(1): + #A = torch.rand(2, 4092, dtype=dtype, device='cuda') + #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') + #torch.random.manual_seed(17) + A = torch.rand(1, 4096, dtype=dtype, device='cuda') + B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') + + #print('') + #print(A) + #print(B) + + qB, state = F.quantize_nf4(B) + F.dequantize_nf4(qB, state) + + + C1 = torch.matmul(A, B.t()) + #C1 = bnb.matmul_4bit(A, qB.t(), state) + C2 = F.cutlass3_gemm(A, qB.t(), state=state) + #print(C1) + #print(C2) + + #torch.testing.assert_close(C1, C2, atol=1e-5, rtol=0.005) + def test_pipeline_func(): a = torch.rand(2, 4).cuda() From ad07d254fb5cefadf8dcb6020b24fb0baee4e936 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 30 Apr 2023 17:43:02 -0700 Subject: [PATCH 66/97] Slow tensor core solution. --- csrc/kernels.cu | 181 +++++++++++++++++++++++++++++++-------- csrc/ops.cu | 17 ++-- csrc/pythonInterface.c | 8 +- tests/test_functional.py | 2 + 4 files changed, 160 insertions(+), 48 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 53a183d..24b004b 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,8 @@ #define NUM 4 #define NUM_BLOCK 4096 +using namespace nvcuda; + // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda __device__ float atomicMax(float* address, float val) { int* address_as_i = reinterpret_cast(address); @@ -3041,62 +3044,164 @@ template __device__ inline void vector_l template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage reduce; - int col_offset = blockIdx.x *8; + typedef cub::WarpReduce WarpReduce; + // Allocate WarpReduce shared memory for one warp + //__shared__ typename WarpReduce::TempStorage temp_storage; - T local_A[128/BITS]; - T local_B[128/BITS]; + //typedef cub::BlockReduce BlockReduce; + //// Allocate shared memory for BlockReduce + //__shared__ typename BlockReduce::TempStorage reduce; + int col_offset = blockIdx.x *8; + const int warp_id = threadIdx.x / 32; + const int warp_lane = threadIdx.x % 32; + + T local_A[64/BITS]; + T local_B[64/BITS]; T local_C[8]; - __shared__ T smem_C[8]; + __shared__ T smem_A[4*32*16]; + __shared__ T smem_B[4*16*8]; + __shared__ T smem_C[4*32*8]; - if(threadIdx.x < 8) - smem_C[threadIdx.x] = T(0); + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment c_frag; + + wmma::fill_fragment(c_frag, 0.0f); + + + for(int i = threadIdx.x; i < 32*16*4; i+=blockDim.x) + smem_A[i] = T(0); + + for(int i = threadIdx.x; i < 32*8*4; i+=blockDim.x) + smem_B[i] = T(0); + + for(int i = threadIdx.x; i < 32*8*THREADS/32; i+=blockDim.x) + smem_C[i] = T(0); __syncthreads(); #pragma unroll 8 for(int k = 0; k < 8; k++) local_C[k] = T(0); - - for(int idx = threadIdx.x*128/BITS; idx < K; idx+=blockDim.x*128/BITS) + int block_idx = 0; + //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) + for(int base_idx = 0; base_idx < K; base_idx+=64) { - vector_load(local_A, A, idx, idx, K); - for(int col = 0; col < 8; col++) + int tidx = threadIdx.x*4; + + if(base_idx % (4*blockDim.x) == 0) { - int offset_B = (col_offset+col)*ldb; - vector_load(local_B, B, offset_B+idx, idx, K); - - #pragma unroll 128/BITS - for(int k = 0; k < 128/BITS; k++) - local_C[col] += local_A[k]*local_B[k]; + vector_load(local_A, A, base_idx+tidx, base_idx+tidx, K); // 54 mu + block_idx = 0; } - } - #pragma unroll 8 - for(int k = 0; k < 8; k++) - { - local_C[k] = BlockReduce(reduce).Reduce(local_C[k], cub::Sum()); + for(int k = 0; k < 4; k++) + { + if((threadIdx.x >= block_idx*16) && (threadIdx.x < (block_idx+1)*16)) + smem_A[(threadIdx.x % 16) + (32*16*k)] = local_A[k]; // 54 mu + } + block_idx += 1; + + // 4 warps, 1 warps loads in total 4*32=64 values -> 4 columns at a time + // we need 8 columns, so 2 loads and smem stores + // we need a half-warp to load one column at a time + for(int j = 0; j < 2; j++) + { + int col = warp_id + (j*4); + int offset_B = (col_offset+col)*ldb; + vector_load(local_B, B, offset_B+base_idx+warp_lane*4, base_idx+warp_lane*4, K); // 171 mu + + + //#pragma unroll 4 + //for(int k = 0; k < 4; k++) + // if((float)local_B[k] != 0.0) + // printf("%i %i %i %i %f\n", j, warp_id, warp_lane, k, (float)local_B[k]); + + // load and store is different + // we wnat to load 64 consequitive values with one warp + // but we need to store those across 4 fragments since + // the max column width is 16. + + // each 16 values a new tile for each warp + //int tile_idx = warp_lane/16; + #pragma unroll 4 + for(int k = 0; k < 4; k++) + smem_B[(warp_lane % 16) + (col*16) + (k*16*8)] = local_B[k]; // 171 mu + } + + + __syncthreads(); + + //if(threadIdx.x == 0) + // for(int w = 0; w < 4; w++) + // for(int trow = 0; trow < 32; trow++) + // for(int tcol = 0; tcol < 16; tcol++) + // if((float)smem_A[trow + tcol*32 + (w*32*16)] != 0.0) + // printf("A %i %i %i = %f\n", w, trow, tcol, (float) smem_B[trow + tcol*16]); + + //if(threadIdx.x == 0) + // for(int w = 0; w < 4; w++) + // for(int trow = 0; trow < 16; trow++) + // for(int tcol = 0; tcol < 8; tcol++) + // if((float)smem_B[trow + tcol*16 + (w*16*8)] != 0.0) + // printf("B %i %i %i = %f\n", w, trow, tcol, (float) smem_B[trow + tcol*16]); + + + //__syncthreads(); + + wmma::load_matrix_sync(a_frag, &(smem_A[warp_id*32*16]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[warp_id*16*8]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); } - if(threadIdx.x == 0) - { - #pragma unroll 8 - for(int k = 0; k < 8; k++) - smem_C[k] = local_C[k]; - } - else if(threadIdx.x >= 32) - // early return for unused warps - return; + // 129 mu + wmma::store_matrix_sync(&(smem_C[warp_id*32*8]), c_frag, 8, wmma::mem_row_major); + __syncthreads(); - __syncwarp(); + //if(threadIdx.x >= 16){ return; } + //printf("%i %f\n", threadIdx.x, (float)smem_C[threadIdx.x]); + //if(threadIdx.x < 32) + if(warp_lane < 8 && warp_id > 0) + //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; + atomicAdd(&(smem_C[warp_lane]), smem_C[warp_lane + (warp_id*32*8)]); + __syncthreads(); + //local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); + //if(threadIdx.x == 0) + // for(int row = 0; row < 32; row++) + // { + // printf("row %i ", row); + // for(int id = 0; id < 4; id++) + // { + // printf(" id %i: ", id); + // for(int k = 0; k < 8; k++) + // printf("%f ", (float)smem_C[k + (row*8) + (id*32*8)]); + // printf("\n"); + // } + // } + + //__syncthreads(); + + //if((float)local_C[0] !=0.0f) + // printf("%i %i %f\n", warp_lane, warp_id, (float)local_C[0]); + //local_C[0] = WarpReduce(temp_storage).Sum(local_C[0]); + + //__syncwarp(); + + ////for(int i = threadIdx.x; i < 32*8; i+=blockDim.x) + ////{ + // if((float)local_C[0] !=0.0f) + // printf("%i %f\n", 0, (float)local_C[0]); + //} + + //if(threadIdx.x < 8 && col_offset + threadIdx.x < M) + //out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; if(threadIdx.x < 8 && col_offset + threadIdx.x < M) - out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; + out[col_offset + threadIdx.x] = smem_C[threadIdx.x]; } template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) @@ -3378,12 +3483,16 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // half alpha, half beta); // these are not used and make no sense, but the compiler needs them -template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +//template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +//template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); // these are not used and make no sense, but the compiler needs them -template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +//template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +//template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); diff --git a/csrc/ops.cu b/csrc/ops.cu index 07e7107..d83fc6e 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -678,7 +678,6 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits) { - dim3 dimBlock(128); int num_blocks = (m+7)/8; cout << num_blocks << endl; @@ -689,16 +688,17 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out cout << m << endl; cout << n << endl; cout << k << endl; - if(bits == 32) - gemm_device<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - else if(bits == 16) - gemm_device<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //if(bits == 32) + //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + if(bits == 16) + gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) { - dim3 dimBlock(128); int num_blocks = (m+7)/8; cout << num_blocks << endl; @@ -709,7 +709,8 @@ template void gemm_4bit_inference(int m, int n, int k, T * A, unsi cout << m << endl; cout << n << endl; cout << k << endl; - kgemm_4bit_inference<<< num_blocks, dimBlock, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); + kgemm_4bit_inference<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); + //kgemm_4bit_inference<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } //============================================================== @@ -717,7 +718,7 @@ template void gemm_4bit_inference(int m, int n, int k, T * A, unsi //============================================================== template void gemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); -template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc, int bits); +//template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc, int bits); template void gemm_host(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc, int bits); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); template void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index bdf821c..26f16f2 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -20,8 +20,8 @@ void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimat void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles(A, code, offset, n); } -void gemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) -{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc, 32); } +//void gemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) +//{ gemm_host(M, N, K, A, B, out, lda, ldb, ldc, 32); } void gemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) { gemm_host(M, N, K, A, B, out, lda, ldb, ldc, 16); } @@ -316,8 +316,8 @@ extern "C" void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); } void cpipeline_test(float *A, float *B, size_t n, size_t batch_size){ pipeline_test(A, B, n, batch_size); } - void cgemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) - { gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); } + //void cgemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) + //{ gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); } void cgemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int lda, int ldb, int ldc) { gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); } diff --git a/tests/test_functional.py b/tests/test_functional.py index f58cd43..e2ecdcb 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2358,6 +2358,8 @@ def test_cutlass3_gemm(dtype): for i in range(1): #A = torch.rand(2, 4092, dtype=dtype, device='cuda') #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') + #A = torch.rand(1, 4096, dtype=dtype, device='cuda') + #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') A = torch.rand(1, 4096, dtype=dtype, device='cuda') B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') From 604bb3fb573eee2437c2ed51efbd0e3c1382e060 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 30 Apr 2023 18:06:01 -0700 Subject: [PATCH 67/97] Slow non-vector 530. --- csrc/kernels.cu | 106 +++++++++++++++--------------------------------- csrc/ops.cu | 4 +- 2 files changed, 35 insertions(+), 75 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 24b004b..5a6db7d 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3041,6 +3041,7 @@ template __device__ inline void vector_l } } +#define WARPS 1 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { @@ -3059,9 +3060,9 @@ template __global__ void gemm_device(int M, T local_B[64/BITS]; T local_C[8]; - __shared__ T smem_A[4*32*16]; - __shared__ T smem_B[4*16*8]; - __shared__ T smem_C[4*32*8]; + __shared__ T smem_A[WARPS*32*16]; + __shared__ T smem_B[WARPS*16*8]; + __shared__ T smem_C[WARPS*32*8]; wmma::fragment a_frag; wmma::fragment b_frag; @@ -3070,13 +3071,13 @@ template __global__ void gemm_device(int M, wmma::fill_fragment(c_frag, 0.0f); - for(int i = threadIdx.x; i < 32*16*4; i+=blockDim.x) + for(int i = threadIdx.x; i < 32*16*WARPS; i+=blockDim.x) smem_A[i] = T(0); - for(int i = threadIdx.x; i < 32*8*4; i+=blockDim.x) + for(int i = threadIdx.x; i < 32*8*WARPS; i+=blockDim.x) smem_B[i] = T(0); - for(int i = threadIdx.x; i < 32*8*THREADS/32; i+=blockDim.x) + for(int i = threadIdx.x; i < 32*8*WARPS; i+=blockDim.x) smem_C[i] = T(0); __syncthreads(); @@ -3084,91 +3085,48 @@ template __global__ void gemm_device(int M, for(int k = 0; k < 8; k++) local_C[k] = T(0); - int block_idx = 0; + //int block_idx = 0; //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) - for(int base_idx = 0; base_idx < K; base_idx+=64) + for(int base_idx = 0; base_idx < K; base_idx+=16) { + int idx = base_idx + threadIdx.x; - int tidx = threadIdx.x*4; - - if(base_idx % (4*blockDim.x) == 0) + if(threadIdx.x < 16) { - vector_load(local_A, A, base_idx+tidx, base_idx+tidx, K); // 54 mu - block_idx = 0; + if(idx >= K) + { + smem_A[threadIdx.x] = 0.0f; + smem_B[threadIdx.x] = 0.0f; + } + else + { + + smem_A[threadIdx.x] = A[idx]; + + for(int col = 0; col < 8; col++) + smem_B[threadIdx.x + (col*16)] = B[(col_offset+col)*ldb+idx]; + } } - for(int k = 0; k < 4; k++) - { - if((threadIdx.x >= block_idx*16) && (threadIdx.x < (block_idx+1)*16)) - smem_A[(threadIdx.x % 16) + (32*16*k)] = local_A[k]; // 54 mu - } - block_idx += 1; - - // 4 warps, 1 warps loads in total 4*32=64 values -> 4 columns at a time - // we need 8 columns, so 2 loads and smem stores - // we need a half-warp to load one column at a time - for(int j = 0; j < 2; j++) - { - int col = warp_id + (j*4); - int offset_B = (col_offset+col)*ldb; - vector_load(local_B, B, offset_B+base_idx+warp_lane*4, base_idx+warp_lane*4, K); // 171 mu - - - //#pragma unroll 4 - //for(int k = 0; k < 4; k++) - // if((float)local_B[k] != 0.0) - // printf("%i %i %i %i %f\n", j, warp_id, warp_lane, k, (float)local_B[k]); - - // load and store is different - // we wnat to load 64 consequitive values with one warp - // but we need to store those across 4 fragments since - // the max column width is 16. - - // each 16 values a new tile for each warp - //int tile_idx = warp_lane/16; - #pragma unroll 4 - for(int k = 0; k < 4; k++) - smem_B[(warp_lane % 16) + (col*16) + (k*16*8)] = local_B[k]; // 171 mu - } - - - __syncthreads(); - //if(threadIdx.x == 0) - // for(int w = 0; w < 4; w++) - // for(int trow = 0; trow < 32; trow++) - // for(int tcol = 0; tcol < 16; tcol++) - // if((float)smem_A[trow + tcol*32 + (w*32*16)] != 0.0) - // printf("A %i %i %i = %f\n", w, trow, tcol, (float) smem_B[trow + tcol*16]); - - //if(threadIdx.x == 0) - // for(int w = 0; w < 4; w++) - // for(int trow = 0; trow < 16; trow++) - // for(int tcol = 0; tcol < 8; tcol++) - // if((float)smem_B[trow + tcol*16 + (w*16*8)] != 0.0) - // printf("B %i %i %i = %f\n", w, trow, tcol, (float) smem_B[trow + tcol*16]); - - - //__syncthreads(); - - wmma::load_matrix_sync(a_frag, &(smem_A[warp_id*32*16]), 16); // 111 mu - wmma::load_matrix_sync(b_frag, &(smem_B[warp_id*16*8]), 16); // 35 mu + wmma::load_matrix_sync(a_frag, &(smem_A[0]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[0]), 16); // 35 mu wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); } // 129 mu - wmma::store_matrix_sync(&(smem_C[warp_id*32*8]), c_frag, 8, wmma::mem_row_major); + wmma::store_matrix_sync(&(smem_C[0]), c_frag, 8, wmma::mem_row_major); __syncthreads(); //if(threadIdx.x >= 16){ return; } //printf("%i %f\n", threadIdx.x, (float)smem_C[threadIdx.x]); //if(threadIdx.x < 32) - if(warp_lane < 8 && warp_id > 0) - //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; - atomicAdd(&(smem_C[warp_lane]), smem_C[warp_lane + (warp_id*32*8)]); - __syncthreads(); + //if(warp_lane < 8 && warp_id > 0) + // //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; + // atomicAdd(&(smem_C[warp_lane]), smem_C[warp_lane + (warp_id*32*8)]); + //__syncthreads(); //local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); //if(threadIdx.x == 0) @@ -3487,12 +3445,14 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); // these are not used and make no sense, but the compiler needs them //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); diff --git a/csrc/ops.cu b/csrc/ops.cu index d83fc6e..5c4f9c0 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -692,8 +692,8 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); if(bits == 16) - gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) From c35ed09b668db43da967ddeff88c13d92a5cb02a Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 30 Apr 2023 18:19:30 -0700 Subject: [PATCH 68/97] Double frag 440. --- csrc/kernels.cu | 27 ++++++++++++++++----------- tests/test_functional.py | 2 +- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 5a6db7d..5d1982d 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3053,19 +3053,24 @@ template __global__ void gemm_device(int M, //// Allocate shared memory for BlockReduce //__shared__ typename BlockReduce::TempStorage reduce; int col_offset = blockIdx.x *8; - const int warp_id = threadIdx.x / 32; - const int warp_lane = threadIdx.x % 32; + const int half_warp_id = threadIdx.x / 16; + const int half_warp_lane = threadIdx.x % 16; T local_A[64/BITS]; T local_B[64/BITS]; T local_C[8]; - __shared__ T smem_A[WARPS*32*16]; - __shared__ T smem_B[WARPS*16*8]; + const int a_tile_offset = 32*16; + const int b_tile_offset = 16*8; + + __shared__ T smem_A[WARPS*32*16*2]; + __shared__ T smem_B[WARPS*16*8*2]; __shared__ T smem_C[WARPS*32*8]; wmma::fragment a_frag; wmma::fragment b_frag; + wmma::fragment a2_frag; + wmma::fragment b2_frag; wmma::fragment c_frag; wmma::fill_fragment(c_frag, 0.0f); @@ -3087,32 +3092,32 @@ template __global__ void gemm_device(int M, //int block_idx = 0; //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) - for(int base_idx = 0; base_idx < K; base_idx+=16) + for(int base_idx = 0; base_idx < K; base_idx+=32) { int idx = base_idx + threadIdx.x; - if(threadIdx.x < 16) - { if(idx >= K) { smem_A[threadIdx.x] = 0.0f; - smem_B[threadIdx.x] = 0.0f; + //smem_B[threadIdx.x] = 0.0f; } else { - smem_A[threadIdx.x] = A[idx]; + smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = A[idx]; for(int col = 0; col < 8; col++) - smem_B[threadIdx.x + (col*16)] = B[(col_offset+col)*ldb+idx]; + smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = B[(col_offset+col)*ldb+idx]; } - } __syncthreads(); wmma::load_matrix_sync(a_frag, &(smem_A[0]), 16); // 111 mu wmma::load_matrix_sync(b_frag, &(smem_B[0]), 16); // 35 mu + wmma::load_matrix_sync(a2_frag, &(smem_A[32*16]), 16); // 111 mu + wmma::load_matrix_sync(b2_frag, &(smem_B[16*8]), 16); // 35 mu wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + wmma::mma_sync(c_frag, a2_frag, b2_frag, c_frag); } // 129 mu diff --git a/tests/test_functional.py b/tests/test_functional.py index e2ecdcb..f31e9b4 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2373,7 +2373,7 @@ def test_cutlass3_gemm(dtype): #print(C1) #print(C2) - torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.005) + torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.05) #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) From e01d4e033df8f94b28ae4e38608c621653673338 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 30 Apr 2023 18:28:52 -0700 Subject: [PATCH 69/97] Fixed bank conflicts in non-vector load 422. --- csrc/kernels.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 5d1982d..dffd40c 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3060,11 +3060,11 @@ template __global__ void gemm_device(int M, T local_B[64/BITS]; T local_C[8]; - const int a_tile_offset = 32*16; - const int b_tile_offset = 16*8; + const int a_tile_offset = 32*16 + 16; + const int b_tile_offset = 16*8 + 16; - __shared__ T smem_A[WARPS*32*16*2]; - __shared__ T smem_B[WARPS*16*8*2]; + __shared__ T smem_A[WARPS*32*16*2 + (16*1)]; + __shared__ T smem_B[WARPS*16*8*2 + (16*1)]; __shared__ T smem_C[WARPS*32*8]; wmma::fragment a_frag; @@ -3114,8 +3114,8 @@ template __global__ void gemm_device(int M, wmma::load_matrix_sync(a_frag, &(smem_A[0]), 16); // 111 mu wmma::load_matrix_sync(b_frag, &(smem_B[0]), 16); // 35 mu - wmma::load_matrix_sync(a2_frag, &(smem_A[32*16]), 16); // 111 mu - wmma::load_matrix_sync(b2_frag, &(smem_B[16*8]), 16); // 35 mu + wmma::load_matrix_sync(a2_frag, &(smem_A[a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b2_frag, &(smem_B[b_tile_offset]), 16); // 35 mu wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); wmma::mma_sync(c_frag, a2_frag, b2_frag, c_frag); } From 30d03e0254f9868f29392f318787667d5bdff891 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 30 Apr 2023 18:55:12 -0700 Subject: [PATCH 70/97] 64 threads, high smem, 434. --- csrc/kernels.cu | 48 ++++++++++++++++++++++++------------------------ csrc/ops.cu | 3 ++- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index dffd40c..4002117 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3041,7 +3041,7 @@ template __device__ inline void vector_l } } -#define WARPS 1 +#define WARPS 2 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { @@ -3062,10 +3062,11 @@ template __global__ void gemm_device(int M, const int a_tile_offset = 32*16 + 16; const int b_tile_offset = 16*8 + 16; + const int c_tile_offset = 32*8 + 24; - __shared__ T smem_A[WARPS*32*16*2 + (16*1)]; - __shared__ T smem_B[WARPS*16*8*2 + (16*1)]; - __shared__ T smem_C[WARPS*32*8]; + __shared__ T smem_A[WARPS*32*16*2 + (16*(WARPS-1))]; + __shared__ T smem_B[WARPS*16*8*2 + (16*(WARPS-1))]; + __shared__ T smem_C[WARPS*32*8 + (24*(WARPS-1))]; wmma::fragment a_frag; wmma::fragment b_frag; @@ -3092,46 +3093,45 @@ template __global__ void gemm_device(int M, //int block_idx = 0; //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) - for(int base_idx = 0; base_idx < K; base_idx+=32) + for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) { int idx = base_idx + threadIdx.x; - if(idx >= K) - { - smem_A[threadIdx.x] = 0.0f; - //smem_B[threadIdx.x] = 0.0f; - } - else - { + if(idx >= K) + { + smem_A[threadIdx.x] = 0.0f; + //smem_B[threadIdx.x] = 0.0f; + } + else + { + smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = A[idx]; - smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = A[idx]; - - for(int col = 0; col < 8; col++) - smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = B[(col_offset+col)*ldb+idx]; - } + for(int col = 0; col < 8; col++) + smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = B[(col_offset+col)*ldb+idx]; + } __syncthreads(); wmma::load_matrix_sync(a_frag, &(smem_A[0]), 16); // 111 mu wmma::load_matrix_sync(b_frag, &(smem_B[0]), 16); // 35 mu - wmma::load_matrix_sync(a2_frag, &(smem_A[a_tile_offset]), 16); // 111 mu - wmma::load_matrix_sync(b2_frag, &(smem_B[b_tile_offset]), 16); // 35 mu + wmma::load_matrix_sync(a2_frag, &(smem_A[half_warp_id*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b2_frag, &(smem_B[half_warp_id*b_tile_offset]), 16); // 35 mu wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); wmma::mma_sync(c_frag, a2_frag, b2_frag, c_frag); } // 129 mu - wmma::store_matrix_sync(&(smem_C[0]), c_frag, 8, wmma::mem_row_major); + wmma::store_matrix_sync(&(smem_C[half_warp_id*c_tile_offset]), c_frag, 8, wmma::mem_row_major); __syncthreads(); //if(threadIdx.x >= 16){ return; } //printf("%i %f\n", threadIdx.x, (float)smem_C[threadIdx.x]); //if(threadIdx.x < 32) - //if(warp_lane < 8 && warp_id > 0) - // //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; - // atomicAdd(&(smem_C[warp_lane]), smem_C[warp_lane + (warp_id*32*8)]); - //__syncthreads(); + if(half_warp_lane < 8 && half_warp_id > 0) + //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; + atomicAdd(&(smem_C[half_warp_lane]), smem_C[half_warp_lane + (half_warp_id*c_tile_offset)]); + __syncthreads(); //local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); //if(threadIdx.x == 0) diff --git a/csrc/ops.cu b/csrc/ops.cu index 5c4f9c0..57d5cca 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -693,7 +693,8 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); if(bits == 16) //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) From cabcd9b9d5c986b5c3c58318f9c1185ea8d8eff5 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 30 Apr 2023 19:12:42 -0700 Subject: [PATCH 71/97] Halved shared memory 466. --- csrc/kernels.cu | 74 ++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 4002117..301221c 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3053,25 +3053,23 @@ template __global__ void gemm_device(int M, //// Allocate shared memory for BlockReduce //__shared__ typename BlockReduce::TempStorage reduce; int col_offset = blockIdx.x *8; + const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; const int half_warp_lane = threadIdx.x % 16; - T local_A[64/BITS]; - T local_B[64/BITS]; - T local_C[8]; + T local_A[1]; + T local_B[8]; const int a_tile_offset = 32*16 + 16; const int b_tile_offset = 16*8 + 16; const int c_tile_offset = 32*8 + 24; - __shared__ T smem_A[WARPS*32*16*2 + (16*(WARPS-1))]; - __shared__ T smem_B[WARPS*16*8*2 + (16*(WARPS-1))]; + __shared__ T smem_A[WARPS*32*16 + (16*(WARPS-1))]; + __shared__ T smem_B[WARPS*16*8 + (16*(WARPS-1))]; __shared__ T smem_C[WARPS*32*8 + (24*(WARPS-1))]; wmma::fragment a_frag; wmma::fragment b_frag; - wmma::fragment a2_frag; - wmma::fragment b2_frag; wmma::fragment c_frag; wmma::fill_fragment(c_frag, 0.0f); @@ -3087,9 +3085,9 @@ template __global__ void gemm_device(int M, smem_C[i] = T(0); __syncthreads(); - #pragma unroll 8 - for(int k = 0; k < 8; k++) - local_C[k] = T(0); + //#pragma unroll 8 + //for(int k = 0; k < 8; k++) + //local_C[k] = T(0); //int block_idx = 0; //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) @@ -3097,27 +3095,45 @@ template __global__ void gemm_device(int M, { int idx = base_idx + threadIdx.x; - if(idx >= K) + for(int k = 0; k < 2; k++) { - smem_A[threadIdx.x] = 0.0f; - //smem_B[threadIdx.x] = 0.0f; + if(k == 0) + { + if(idx < K) + { + local_A[0] = A[idx]; + + #pragma unroll 8 + for(int col = 0; col < 8; col++) + local_B[col] = B[(col_offset+col)*ldb+idx]; + } + + } + + if(idx >= K) + { + smem_A[threadIdx.x] = 0.0f; + //smem_B[threadIdx.x] = 0.0f; + } + else + { + if((k == 0 && half_warp_id % 2 == 0) || + (k == 1 && half_warp_id % 2 == 1)) + { + smem_A[half_warp_lane + (warp_id*a_tile_offset)] = local_A[0]; + + #pragma unroll 8 + for(int col = 0; col < 8; col++) + smem_B[half_warp_lane + (warp_id*b_tile_offset) + (col*16)] = local_B[col]; + } + } + + __syncthreads(); + + wmma::load_matrix_sync(a_frag, &(smem_A[warp_id*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[warp_id*b_tile_offset]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); } - else - { - smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = A[idx]; - - for(int col = 0; col < 8; col++) - smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = B[(col_offset+col)*ldb+idx]; - } - - __syncthreads(); - - wmma::load_matrix_sync(a_frag, &(smem_A[0]), 16); // 111 mu - wmma::load_matrix_sync(b_frag, &(smem_B[0]), 16); // 35 mu - wmma::load_matrix_sync(a2_frag, &(smem_A[half_warp_id*a_tile_offset]), 16); // 111 mu - wmma::load_matrix_sync(b2_frag, &(smem_B[half_warp_id*b_tile_offset]), 16); // 35 mu - wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); - wmma::mma_sync(c_frag, a2_frag, b2_frag, c_frag); } // 129 mu From 7cc8ff4727e9e1094937b59aef96777c4818ae8a Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 1 May 2023 08:21:12 -0700 Subject: [PATCH 72/97] Warp specalization 362. --- csrc/kernels.cu | 102 +++++++++++++++++++++------------------ csrc/ops.cu | 5 +- tests/test_functional.py | 6 +-- 3 files changed, 61 insertions(+), 52 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 301221c..2c0737d 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3041,7 +3041,7 @@ template __device__ inline void vector_l } } -#define WARPS 2 +#define WARPS 4 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { @@ -3056,17 +3056,18 @@ template __global__ void gemm_device(int M, const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; const int half_warp_lane = threadIdx.x % 16; + const int batch_size_warps = (WARPS-1)*2; T local_A[1]; T local_B[8]; - const int a_tile_offset = 32*16 + 16; - const int b_tile_offset = 16*8 + 16; + const int a_tile_offset = (32*16 + 16); + const int b_tile_offset = (16*8 + 16); const int c_tile_offset = 32*8 + 24; - __shared__ T smem_A[WARPS*32*16 + (16*(WARPS-1))]; - __shared__ T smem_B[WARPS*16*8 + (16*(WARPS-1))]; - __shared__ T smem_C[WARPS*32*8 + (24*(WARPS-1))]; + __shared__ T smem_A[2*batch_size_warps*32*16 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_B[2*batch_size_warps*16*8 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_C[32*8]; wmma::fragment a_frag; wmma::fragment b_frag; @@ -3091,63 +3092,68 @@ template __global__ void gemm_device(int M, //int block_idx = 0; //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) - for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) + int ticktock = 0; + int idx = 0 + threadIdx.x; + // prefetch + if(idx < K && warp_id < (WARPS-1)) { - int idx = base_idx + threadIdx.x; + local_A[0] = A[idx]; - for(int k = 0; k < 2; k++) + #pragma unroll 8 + for(int col = 0; col < 8; col++) + local_B[col] = B[(col_offset+col)*ldb+idx]; + + smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = local_A[0]; + + #pragma unroll 8 + for(int col = 0; col < 8; col++) + smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = local_B[col]; + } + ticktock = ticktock == 0 ? 1 : 0; + + for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x-32) + { + idx = base_idx + threadIdx.x; + + __syncthreads(); + if(idx < K && warp_id < (WARPS-1)) { - if(k == 0) - { - if(idx < K) - { - local_A[0] = A[idx]; + local_A[0] = A[idx]; - #pragma unroll 8 - for(int col = 0; col < 8; col++) - local_B[col] = B[(col_offset+col)*ldb+idx]; - } + #pragma unroll 8 + for(int col = 0; col < 8; col++) + local_B[col] = B[(col_offset+col)*ldb+idx]; - } - - if(idx >= K) - { - smem_A[threadIdx.x] = 0.0f; - //smem_B[threadIdx.x] = 0.0f; - } - else - { - if((k == 0 && half_warp_id % 2 == 0) || - (k == 1 && half_warp_id % 2 == 1)) - { - smem_A[half_warp_lane + (warp_id*a_tile_offset)] = local_A[0]; + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; - #pragma unroll 8 - for(int col = 0; col < 8; col++) - smem_B[half_warp_lane + (warp_id*b_tile_offset) + (col*16)] = local_B[col]; - } - } - - __syncthreads(); - - wmma::load_matrix_sync(a_frag, &(smem_A[warp_id*a_tile_offset]), 16); // 111 mu - wmma::load_matrix_sync(b_frag, &(smem_B[warp_id*b_tile_offset]), 16); // 35 mu - wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + #pragma unroll 8 + for(int col = 0; col < 8; col++) + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; } + ticktock = ticktock == 0 ? 1 : 0; + + if(warp_id == (WARPS-1)) + for(int k = 0; k < batch_size_warps; k++) + { + wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + } } // 129 mu - wmma::store_matrix_sync(&(smem_C[half_warp_id*c_tile_offset]), c_frag, 8, wmma::mem_row_major); + if(warp_id == (WARPS-1)) + wmma::store_matrix_sync(&(smem_C[0]), c_frag, 8, wmma::mem_row_major); __syncthreads(); //if(threadIdx.x >= 16){ return; } //printf("%i %f\n", threadIdx.x, (float)smem_C[threadIdx.x]); //if(threadIdx.x < 32) - if(half_warp_lane < 8 && half_warp_id > 0) - //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; - atomicAdd(&(smem_C[half_warp_lane]), smem_C[half_warp_lane + (half_warp_id*c_tile_offset)]); - __syncthreads(); + //if(half_warp_lane < 8 && half_warp_id > 0) + // //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; + // atomicAdd(&(smem_C[half_warp_lane]), smem_C[half_warp_lane + (half_warp_id*c_tile_offset)]); + //__syncthreads(); //local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); //if(threadIdx.x == 0) @@ -3463,6 +3469,7 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // these are not used and make no sense, but the compiler needs them //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); @@ -3470,6 +3477,7 @@ template __global__ void gemm_device(int M, int N, int K, half * _ // these are not used and make no sense, but the compiler needs them //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); diff --git a/csrc/ops.cu b/csrc/ops.cu index 57d5cca..c1c27b8 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -692,9 +692,10 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); if(bits == 16) - //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 64, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) diff --git a/tests/test_functional.py b/tests/test_functional.py index f31e9b4..5f90f69 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2370,10 +2370,10 @@ def test_cutlass3_gemm(dtype): C1 = torch.matmul(A, B.t()) C2 = F.cutlass3_gemm(A, B.t()) - #print(C1) - #print(C2) + print(C1) + print(C2) - torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.05) + torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.06) #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) From 3d4a2eadd3c1481447b8e885018ed24341ea91a5 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 1 May 2023 16:23:45 -0700 Subject: [PATCH 73/97] 16x16 240. --- csrc/kernels.cu | 52 ++++++++++++++++++++++++------------------------- csrc/ops.cu | 2 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 2c0737d..4e3a4a3 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3052,37 +3052,37 @@ template __global__ void gemm_device(int M, //typedef cub::BlockReduce BlockReduce; //// Allocate shared memory for BlockReduce //__shared__ typename BlockReduce::TempStorage reduce; - int col_offset = blockIdx.x *8; + int col_offset = blockIdx.x *16; const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; const int half_warp_lane = threadIdx.x % 16; const int batch_size_warps = (WARPS-1)*2; T local_A[1]; - T local_B[8]; + T local_B[16]; - const int a_tile_offset = (32*16 + 16); - const int b_tile_offset = (16*8 + 16); - const int c_tile_offset = 32*8 + 24; + const int a_tile_offset = (16*16 + 16); + const int b_tile_offset = (16*16 + 16); + const int c_tile_offset = 16*16 + 24; - __shared__ T smem_A[2*batch_size_warps*32*16 + (2*16*(batch_size_warps-1))]; - __shared__ T smem_B[2*batch_size_warps*16*8 + (2*16*(batch_size_warps-1))]; - __shared__ T smem_C[32*8]; + __shared__ T smem_A[2*batch_size_warps*16*16 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_B[2*batch_size_warps*16*16 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_C[16*16]; - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment c_frag; + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment c_frag; wmma::fill_fragment(c_frag, 0.0f); - for(int i = threadIdx.x; i < 32*16*WARPS; i+=blockDim.x) - smem_A[i] = T(0); + //for(int i = threadIdx.x; i < 16*16*WARPS; i+=blockDim.x) + // smem_A[i] = T(0); - for(int i = threadIdx.x; i < 32*8*WARPS; i+=blockDim.x) - smem_B[i] = T(0); + //for(int i = threadIdx.x; i < 16*16*WARPS; i+=blockDim.x) + // smem_B[i] = T(0); - for(int i = threadIdx.x; i < 32*8*WARPS; i+=blockDim.x) + for(int i = threadIdx.x; i < 16*16; i+=blockDim.x) smem_C[i] = T(0); __syncthreads(); @@ -3099,14 +3099,14 @@ template __global__ void gemm_device(int M, { local_A[0] = A[idx]; - #pragma unroll 8 - for(int col = 0; col < 8; col++) + #pragma unroll 16 + for(int col = 0; col < 16; col++) local_B[col] = B[(col_offset+col)*ldb+idx]; smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = local_A[0]; - #pragma unroll 8 - for(int col = 0; col < 8; col++) + #pragma unroll 16 + for(int col = 0; col < 16; col++) smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = local_B[col]; } ticktock = ticktock == 0 ? 1 : 0; @@ -3120,14 +3120,14 @@ template __global__ void gemm_device(int M, { local_A[0] = A[idx]; - #pragma unroll 8 - for(int col = 0; col < 8; col++) + #pragma unroll 16 + for(int col = 0; col < 16; col++) local_B[col] = B[(col_offset+col)*ldb+idx]; smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; - #pragma unroll 8 - for(int col = 0; col < 8; col++) + #pragma unroll 16 + for(int col = 0; col < 16; col++) smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; } ticktock = ticktock == 0 ? 1 : 0; @@ -3143,7 +3143,7 @@ template __global__ void gemm_device(int M, // 129 mu if(warp_id == (WARPS-1)) - wmma::store_matrix_sync(&(smem_C[0]), c_frag, 8, wmma::mem_row_major); + wmma::store_matrix_sync(&(smem_C[0]), c_frag, 16, wmma::mem_row_major); __syncthreads(); //if(threadIdx.x >= 16){ return; } @@ -3185,7 +3185,7 @@ template __global__ void gemm_device(int M, //if(threadIdx.x < 8 && col_offset + threadIdx.x < M) //out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; - if(threadIdx.x < 8 && col_offset + threadIdx.x < M) + if(threadIdx.x < 16 && col_offset + threadIdx.x < M) out[col_offset + threadIdx.x] = smem_C[threadIdx.x]; } diff --git a/csrc/ops.cu b/csrc/ops.cu index c1c27b8..d0e903f 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -678,7 +678,7 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits) { - int num_blocks = (m+7)/8; + int num_blocks = (m+15)/16; cout << num_blocks << endl; cout << lda << endl; From 7bfa09d0fcaa524863bcc8ea71436f99423bbd3f Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 1 May 2023 16:38:09 -0700 Subject: [PATCH 74/97] 8x32 240 6 warps. --- csrc/kernels.cu | 50 ++++++++++++++++++++++++++----------------------- csrc/ops.cu | 6 ++++-- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 4e3a4a3..b03c6ca 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3041,7 +3041,7 @@ template __device__ inline void vector_l } } -#define WARPS 4 +#define WARPS 6 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { @@ -3052,26 +3052,26 @@ template __global__ void gemm_device(int M, //typedef cub::BlockReduce BlockReduce; //// Allocate shared memory for BlockReduce //__shared__ typename BlockReduce::TempStorage reduce; - int col_offset = blockIdx.x *16; + int col_offset = blockIdx.x *32; const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; const int half_warp_lane = threadIdx.x % 16; const int batch_size_warps = (WARPS-1)*2; T local_A[1]; - T local_B[16]; + T local_B[32]; - const int a_tile_offset = (16*16 + 16); - const int b_tile_offset = (16*16 + 16); - const int c_tile_offset = 16*16 + 24; + const int a_tile_offset = (8*16 + 16); + const int b_tile_offset = (16*32 + 16); + const int c_tile_offset = 8*32 + 24; - __shared__ T smem_A[2*batch_size_warps*16*16 + (2*16*(batch_size_warps-1))]; - __shared__ T smem_B[2*batch_size_warps*16*16 + (2*16*(batch_size_warps-1))]; - __shared__ T smem_C[16*16]; + __shared__ T smem_A[2*batch_size_warps*8*16 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_C[8*32]; - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment c_frag; + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment c_frag; wmma::fill_fragment(c_frag, 0.0f); @@ -3082,7 +3082,7 @@ template __global__ void gemm_device(int M, //for(int i = threadIdx.x; i < 16*16*WARPS; i+=blockDim.x) // smem_B[i] = T(0); - for(int i = threadIdx.x; i < 16*16; i+=blockDim.x) + for(int i = threadIdx.x; i < 8*32; i+=blockDim.x) smem_C[i] = T(0); __syncthreads(); @@ -3099,14 +3099,14 @@ template __global__ void gemm_device(int M, { local_A[0] = A[idx]; - #pragma unroll 16 - for(int col = 0; col < 16; col++) + #pragma unroll 32 + for(int col = 0; col < 32; col++) local_B[col] = B[(col_offset+col)*ldb+idx]; smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = local_A[0]; - #pragma unroll 16 - for(int col = 0; col < 16; col++) + #pragma unroll 32 + for(int col = 0; col < 32; col++) smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = local_B[col]; } ticktock = ticktock == 0 ? 1 : 0; @@ -3120,14 +3120,14 @@ template __global__ void gemm_device(int M, { local_A[0] = A[idx]; - #pragma unroll 16 - for(int col = 0; col < 16; col++) + #pragma unroll 32 + for(int col = 0; col < 32; col++) local_B[col] = B[(col_offset+col)*ldb+idx]; smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; - #pragma unroll 16 - for(int col = 0; col < 16; col++) + #pragma unroll 32 + for(int col = 0; col < 32; col++) smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; } ticktock = ticktock == 0 ? 1 : 0; @@ -3143,7 +3143,7 @@ template __global__ void gemm_device(int M, // 129 mu if(warp_id == (WARPS-1)) - wmma::store_matrix_sync(&(smem_C[0]), c_frag, 16, wmma::mem_row_major); + wmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major); __syncthreads(); //if(threadIdx.x >= 16){ return; } @@ -3185,7 +3185,7 @@ template __global__ void gemm_device(int M, //if(threadIdx.x < 8 && col_offset + threadIdx.x < M) //out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; - if(threadIdx.x < 16 && col_offset + threadIdx.x < M) + if(threadIdx.x < 32 && col_offset + threadIdx.x < M) out[col_offset + threadIdx.x] = smem_C[threadIdx.x]; } @@ -3470,18 +3470,22 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // these are not used and make no sense, but the compiler needs them //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); // these are not used and make no sense, but the compiler needs them //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); diff --git a/csrc/ops.cu b/csrc/ops.cu index d0e903f..2ccb418 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -678,7 +678,7 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size) template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits) { - int num_blocks = (m+15)/16; + int num_blocks = (m+31)/32; cout << num_blocks << endl; cout << lda << endl; @@ -693,7 +693,9 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); if(bits == 16) //gemm_device<<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + gemm_device<<< num_blocks, 192, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + //gemm_device<<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 64, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } From f9bfea8f2335a63fbb7b24175e1fa2951ee55bf1 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 07:24:12 -0700 Subject: [PATCH 75/97] Baseline for debugging. --- bitsandbytes/functional.py | 2 +- csrc/kernels.cu | 31 ++++++++++++++++++++++++++++--- csrc/ops.cu | 14 +++++++------- tests/test_functional.py | 34 ++++++++++++++++++++++++++++------ 4 files changed, 64 insertions(+), 17 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index f725c1c..b4cbd28 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1467,7 +1467,7 @@ def cutlass3_gemm( lda = Bshape[1] ldc = Bshape[0] ldb = (ldb+1)//2 - print(m, n, k, lda, ldb, ldc) + #print(m, n, k, lda, ldb, ldc) is_on_gpu([B, A, out]) m = ct.c_int32(m) n = ct.c_int32(n) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index b03c6ca..477904c 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3061,9 +3061,8 @@ template __global__ void gemm_device(int M, T local_A[1]; T local_B[32]; - const int a_tile_offset = (8*16 + 16); - const int b_tile_offset = (16*32 + 16); - const int c_tile_offset = 8*32 + 24; + const int a_tile_offset = (8*16); + const int b_tile_offset = (16*32); __shared__ T smem_A[2*batch_size_warps*8*16 + (2*16*(batch_size_warps-1))]; __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))]; @@ -3109,6 +3108,19 @@ template __global__ void gemm_device(int M, for(int col = 0; col < 32; col++) smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = local_B[col]; } + else if(warp_id < (WARPS-1)) + { + local_A[0] = T(0.0); + smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = T(0.0); + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = T(0.0f); + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = T(0.0f); + } ticktock = ticktock == 0 ? 1 : 0; for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x-32) @@ -3130,6 +3142,19 @@ template __global__ void gemm_device(int M, for(int col = 0; col < 32; col++) smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; } + else if(warp_id < (WARPS-1)) + { + local_A[0] = T(0.0); + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = 0.0f; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = 0.0f; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f; + } ticktock = ticktock == 0 ? 1 : 0; if(warp_id == (WARPS-1)) diff --git a/csrc/ops.cu b/csrc/ops.cu index 2ccb418..6bf1e89 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -680,14 +680,14 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out int num_blocks = (m+31)/32; - cout << num_blocks << endl; - cout << lda << endl; - cout << ldb << endl; - cout << ldc << endl; + //cout << num_blocks << endl; + //cout << lda << endl; + //cout << ldb << endl; + //cout << ldc << endl; - cout << m << endl; - cout << n << endl; - cout << k << endl; + //cout << m << endl; + //cout << n << endl; + //cout << k << endl; //if(bits == 32) //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); diff --git a/tests/test_functional.py b/tests/test_functional.py index 5f90f69..25fbb5b 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2355,25 +2355,47 @@ def test_normal_map_tree(): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): - for i in range(1): + for i in range(100): #A = torch.rand(2, 4092, dtype=dtype, device='cuda') #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') #A = torch.rand(1, 4096, dtype=dtype, device='cuda') #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') - A = torch.rand(1, 4096, dtype=dtype, device='cuda') - B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') + A = torch.randn(1, 128+32, dtype=dtype, device='cuda') + B = torch.randn(4096, 128+32, dtype=dtype, device='cuda')/math.sqrt(128) #print('') #print(A) #print(B.t()) + #A[:, :-3] = 0 + #B[:, :-3] = 0 C1 = torch.matmul(A, B.t()) C2 = F.cutlass3_gemm(A, B.t()) - print(C1) - print(C2) + err = C1-C2 - torch.testing.assert_close(C1, C2, atol=1e-05, rtol=0.06) + # tensor cores are non-deterministic + # so we need to analyze errors around the mean + # to test our implementation + err = torch.abs(err.mean()).item() + mag = torch.abs(C1).mean() + relerr = err/mag + + if err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: + print('') + print(i, err, mag.item(), relerr.item()) + print(A.flatten()[-6:]) + print(B.flatten()[-6:]) + out = A.flatten()[-6:]*B.flatten()[-6:] + print(out) + print(out[:-1].sum()) + print('='*80) + print(C1.flatten()[-6:]) + print(C2.flatten()[-6:]) + #assert False, 'ERROR' + + c = int(C1.numel()*0.001) + assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c) #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) From 9192c9de648338dd9281368ed0bff20dc123490b Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 07:50:32 -0700 Subject: [PATCH 76/97] Tighter and scaled error analysis. --- csrc/kernels.cu | 15 ++++++- tests/test_functional.py | 85 +++++++++++++++++++++++----------------- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 477904c..2fa288f 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3123,6 +3123,7 @@ template __global__ void gemm_device(int M, } ticktock = ticktock == 0 ? 1 : 0; + //for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32) for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x-32) { idx = base_idx + threadIdx.x; @@ -3155,8 +3156,9 @@ template __global__ void gemm_device(int M, for(int col = 0; col < 32; col++) smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f; } - ticktock = ticktock == 0 ? 1 : 0; + //ticktock = ticktock == 0 ? 1 : 0; + __syncthreads(); if(warp_id == (WARPS-1)) for(int k = 0; k < batch_size_warps; k++) { @@ -3166,11 +3168,22 @@ template __global__ void gemm_device(int M, } } + //__syncthreads(); + //if(warp_id == (WARPS-1)) + // for(int k = 0; k < batch_size_warps; k++) + // { + // wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu + // wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu + // wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + // } + __syncthreads(); + // 129 mu if(warp_id == (WARPS-1)) wmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major); __syncthreads(); + //if(threadIdx.x >= 16){ return; } //printf("%i %f\n", threadIdx.x, (float)smem_C[threadIdx.x]); diff --git a/tests/test_functional.py b/tests/test_functional.py index 25fbb5b..0500984 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2355,47 +2355,62 @@ def test_normal_map_tree(): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): - for i in range(100): - #A = torch.rand(2, 4092, dtype=dtype, device='cuda') - #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') - #A = torch.rand(1, 4096, dtype=dtype, device='cuda') - #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') - A = torch.randn(1, 128+32, dtype=dtype, device='cuda') - B = torch.randn(4096, 128+32, dtype=dtype, device='cuda')/math.sqrt(128) + for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: + errs = [] + relerrs = [] + max_err = 0 + max_relerr = 0 + for i in range(100): + #A = torch.rand(2, 4092, dtype=dtype, device='cuda') + #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') + #A = torch.rand(1, 4096, dtype=dtype, device='cuda') + #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') + A = torch.randn(1, dim+0, dtype=dtype, device='cuda') + B = torch.randn(4*496, dim+0, dtype=dtype, device='cuda')/math.sqrt(dim) - #print('') - #print(A) - #print(B.t()) - #A[:, :-3] = 0 - #B[:, :-3] = 0 + #print('') + #print(A) + #print(B.t()) + #A[:, :-3] = 0 + #B[:, :-3] = 0 - C1 = torch.matmul(A, B.t()) - C2 = F.cutlass3_gemm(A, B.t()) - err = C1-C2 + C1 = torch.matmul(A, B.t()) + C2 = F.cutlass3_gemm(A, B.t()) - # tensor cores are non-deterministic - # so we need to analyze errors around the mean - # to test our implementation - err = torch.abs(err.mean()).item() - mag = torch.abs(C1).mean() - relerr = err/mag + # tensor cores are non-deterministic + # so we need to analyze errors around the mean + # to test our implementation + err = torch.abs(C1-C2) + mag = torch.abs(C1)+1e-8 + relerr = err/mag + max_err = max(err.max(), max_err) + max_relerr = max(relerr.max(), max_relerr) + err = err.mean().item() + relerr = relerr.mean().item() - if err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: - print('') - print(i, err, mag.item(), relerr.item()) - print(A.flatten()[-6:]) - print(B.flatten()[-6:]) - out = A.flatten()[-6:]*B.flatten()[-6:] - print(out) - print(out[:-1].sum()) - print('='*80) - print(C1.flatten()[-6:]) - print(C2.flatten()[-6:]) - #assert False, 'ERROR' + errs.append(err) + relerrs.append(relerr) - c = int(C1.numel()*0.001) - assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c) + #if err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: + # print('') + # print(i, err, mag.item(), relerr.item()) + # print(A.flatten()[-6:]) + # print(B.flatten()[-6:]) + # out = A.flatten()[-6:]*B.flatten()[-6:] + # print(out) + # print(out[:-1].sum()) + # print('='*80) + # print(C1.flatten()[-6:]) + # print(C2.flatten()[-6:]) + # #assert False, 'ERROR' + + c = int(C1.numel()*0.00125*(dim/256))+1 + assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c) + print('') + print(dim, sum(errs)/len(errs)/math.sqrt(dim)) + print(dim, sum(relerrs)/len(relerrs)/math.sqrt(dim)) + print(dim, (max_err.item(), max_relerr.item())) #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) From 9aa232cc3918ef722791c2a6775aaa807ad72109 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 07:53:29 -0700 Subject: [PATCH 77/97] Initial. --- tests/test_functional.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_functional.py b/tests/test_functional.py index 0500984..808c1ce 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2406,6 +2406,7 @@ def test_cutlass3_gemm(dtype): # #assert False, 'ERROR' c = int(C1.numel()*0.00125*(dim/256))+1 + assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c) print('') print(dim, sum(errs)/len(errs)/math.sqrt(dim)) From 394749db718526aa7810333f0f90caa2b6af8554 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 08:58:59 -0700 Subject: [PATCH 78/97] Correct implementation 240. --- csrc/kernels.cu | 48 +++++++++++++++------------------------- tests/test_functional.py | 20 +++++++++++------ 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 2fa288f..8ce881c 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3061,8 +3061,8 @@ template __global__ void gemm_device(int M, T local_A[1]; T local_B[32]; - const int a_tile_offset = (8*16); - const int b_tile_offset = (16*32); + const int a_tile_offset = (8*16 + 16); + const int b_tile_offset = (16*32 + 16); __shared__ T smem_A[2*batch_size_warps*8*16 + (2*16*(batch_size_warps-1))]; __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))]; @@ -3074,23 +3074,10 @@ template __global__ void gemm_device(int M, wmma::fill_fragment(c_frag, 0.0f); - - //for(int i = threadIdx.x; i < 16*16*WARPS; i+=blockDim.x) - // smem_A[i] = T(0); - - //for(int i = threadIdx.x; i < 16*16*WARPS; i+=blockDim.x) - // smem_B[i] = T(0); - for(int i = threadIdx.x; i < 8*32; i+=blockDim.x) smem_C[i] = T(0); __syncthreads(); - //#pragma unroll 8 - //for(int k = 0; k < 8; k++) - //local_C[k] = T(0); - - //int block_idx = 0; - //for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x) int ticktock = 0; int idx = 0 + threadIdx.x; // prefetch @@ -3102,29 +3089,29 @@ template __global__ void gemm_device(int M, for(int col = 0; col < 32; col++) local_B[col] = B[(col_offset+col)*ldb+idx]; - smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = local_A[0]; + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; #pragma unroll 32 for(int col = 0; col < 32; col++) - smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = local_B[col]; + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; } else if(warp_id < (WARPS-1)) { local_A[0] = T(0.0); - smem_A[half_warp_lane + (half_warp_id*a_tile_offset)] = T(0.0); + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = 0.0f; #pragma unroll 32 for(int col = 0; col < 32; col++) - local_B[col] = T(0.0f); + local_B[col] = 0.0f; #pragma unroll 32 for(int col = 0; col < 32; col++) - smem_B[half_warp_lane + (half_warp_id*b_tile_offset) + (col*16)] = T(0.0f); + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f; } ticktock = ticktock == 0 ? 1 : 0; //for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32) - for(int base_idx = 0; base_idx < K; base_idx+=blockDim.x-32) + for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32) { idx = base_idx + threadIdx.x; @@ -3156,7 +3143,7 @@ template __global__ void gemm_device(int M, for(int col = 0; col < 32; col++) smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f; } - //ticktock = ticktock == 0 ? 1 : 0; + ticktock = ticktock == 0 ? 1 : 0; __syncthreads(); if(warp_id == (WARPS-1)) @@ -3168,14 +3155,15 @@ template __global__ void gemm_device(int M, } } - //__syncthreads(); - //if(warp_id == (WARPS-1)) - // for(int k = 0; k < batch_size_warps; k++) - // { - // wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu - // wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu - // wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); - // } + __syncthreads(); + ticktock = ticktock == 0 ? 1 : 0; + if(warp_id == (WARPS-1)) + for(int k = 0; k < batch_size_warps; k++) + { + wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + } __syncthreads(); // 129 mu diff --git a/tests/test_functional.py b/tests/test_functional.py index 808c1ce..4c86d83 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -18,12 +18,15 @@ torch.set_printoptions( k = 20 -def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0): +def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0, throw=True): idx = torch.isclose(a, b, rtol, atol) sumval = (idx == 0).sum().item() if sumval > count: - print(f"Too many values not close: assert {sumval} < {count}") - torch.testing.assert_allclose(a, b, rtol, atol) + if throw: + print(f"Too many values not close: assert {sumval} < {count}") + torch.testing.assert_allclose(a, b, rtol, atol) + + return sumval class FFN(torch.nn.Module): @@ -2355,7 +2358,9 @@ def test_normal_map_tree(): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): - for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: + #for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: + #for dim in [4096, 5120, 6656, 8192]: + for dim in [4096]: errs = [] relerrs = [] max_err = 0 @@ -2366,7 +2371,7 @@ def test_cutlass3_gemm(dtype): #A = torch.rand(1, 4096, dtype=dtype, device='cuda') #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') A = torch.randn(1, dim+0, dtype=dtype, device='cuda') - B = torch.randn(4*496, dim+0, dtype=dtype, device='cuda')/math.sqrt(dim) + B = torch.randn(4*dim, dim+0, dtype=dtype, device='cuda')/math.sqrt(dim) #print('') #print(A) @@ -2405,9 +2410,10 @@ def test_cutlass3_gemm(dtype): # print(C2.flatten()[-6:]) # #assert False, 'ERROR' - c = int(C1.numel()*0.00125*(dim/256))+1 + c = int(C1.numel()*0.0014*(dim/256))+1 - assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c) + c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=False) + #print(c/math.sqrt(dim)) print('') print(dim, sum(errs)/len(errs)/math.sqrt(dim)) print(dim, sum(relerrs)/len(relerrs)/math.sqrt(dim)) From 4decb3cc6878a7d51e92dd5f48ec0fb25ec8ba19 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 09:38:14 -0700 Subject: [PATCH 79/97] Removed uncessary sync. --- csrc/kernels.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 8ce881c..d09f78a 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3145,7 +3145,6 @@ template __global__ void gemm_device(int M, } ticktock = ticktock == 0 ? 1 : 0; - __syncthreads(); if(warp_id == (WARPS-1)) for(int k = 0; k < batch_size_warps; k++) { From 89cccd8196b885de777cc6f627bd05c96c700300 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 09:40:31 -0700 Subject: [PATCH 80/97] A tile multi-tiling. --- csrc/kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index d09f78a..a528d16 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3061,10 +3061,10 @@ template __global__ void gemm_device(int M, T local_A[1]; T local_B[32]; - const int a_tile_offset = (8*16 + 16); + const int a_tile_offset = (16 + 16); const int b_tile_offset = (16*32 + 16); - __shared__ T smem_A[2*batch_size_warps*8*16 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_A[8*16 + (4*16*(batch_size_warps-1))]; __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))]; __shared__ T smem_C[8*32]; From 77f15fdce9f11324f6616e4fccc03d16f61347e6 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 11:38:11 -0700 Subject: [PATCH 81/97] Shared memory efficient 240. --- csrc/kernels.cu | 80 ++++++++++------------------------------ csrc/ops.cu | 2 +- tests/test_functional.py | 4 +- 3 files changed, 22 insertions(+), 64 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index a528d16..8b5544a 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3041,7 +3041,7 @@ template __device__ inline void vector_l } } -#define WARPS 6 +#define WARPS 5 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { @@ -3061,23 +3061,18 @@ template __global__ void gemm_device(int M, T local_A[1]; T local_B[32]; - const int a_tile_offset = (16 + 16); + const int a_tile_offset = 16; const int b_tile_offset = (16*32 + 16); - __shared__ T smem_A[8*16 + (4*16*(batch_size_warps-1))]; + __shared__ T smem_A[8*16 + (2*16*(batch_size_warps-1))]; __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))]; - __shared__ T smem_C[8*32]; + //__shared__ T smem_C[8*32]; wmma::fragment a_frag; wmma::fragment b_frag; wmma::fragment c_frag; - wmma::fill_fragment(c_frag, 0.0f); - for(int i = threadIdx.x; i < 8*32; i+=blockDim.x) - smem_C[i] = T(0); - __syncthreads(); - int ticktock = 0; int idx = 0 + threadIdx.x; // prefetch @@ -3155,63 +3150,24 @@ template __global__ void gemm_device(int M, } __syncthreads(); + if(warp_id != (WARPS-1)){ return; } + // only warp_id == (WARPS-1) from here + int warp_lane = threadIdx.x % 32; + ticktock = ticktock == 0 ? 1 : 0; - if(warp_id == (WARPS-1)) - for(int k = 0; k < batch_size_warps; k++) - { - wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu - wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu - wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); - } - __syncthreads(); + for(int k = 0; k < batch_size_warps; k++) + { + wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + } // 129 mu if(warp_id == (WARPS-1)) - wmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major); - __syncthreads(); + wmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major); - - //if(threadIdx.x >= 16){ return; } - //printf("%i %f\n", threadIdx.x, (float)smem_C[threadIdx.x]); - - //if(threadIdx.x < 32) - //if(half_warp_lane < 8 && half_warp_id > 0) - // //local_C[warp_lane] = smem_C[warp_lane + (warp_id*32*8)]; - // atomicAdd(&(smem_C[half_warp_lane]), smem_C[half_warp_lane + (half_warp_id*c_tile_offset)]); - //__syncthreads(); - - //local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); - //if(threadIdx.x == 0) - // for(int row = 0; row < 32; row++) - // { - // printf("row %i ", row); - // for(int id = 0; id < 4; id++) - // { - // printf(" id %i: ", id); - // for(int k = 0; k < 8; k++) - // printf("%f ", (float)smem_C[k + (row*8) + (id*32*8)]); - // printf("\n"); - // } - // } - - //__syncthreads(); - - //if((float)local_C[0] !=0.0f) - // printf("%i %i %f\n", warp_lane, warp_id, (float)local_C[0]); - //local_C[0] = WarpReduce(temp_storage).Sum(local_C[0]); - - //__syncwarp(); - - ////for(int i = threadIdx.x; i < 32*8; i+=blockDim.x) - ////{ - // if((float)local_C[0] !=0.0f) - // printf("%i %f\n", 0, (float)local_C[0]); - //} - - //if(threadIdx.x < 8 && col_offset + threadIdx.x < M) - //out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; - if(threadIdx.x < 32 && col_offset + threadIdx.x < M) - out[col_offset + threadIdx.x] = smem_C[threadIdx.x]; + if(col_offset + warp_lane < M) + out[col_offset + warp_lane] = smem_A[warp_lane]; } template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) @@ -3496,6 +3452,7 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); @@ -3506,6 +3463,7 @@ template __global__ void gemm_device(int M, int N, int K, half * _ //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); +template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); diff --git a/csrc/ops.cu b/csrc/ops.cu index 6bf1e89..16d82f9 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -693,7 +693,7 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); if(bits == 16) //gemm_device<<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - gemm_device<<< num_blocks, 192, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); + gemm_device<<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); diff --git a/tests/test_functional.py b/tests/test_functional.py index 4c86d83..62dd1cb 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2358,9 +2358,9 @@ def test_normal_map_tree(): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): - #for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: + for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: #for dim in [4096, 5120, 6656, 8192]: - for dim in [4096]: + #for dim in [4096]: errs = [] relerrs = [] max_err = 0 From 869b7e83b506cdb7e342e4939580104b486ed9ba Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 12:10:32 -0700 Subject: [PATCH 82/97] Warp multi-specialization 240. --- csrc/kernels.cu | 62 +++++++++++++++++++++++++++++++++------- tests/test_functional.py | 8 +++--- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 8b5544a..65ed19e 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3058,8 +3058,8 @@ template __global__ void gemm_device(int M, const int half_warp_lane = threadIdx.x % 16; const int batch_size_warps = (WARPS-1)*2; - T local_A[1]; - T local_B[32]; + T local_A[2]; + T local_B[64]; const int a_tile_offset = 16; const int b_tile_offset = (16*32 + 16); @@ -3075,14 +3075,32 @@ template __global__ void gemm_device(int M, int ticktock = 0; int idx = 0 + threadIdx.x; + int loaded_values = 0; // prefetch if(idx < K && warp_id < (WARPS-1)) { - local_A[0] = A[idx]; + if(loaded_values == 0) + { + local_A[0] = A[idx]; + local_A[1] = A[idx+blockDim.x-32]; - #pragma unroll 32 - for(int col = 0; col < 32; col++) - local_B[col] = B[(col_offset+col)*ldb+idx]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + { + local_B[col] = B[(col_offset+col)*ldb+idx]; + local_B[col+32] = B[(col_offset+col)*ldb+idx+blockDim.x-32]; + } + loaded_values = 1; + } + else + { + local_A[0] = local_A[1]; + loaded_values--; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+32]; + } smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; @@ -3113,11 +3131,35 @@ template __global__ void gemm_device(int M, __syncthreads(); if(idx < K && warp_id < (WARPS-1)) { - local_A[0] = A[idx]; + //local_A[0] = A[idx]; - #pragma unroll 32 - for(int col = 0; col < 32; col++) - local_B[col] = B[(col_offset+col)*ldb+idx]; + //#pragma unroll 32 + //for(int col = 0; col < 32; col++) + // local_B[col] = B[(col_offset+col)*ldb+idx]; + if(loaded_values == 0) + { + local_A[0] = A[idx]; + local_A[1] = A[idx+blockDim.x-32]; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + { + local_B[col] = B[(col_offset+col)*ldb+idx]; + local_B[col+32] = B[(col_offset+col)*ldb+idx+blockDim.x-32]; + } + loaded_values = 1; + } + else + { + local_A[0] = local_A[1]; + loaded_values--; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+32]; + + + } smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; diff --git a/tests/test_functional.py b/tests/test_functional.py index 62dd1cb..e9a67f5 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2376,8 +2376,8 @@ def test_cutlass3_gemm(dtype): #print('') #print(A) #print(B.t()) - #A[:, :-3] = 0 - #B[:, :-3] = 0 + #A[:, :-1] = 0 + #B[:, :-1] = 0 C1 = torch.matmul(A, B.t()) @@ -2399,7 +2399,7 @@ def test_cutlass3_gemm(dtype): #if err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: # print('') - # print(i, err, mag.item(), relerr.item()) + # print(i, err, relerr) # print(A.flatten()[-6:]) # print(B.flatten()[-6:]) # out = A.flatten()[-6:]*B.flatten()[-6:] @@ -2412,7 +2412,7 @@ def test_cutlass3_gemm(dtype): c = int(C1.numel()*0.0014*(dim/256))+1 - c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=False) + c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=True) #print(c/math.sqrt(dim)) print('') print(dim, sum(errs)/len(errs)/math.sqrt(dim)) From 264a948539d219e6b9a8fc8b9d92120d76b8878b Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 2 May 2023 16:15:38 -0700 Subject: [PATCH 83/97] 4-bit draft; 128 vector load 240. --- bitsandbytes/functional.py | 6 +- csrc/kernels.cu | 307 ++++++++++++++++++++++++------------- csrc/ops.cu | 18 +-- tests/test_functional.py | 95 ++++++++---- 4 files changed, 284 insertions(+), 142 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index b4cbd28..e5b1bf7 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1385,10 +1385,12 @@ def cutlass3_gemm( #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) if state is None: Bshape = B.shape + bout = Bshape[1] else: Bshape = state[1] + bout = Bshape[0] if out is None: - out = torch.zeros(size=(A.shape[0], Bshape[1]), dtype=A.dtype, device=A.device) + out = torch.zeros(size=(A.shape[0], bout), dtype=A.dtype, device=A.device) sA = A.shape sB = B.shape @@ -1464,7 +1466,7 @@ def cutlass3_gemm( if state is not None: m = Bshape[0] k = Bshape[1] - lda = Bshape[1] + lda = Bshape[0] ldc = Bshape[0] ldb = (ldb+1)//2 #print(m, n, k, lda, ldb, ldc) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 65ed19e..2373b91 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3044,22 +3044,15 @@ template __device__ inline void vector_l #define WARPS 5 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { - - typedef cub::WarpReduce WarpReduce; - // Allocate WarpReduce shared memory for one warp - //__shared__ typename WarpReduce::TempStorage temp_storage; - - //typedef cub::BlockReduce BlockReduce; - //// Allocate shared memory for BlockReduce - //__shared__ typename BlockReduce::TempStorage reduce; int col_offset = blockIdx.x *32; const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; const int half_warp_lane = threadIdx.x % 16; const int batch_size_warps = (WARPS-1)*2; + const int val_per_iter = blockDim.x-32; - T local_A[2]; - T local_B[64]; + T local_A[4]; + T local_B[128]; const int a_tile_offset = 16; const int b_tile_offset = (16*32 + 16); @@ -3082,24 +3075,45 @@ template __global__ void gemm_device(int M, if(loaded_values == 0) { local_A[0] = A[idx]; - local_A[1] = A[idx+blockDim.x-32]; + local_A[1] = A[idx+(1*val_per_iter)]; + local_A[2] = A[idx+(2*val_per_iter)]; + local_A[3] = A[idx+(3*val_per_iter)]; #pragma unroll 32 for(int col = 0; col < 32; col++) { local_B[col] = B[(col_offset+col)*ldb+idx]; - local_B[col+32] = B[(col_offset+col)*ldb+idx+blockDim.x-32]; + local_B[col+32] = B[(col_offset+col)*ldb+idx+(1*val_per_iter)]; + local_B[col+64] = B[(col_offset+col)*ldb+idx+(2*val_per_iter)]; + local_B[col+96] = B[(col_offset+col)*ldb+idx+(3*val_per_iter)]; } - loaded_values = 1; + loaded_values = 3; } else { - local_A[0] = local_A[1]; - loaded_values--; - #pragma unroll 32 - for(int col = 0; col < 32; col++) - local_B[col] = local_B[col+32]; + if(loaded_values == 3) + { + local_A[0] = local_A[1]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+(32)]; + } + else if(loaded_values == 2) + { + local_A[0] = local_A[2]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+(64)]; + } + else + { + local_A[0] = local_A[3]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+(96)]; + } + loaded_values--; } smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; @@ -3139,26 +3153,46 @@ template __global__ void gemm_device(int M, if(loaded_values == 0) { local_A[0] = A[idx]; - local_A[1] = A[idx+blockDim.x-32]; + local_A[1] = A[idx+(1*val_per_iter)]; + local_A[2] = A[idx+(2*val_per_iter)]; + local_A[3] = A[idx+(3*val_per_iter)]; #pragma unroll 32 for(int col = 0; col < 32; col++) { local_B[col] = B[(col_offset+col)*ldb+idx]; - local_B[col+32] = B[(col_offset+col)*ldb+idx+blockDim.x-32]; + local_B[col+32] = B[(col_offset+col)*ldb+idx+(1*val_per_iter)]; + local_B[col+64] = B[(col_offset+col)*ldb+idx+(2*val_per_iter)]; + local_B[col+96] = B[(col_offset+col)*ldb+idx+(3*val_per_iter)]; } - loaded_values = 1; + loaded_values = 3; + } else { - local_A[0] = local_A[1]; + + if(loaded_values == 3) + { + local_A[0] = local_A[1]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+(32)]; + } + else if(loaded_values == 2) + { + local_A[0] = local_A[2]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+(64)]; + } + else + { + local_A[0] = local_A[3]; + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = local_B[col+(96)]; + } loaded_values--; - - #pragma unroll 32 - for(int col = 0; col < 32; col++) - local_B[col] = local_B[col+32]; - - } smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; @@ -3215,104 +3249,166 @@ template __global__ void gemm_device(int M, template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage reduce; - int col_offset = blockIdx.x *8; + int col_offset = blockIdx.x *32; + const int warp_id = threadIdx.x / 32; + const int half_warp_id = threadIdx.x / 16; + const int half_warp_lane = threadIdx.x % 16; + const int batch_size_warps = (WARPS-1)*2; - T local_A[32]; - unsigned char local_B_4bit[16]; - T local_B[32]; - T local_C[8]; + T local_A[2]; + T local_B[64]; + unsigned char local_B_4bit[32]; - __shared__ T smem_C[8]; + const int a_tile_offset = 16; + const int b_tile_offset = (16*32 + 16); - if(threadIdx.x < 8) - smem_C[threadIdx.x] = T(0); - __syncthreads(); + __shared__ T smem_A[8*16 + (2*16*(batch_size_warps-1))]; + __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))]; + //__shared__ T smem_C[8*32]; - #pragma unroll 8 - for(int k = 0; k < 8; k++) - local_C[k] = T(0); + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment c_frag; + wmma::fill_fragment(c_frag, 0.0f); - - for(int idx = threadIdx.x*32; idx < K; idx+=blockDim.x*32) + int ticktock = 0; + int idx = 0 + threadIdx.x; + int loaded_values = 0; + // prefetch + if(idx < K && warp_id < (WARPS-1)) { - - // we load only 8 values per iteration from A, so we - // need to do 4 loads for every single load from B - // for B, we have packed values, so the 16 8-bit values - // turn into 32 4-bit values to 4x 4 loads turns into 4x 8 loads - vector_load(local_A, A, idx, idx, K); - vector_load(&(local_A[8]), A, idx+8, idx+8, K); - vector_load(&(local_A[16]), A, idx+16, idx+16, K); - vector_load(&(local_A[24]), A, idx+24, idx+24, K); - - for(int col = 0; col < 8; col++) + if(loaded_values == 0) { - if((col + col_offset) >= M){ break; } - - int offset_B = (col_offset+col)*ldb; - // 0111 -> 0.0f in NF4 - // since we have packed 8-bits, we need cat(0b0111, 0b0111) = 0b01110111 - vector_load(local_B_4bit, B, (offset_B+idx+1)/2, (idx+1)/2, (K+1)/2, 0b01110111); - - int absidx = (idx + offset_B)/blocksize; - half local_absmax = __ldg(&(absmax[absidx])); - //for(int k = 0; k < 16; k++) - //printf("%i %i ", local_B_4bit[k] >> 4, local_B_4bit[k] & 0x0F); - //printf("\n"); - - //vector_load(local_A, A, idx, idx, K); - - #pragma unroll 16 - for(int k = 0; k < 16; k++) - { - - //if(local_B_4bit[k ] != 0b01110111) - //printf("(%i %i %i) %i -> %f, %i -> %f\n", threadIdx.x , k, K, local_B_4bit[k ] >> 4, dDequantizeNF4(local_B_4bit[k ] >> 4, local_absmax), - //local_B_4bit[k ] & 0x0F, dDequantizeNF4(local_B_4bit[k ] & 0x0F, local_absmax)); - //local_B[k*2] = d2DequantizeFP4(local_B_4bit[k] >> 4);//*local_absmax; - //local_B[k*2 + 1] = d2DequantizeFP4(local_B_4bit[k] & 0x0F);//*local_absmax; - local_B[k*2] = (half)(local_B_4bit[k] >> 4)*local_absmax; - local_B[k*2 + 1] = (half)(local_B_4bit[k] & 0x0F)*local_absmax; - //local_B[k*2] = (half)dDequantizeNF4(local_B_4bit[k ] >> 4);//*local_absmax; - //local_B[k*2 + 1] = (half)dDequantizeNF4(local_B_4bit[k ] & 0x0F);//*local_absmax; - } + local_A[0] = A[idx]; + local_A[1] = A[idx+blockDim.x-32]; #pragma unroll 32 - //for(int k = 0; k < 8; k++) - for(int k = 0; k < 32; k++) + for(int col = 0; col < 32; col++) + local_B_4bit[col] = B[(col_offset+col)*ldb+idx]; + + loaded_values = 1; + } + else + { + local_A[0] = local_A[1]; + loaded_values--; + + #pragma unroll 64 + for(int col = 0; col < 64; col+=2) { - local_C[col] += local_A[k]*local_B[k]; - //if((float)local_A[k] != 0.0 && (float)local_B[k] != 0.0) - //if((float)local_B[k] != 0.0) - //printf("%i %i %i %i %f*%f\n", threadIdx.x, k, col, (float)local_A[k], (float)local_B[k]); + local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(1.0f); + local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(1.0f); } } - } - #pragma unroll 8 - for(int k = 0; k < 8; k++) + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; + } + else if(warp_id < (WARPS-1)) { - local_C[k] = BlockReduce(reduce).Reduce(local_C[k], cub::Sum()); + local_A[0] = T(0.0); + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = 0.0f; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = 0.0f; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f; + } + ticktock = ticktock == 0 ? 1 : 0; + + //for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32) + for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32) + { + idx = base_idx + threadIdx.x; + __syncthreads(); + if(idx < K && warp_id < (WARPS-1)) + { + if(loaded_values == 0) + { + local_A[0] = A[idx]; + local_A[1] = A[idx+blockDim.x-32]; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + { + local_B_4bit[col] = B[(col_offset+col)*ldb+idx]; + local_B_4bit[col+16] = B[(col_offset+col)*ldb+idx]; + } + + loaded_values = 1; + } + else + { + local_A[0] = local_A[1]; + loaded_values--; + + int absidx = (idx + col_offset)/blocksize; + half local_absmax = __ldg(&(absmax[absidx])); + + #pragma unroll 64 + for(int col = 0; col < 64; col+=2) + { + local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(absidx); + local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(absidx); + } + } + + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0]; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col]; + } + else if(warp_id < (WARPS-1)) + { + local_A[0] = T(0.0); + smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = 0.0f; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + local_B[col] = 0.0f; + + #pragma unroll 32 + for(int col = 0; col < 32; col++) + smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f; + } + ticktock = ticktock == 0 ? 1 : 0; + + if(warp_id == (WARPS-1)) + for(int k = 0; k < batch_size_warps; k++) + { + wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + } } - if(threadIdx.x == 0) + __syncthreads(); + if(warp_id != (WARPS-1)){ return; } + // only warp_id == (WARPS-1) from here + int warp_lane = threadIdx.x % 32; + + ticktock = ticktock == 0 ? 1 : 0; + for(int k = 0; k < batch_size_warps; k++) { - #pragma unroll 8 - for(int k = 0; k < 8; k++) - smem_C[k] = local_C[k]; + wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); // 111 mu + wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); } - else if(threadIdx.x >= 32) - // early return for unused warps - return; - __syncwarp(); + // 129 mu + if(warp_id == (WARPS-1)) + wmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major); - - if(threadIdx.x < 8 && col_offset + threadIdx.x < M) - out[col_offset + threadIdx.x ] = smem_C[threadIdx.x]; + if(col_offset + warp_lane < M) + out[col_offset + warp_lane] = smem_A[warp_lane]; } //#define ROWS 2 @@ -3513,6 +3609,7 @@ template __global__ void gemm_device(int M, int N, int K, half * _ template __global__ void gemm_device(int M, int N, int K, half * __restrict__ const A, half* B, half * out, int lda, int ldb, int ldc); template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); +template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); //template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); diff --git a/csrc/ops.cu b/csrc/ops.cu index 16d82f9..4d68436 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -703,17 +703,17 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) { - int num_blocks = (m+7)/8; + int num_blocks = (m+31)/32; - cout << num_blocks << endl; - cout << lda << endl; - cout << ldb << endl; - cout << ldc << endl; + //cout << num_blocks << endl; + //cout << lda << endl; + //cout << ldb << endl; + //cout << ldc << endl; - cout << m << endl; - cout << n << endl; - cout << k << endl; - kgemm_4bit_inference<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); + //cout << m << endl; + //cout << n << endl; + //cout << k << endl; + kgemm_4bit_inference<<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); //kgemm_4bit_inference<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } diff --git a/tests/test_functional.py b/tests/test_functional.py index e9a67f5..dc4e40d 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2358,20 +2358,19 @@ def test_normal_map_tree(): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_cutlass3_gemm(dtype): - for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: + debug = True + #for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: #for dim in [4096, 5120, 6656, 8192]: - #for dim in [4096]: + for dim in [4096]: + #for dim in [128+1]: errs = [] relerrs = [] max_err = 0 max_relerr = 0 for i in range(100): - #A = torch.rand(2, 4092, dtype=dtype, device='cuda') - #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') - #A = torch.rand(1, 4096, dtype=dtype, device='cuda') - #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') - A = torch.randn(1, dim+0, dtype=dtype, device='cuda') + A = torch.randn(1, dim, dtype=dtype, device='cuda') B = torch.randn(4*dim, dim+0, dtype=dtype, device='cuda')/math.sqrt(dim) + #B = torch.randn(1, dim, dtype=dtype, device='cuda')/math.sqrt(dim) #print('') #print(A) @@ -2397,7 +2396,7 @@ def test_cutlass3_gemm(dtype): errs.append(err) relerrs.append(relerr) - #if err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: + #if not debug and err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: # print('') # print(i, err, relerr) # print(A.flatten()[-6:]) @@ -2412,7 +2411,7 @@ def test_cutlass3_gemm(dtype): c = int(C1.numel()*0.0014*(dim/256))+1 - c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=True) + c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=not debug) #print(c/math.sqrt(dim)) print('') print(dim, sum(errs)/len(errs)/math.sqrt(dim)) @@ -2422,29 +2421,73 @@ def test_cutlass3_gemm(dtype): #@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=['fp32', 'fp16']) @pytest.mark.parametrize("dtype", [torch.float16], ids=['fp16']) def test_gemm_4bit(dtype): - for i in range(1): - #A = torch.rand(2, 4092, dtype=dtype, device='cuda') - #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') - #torch.random.manual_seed(17) - A = torch.rand(1, 4096, dtype=dtype, device='cuda') - B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') + #for dim in [32, 64, 128, 256, 512, 1024, 2048, 4096]: + #for dim in [4096, 5120, 6656, 8192]: + #for dim in [32]: + for dim in [4096]: + errs = [] + relerrs = [] + max_err = 0 + max_relerr = 0 + for i in range(1): + #A = torch.rand(2, 4092, dtype=dtype, device='cuda') + #B = torch.rand(4*4092, 4092, dtype=dtype, device='cuda') + #A = torch.rand(1, 4096, dtype=dtype, device='cuda') + #B = torch.rand(4*4096, 4096, dtype=dtype, device='cuda') + A = torch.randn(1, dim+0, dtype=dtype, device='cuda') + B = torch.randn(4*dim, dim+0, dtype=dtype, device='cuda')/math.sqrt(dim) - #print('') - #print(A) - #print(B) + #print('') + #print(A) + #print(B.t()) + #A[:, :-1] = 0 + #B[:, :-1] = 0 - qB, state = F.quantize_nf4(B) - F.dequantize_nf4(qB, state) + qB, state = F.quantize_nf4(B) + F.dequantize_nf4(qB, state) + C3 = torch.matmul(A, B.t()) + C2 = F.cutlass3_gemm(A, qB.t(), state=state) + C1 = bnb.matmul_4bit(A, qB.t(), state) + C2 = F.cutlass3_gemm(A, qB.t(), state=state) - C1 = torch.matmul(A, B.t()) - #C1 = bnb.matmul_4bit(A, qB.t(), state) - C2 = F.cutlass3_gemm(A, qB.t(), state=state) - #print(C1) - #print(C2) + print(C1.shape, C2.shape) - #torch.testing.assert_close(C1, C2, atol=1e-5, rtol=0.005) + # tensor cores are non-deterministic + # so we need to analyze errors around the mean + # to test our implementation + err = torch.abs(C1-C2) + mag = torch.abs(C1)+1e-8 + relerr = err/mag + max_err = max(err.max(), max_err) + max_relerr = max(relerr.max(), max_relerr) + err = err.mean().item() + relerr = relerr.mean().item() + errs.append(err) + relerrs.append(relerr) + + if err/torch.abs(C1).mean() > 5e-5 or err > 3.2e-5: + print('') + print(i, err, relerr) + print(A.flatten()[-6:]) + print(B.flatten()[-6:]) + out = A.flatten()[-6:]*B.flatten()[-6:] + print(out) + print(out[:-1].sum()) + print('='*80) + print(C1.flatten()[-6:]) + print(C2.flatten()[-6:]) + #assert False, 'ERROR' + + c = int(C1.numel()*0.0014*(dim/256))+1 + + c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=False) + #print(c/math.sqrt(dim)) + print('') + print(dim, sum(errs)/len(errs)/math.sqrt(dim)) + print(dim, sum(relerrs)/len(relerrs)/math.sqrt(dim)) + print(dim, (max_err.item(), max_relerr.item())) def test_pipeline_func(): a = torch.rand(2, 4).cuda() From ec38ba95b0cd6bf3dadfccf366cd8917acf59c4b Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 6 May 2023 11:14:06 -0700 Subject: [PATCH 84/97] Added paging. --- bitsandbytes/cextension.py | 2 + bitsandbytes/functional.py | 55 +++++++++++++++++++++++++++ csrc/kernels.cu | 76 ++++++++++---------------------------- csrc/kernels.cuh | 18 +-------- csrc/ops.cu | 25 ++++++++----- csrc/ops.cuh | 9 ++++- csrc/pythonInterface.c | 32 +++++++++++++++- tests/test_functional.py | 40 +++++++++++++++++--- 8 files changed, 167 insertions(+), 90 deletions(-) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 8adca93..17c2a46 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -26,6 +26,8 @@ try: lib.cadam_8bit_blockwise_fp32 lib.get_context.restype = ct.c_void_p lib.get_cusparse.restype = ct.c_void_p + lib.cget_managed_ptr.restype = ct.c_void_p + lib.cget_stream.restype = ct.c_void_p COMPILED_WITH_CUDA = True except AttributeError: warn("The installed version of bitsandbytes was compiled without GPU support. " diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index e5b1bf7..f548475 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -130,6 +130,61 @@ class Cusparse_Context: cls._instance.initialize() return cls._instance +dtype2bytes = {} +dtype2bytes[torch.float32] = 4 +dtype2bytes[torch.float16] = 2 +dtype2bytes[torch.bfloat16] = 2 +dtype2bytes[torch.uint8] = 1 +dtype2bytes[torch.int8] = 1 + +def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)): + num_bytes = dtype2bytes[dtype]*prod(shape) + cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes)) + c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int)) + new_array = np.ctypeslib.as_array(c_ptr, shape=shape) + out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)) + out.is_paged = True + out.page_deviceid = device.index + return out + +def prefetch_tensor(A, to_cpu=False): + assert A.is_paged, 'Only paged tensors can be prefetched!' + if to_cpu: + deviceid = -1 + else: + deviceid = A.page_deviceid + + num_bytes = dtype2bytes[A.dtype]*A.numel() + lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid)) + +def elementwise_func(func_name, A, B, value, prefetch=True): + func = None + if A.dtype == torch.float32: + func = getattr(lib, f'c{func_name}_fp32', None) + cvalue = ct.c_float(value) + elif A.dtype == torch.uint8: + func = getattr(lib, f'c{func_name}_uint8', None) + cvalue = ct.c_uint8(value) + + if func is None: raise NotImplementedError(f'Function not implemented: {func_name}') + + is_managed = getattr(A, 'is_managed', False) + if is_managed and prefetch: + prefetch_tensor(A) + if B is not None: prefetch_tensor(B) + + func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel())) + if A.is_paged or B.is_paged: + # paged function are fully asynchronous + # if we return from this function, we want to the tensor + # to be in the correct state, that is the final state after the + # operation occured. So we synchronize. + torch.cuda.synchronize() + +def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value) +def arange(A, device=None): elementwise_func('arange', A, None, 0) +def _mul(A, B, device=None): elementwise_func('_mul', A, B, 0) + def create_linear_map(signed=True, total_bits=8, add_zero=True): sign = (-1.0 if signed else 0.0) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 2373b91..e1a3155 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3522,49 +3522,23 @@ template __global__ void kgemm_4bit_inference(int M, i //} -__device__ void compute(float* global_out, float const* shared_in) +template __global__ void kfunc(T *A, T *B, T value, long n) { - -} -template -__global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz) { - auto grid = cooperative_groups::this_grid(); - auto block = cooperative_groups::this_thread_block(); - assert(size == batch_sz * grid.size()); // Assume input size fits batch_sz * grid_size - - extern __shared__ float shared[]; // stages_count * block.size() * sizeof(int) bytes - size_t shared_offset[stages_count]; - for (int s = 0; s < stages_count; ++s) shared_offset[s] = s * block.size(); - - __shared__ cuda::pipeline_shared_state< - cuda::thread_scope::thread_scope_block, - stages_count - > shared_state; - auto pipeline = cuda::make_pipeline(block, &shared_state); - - auto block_batch = [&](size_t batch) -> int { - return block.group_index().x * block.size() + grid.size() * batch; - }; - - // compute_batch: next batch to process - // fetch_batch: next batch to fetch from global memory - for (size_t compute_batch = 0, fetch_batch = 0; compute_batch < batch_sz; ++compute_batch) { - // The outer loop iterates over the computation of the batches - for (; fetch_batch < batch_sz && fetch_batch < (compute_batch + stages_count); ++fetch_batch) { - // This inner loop iterates over the memory transfers, making sure that the pipeline is always full - pipeline.producer_acquire(); - size_t shared_idx = fetch_batch % stages_count; - size_t batch_idx = fetch_batch; - size_t block_batch_idx = block_batch(batch_idx); - cuda::memcpy_async(block, shared + shared_offset[shared_idx], global_in + block_batch_idx, sizeof(float) * block.size(), pipeline); - pipeline.producer_commit(); - } - pipeline.consumer_wait(); - int shared_idx = compute_batch % stages_count; - int batch_idx = compute_batch; - compute(global_out + block_batch(batch_idx), shared + shared_offset[shared_idx]); - pipeline.consumer_release(); + for(long i = (blockDim.x*blockIdx.x) + threadIdx.x; i < n; i+=(blockDim.x*gridDim.x)) + { + switch(FUNC) + { + case FILL: + A[i] = (T)value; + break; + case ARANGE: + A[i] = (T)i; + break; + case _MUL: + A[i] = A[i]*B[i]; + break; } + } } @@ -3572,19 +3546,10 @@ __global__ void with_staging_unified(float const* global_in, float * global_out, // TEMPLATE DEFINITIONS //============================================================== -//template -//__global__ static -//__launch_bounds__(decltype(size(CThreadLayout{}))::value) -//void -//gemm_device(MShape M, NShape N, KShape K, -// TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, -// TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, -// TC * out, CStride dC, CBlockLayout , CThreadLayout tC, -// half alpha, half beta); +template __global__ void kfunc(float *A, float *B, float value, long n); +template __global__ void kfunc(unsigned char *A, unsigned char *B, unsigned char value, long n); +template __global__ void kfunc(float *A, float *B, float value, long n); +template __global__ void kfunc(float *A, float *B, float value, long n); // these are not used and make no sense, but the compiler needs them //template __global__ void gemm_device(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc); @@ -3611,9 +3576,6 @@ template __global__ void gemm_device(int M, int N, int K, half * _ template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); template __global__ void kgemm_4bit_inference(int M, int N, int K, half * __restrict__ const A, unsigned char *B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); - -//template __global__ void kMatmul_inference_4bit(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB); -template __global__ void with_staging_unified<2>(float const* global_in, float * global_out, size_t size, size_t batch_sz); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh index 4951031..29c6683 100644 --- a/csrc/kernels.cuh +++ b/csrc/kernels.cuh @@ -122,23 +122,9 @@ template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); -//template -//__global__ static -//__launch_bounds__(decltype(size(CThreadLayout{}))::value) -//void -//gemm_device(MShape M, NShape N, KShape K, -// TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, -// TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, -// TC * out, CStride dC, CBlockLayout , CThreadLayout tC, -// Alpha alpha, Beta beta); -template -__global__ void with_staging_unified(float const* global_in, float * global_out, size_t size, size_t batch_sz); - template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc); template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize); +template __global__ void kfunc(T *A, T *B, T value, long n); + #endif diff --git a/csrc/ops.cu b/csrc/ops.cu index 4d68436..7d13b71 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -663,16 +663,6 @@ template void extractOutliers(char * A, int *idx, char *out, int id } -void pipeline_test(float *A, float *B, size_t n, size_t batch_size) -{ - - int threads = 256; - int num_blocks = (n+(256*batch_size)+1)/(batch_size*256); - - with_staging_unified<2><<>>(A, B, n, batch_size); - CUDA_CHECK_RETURN(cudaPeekAtLastError()); -} - template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits) @@ -717,10 +707,25 @@ template void gemm_4bit_inference(int m, int n, int k, T * A, unsi //kgemm_4bit_inference<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } +template void func(T *A, T *B, T value, long n) +{ + int threads = 512; + int blocks = n/threads; + blocks = n % threads == 0 ? blocks : blocks + 1; + blocks = blocks > 65535 ? 65535 : blocks; + kfunc<<>>(A, B, value, n); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); +} + //============================================================== // TEMPLATE DEFINITIONS //============================================================== +template void func(float *A, float *B, float value, long n); +template void func(unsigned char *A, unsigned char *B, unsigned char value, long n); +template void func(float *A, float *B, float value, long n); +template void func(float *A, float *B, float value, long n); + template void gemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize); //template void gemm_host(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc, int bits); template void gemm_host(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc, int bits); diff --git a/csrc/ops.cuh b/csrc/ops.cuh index 8919c60..e9d2e22 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -93,6 +93,13 @@ typedef enum DataType_t NF4 = 2, } DataType_t; +typedef enum Funcs_t +{ + FILL = 0, + ARANGE = 1, + _MUL = 2, +} Funcs_t; + class Context { public: @@ -193,6 +200,6 @@ void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rows template void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits); template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize); +template void func(T *A, T *B, T value, long n); -void pipeline_test(float *A, float *B, size_t n, size_t batch_size); #endif diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 26f16f2..7271430 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -28,6 +28,14 @@ void gemm_host_fp16(int M, int N, int K, half * A, half* B, half * out, int l void gemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize) { gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } +#define MAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \ +void fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ func(A, B, value, n); } \ + +MAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL) +MAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL) +MAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE) +MAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL) + #define MAKE_FUNC32(fname, oname, gtype, gbits) \ void fname##32bit_g##gbits(gtype *g, gtype *p, \ @@ -314,7 +322,6 @@ extern "C" void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); } void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); } - void cpipeline_test(float *A, float *B, size_t n, size_t batch_size){ pipeline_test(A, B, n, batch_size); } //void cgemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc) //{ gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); } @@ -325,6 +332,29 @@ extern "C" void cgemm_4bit_inference(int m, int n, int k, half * A, unsigned char* B, float *absmax, half * out, int lda, int ldb, int ldc, int blocksize) { gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } + void *cget_managed_ptr(size_t bytes) + { + void *ptr; + CUDA_CHECK_RETURN(cudaMallocManaged(&ptr, bytes, cudaMemAttachHost)); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); + + return ptr; + } + + void cprefetch(void *ptr, size_t bytes, int device) + { + CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); + } + + #define CMAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \ + void c##fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ fname##_##type_name(A, B, value, n); } \ + + CMAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL) + CMAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL) + CMAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE) + CMAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL) + #endif void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } diff --git a/tests/test_functional.py b/tests/test_functional.py index dc4e40d..145c267 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2489,8 +2489,38 @@ def test_gemm_4bit(dtype): print(dim, sum(relerrs)/len(relerrs)/math.sqrt(dim)) print(dim, (max_err.item(), max_relerr.item())) -def test_pipeline_func(): - a = torch.rand(2, 4).cuda() - out = F.pipeline_test(a, 2) - print(a) - print(out) +def test_managed(): + n = 32*10 + A = F.get_paged(n, n, dtype=torch.float32) + B = F.get_paged(n, n, dtype=torch.uint8) + B2 = F.get_paged(n, n, dtype=torch.float32) + assert A.is_paged + assert B.is_paged + assert A.page_deviceid==0 + assert B.page_deviceid==0 + F.fill(A, 17.0) + F.fill(B, 17) + F.fill(B2, 2) + assert (A==17).sum().item() == n*n + assert (B==17).sum().item() == n*n + C = A*B.float() + assert (C==289).sum().item() == n*n + F._mul(A, B2) + F._mul(A, B2) + F._mul(A, B2) + assert (A==17*(2**3)).sum().item() == n*n + # F.prefetch_tensor(A) + # F.prefetch_tensor(B) + + + # F.fill(B2, 17.0) + # F._mul(A, B2) + + # F.prefetch_tensor(A, to_cpu=True) + # F.prefetch_tensor(B, to_cpu=True) + # F.prefetch_tensor(B2, to_cpu=True) + # torch.cuda.synchronize() + + # assert (A==17).sum().item() == n*n + + # torch.testing.assert_allclose(A, torch.ones(A.shape)*289) From 44d68ff29cc19e54db13242e7f8cff3c7e4c5196 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 6 May 2023 14:59:29 -0700 Subject: [PATCH 85/97] Added paged optimizers. --- bitsandbytes/cextension.py | 1 - bitsandbytes/functional.py | 33 ++++++++-- bitsandbytes/optim/__init__.py | 4 +- bitsandbytes/optim/adam.py | 104 +++++++----------------------- bitsandbytes/optim/adamw.py | 108 ++++++++------------------------ bitsandbytes/optim/optimizer.py | 72 ++++++++++----------- tests/test_functional.py | 14 ++--- tests/test_optim.py | 87 +++++++++++-------------- 8 files changed, 157 insertions(+), 266 deletions(-) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 17c2a46..29621c9 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -27,7 +27,6 @@ try: lib.get_context.restype = ct.c_void_p lib.get_cusparse.restype = ct.c_void_p lib.cget_managed_ptr.restype = ct.c_void_p - lib.cget_stream.restype = ct.c_void_p COMPILED_WITH_CUDA = True except AttributeError: warn("The installed version of bitsandbytes was compiled without GPU support. " diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index f548475..a6ed675 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -83,6 +83,27 @@ if COMPILED_WITH_CUDA: lib.cadagrad_8bit_blockwise_fp16, ) +class GlobalPageManager: + _instance = None + + def __init__(self): + raise RuntimeError("Call get_instance() instead") + + def initialize(self): + self.paged_tensors = [] + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = cls.__new__(cls) + cls._instance.initialize() + return cls._instance + + def prefetch_all(self, to_cpu=False): + for t in self.paged_tensors: + prefetch_tensor(t, to_cpu) + + class CUBLAS_Context: _instance = None @@ -142,7 +163,7 @@ def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)) cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes)) c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int)) new_array = np.ctypeslib.as_array(c_ptr, shape=shape) - out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)) + out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)).view(shape) out.is_paged = True out.page_deviceid = device.index return out @@ -415,10 +436,14 @@ def is_on_gpu(tensors): gpu_ids = set() for t in tensors: if t is None: continue # NULL pointers are fine - on_gpu &= t.device.type == 'cuda' - gpu_ids.add(t.device.index) + is_paged = getattr(t, 'is_paged', False) + on_gpu &= (t.device.type == 'cuda' or is_paged) + if not is_paged: + gpu_ids.add(t.device.index) + if not on_gpu: + raise TypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}') if len(gpu_ids) > 1: - raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:{[(t.shape, t.device) for t in tensors]}') + raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}') return on_gpu def get_ptr(A: Tensor) -> ct.c_void_p: diff --git a/bitsandbytes/optim/__init__.py b/bitsandbytes/optim/__init__.py index 8c8a8f4..994dae5 100644 --- a/bitsandbytes/optim/__init__.py +++ b/bitsandbytes/optim/__init__.py @@ -6,8 +6,8 @@ from bitsandbytes.cextension import COMPILED_WITH_CUDA from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit -from .adam import Adam, Adam8bit, Adam32bit -from .adamw import AdamW, AdamW8bit, AdamW32bit +from .adam import Adam, Adam8bit, Adam32bit, PagedAdam, PagedAdam8bit, PagedAdam32bit +from .adamw import AdamW, AdamW8bit, AdamW32bit, PagedAdamW, PagedAdamW8bit, PagedAdamW32bit from .lamb import LAMB, LAMB8bit, LAMB32bit from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS from .optimizer import GlobalOptimManager diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index 396aeb8..86981eb 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -14,92 +14,34 @@ from bitsandbytes.optim.optimizer import Optimizer2State class Adam(Optimizer2State): - def __init__( - self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0, - amsgrad=False, - optim_bits=32, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "adam", - params, - lr, - betas, - eps, - weight_decay, - optim_bits, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) - + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Adam8bit(Optimizer2State): - def __init__( - self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0, - amsgrad=False, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "adam", - params, - lr, - betas, - eps, - weight_decay, - 8, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) - + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Adam32bit(Optimizer2State): - def __init__( - self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0, - amsgrad=False, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "adam", - params, - lr, - betas, - eps, - weight_decay, - 32, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) +class PagedAdam(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + +class PagedAdam8bit(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + +class PagedAdam32bit(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class AnalysisAdam(torch.optim.Optimizer): """Adam that performs 8-bit vs 32-bit error analysis. diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 022e64c..21077f1 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -5,89 +5,35 @@ from bitsandbytes.optim.optimizer import Optimizer2State -class AdamW(Optimizer2State): - def __init__( - self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=1e-2, - amsgrad=False, - optim_bits=32, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "adam", - params, - lr, - betas, - eps, - weight_decay, - optim_bits, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) +class AdamW(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) class AdamW8bit(Optimizer2State): - def __init__( - self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=1e-2, - amsgrad=False, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "adam", - params, - lr, - betas, - eps, - weight_decay, - 8, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) - + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) class AdamW32bit(Optimizer2State): - def __init__( - self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=1e-2, - amsgrad=False, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "adam", - params, - lr, - betas, - eps, - weight_decay, - 32, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + + +class PagedAdamW(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + +class PagedAdamW8bit(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + +class PagedAdamW32bit(Optimizer2State): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, + args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index 867ad3d..4f8dcc7 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -92,10 +92,12 @@ class GlobalOptimManager: class Optimizer8bit(torch.optim.Optimizer): - def __init__(self, params, defaults, optim_bits=32): + def __init__(self, params, defaults, optim_bits=32, is_paged=False): super().__init__(params, defaults) self.initialized = False self.name2qmap = {} + self.is_paged = is_paged + self.page_mng = F.GlobalPageManager.get_instance() self.mng = GlobalOptimManager.get_instance() self.non_castable_tensor_keys = { @@ -207,7 +209,9 @@ class Optimizer8bit(torch.optim.Optimizer): values = self.state[p] for k, v in values.items(): if isinstance(v, torch.Tensor): - self.state[p][k] = v.to(p.device) + is_paged = getattr(v, 'is_paged', False) + if not is_paged: + self.state[p][k] = v.to(p.device) def check_overrides(self): for module, attr, config in self.mng.module_weight_config_triple: @@ -252,6 +256,7 @@ class Optimizer8bit(torch.optim.Optimizer): self.to_gpu() # needed for fairseq pure fp16 training self.initialized = True + if self.is_paged: self.page_mng.prefetch_all() for gindex, group in enumerate(self.param_groups): for pindex, p in enumerate(group["params"]): if p.grad is None: @@ -261,6 +266,11 @@ class Optimizer8bit(torch.optim.Optimizer): self.init_state(group, p, gindex, pindex) self.update_step(group, p, gindex, pindex) + if self.is_paged: + # all paged operation are asynchronous, we need + # to sync to make sure all tensors are in the right state + torch.cuda.synchronize() + return loss @@ -289,6 +299,16 @@ class Optimizer8bit(torch.optim.Optimizer): "The update_step method needs to be overridden" ) + def get_state_buffer(self, p, dtype=torch.float32): + if not self.is_paged or p.numel() < 1e5: + return torch.zeros_like(p, dtype=dtype, device=p.device) + else: + # > 1 MB + buff = F.get_paged(*p.shape, dtype=dtype, device=p.device) + F.fill(buff, 0) + self.page_mng.paged_tensors.append(buff) + return buff + class Optimizer2State(Optimizer8bit): def __init__( @@ -306,6 +326,7 @@ class Optimizer2State(Optimizer8bit): block_wise=True, max_unorm=0.0, skip_zeros=False, + is_paged=False ): if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -325,7 +346,7 @@ class Optimizer2State(Optimizer8bit): f"Invalid weight_decay value: {weight_decay}" ) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - super().__init__(params, defaults, optim_bits) + super().__init__(params, defaults, optim_bits, is_paged) if args is None: args = {} @@ -365,18 +386,8 @@ class Optimizer2State(Optimizer8bit): if dtype == torch.float32 or ( dtype == torch.uint8 and p.numel() < 4096 ): - state["state1"] = torch.zeros_like( - p, - memory_format=torch.preserve_format, - dtype=torch.float32, - device=p.device, - ) - state["state2"] = torch.zeros_like( - p, - memory_format=torch.preserve_format, - dtype=torch.float32, - device=p.device, - ) + state["state1"] = self.get_state_buffer(p, dtype=torch.float32) + state["state2"] = self.get_state_buffer(p, dtype=torch.float32) elif dtype == torch.uint8: if state["step"] == 0: if "dynamic" not in self.name2qmap: @@ -388,20 +399,10 @@ class Optimizer2State(Optimizer8bit): p.device ) - state["state1"] = torch.zeros_like( - p, - memory_format=torch.preserve_format, - dtype=torch.uint8, - device=p.device, - ) + state["state1"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap1"] = self.name2qmap["dynamic"] - state["state2"] = torch.zeros_like( - p, - memory_format=torch.preserve_format, - dtype=torch.uint8, - device=p.device, - ) + state["state2"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap2"] = self.name2qmap["udynamic"] if config["block_wise"]: @@ -538,6 +539,7 @@ class Optimizer1State(Optimizer8bit): block_wise=True, max_unorm=0.0, skip_zeros=False, + is_paged=False ): if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -553,7 +555,7 @@ class Optimizer1State(Optimizer8bit): f"Invalid weight_decay value: {weight_decay}" ) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - super().__init__(params, defaults, optim_bits) + super().__init__(params, defaults, optim_bits, is_paged) if args is None: args = {} @@ -593,12 +595,7 @@ class Optimizer1State(Optimizer8bit): if dtype == torch.float32 or ( dtype == torch.uint8 and p.numel() < 4096 ): - state["state1"] = torch.zeros_like( - p, - memory_format=torch.preserve_format, - dtype=torch.float32, - device=p.device, - ) + state["state1"] = self.get_state_buffer(p, dtype=torch.float32) elif dtype == torch.uint8: if state["step"] == 0: if "dynamic" not in self.name2qmap: @@ -607,12 +604,7 @@ class Optimizer1State(Optimizer8bit): p.device ) - state["state1"] = torch.zeros_like( - p, - memory_format=torch.preserve_format, - dtype=torch.uint8, - device=p.device, - ) + state["state1"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap1"] = self.name2qmap["dynamic"] if config["block_wise"]: diff --git a/tests/test_functional.py b/tests/test_functional.py index 145c267..6bda1a8 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -172,8 +172,8 @@ def test_dynamic_blockwise_quantization(nested, blocksize): relerr = sum(reldiffs)/len(reldiffs) assert abserr < 0.011 assert relerr < 0.018 - print('nested=', nested, 'randn', blocksize, sum(diffs)/len(diffs)) - print('nested=', nested, 'randn', blocksize, sum(reldiffs)/len(reldiffs)) + #print('nested=', nested, 'randn', blocksize, sum(diffs)/len(diffs)) + #print('nested=', nested, 'randn', blocksize, sum(reldiffs)/len(reldiffs)) diffs = [] for i in range(100): @@ -189,8 +189,8 @@ def test_dynamic_blockwise_quantization(nested, blocksize): relerr = sum(reldiffs)/len(reldiffs) assert abserr < 0.0035 assert relerr < 0.015 - print('nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs)) - print('nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs)) + #print('nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs)) + #print('nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs)) def test_dynamic_blockwise_stochastic_quantization(): @@ -320,7 +320,7 @@ def test_approx_igemm(dim1, dim2, quant_methods, batched): dim2 = dim2 - (dim2 % 32) errors = [] relerrors = [] - print("") + #print("") for i in range(5): if batched: A = torch.normal(0, 0.5, size=(32, dim1, dim2 // 32), device="cuda") @@ -349,8 +349,8 @@ def test_approx_igemm(dim1, dim2, quant_methods, batched): relerr = err / torch.abs(out2) errors.append(err.mean().item()) relerrors.append(relerr.mean().item()) - print(mean(errors)) - print(mean(relerrors)) + #print(mean(errors)) + #print(mean(relerrors)) def test_stable_embedding(): diff --git a/tests/test_optim.py b/tests/test_optim.py index a13b332..a5ecb6e 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -39,6 +39,8 @@ str2optimizers["momentum_pytorch"] = ( bnb.optim.Adam, ) str2optimizers["adam"] = (torch.optim.Adam, bnb.optim.Adam) +str2optimizers["paged_adamw"] = (torch.optim.AdamW, bnb.optim.PagedAdamW) +str2optimizers["paged_adam"] = (torch.optim.Adam, bnb.optim.PagedAdam) # str2optimizers['fused_adam'] = (apex.optimizers.FusedAdam, bnb.optim.Adam) str2optimizers["momentum"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), @@ -48,10 +50,7 @@ str2optimizers["rmsprop"] = ( lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9), lambda pxx: bnb.optim.RMSprop(pxx, 0.01, 0.9, block_wise=False), ) -str2optimizers["adam8bit"] = ( - torch.optim.Adam, - lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=False), -) +str2optimizers["adam8bit"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=False)) str2optimizers["momentum8bit"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9, block_wise=False), @@ -61,10 +60,9 @@ str2optimizers["rmsprop8bit"] = ( lambda pxx: bnb.optim.RMSprop8bit(pxx, 0.01, 0.9, block_wise=False), ) -str2optimizers["adam8bit_blockwise"] = ( - torch.optim.Adam, - lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=True), -) +str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=True)) +str2optimizers["paged_adamw8bit_blockwise"] = (torch.optim.AdamW, lambda pxx: bnb.optim.PagedAdamW8bit(pxx, block_wise=True)) +str2optimizers["paged_adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.PagedAdam8bit(pxx, block_wise=True)) str2optimizers["momentum8bit_blockwise"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9, block_wise=True), @@ -76,36 +74,25 @@ str2optimizers["rmsprop8bit_blockwise"] = ( str2statenames = {} str2statenames["adam"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] +str2statenames["paged_adamw"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] +str2statenames["paged_adam"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["momentum"] = [("momentum_buffer", "state1")] str2statenames["lamb"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["rmsprop"] = [("square_avg", "state1")] -str2statenames["adam8bit"] = [ - ("exp_avg", "state1", "qmap1", "max1"), - ("exp_avg_sq", "state2", "qmap2", "max2"), -] -str2statenames["lamb8bit"] = [ - ("exp_avg", "state1", "qmap1", "max1"), - ("exp_avg_sq", "state2", "qmap2", "max2"), -] -str2statenames["adam8bit_blockwise"] = [ - ("exp_avg", "state1", "qmap1", "absmax1"), - ("exp_avg_sq", "state2", "qmap2", "absmax2"), -] -str2statenames["momentum8bit"] = [ - ("momentum_buffer", "state1", "qmap1", "max1") -] -str2statenames["momentum8bit_blockwise"] = [ - ("momentum_buffer", "state1", "qmap1", "absmax1") -] +str2statenames["adam8bit"] = [("exp_avg", "state1", "qmap1", "max1"), ("exp_avg_sq", "state2", "qmap2", "max2")] +str2statenames["lamb8bit"] = [("exp_avg", "state1", "qmap1", "max1"), ("exp_avg_sq", "state2", "qmap2", "max2")] +str2statenames["adam8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1"), ("exp_avg_sq", "state2", "qmap2", "absmax2")] +str2statenames["paged_adam8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1"), ("exp_avg_sq", "state2", "qmap2", "absmax2")] +str2statenames["paged_adamw8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1"), ("exp_avg_sq", "state2", "qmap2", "absmax2")] +str2statenames["momentum8bit"] = [("momentum_buffer", "state1", "qmap1", "max1")] +str2statenames["momentum8bit_blockwise"] = [("momentum_buffer", "state1", "qmap1", "absmax1")] str2statenames["rmsprop8bit"] = [("square_avg", "state1", "qmap1", "max1")] -str2statenames["rmsprop8bit_blockwise"] = [ - ("square_avg", "state1", "qmap1", "absmax1") -] +str2statenames["rmsprop8bit_blockwise"] = [("square_avg", "state1", "qmap1", "absmax1")] dim1 = [1024] dim2 = [32, 1024, 4097, 1] -gtype = [torch.float32, torch.float16, torch.bfloat16] -optimizer_names = ["adam", "momentum", "rmsprop"] +gtype = [torch.float32, torch.float16] +optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam'] values = list(product(dim1, dim2, gtype, optimizer_names)) names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values] @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) @@ -135,14 +122,14 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): torch_optimizer.step() for name1, name2 in str2statenames[optim_name]: - torch.testing.assert_allclose( + torch.testing.assert_close( torch_optimizer.state[p1][name1], - bnb_optimizer.state[p2][name2], + bnb_optimizer.state[p2][name2].cuda(), atol=atol, rtol=rtol, ) - torch.testing.assert_allclose(p1, p2.float(), atol=atol, rtol=rtol) + torch.testing.assert_close(p1, p2.float(), atol=atol, rtol=rtol) if i % (k // 5) == 0 and i > 0: path = get_temp_dir() @@ -152,9 +139,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): bnb_optimizer = str2optimizers[optim_name][1]([p2]) bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt"))) rm_path(path) - torch.testing.assert_allclose(p1, p2.float(), atol=atol, rtol=rtol) + torch.testing.assert_close(p1, p2.float(), atol=atol, rtol=rtol) for name1, name2 in str2statenames[optim_name]: - torch.testing.assert_allclose( + torch.testing.assert_close( torch_optimizer.state[p1][name1], bnb_optimizer.state[p2][name2], atol=atol, @@ -168,7 +155,7 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): # --> copy the state to keep weights close p1.data = p1.data.to(p2.dtype).float() p2.copy_(p1.data) - torch.testing.assert_allclose(p1.to(p2.dtype), p2) + torch.testing.assert_close(p1.to(p2.dtype), p2) if optim_name in ["lars", "lamb"]: assert bnb_optimizer.state[p2]["unorm_vec"] > 0.0 @@ -277,7 +264,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): bnb_optimizer.step() torch_optimizer.step() - torch.testing.assert_allclose(p1, p2.float(), atol=patol, rtol=prtol) + torch.testing.assert_close(p1, p2.float(), atol=patol, rtol=prtol) dequant_states = [] for name1, name2, qmap, max_val in str2statenames[optim_name]: @@ -331,8 +318,8 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): bnb_optimizer = str2optimizers[optim_name][1]([p2]) bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt"))) rm_path(path) - torch.testing.assert_allclose(raws1cpy, bnb_optimizer.state[p2][name2]) - torch.testing.assert_allclose(qmap1, bnb_optimizer.state[p2][qmap]) + torch.testing.assert_close(raws1cpy, bnb_optimizer.state[p2][name2]) + torch.testing.assert_close(qmap1, bnb_optimizer.state[p2][qmap]) if "blockwise" in optim_name: s1 = F.dequantize_blockwise( @@ -347,17 +334,17 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): absmax=bnb_optimizer.state[p2][max_val], A=bnb_optimizer.state[p2][name2], ) - torch.testing.assert_allclose(s1cpy, s1) + torch.testing.assert_close(s1cpy, s1) num_not_close = (torch.isclose(torch_optimizer.state[p1][name1], s1, atol=atol, rtol=rtol) == 0) assert num_not_close.sum().item() < 20 - torch.testing.assert_allclose(p1, p2.float(), atol=patol, rtol=prtol) + torch.testing.assert_close(p1, p2.float(), atol=patol, rtol=prtol) # the parameters diverge quickly. Here we keep them close # together so we can test against the Adam error p1.data = p1.data.to(gtype).float() p2.copy_(p1.data) - torch.testing.assert_allclose(p1.to(gtype), p2) + torch.testing.assert_close(p1.to(gtype), p2) for (name1, name2, qmap, max_val), s in zip(str2statenames[optim_name], dequant_states): torch_optimizer.state[p1][name1].copy_(s.data) @@ -419,28 +406,28 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): # gnorm_scale is not deterministic (warp reductions), as such there can be slight differences in state if optim_bits == 32: - torch.testing.assert_allclose(p1, p2) - torch.testing.assert_allclose( + torch.testing.assert_close(p1, p2) + torch.testing.assert_close( adam1.state[p1]["state1"], adam2.state[p2]["state1"], atol=5e-5, rtol=1e-4, ) - torch.testing.assert_allclose( + torch.testing.assert_close( adam1.state[p1]["state2"], adam2.state[p2]["state2"], atol=5e-5, rtol=1e-4, ) elif optim_bits == 8: - torch.testing.assert_allclose(p1, p2, atol=1e-4, rtol=1e-3) - torch.testing.assert_allclose( + torch.testing.assert_close(p1, p2, atol=1e-4, rtol=1e-3) + torch.testing.assert_close( adam1.state[p1]["state1"], adam2.state[p2]["state1"], atol=2, rtol=1e-3, ) - torch.testing.assert_allclose( + torch.testing.assert_close( adam1.state[p1]["state2"], adam2.state[p2]["state2"], atol=2, @@ -472,7 +459,7 @@ gtype = [torch.float32, torch.float16] # optimizer_names = ['momentum_apex', 'momentum8bit', 'momentum_pytorch'] # optimizer_names = ['lamb_apex', 'lamb8bit'] # optimizer_names = ['lars_apex', 'lars8bit'] -optimizer_names = ["adam8bit_blockwise"] +optimizer_names = ["adam8bit_blockwise", 'paged_adam8bit_blockwise', 'paged_adamw8bit_blockwise'] values = list(product(dim1, dim2, gtype, optimizer_names)) names = [ "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values From 41a9c708148c4a16675244de88352d0437e2d87a Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 6 May 2023 18:59:59 -0700 Subject: [PATCH 86/97] Changed prefetching. --- bitsandbytes/functional.py | 5 ++++- bitsandbytes/optim/optimizer.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index a6ed675..2542e4b 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -100,7 +100,10 @@ class GlobalPageManager: return cls._instance def prefetch_all(self, to_cpu=False): - for t in self.paged_tensors: + # assume the first added, will be hte + # ones that are used first, so swap them in last + # in the case they are evicted again + for t in self.paged_tensors[::-1]: prefetch_tensor(t, to_cpu) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index 4f8dcc7..921ec0a 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -256,7 +256,7 @@ class Optimizer8bit(torch.optim.Optimizer): self.to_gpu() # needed for fairseq pure fp16 training self.initialized = True - if self.is_paged: self.page_mng.prefetch_all() + #if self.is_paged: self.page_mng.prefetch_all() for gindex, group in enumerate(self.param_groups): for pindex, p in enumerate(group["params"]): if p.grad is None: @@ -265,7 +265,9 @@ class Optimizer8bit(torch.optim.Optimizer): if len(state) == 0: self.init_state(group, p, gindex, pindex) + self.prefetch_state(p) self.update_step(group, p, gindex, pindex) + torch.cuda.synchronize() if self.is_paged: # all paged operation are asynchronous, we need # to sync to make sure all tensors are in the right state @@ -309,6 +311,13 @@ class Optimizer8bit(torch.optim.Optimizer): self.page_mng.paged_tensors.append(buff) return buff + def prefetch_state(self, p): + if self.is_paged: + state = self.state[p] + F.prefetch_tensor(state['state1']) + if 'state2' in state: + F.prefetch_tensor(state['state2']) + class Optimizer2State(Optimizer8bit): def __init__( From f64cfe65aad56751cabf87c2a9a610e8c43bb981 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 6 May 2023 21:49:16 -0700 Subject: [PATCH 87/97] Fixed prefetch bug for non-paged tensors; added benchmark. --- bitsandbytes/optim/optimizer.py | 9 ++++--- tests/test_optim.py | 44 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index 921ec0a..41c8d27 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -314,9 +314,12 @@ class Optimizer8bit(torch.optim.Optimizer): def prefetch_state(self, p): if self.is_paged: state = self.state[p] - F.prefetch_tensor(state['state1']) - if 'state2' in state: - F.prefetch_tensor(state['state2']) + s1 = state['state1'] + is_paged = getattr(s1, 'is_paged', False) + if is_paged: + F.prefetch_tensor(state['state1']) + if 'state2' in state: + F.prefetch_tensor(state['state2']) class Optimizer2State(Optimizer8bit): diff --git a/tests/test_optim.py b/tests/test_optim.py index a5ecb6e..e35408e 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -490,3 +490,47 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): params = (k - k // 5) * dim1 * dim2 print(optim_name, gtype, s / params) # assert s < 3.9 + +dim1 = [10*1024] +gtype = [torch.float16] +#mode = ['torch', 'bnb'] +mode = ['bnb'] +optimizer_names = ['paged_adamw'] +#optimizer_names = ['paged_adamw8bit_blockwise'] +values = list(product(dim1,gtype, optimizer_names, mode)) +names = ['dim1_{0}_gtype_{1}_optim_{2}_mode_{3}'.format(*vals) for vals in values] +@pytest.mark.parametrize("dim1, gtype, optim_name, mode", values, ids=names) +def test_stream_optimizer_bench(dim1, gtype, optim_name, mode): + layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)])) + layers1 = layers1.to(gtype) + layers1 = layers1.cuda() + + large_tensor = None + if mode == 'torch': + optim = str2optimizers[optim_name][0](layers1.parameters()) + else: + optim = str2optimizers[optim_name][1](layers1.parameters()) + # 12 GB + large_tensor = torch.empty((int(4.5e9),), device='cuda') + + torch.cuda.synchronize() + time.sleep(5) + + num_batches = 5 + batches = torch.randn(num_batches, 128, dim1, device='cuda').to(gtype) + lbls = torch.randint(0, 10, size=(num_batches,128)).cuda() + + for i in range(num_batches): + print(i) + b = batches[i] + if i ==2: + torch.cuda.synchronize() + t0 = time.time() + + out1 = layers1(b) + + loss1 = torch.nn.functional.cross_entropy(out1, lbls[i]).mean() + loss1.backward() + optim.step() + torch.cuda.synchronize() + print(mode, time.time() - t0) From 4bd11518293ea30c6792a5baf64f0715739a09ca Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sun, 7 May 2023 15:06:17 -0700 Subject: [PATCH 88/97] Fixed gradient accumulation test. --- bitsandbytes/autograd/_functions.py | 1 - tests/test_modules.py | 20 +++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index acd90f5..63b7156 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -456,7 +456,6 @@ class MatMul8bitLt(torch.autograd.Function): Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) if req_gradB: - #grad_B = torch.matmul(grad_output.t(), A) CxAt, SAt = F.transform(CAt, formatB, transpose=True) C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True) gradB32, SgradB32 = F.igemmlt(C32grad, CxAt, Sgrad, SAt) diff --git a/tests/test_modules.py b/tests/test_modules.py index 1319cf7..d0a9051 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -332,12 +332,13 @@ def test_linear8bitlt_inference(threshold): def test_linear8bitlt_accumulated_gradient(): l1 = torch.nn.Sequential(*[bnb.nn.Linear8bitLt(32, 32).cuda().half() for i in range(2)]) l2 = torch.nn.Sequential(*[torch.nn.Linear(32, 32).cuda().half() for i in range(2)]) - l2[0].weight = torch.nn.Parameter(l1[0].weight.clone()) - l2[0].bias = torch.nn.Parameter(l1[0].bias.clone()) - l2[1].weight = torch.nn.Parameter(l1[1].weight.clone()) - l2[1].bias = torch.nn.Parameter(l1[1].bias.clone()) - opt1 = bnb.optim.Adam8bit(l1.parameters(), lr=0.001) - opt2 = bnb.optim.Adam8bit(l2.parameters(), lr=0.001) + l1[0].weight.data.copy_(l2[0].weight.data) + l1[1].weight.data.copy_(l2[1].weight.data) + l1[0].bias.data.copy_(l2[0].bias.data) + l1[1].bias.data.copy_(l2[1].bias.data) + + opt1 = bnb.optim.Adam32bit(l1.parameters(), lr=0.001) + opt2 = bnb.optim.Adam32bit(l2.parameters(), lr=0.001) acc_steps = 10 @@ -353,7 +354,6 @@ def test_linear8bitlt_accumulated_gradient(): assert l1[0].state.CxB is not None assert l1[1].state.CxB is not None - print(i) if i > 0 and i % acc_steps == 0: opt1.step() opt1.zero_grad(True) @@ -368,9 +368,11 @@ def test_linear8bitlt_accumulated_gradient(): # we do this copy because otherwise we have small divergences over time that add up l1[0].weight.data.copy_(l2[0].weight.data) l1[1].weight.data.copy_(l2[1].weight.data) + l1[0].bias.data.copy_(l2[0].bias.data) + l1[1].bias.data.copy_(l2[1].bias.data) else: - torch.testing.assert_close(l1[0].weight.grad, l2[0].weight.grad) - torch.testing.assert_close(l1[1].weight.grad, l2[1].weight.grad) + torch.testing.assert_close(l1[0].weight.grad, l2[0].weight.grad, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(l1[1].weight.grad, l2[1].weight.grad, atol=1e-3, rtol=1e-3) @pytest.mark.parametrize("threshold", [0.0, 2.0]) From 2bce175d156b5c5c1be925cb57fe33215675fafd Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 23 May 2023 18:42:19 -0700 Subject: [PATCH 89/97] Fixed Makefile. --- Makefile | 24 ++----------- bitsandbytes/functional.py | 69 -------------------------------------- tests/test_functional.py | 47 ++++++++++++++------------ 3 files changed, 27 insertions(+), 113 deletions(-) diff --git a/Makefile b/Makefile index ea6ee87..c113a3d 100644 --- a/Makefile +++ b/Makefile @@ -40,11 +40,6 @@ CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler # Later versions of CUDA support the new architectures -CC_CUDA10x += -gencode arch=compute_75,code=sm_75 - -CC_CUDA110 := -gencode arch=compute_75,code=sm_75 -CC_CUDA110 += -gencode arch=compute_80,code=sm_80 - CC_CUDA11x := -gencode arch=compute_75,code=sm_75 CC_CUDA11x += -gencode arch=compute_80,code=sm_80 CC_CUDA11x += -gencode arch=compute_86,code=sm_86 @@ -54,8 +49,8 @@ CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -#CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -#CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 +CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 +CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 @@ -66,16 +61,6 @@ all: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) -cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda10x_nomatmul: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE_10x) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - cuda110_nomatmul: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o @@ -122,11 +107,6 @@ env: @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" @echo "============================" -cutlass: - if [ ! -d "$(ROOT_DIR)/dependencies/cutlass" ]; then \ - git clone https://github.com/NVIDIA/cutlass.git $(ROOT_DIR)/dependencies/cutlass; \ - fi \ - $(BUILD_DIR): mkdir -p build mkdir -p dependencies diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index cc82943..c0eb2de 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -128,11 +128,6 @@ class CUBLAS_Context: def initialize(self): self.context = {} - # prev_device = torch.cuda.current_device() - # for i in range(torch.cuda.device_count()): - # torch.cuda.set_device(torch.device('cuda', i)) - # self.context.append(ct.c_void_p(lib.get_context())) - # torch.cuda.set_device(prev_device) @classmethod def get_instance(cls): @@ -238,72 +233,8 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True): return values else: l = values.numel()//2 - #return torch.Tensor(values[:l].tolist() + [-1e-6]*((gap//2)-1) + [0]*2 + [1e-6]*((gap//2)-1) + values[l:].tolist()) return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) -def create_custom_map(seed=0, scale=0.01): - v = [12, 10, 8, 6, 3, 2, 1] - # 16-bit 7B 22.33, 4-bit best 22.88, FP4 23.25, 4-bit 95 22.97, 4-bit evo 22.45 - # 16-bit 13B 70.35, 4-bit best 67.16, FP4 100.78, 4-bit-95 69.39, 4-bit evo 70.48 - - # 13B 100 steps: - # - 4-bit evo: 86.02 - # - 4-bit norm: 78.73 - # - 4-bit FP4: - # - 16-bit: - - # interval search on normal distribution - #v = [3.090232306167813, 1.4589770349449647, 1.064410327932115, 0.7896806653244509, 0.5646884166925807, 0.3653406435875121, 0.17964844284441311] # 0.999 26.5 - #v = [2.3263478740408408, 1.4050715603096329, 1.0364333894937898, 0.7721932141886848, 0.5533847195556727, 0.3584587932511938, 0.1763741647808615] # 0.99 24.99 - #v = [1.6448536269514722, 1.2040469600267016, 0.9208229763683788, 0.6971414348463417, 0.5039653672113453, 0.3280721075316511, 0.16184416680396213] # 0.95 24.53 22.97 - #v = [1.4050715603096329, 1.0803193408149558, 0.8416212335729143, 0.643345405392917, 0.4676987991145084, 0.3054807880993974, 0.1509692154967774] # 0.92 24.81 - #v = [1.2815515655446004, 1.0062699858608395, 0.7916386077433746, 0.6084981344998837, 0.4438613119262478, 0.29050677112339396, 0.14372923370582416] # 0.9 24.68 - #v = [1.8807936081512509, 1.2980047163986055, 0.9769954022693226, 0.7341502955472268, 0.5285136765472481, 0.343225833559403, 0.16910470304375366] # 0.97 25.03 - #v = [1.7506860712521692, 1.2496468758017434, 0.9485350408266378, 0.7155233557034365, 0.5162006366043174, 0.3356393360829622, 0.16547334454641704] # 0.96 24.85 23.01 - #v = [1.5547735945968535, 1.1608220210715001, 0.893800631179489, 0.6789921163940618, 0.4918050830048072, 0.3205236191093902, 0.15821711945563585] # 0.94 24.47 - #v = [1.475791028179171, 1.1196635980209986, 0.8674156943957149, 0.6610637542614526, 0.4797170937629045, 0.31299335020578195, 0.15459215234139795] # 0.93 24.85 - #v = [1.5981931399228175, 1.1821583959486879, 0.9072289939325966, 0.6880384454306778, 0.49787602226482025, 0.3242955535308664, 0.160030379970179] # 0.945 24.287 - ##v = [1.6164363711150211, 1.1908453913294612, 0.9126463450304729, 0.6916727602238111, 0.5003095327012462, 0.3258056171348078, 0.1607558311941979] # 0.947 24.293 - #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.207 - #v = [1.6118251211466303, 1.188665228776879, 0.9112895004060624, 0.690763326564427, 0.4997008778346997, 0.3254280317127771, 0.16057446047146948] # 0.9465 24.30 - #v = [1.6027040905517569, 1.184321770169049, 0.9085808314549837, 0.6889461706317986, 0.4984841229538408, 0.32467299997597887, 0.1602117348657326] # 0.9455 24.293 - #v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88 - - # 7B evo start - #v = [1.62129629, 1.18870191, 0.90848106, 0.69108646, 0.50515268, 0.34927819905, 0.14122701] # 22.06 - #v = [1.6143079205628337, 1.1888081407660314, 0.8990131955745421, 0.694373759813679, 0.5083033257326773, 0.3452499746844963, 0.1148939728228951] - #v = [1.614442766030303, 1.189401918639665, 0.8998038168964273, 0.6953094818279475, 0.5073264599048384, 0.3449003790823619, 0.11428378427205564] - - # 13B evo start - #v = [1.6077535089716468, 1.1914902148179205, 0.8999752421085561, 0.6967904489387543, 0.4949093928311768, 0.30920472033044544, 0.15391602735952042] - #v = [1.586363722436466, 1.202610827188916, 0.9003332576346587, 0.6904888715206972, 0.49490974688233724, 0.2971151461329376, 0.15683230810738283] - v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908] - - # mean evo 7B + 13B - #v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237] - - # theoretically optiomal (0.93333) - #v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333 - - if seed > 0: - v = np.array(v) - np.random.seed(seed) - v += np.random.randn(7)*scale - print(v.tolist()) - #v[0] += (np.random.randn(1)*0.001)[0] - #v[-1] += (np.random.randn(1)*0.001)[0] - #print(v[0], v[-1]) - v = v.tolist() - values = v + [0]*(256-14) + \ - v[::-1] - - values = torch.Tensor(values) - values[0:7] *= -1 - values = values.sort().values - values /= values.max() - assert values.numel() == 256 - return values - def create_normal_map(offset=0.9677083, use_extra_value=True): if use_extra_value: diff --git a/tests/test_functional.py b/tests/test_functional.py index c2d4796..cc58324 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1773,21 +1773,24 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): print("partial matmul", time.time() - t0) -batch_size = 2 -seqdim = 2048 +batch_size = 1 +seqdim = 1 values = [] -values.append((batch_size, seqdim, 768, 4 * 768)) +#values.append((batch_size, seqdim, 768, 4 * 768)) #values.append((batch_size, seqdim, 1024, 4*1024)) #values.append((batch_size, seqdim, 1536, 4*1536)) #values.append((batch_size, seqdim, 2048, 4*2048)) #values.append((batch_size, seqdim, 2560, 4*2560)) -#values.append((batch_size, seqdim, 4096, 4*4096)) +values.append((batch_size, seqdim, 4096, 4*4096)) +values.append((batch_size, seqdim, 5120, 4*5120)) +values.append((batch_size, seqdim, 6656, 4*6656)) +values.append((batch_size, seqdim, 8192, 4*8192)) #values.append((batch_size, seqdim, 5140, 4*5140)) #values.append((batch_size, seqdim, 12288, 4*12288)) names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): - iters = 1 + iters = 80 formatB = F.get_special_format_str() A = torch.randn(batch, seq, model, device="cuda").half() @@ -1799,14 +1802,14 @@ def test_bench_matmul(batch, seq, model, hidden): B_nf4, state_nf4= F.quantize_nf4(B) - linear8bit = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() + linear8bit = bnb.nn.Linear8bitLt(model, hidden, False, False).cuda().half() linear8bit.eval() outliers = torch.randint(0, model, size=(5,)).cuda() A[:, :, outliers] = 8.0 - linearMixedBit = (bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half()) - linearMixedBit.eval() + linearMixedBit = (bnb.nn.Linear8bitLt(model, hidden, False, False, threshold=6.0).cuda().half()) + #linearMixedBit.eval() linear8bit_train = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() linear8bit_train_thresh = bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half() @@ -1898,21 +1901,21 @@ def test_bench_matmul(batch, seq, model, hidden): #torch.cuda.synchronize() #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #linear8bit(A) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): - # linear8bit(A) - #torch.cuda.synchronize() - #print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + linear8bit(A) + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + linear8bit(A) + torch.cuda.synchronize() + print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #linearMixedBit(A) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): - # linearMixedBit(A) - #torch.cuda.synchronize() - #print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + linearMixedBit(A) + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + linearMixedBit(A) + torch.cuda.synchronize() + print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") #linear8bit_train(A) #torch.cuda.synchronize() From 1b8772a8f33fdb47df0c849302cbb7e703571b8c Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 23 May 2023 19:37:38 -0700 Subject: [PATCH 90/97] Added PagedLion and bf16 Lion. --- bitsandbytes/functional.py | 6 +-- bitsandbytes/optim/__init__.py | 2 +- bitsandbytes/optim/lion.py | 95 +++++++--------------------------- csrc/kernels.cu | 3 ++ csrc/ops.cu | 2 + csrc/pythonInterface.c | 12 +++-- tests/test_optim.py | 23 ++++---- 7 files changed, 46 insertions(+), 97 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index c0eb2de..afa346e 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -37,10 +37,7 @@ if COMPILED_WITH_CUDA: lib.crmsprop32bit_grad_32, lib.crmsprop32bit_grad_16, ) - str2optimizer32bit["lion"] = ( - lib.clion32bit_grad_32, - lib.clion32bit_grad_16, - ) + str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16, lib.clion32bit_grad_bf16) str2optimizer32bit["adagrad"] = ( lib.cadagrad32bit_grad_32, lib.cadagrad32bit_grad_16, @@ -89,6 +86,7 @@ if COMPILED_WITH_CUDA: str2optimizer8bit_blockwise["lion"] = ( lib.clion_8bit_blockwise_grad_fp32, lib.clion_8bit_blockwise_grad_fp16, + lib.clion_8bit_blockwise_grad_bf16, ) str2optimizer8bit_blockwise["adagrad"] = ( lib.cadagrad_8bit_blockwise_grad_fp32, diff --git a/bitsandbytes/optim/__init__.py b/bitsandbytes/optim/__init__.py index 1cfe241..83a57bd 100644 --- a/bitsandbytes/optim/__init__.py +++ b/bitsandbytes/optim/__init__.py @@ -12,5 +12,5 @@ from .lamb import LAMB, LAMB8bit, LAMB32bit from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS from .optimizer import GlobalOptimManager from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit -from .lion import Lion, Lion8bit, Lion32bit +from .lion import Lion, Lion8bit, Lion32bit, PagedLion, PagedLion8bit, PagedLion32bit from .sgd import SGD, SGD8bit, SGD32bit diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index 2551b68..2bde1a4 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -4,84 +4,27 @@ # LICENSE file in the root directory of this source tree. from bitsandbytes.optim.optimizer import Optimizer1State - class Lion(Optimizer1State): - def __init__( - self, - params, - lr=1e-4, - betas=(0.9, 0.99), - weight_decay=0, - optim_bits=32, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "lion", - params, - lr, - betas, - 0., - weight_decay, - optim_bits, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) - + def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Lion8bit(Optimizer1State): - def __init__( - self, - params, - lr=1e-4, - betas=(0.9, 0.99), - weight_decay=0, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "lion", - params, - lr, - betas, - 0., - weight_decay, - 8, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) - + def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Lion32bit(Optimizer1State): - def __init__( - self, - params, - lr=1e-4, - betas=(0.9, 0.99), - weight_decay=0, - args=None, - min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, - ): - super().__init__( - "lion", - params, - lr, - betas, - 0., - weight_decay, - 32, - args, - min_8bit_size, - percentile_clipping, - block_wise, - ) + def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + + +class PagedLion(Optimizer1State): + def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + +class PagedLion8bit(Optimizer1State): + def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + +class PagedLion32bit(Optimizer1State): + def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 30e5e2e..11ad63f 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3666,6 +3666,7 @@ MAKE_PreconditionOptimizer32bit1State(RMSPROP, half) MAKE_PreconditionOptimizer32bit1State(RMSPROP, float) MAKE_PreconditionOptimizer32bit1State(LION, half) MAKE_PreconditionOptimizer32bit1State(LION, float) +MAKE_PreconditionOptimizer32bit1State(LION, __nv_bfloat16) MAKE_PreconditionOptimizer32bit1State(ADAGRAD, half) MAKE_PreconditionOptimizer32bit1State(ADAGRAD, float) @@ -3679,6 +3680,7 @@ MAKE_Optimizer32bit1State(RMSPROP, half) MAKE_Optimizer32bit1State(RMSPROP, float) MAKE_Optimizer32bit1State(LION, half) MAKE_Optimizer32bit1State(LION, float) +MAKE_Optimizer32bit1State(LION, __nv_bfloat16) MAKE_Optimizer32bit1State(ADAGRAD, half) MAKE_Optimizer32bit1State(ADAGRAD, float) @@ -3852,5 +3854,6 @@ MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, float, 2048, 8) MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, half, 2048, 8) MAKE_OptimizerStatic8bit1StateBlockwise(LION, float, 2048, 8) MAKE_OptimizerStatic8bit1StateBlockwise(LION, half, 2048, 8) +MAKE_OptimizerStatic8bit1StateBlockwise(LION, __nv_bfloat16, 2048, 8) MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, float, 2048, 8) MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, half, 2048, 8) diff --git a/csrc/ops.cu b/csrc/ops.cu index 7f3a831..9c042fa 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -802,6 +802,7 @@ MAKE_optimizer32bit(RMSPROP, half) MAKE_optimizer32bit(RMSPROP, float) MAKE_optimizer32bit(LION, half) MAKE_optimizer32bit(LION, float) +MAKE_optimizer32bit(LION, __nv_bfloat16) MAKE_optimizer32bit(ADAGRAD, half) MAKE_optimizer32bit(ADAGRAD, float) @@ -837,6 +838,7 @@ MAKE_optimizerStatic8bitBlockwise(half, RMSPROP); MAKE_optimizerStatic8bitBlockwise(float, RMSPROP); MAKE_optimizerStatic8bitBlockwise(half, LION); MAKE_optimizerStatic8bitBlockwise(float, LION); +MAKE_optimizerStatic8bitBlockwise(__nv_bfloat16, LION); MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD); MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD); diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 776497b..23a0364 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -51,8 +51,9 @@ MAKE_FUNC32(adam, ADAM, half, fp16) MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16) MAKE_FUNC32(rmsprop, RMSPROP, float, 32) MAKE_FUNC32(rmsprop, RMSPROP, half, 16) -MAKE_FUNC32(lion, LION, float, 32) -MAKE_FUNC32(lion, LION, half, 16) +MAKE_FUNC32(lion, LION, float, fp32) +MAKE_FUNC32(lion, LION, half, fp16) +MAKE_FUNC32(lion, LION, __nv_bfloat16, bf16) MAKE_FUNC32(adagrad, ADAGRAD, float, 32) MAKE_FUNC32(adagrad, ADAGRAD, half, 16) @@ -95,6 +96,7 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32) MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16) MAKE_BLOCKWISE8(lion, LION, half, fp16) MAKE_BLOCKWISE8(lion, LION, float, fp32) +MAKE_BLOCKWISE8(lion, LION, __nv_bfloat16, bf16) void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping(g, gnorm_vec, step, n); } @@ -201,8 +203,9 @@ extern "C" MAKE_CFUNC32(momentum, half, 16) MAKE_CFUNC32(rmsprop, float, 32) MAKE_CFUNC32(rmsprop, half, 16) - MAKE_CFUNC32(lion, float, 32) - MAKE_CFUNC32(lion, half, 16) + MAKE_CFUNC32(lion, float, fp32) + MAKE_CFUNC32(lion, half, fp16) + MAKE_CFUNC32(lion, __nv_bfloat16, bf16) MAKE_CFUNC32(adagrad, float, 32) MAKE_CFUNC32(adagrad, half, 16) @@ -245,6 +248,7 @@ extern "C" MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16) MAKE_CBLOCKWISE8(lion, LION, half, fp16) MAKE_CBLOCKWISE8(lion, LION, float, fp32) + MAKE_CBLOCKWISE8(lion, LION, __nv_bfloat16, bf16) void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); } void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); } diff --git a/tests/test_optim.py b/tests/test_optim.py index 98e4289..9e90083 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -19,11 +19,11 @@ import bitsandbytes.functional as F k = 20 def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0): - idx = torch.isclose(a, b, rtol, atol) + idx = torch.isclose(a, b, rtol=rtol, atol=atol) error_count = (idx == 0).sum().item() if error_count > max_error_count: print(f"Too many values not close: assert {error_count} < {max_error_count}") - torch.testing.assert_close(a, b, rtol, atol) + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) def get_temp_dir(): @@ -35,13 +35,8 @@ def get_temp_dir(): def rm_path(path): shutil.rmtree(path) -str2bf16support = {} -str2bf16support['adam8bit_blockwise'] = True - str2optimizers = {} str2optimizers["adam_pytorch"] = (None, torch.optim.Adam, bnb.optim.Adam) -# str2optimizers['adam_apex'] = (None, apex.optimizers.FusedAdam, bnb.optim.Adam) -# str2optimizers['momentum_apex'] = (None, lambda pxx: apex.optimizers.FusedSGD(pxx, 0.01, 0.9), bnb.optim.Adam) str2optimizers["lion_pytorch"] = (None, Lion, bnb.optim.Lion) str2optimizers["momentum_pytorch"] = ( None, @@ -51,8 +46,8 @@ str2optimizers["momentum_pytorch"] = ( str2optimizers["adam"] = (torch.optim.Adam, bnb.optim.Adam) str2optimizers["paged_adamw"] = (torch.optim.AdamW, bnb.optim.PagedAdamW) str2optimizers["paged_adam"] = (torch.optim.Adam, bnb.optim.PagedAdam) -# str2optimizers['fused_adam'] = (apex.optimizers.FusedAdam, bnb.optim.Adam) str2optimizers["lion"] = (Lion, bnb.optim.Lion) +str2optimizers["paged_lion"] = (Lion, bnb.optim.PagedLion) str2optimizers["momentum"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), lambda pxx: bnb.optim.SGD(pxx, 0.01, 0.9, block_wise=False), @@ -76,6 +71,7 @@ str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim. str2optimizers["paged_adamw8bit_blockwise"] = (torch.optim.AdamW, lambda pxx: bnb.optim.PagedAdamW8bit(pxx, block_wise=True)) str2optimizers["paged_adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.PagedAdam8bit(pxx, block_wise=True)) str2optimizers["lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.Lion8bit(pxx, block_wise=True)) +str2optimizers["paged_lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.PagedLion8bit(pxx, block_wise=True)) str2optimizers["momentum8bit_blockwise"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9, block_wise=True), @@ -90,6 +86,7 @@ str2statenames["adam"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["paged_adamw"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["paged_adam"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["lion"] = [("exp_avg", "state1")] +str2statenames["paged_lion"] = [("exp_avg", "state1")] str2statenames["momentum"] = [("momentum_buffer", "state1")] str2statenames["lamb"] = [("exp_avg", "state1"), ("exp_avg_sq", "state2")] str2statenames["rmsprop"] = [("square_avg", "state1")] @@ -104,15 +101,17 @@ str2statenames["momentum8bit_blockwise"] = [("momentum_buffer", "state1", "qmap1 str2statenames["rmsprop8bit"] = [("square_avg", "state1", "qmap1", "max1")] str2statenames["rmsprop8bit_blockwise"] = [("square_avg", "state1", "qmap1", "absmax1")] str2statenames["lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] +str2statenames["paged_lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] dim1 = [1024] dim2 = [32, 1024, 4097, 1] -gtype = [torch.float32, torch.float16] -optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion'] +gtype = [torch.float32, torch.float16, torch.bfloat16] +optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] values = list(product(dim1, dim2, gtype, optimizer_names)) names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values] @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_optimizer32bit(dim1, dim2, gtype, optim_name): + if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip() if dim1 == 1 and dim2 == 1: return p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 @@ -254,7 +253,7 @@ names = [ @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) def test_optimizer8bit(dim1, dim2, gtype, optim_name): - if gtype == torch.bfloat16 and optim_name not in str2bf16support: return + if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip() if dim1 == 1 and dim2 == 1: return p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 @@ -485,7 +484,7 @@ gtype = [torch.float32, torch.float16] # optimizer_names = ['momentum_apex', 'momentum8bit', 'momentum_pytorch'] # optimizer_names = ['lamb_apex', 'lamb8bit'] # optimizer_names = ['lars_apex', 'lars8bit'] -optimizer_names = ["adam8bit_blockwise", 'paged_adam8bit_blockwise', 'paged_adamw8bit_blockwise'] +optimizer_names = ["adam8bit_blockwise", 'paged_adam8bit_blockwise', 'paged_adamw8bit_blockwise', 'paged_lion8bit_blockwise'] values = list(product(dim1, dim2, gtype, optimizer_names)) names = [ "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values From 0f40fa3f0a198802056e29ba183eaabc6751d565 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 23 May 2023 19:55:52 -0700 Subject: [PATCH 91/97] Bumped version. --- CHANGELOG.md | 11 +++++++++++ Makefile | 3 +-- setup.py | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2de70d3..eb7ac0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -228,3 +228,14 @@ Deprecated: Features: - Added Int8 SwitchBack layers - Added Fake FP8 layers for research purposes (available under `bnb.research.nn. ...`) + + +### 0.39.0 + + +Features: + - 4-bit matrix multiplication for Float4 and NormalFloat4 data types. + - Added 4-bit quantization routines + - Doubled quantization routines for 4-bit quantization + - Paged optimizers for Adam and Lion. + - bfloat16 gradient / weight support for Adam and Lion with 8 or 32-bit states. diff --git a/Makefile b/Makefile index c113a3d..1f2b281 100644 --- a/Makefile +++ b/Makefile @@ -25,8 +25,7 @@ FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -INCLUDE_10x := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib +LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib # NVIDIA NVCC compilation flags COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell diff --git a/setup.py b/setup.py index 009fd3d..b683bfc 100644 --- a/setup.py +++ b/setup.py @@ -18,10 +18,10 @@ def read(fname): setup( name=f"bitsandbytes", - version=f"0.38.1", + version=f"0.39.0", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", - description="8-bit optimizers and matrix multiplication routines.", + description="k-bit optimizers and matrix multiplication routines.", license="MIT", keywords="gpu optimizers optimization 8-bit quantization compression", url="https://github.com/TimDettmers/bitsandbytes", From ac5550a0238286377ee3f58a85aeba1c40493e17 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 30 May 2023 19:06:59 -0700 Subject: [PATCH 92/97] Added changes for deployment. --- Makefile | 1 - csrc/kernels.cu | 10 +++++++--- deploy.sh | 11 ----------- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 1f2b281..5fa1f17 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,6 @@ COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta -COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 11ad63f..ab12c37 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -16,15 +16,12 @@ #include #include -#include -#include #define HLF_MAX 65504 #define TH 1024 #define NUM 4 #define NUM_BLOCK 4096 -using namespace nvcuda; // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda __device__ float atomicMax(float* address, float val) { @@ -3094,6 +3091,9 @@ template __device__ inline void vector_l #define WARPS 5 template __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc) { + +#if __CUDA_ARCH__ >= 750 + using namespace nvcuda; int col_offset = blockIdx.x *32; const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; @@ -3294,11 +3294,14 @@ template __global__ void gemm_device(int M, if(col_offset + warp_lane < M) out[col_offset + warp_lane] = smem_A[warp_lane]; +#endif } template __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) { +#if __CUDA_ARCH__ >= 750 + using namespace nvcuda; int col_offset = blockIdx.x *32; const int warp_id = threadIdx.x / 32; const int half_warp_id = threadIdx.x / 16; @@ -3459,6 +3462,7 @@ template __global__ void kgemm_4bit_inference(int M, i if(col_offset + warp_lane < M) out[col_offset + warp_lane] = smem_A[warp_lane]; +#endif } //#define ROWS 2 diff --git a/deploy.sh b/deploy.sh index 24d6cbf..a2257a2 100644 --- a/deploy.sh +++ b/deploy.sh @@ -139,17 +139,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then fi -make clean -export CUDA_HOME=$BASE_PATH/cuda-10.2 -make cuda10x_nomatmul CUDA_VERSION=102 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda102_nocublaslt.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - - make clean export CUDA_HOME=$BASE_PATH/cuda-11.0 make cuda110_nomatmul CUDA_VERSION=110 From 4fb37d45c1b4ed0b250b2ecfa7b5b41ecda9fbbb Mon Sep 17 00:00:00 2001 From: Max Ryabinin Date: Fri, 9 Jun 2023 21:39:37 +0200 Subject: [PATCH 93/97] Extract get_tile_inds to a separate function --- bitsandbytes/autograd/_functions.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 63b7156..c2298c8 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -232,6 +232,19 @@ def supports_igemmlt(device: torch.device) -> bool: return True +def _get_tile_size(format): + assert format in ( + "col_turing", + "col_ampere", + ), f"please find this assert and manually enter tile size for {format}" + return (8, 32) if format == "col_turing" else (32, 32) + + +def get_tile_inds(format, device): + transform = lambda x: F.transform(x.to(device), from_order="row", to_order=format)[0].to(x.device) + with torch.no_grad(): + return get_inverse_transform_indices(transform, _get_tile_size(format)).to(device) + @dataclass class MatmulLtState: _tile_indices: Optional[torch.Tensor] = None @@ -267,20 +280,10 @@ class MatmulLtState: self.SBt = None self.CBt = None - def get_tile_size(self): - assert self.formatB in ( - "col_turing", - "col_ampere", - ), f"please find this assert and manually enter tile size for {self.formatB}" - return (8, 32) if self.formatB == "col_turing" else (32, 32) - @property def tile_indices(self): if self._tile_indices is None: - device = self.CxB.device - transform = lambda x: F.transform(x.to(device), from_order="row", to_order=self.formatB)[0].to(x.device) - with torch.no_grad(): - self._tile_indices = get_inverse_transform_indices(transform, self.get_tile_size()).to(device) + self._tile_indices = get_tile_inds(self.formatB, self.CxB.device) return self._tile_indices From f734076e94787a875eb9a5621c4d3d33b0bc4c4c Mon Sep 17 00:00:00 2001 From: Max Ryabinin Date: Fri, 9 Jun 2023 21:39:57 +0200 Subject: [PATCH 94/97] Improve memory efficiency of 8-bit serialization --- bitsandbytes/nn/modules.py | 59 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 3284921..101c988 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -10,7 +10,7 @@ from torch import Tensor, device, dtype, nn import bitsandbytes as bnb import bitsandbytes.functional -from bitsandbytes.autograd._functions import get_inverse_transform_indices, undo_layout +from bitsandbytes.autograd._functions import undo_layout, get_tile_inds from bitsandbytes.optim import GlobalOptimManager from bitsandbytes.utils import OutlierTracer, find_outlier_dims @@ -306,7 +306,6 @@ class Int8Params(torch.nn.Parameter): return new_param - class Linear8bitLt(nn.Linear): def __init__(self, input_features, output_features, bias=True, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0, index=None): @@ -324,50 +323,58 @@ class Linear8bitLt(nn.Linear): self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights) def _save_to_state_dict(self, destination, prefix, keep_vars): - if not self.state.has_fp16_weights and self.state.CB is None and self.state.CxB is not None: - # reorder weight layout back from ampere/turing to row - reorder_layout = True - weight_clone = self.weight.data.clone() - else: - reorder_layout = False + super()._save_to_state_dict(destination, prefix, keep_vars) - try: - if reorder_layout: - self.weight.data = undo_layout(self.state.CxB, self.state.tile_indices) + # we only need to save SCB as extra data, because CB for quantized weights is already stored in weight.data + scb_name = "SCB" - super()._save_to_state_dict(destination, prefix, keep_vars) + # case 1: .cuda was called, SCB is in self.weight + param_from_weight = getattr(self.weight, scb_name) + # case 2: self.init_8bit_state was called, SCB is in self.state + param_from_state = getattr(self.state, scb_name) + # case 3: SCB is in self.state, weight layout reordered after first forward() + layout_reordered = self.state.CxB is not None - # we only need to save SCB as extra data, because CB for quantized weights is already stored in weight.data - weight_name = "SCB" + key_name = prefix + f"{scb_name}" + format_name = prefix + "weight_format" - # case 1: .cuda was called, SCB is in self.weight - param_from_weight = getattr(self.weight, weight_name) - # case 2: self.init_8bit_state was called, SCB is in self.state - param_from_state = getattr(self.state, weight_name) - - key_name = prefix + f"{weight_name}" + if not self.state.has_fp16_weights: if param_from_weight is not None: destination[key_name] = param_from_weight if keep_vars else param_from_weight.detach() - elif not self.state.has_fp16_weights and param_from_state is not None: + destination[format_name] = "row" + elif param_from_state is not None and not layout_reordered: destination[key_name] = param_from_state if keep_vars else param_from_state.detach() - finally: - if reorder_layout: - self.weight.data = weight_clone + destination[format_name] = "row" + elif param_from_state is not None: + destination[key_name] = param_from_state if keep_vars else param_from_state.detach() + destination[format_name] = self.state.formatB def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) - for key in unexpected_keys: + unexpected_copy = list(unexpected_keys) + + for key in unexpected_copy: input_name = key[len(prefix):] if input_name == "SCB": if self.weight.SCB is None: - # buffers not yet initialized, can't call them directly without + # buffers not yet initialized, can't access them directly without quantizing first raise RuntimeError("Loading a quantized checkpoint into non-quantized Linear8bitLt is " "not supported. Please call module.cuda() before module.load_state_dict()") input_param = state_dict[key] self.weight.SCB.copy_(input_param) + + if self.state.SCB is not None: + self.state.SCB = self.weight.SCB + + unexpected_keys.remove(key) + if input_name == "weight_format": + input_param = state_dict[key] + if input_param != "row": + tile_indices = get_tile_inds(input_param, self.weight.device) + self.weight.data = self.weight.CB = undo_layout(self.weight.data, tile_indices) unexpected_keys.remove(key) def init_8bit_state(self): From c1f3f56d2cc18c929dc9b257a24603d26657b0b7 Mon Sep 17 00:00:00 2001 From: Max Ryabinin Date: Fri, 9 Jun 2023 21:58:39 +0200 Subject: [PATCH 95/97] Rearrange the weights directly in state dict before loading --- bitsandbytes/nn/modules.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 101c988..b806e94 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -306,6 +306,15 @@ class Int8Params(torch.nn.Parameter): return new_param +def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): + weight = state_dict[f"{prefix}weight"] + weight_format = state_dict.pop(f"{prefix}weight_format", "row") + + if weight_format != "row": + tile_indices = get_tile_inds(weight_format, weight.device) + state_dict[f"{prefix}weight"] = undo_layout(weight, tile_indices) + + class Linear8bitLt(nn.Linear): def __init__(self, input_features, output_features, bias=True, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0, index=None): @@ -321,6 +330,7 @@ class Linear8bitLt(nn.Linear): self.state.use_pool = True self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights) + self._register_load_state_dict_pre_hook(maybe_rearrange_weight) def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) @@ -370,12 +380,6 @@ class Linear8bitLt(nn.Linear): self.state.SCB = self.weight.SCB unexpected_keys.remove(key) - if input_name == "weight_format": - input_param = state_dict[key] - if input_param != "row": - tile_indices = get_tile_inds(input_param, self.weight.device) - self.weight.data = self.weight.CB = undo_layout(self.weight.data, tile_indices) - unexpected_keys.remove(key) def init_8bit_state(self): self.state.CB = self.weight.CB From b599fdb197c6b80d3be334899b9e9620492656b9 Mon Sep 17 00:00:00 2001 From: Max Ryabinin Date: Wed, 14 Jun 2023 19:27:13 +0200 Subject: [PATCH 96/97] Only rearrange weight if it exists --- bitsandbytes/nn/modules.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index b806e94..b10d45a 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -307,7 +307,10 @@ class Int8Params(torch.nn.Parameter): def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - weight = state_dict[f"{prefix}weight"] + weight = state_dict.get(f"{prefix}weight") + if weight is None: + # if the state dict has no weights for this layer (e.g., LoRA finetuning), do nothing + return weight_format = state_dict.pop(f"{prefix}weight_format", "row") if weight_format != "row": From 4395d68cf6beda911c1c3e5633debf15652e2902 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 19 Jun 2023 19:40:41 -0700 Subject: [PATCH 97/97] Release 0.39.1. --- CHANGELOG.md | 8 ++++++++ Makefile | 15 +++++++++++++-- compile_from_source.md | 5 +++++ setup.py | 2 +- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb7ac0d..7c75b24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -239,3 +239,11 @@ Features: - Doubled quantization routines for 4-bit quantization - Paged optimizers for Adam and Lion. - bfloat16 gradient / weight support for Adam and Lion with 8 or 32-bit states. + +Bug fixes: + - Fixed a bug where 8-bit models consumed twice the memory as expected after serialization + +Deprecated: + - Kepler binaries (GTX 700s and Tesla K40/K80) are not longer provided via pip and need to be compiled from source. Kepler support might be fully removed in the future. + + diff --git a/Makefile b/Makefile index 5fa1f17..19b5b91 100644 --- a/Makefile +++ b/Makefile @@ -59,16 +59,27 @@ all: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) -cuda110_nomatmul: $(BUILD_DIR) env +cuda110_nomatmul_kepler: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) -cuda11x_nomatmul: $(BUILD_DIR) env +cuda11x_nomatmul_kepler: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda110_nomatmul: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda11x_nomatmul: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + cuda12x_nomatmul: $(BUILD_DIR) env $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o diff --git a/compile_from_source.md b/compile_from_source.md index 9d4f89d..f5de4db 100644 --- a/compile_from_source.md +++ b/compile_from_source.md @@ -33,3 +33,8 @@ You can set `CUDA_HOME` to `/usr/local/cuda-11.7`. For example, you might be abl If you have problems compiling the library with these instructions from source, please open an issue. + +## Compilation with Kepler + +Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler` + diff --git a/setup.py b/setup.py index b683bfc..51e747c 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def read(fname): setup( name=f"bitsandbytes", - version=f"0.39.0", + version=f"0.39.1", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="k-bit optimizers and matrix multiplication routines.",