bitsandbytes-rocm/tests/test_generation.py

import pytest
import torch
import math

from itertools import product

import transformers
from transformers import (
  AutoConfig,
  AutoModelForCausalLM,
  AutoTokenizer,
  BitsAndBytesConfig,
  GenerationConfig,
  set_seed,

)


def get_4bit_config():
  return BitsAndBytesConfig(
    load_in_4bit=True,
    load_in_8bit=False,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
  )


def get_model_and_tokenizer(config):
    model_name_or_path, quant_type = config
    bnb_config = get_4bit_config()
    if quant_type == '16bit':
        bnb_config.load_in_4bit = False
    else:
        bnb_config.bnb_4bit_quant_type= quant_type
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
        quantization_config=bnb_config,
        max_memory={0:'48GB'},
        device_map='auto',
        torch_dtype=torch.bfloat16
        ).eval()

    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)

    return model, tokenizer

def get_prompt_for_generation_eval(text, add_roles=True):
    description = (
        "A chat between a curious human and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's questions."
    )
    if add_roles:
        prompt = f'{description} ### Human: {text} ### Assistant:'
    else:
        prompt = f'{description} {text}'
    return prompt

def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_for_generation_eval):
    text = prompt_func(text)
    inputs = tokenizer(text, return_tensors="pt").to('cuda:0')
    outputs = model.generate(inputs=inputs['input_ids'], generation_config=generation_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

models = ['huggyllama/llama-7b', 'bigscience/bloom-1b7']
dtypes = ['nf4', 'fp4']
load_in_4bit = [True, False]
values = list(product(models, dtypes))
strfunc = lambda lst: [str(x) for x in lst]
ids = ['_'.join(strfunc(x)) for x in values]
@pytest.fixture(scope='session', params=values, ids=ids)
def model_and_tokenizer(request):
    model, tokenizer = get_model_and_tokenizer(request.param)
    yield request.param, model, tokenizer
    del model

@pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False'])
@pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False'])
#@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
def test_pi(model_and_tokenizer, inference_kernel, DQ):
    print('')
    dtype = torch.float16

    fixture_config, model, tokenizer = model_and_tokenizer

    generation_config = transformers.GenerationConfig(
        max_new_tokens=20,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )
    generation_config.max_new_tokens = 20


    #text = 'Please write down the first 50 digits of pi.'
    #text = get_prompt_for_generation_eval(text)
    #text += ' Sure, here the first 50 digits of pi: 3.14159'
    n_cases = 6
    text = '3.14159'
    if hasattr(model.config, 'quantization_config'):
        model.config.quantization_config.bnb_4bit_compute_dtype = dtype
        model.config.quantization_config.bnb_4bit_use_double_quant = DQ

    if not inference_kernel:
        text = [text]*n_cases
    inputs = tokenizer(text, return_tensors="pt").to('cuda:0')
    x = inputs['input_ids']
    outputs = []
    if inference_kernel:
        for i in range(n_cases):
            output = model.generate(x, generation_config=generation_config)
            textout = tokenizer.decode(output[0], skip_special_tokens=True)
            outputs.append(textout)
    else:
        outputs = model.generate(x, generation_config=generation_config)
        outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


    assert len(outputs) == n_cases
    failure_count = 0
    for i in range(n_cases):
        if not outputs[i][:len(str(math.pi))] == str(math.pi):
            failure_count += 1
    failure_max = (2 if fixture_config[0] == 'huggyllama/llama-7b' else 4)
    if failure_count > failure_max:
        print(math.pi)
        for out in outputs:
            print(out)
        raise ValueError(f'Failure count: {failure_count}/{n_cases}')
Added generation tests. 2023-07-10 19:19:16 +00:00			`import pytest`
			`import torch`
			`import math`

Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`from itertools import product`

			`import transformers`
Added generation tests. 2023-07-10 19:19:16 +00:00			`from transformers import (`
			`AutoConfig,`
			`AutoModelForCausalLM,`
			`AutoTokenizer,`
			`BitsAndBytesConfig,`
			`GenerationConfig,`
			`set_seed,`

			`)`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00
Added generation tests. 2023-07-10 19:19:16 +00:00

			`def get_4bit_config():`
			`return BitsAndBytesConfig(`
			`load_in_4bit=True,`
			`load_in_8bit=False,`
			`llm_int8_threshold=6.0,`
			`llm_int8_has_fp16_weight=False,`
			`bnb_4bit_compute_dtype=torch.float16,`
			`bnb_4bit_use_double_quant=True,`
			`bnb_4bit_quant_type='nf4',`
			`)`


Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`def get_model_and_tokenizer(config):`
			`model_name_or_path, quant_type = config`
			`bnb_config = get_4bit_config()`
			`if quant_type == '16bit':`
			`bnb_config.load_in_4bit = False`
			`else:`
			`bnb_config.bnb_4bit_quant_type= quant_type`
			`model = AutoModelForCausalLM.from_pretrained(model_name_or_path,`
			`quantization_config=bnb_config,`
			`max_memory={0:'48GB'},`
			`device_map='auto',`
			`torch_dtype=torch.bfloat16`
			`).eval()`
Added generation tests. 2023-07-10 19:19:16 +00:00
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)`

			`return model, tokenizer`
Added generation tests. 2023-07-10 19:19:16 +00:00
			`def get_prompt_for_generation_eval(text, add_roles=True):`
			`description = (`
			`"A chat between a curious human and an artificial intelligence assistant. "`
			`"The assistant gives helpful, detailed, and polite answers to the user's questions."`
			`)`
			`if add_roles:`
			`prompt = f'{description} ### Human: {text} ### Assistant:'`
			`else:`
			`prompt = f'{description} {text}'`
			`return prompt`

			`def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_for_generation_eval):`
			`text = prompt_func(text)`
			`inputs = tokenizer(text, return_tensors="pt").to('cuda:0')`
			`outputs = model.generate(inputs=inputs['input_ids'], generation_config=generation_config)`
			`return tokenizer.decode(outputs[0], skip_special_tokens=True)`

Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`models = ['huggyllama/llama-7b', 'bigscience/bloom-1b7']`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`dtypes = ['nf4', 'fp4']`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`load_in_4bit = [True, False]`
			`values = list(product(models, dtypes))`
			`strfunc = lambda lst: [str(x) for x in lst]`
			`ids = ['_'.join(strfunc(x)) for x in values]`
			`@pytest.fixture(scope='session', params=values, ids=ids)`
			`def model_and_tokenizer(request):`
			`model, tokenizer = get_model_and_tokenizer(request.param)`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`yield request.param, model, tokenizer`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`del model`

Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`@pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False'])`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`@pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False'])`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`#@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])`
			`def test_pi(model_and_tokenizer, inference_kernel, DQ):`
			`print('')`
			`dtype = torch.float16`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`fixture_config, model, tokenizer = model_and_tokenizer`
Added generation tests. 2023-07-10 19:19:16 +00:00
			`generation_config = transformers.GenerationConfig(`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`max_new_tokens=20,`
Added generation tests. 2023-07-10 19:19:16 +00:00			`do_sample=True,`
			`top_p=0.9,`
			`temperature=0.7,`
			`)`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`generation_config.max_new_tokens = 20`
Added generation tests. 2023-07-10 19:19:16 +00:00

			`#text = 'Please write down the first 50 digits of pi.'`
			`#text = get_prompt_for_generation_eval(text)`
			`#text += ' Sure, here the first 50 digits of pi: 3.14159'`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`n_cases = 6`
Added generation tests. 2023-07-10 19:19:16 +00:00			`text = '3.14159'`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`if hasattr(model.config, 'quantization_config'):`
			`model.config.quantization_config.bnb_4bit_compute_dtype = dtype`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`model.config.quantization_config.bnb_4bit_use_double_quant = DQ`
Added generation tests. 2023-07-10 19:19:16 +00:00
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`if not inference_kernel:`
			`text = [text]*n_cases`
Added generation tests. 2023-07-10 19:19:16 +00:00			`inputs = tokenizer(text, return_tensors="pt").to('cuda:0')`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`x = inputs['input_ids']`
			`outputs = []`
			`if inference_kernel:`
			`for i in range(n_cases):`
			`output = model.generate(x, generation_config=generation_config)`
			`textout = tokenizer.decode(output[0], skip_special_tokens=True)`
			`outputs.append(textout)`
			`else:`
			`outputs = model.generate(x, generation_config=generation_config)`
			`outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]`


			`assert len(outputs) == n_cases`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`failure_count = 0`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`for i in range(n_cases):`
			`if not outputs[i][:len(str(math.pi))] == str(math.pi):`
			`failure_count += 1`
Fixed missing bias in bnb.matmul_4bit for inference; more tests. 2023-07-12 00:13:33 +00:00			`failure_max = (2 if fixture_config[0] == 'huggyllama/llama-7b' else 4)`
			`if failure_count > failure_max:`
Test for bloom that fails with inference kernels. 2023-07-11 22:40:20 +00:00			`print(math.pi)`
			`for out in outputs:`
			`print(out)`
			`raise ValueError(f'Failure count: {failure_count}/{n_cases}')`
Added generation tests. 2023-07-10 19:19:16 +00:00