From 32f8c89201e85f8405ec263d40baeb6daf84c3cb Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Wed, 12 Apr 2023 11:27:31 -0700 Subject: [PATCH] Added missing example folder. --- examples/int8_inference_huggingface.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/int8_inference_huggingface.py diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py new file mode 100644 index 0000000..dc80a44 --- /dev/null +++ b/examples/int8_inference_huggingface.py @@ -0,0 +1,27 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +MAX_NEW_TOKENS = 128 +model_name = 'decapoda-research/llama-7b-hf' + +text = 'Hamburg is in which country?\n' +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_ids = tokenizer(text, return_tensors="pt").input_ids + +free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) +max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' + +n_gpus = torch.cuda.device_count() +max_memory = {i: max_memory for i in range(n_gpus)} + +model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map='auto', + load_in_8bit=True, + max_memory=max_memory +) +generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) +print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) + + +