Large Language models

Large Language models#

Run an LLM on the NPU#

You can use your existing LLM inference script on the NPU with a simple line of code

# First import the library
import intel_npu_acceleration_library

# Call the compile function to offload kernels to the NPU.
model = intel_npu_acceleration_library.compile(model)

Here a full example:

from torch.profiler import profile, ProfilerActivity
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
from threading import Thread
import intel_npu_acceleration_library
import torch
import time
import sys

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
streamer = TextStreamer(tokenizer, skip_special_tokens=True)


print("Compile model for the NPU")
model = intel_npu_acceleration_library.compile(model)

query = "What is the meaning of life?"
prefix = tokenizer(query, return_tensors="pt")["input_ids"]


generation_kwargs = dict(
    input_ids=prefix,
    streamer=streamer,
    do_sample=True,
    top_k=50,
    top_p=0.9,
)

print("Run inference")
_ = model.generate(**generation_kwargs)