MILLION
Published:
KV cache acceleration based on product quantization
the official repository is worth learning for its programming wisdom(ABC(abstract base class), AOP…) and plug-and-play integration(Context manager and Injector) into Llama inference framework
MILLION
References
Plug-and-play integration
Context manager
context manager usage in main_pq.py
from transformers import AutoConfig
from .models.ModelContext import get_context
config.model_config = AutoConfig.from_pretrained(config.model_path)
config.context = get_context(config.model_config.model_type)
......
with config.context.init_context:
model = AutoModelForCausalLM.from_pretrained(config.model_path).to(config.device)
tokenizer = AutoTokenizer.from_pretrained(config.model_path)
if config.half:
model = model.half()
# ================== baseline ==================
if "baseline" in config.pipeline:
tprint("Baseline")
from ..benchmarks import dataset2benchmark
benchmark = dataset2benchmark[config.dataset]
with config.context.baseline_context:
score_baseline = benchmark(model, tokenizer, **(config.to_dict()))
class ContextList(list):
# return value as the object initiator
def __enter__(self):
for item in self:
item.__enter__()
return self
def __exit__(self, exc_type, exc_value, traceback):
for item in self:
item.__exit__(exc_type, exc_value, traceback)
return False
Method injection(forward function)
@ ./scripts/utils/Injector.py