-
Notifications
You must be signed in to change notification settings - Fork 9
/
correctness.py
117 lines (95 loc) · 3.52 KB
/
correctness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
import datetime
import json
import os
import random
import logging
import torch
import transformers
from arguments import BenchmarkArguments, process_cli_arguments
from data import get_data
from self_speculation.autoregressive_generator import AutoRegressiveGenerationStrategy
from self_speculation.generator_base import (
GenerationConfig,
GenerationResult,
HuggingfaceLlamaGenerator,
)
from self_speculation.self_speculation_generator import (
SelfSpeculativeGenerationStrategy,
)
from tqdm import tqdm
log = logging.getLogger(__name__)
def main(benchmark_arguments: BenchmarkArguments, generation_config: GenerationConfig, output_fname: str):
torch.distributed.init_process_group(
backend="cpu:gloo,cuda:nccl", timeout=datetime.timedelta(hours=48)
)
rank = int(os.environ["LOCAL_RANK"])
random.seed(benchmark_arguments.seed)
torch.manual_seed(benchmark_arguments.seed)
if rank != 0:
# only run on rank 0, we don't support parallel inference yet
return
local_model_path: str = benchmark_arguments.model
# initialize model
tokenizer = transformers.LlamaTokenizer.from_pretrained(
local_model_path, use_fast=False
)
config = transformers.LlamaConfig.from_pretrained(local_model_path)
model = transformers.LlamaForCausalLM.from_pretrained(
local_model_path,
config=config,
torch_dtype=torch.float16,
)
model.cuda()
model.half()
model.eval()
# initialize generator
spec_generator = HuggingfaceLlamaGenerator(
tokenizer=tokenizer,
model=model,
generation_strategy=SelfSpeculativeGenerationStrategy(),
)
ar_generator = HuggingfaceLlamaGenerator(
tokenizer=tokenizer,
model=model,
generation_strategy=AutoRegressiveGenerationStrategy(),
)
evaluation_set = get_data(
random_shuffle=benchmark_arguments.random_shuffle,
num_samples=benchmark_arguments.num_samples,
dataset=benchmark_arguments.dataset,
data_path=benchmark_arguments.data_path,
)
errors: int = 0
for i, example in enumerate(tqdm(evaluation_set)):
spec_response: GenerationResult = spec_generator.generate(
prompt=example.input,
generation_config=generation_config,
)
ar_response: GenerationResult = ar_generator.generate(
prompt=example.input,
# generation config to use the full model
generation_config=GenerationConfig(
max_steps=generation_config.max_steps,
exit_layer=-1,
num_speculations=-1,
generation_strategy="autoregressive",
),
)
if spec_response.decoded_prediction != ar_response.decoded_prediction:
errors += 1
log.info("Error found")
log.info(f"Spec response: {spec_response}")
log.info(f"AR response: {ar_response}")
metric_result = {"errors": errors, "error_pct": errors / len(evaluation_set)}
print(metric_result)
with open(output_fname, "w") as f:
json.dump(metric_result, f)
if __name__ == "__main__":
args = process_cli_arguments()
main(args.benchmark_arguments, args.generation_config, f"{args.benchmark_arguments.output_dir}/correctness_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json")