-
Notifications
You must be signed in to change notification settings - Fork 2
/
definition_trees.py
131 lines (96 loc) · 5.19 KB
/
definition_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# To use this, you'll first have to have loaded EleutherAI/gpt-j-6B (or other model) as 'GPTmodel', along with its tokenizer as 'tokenizer' and its shape embeddings tensor as 'embeddings'
# You'll also need to specify 'save_directory'
import os
import json
import torch
from mutant_prompting import mutant_prompting
from token_utils import token_setup
class CustomEmbedding(torch.nn.Module):
def __init__(self, original_embedding):
super().__init__()
self.original_embedding = original_embedding # changed from self.embedding to self.original_embedding
self.modifications = {}
def add_modification(self, token_id, new_embedding):
self.modifications[token_id] = new_embedding
def remove_modifications(self):
self.modifications.clear() # This empties the dictionary of modifications
def forward(self, input_ids=None):
# Get the original embeddings
original_embeddings = self.original_embedding(input_ids)
# Apply any modifications
for token_id, new_embedding in self.modifications.items():
mask = (input_ids == token_id)
original_embeddings[mask] = new_embedding
return original_embeddings
def produce_next_token_probs(prompt, noken, GPTmodel, tokenizer, topk):
original_wte = GPTmodel.get_input_embeddings()
# Create a custom embedding using the original embedding layer
custom_embedding = CustomEmbedding(original_wte)
# Set the custom embedding as the model's new embedding layer
GPTmodel.set_input_embeddings(custom_embedding)
# Add the embedding modification
custom_embedding.add_modification(62, noken) # 62 is the index for the placeholder token "_" which will stand for the chosen embeding.
input_ids = tokenizer.encode(prompt, return_tensors='pt')
with torch.no_grad():
outputs = GPTmodel(input_ids)
logits = outputs.logits
last_logits = logits[0, -1, :]
softmax_probs = torch.softmax(last_logits, dim=0)
total_tokens = len(softmax_probs)
prob_distribution = {tokenizer.decode([i]): softmax_probs[i].item() for i in range(total_tokens)}
top_k_probs = dict(sorted(prob_distribution.items(), key=lambda item: item[1], reverse=True)[:topk])
custom_embedding.remove_modifications()
return top_k_probs
def build_def_tree(token, data, base_prompt, noken, GPTmodel, tokenizer, topk, path="", visited=None):
if visited is None:
visited = set()
visited.add(path)
current_prompt = base_prompt + path
print(current_prompt + " PROB: " + str(data['cumulative_prob']))
response = produce_next_token_probs(current_prompt, noken, GPTmodel, tokenizer, topk)
for tok, prob in response.items():
child_path = path + tok
if child_path in visited or prob * data['cumulative_prob'] < cutoff:
continue
new_child = {"token": tok, "cumulative_prob": prob * data['cumulative_prob'], "children": []}
data['children'].append(new_child)
build_def_tree(tok, new_child, base_prompt, noken, GPTmodel, tokenizer, topk, child_path, visited)
return data
def count_tokens(strg, tokenizer):
return len(tokenizer.encode(strg)) if strg else 1
def save_data(data, folder, token):
filepath = os.path.join(folder, f'def_results_{token.strip()}.json')
with open(filepath, 'w') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def find_cumulative_probability(tree, target_def, tokenizer):
current_node = tree
tok_id_list = tokenizer.encode(target_def)
for tok_id in tok_id_list:
token = tokenizer.decode([tok_id])
next_node = next((child for child in current_node['children'] if child['token'] == token), None)
if not next_node:
return 0
current_node = next_node
return current_node.get('cumulative_prob', 0)
def mainfunction(data, topk, prompt, noken, GPTmodel, tokenizer):
results_dict = {}
tree_json = build_def_tree("", data, prompt, noken, GPTmodel, tokenizer, topk)
results_dict["tree JSON"] = tree_json
print("RESULTS DICT:", results_dict)
return results_dict
save_directory = '/content/Drive/My Drive/DefinitionTrees/' # This will have to be changed as appropriate
embeddings = embeddings[:50257]
token_strings, all_rom_token_indices, all_rom_token_gt2_indices = token_setup(tokenizer)
centroid = torch.mean(embeddings, dim=0)
topk = 5
cutoff = 0.00001 # This it the threshold which, if cumulative product of probabilities drops below it, the branch ends.
# So making it smaller will result in a denser tree, more branches, but will take longer and use more compute.
noken = centroid # OR set noken to any shape [4096] tensor
# For an actual token, e.g. broccoli, you can use noken = embeddings(token_strings.index(' broccoli')
prompt = f"A typical definition of '_' would be '"
deftree_data = {"level": 0, "token": "", "cumulative_prob": 1, "children": []}
results_dict = mainfunction(deftree_data, topk, prompt, noken, GPTmodel, tokenizer)
file_path = os.path.join(save_directory, 'results.json')
with open(file_path, 'w') as file:
json.dump(results_dict, file, ensure_ascii=False, indent=4)
print(f"Saved results_dict to {file_path}")