-
Notifications
You must be signed in to change notification settings - Fork 3
/
vocabulary.py
154 lines (123 loc) · 4.47 KB
/
vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# Copyright 2016 Eddie Antonio Santos <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import warnings
from os import PathLike
from typing import Any, Dict, Iterable, List, NewType, Sequence, Sized, Tuple
from typing import cast
__all__ = 'Vocabulary', 'Entry', 'Vind', 'vocabulary'
# A vocabulary index that gets in your face.
Vind = NewType('Vind', int)
# A vocabulary entry
Entry = NewType('Entry', str)
UNK_TOKEN = '<UNK>'
START_TOKEN = '<s>'
END_TOKEN = '</s>'
class Vocabulary(Sized):
"""
One-to-one mapping of vocabulary strings to vocabulary indices (Vinds).
>>> v = Vocabulary(['var', '<IDENTIFIER>', ';'])
>>> v[4]
'<IDENTIFIER>'
>>> v.to_index(';')
5
>>> len(v)
6
"""
SPECIAL_ENTRIES = (UNK_TOKEN, START_TOKEN, END_TOKEN)
def __init__(self, entries: Iterable[str]) -> None:
self._index2text = cast(Sequence[Entry],
self.SPECIAL_ENTRIES + tuple(entries))
self._text2index: Dict[str, Vind] = {
text: Vind(index) for index, text in enumerate(self._index2text)
}
assert len(self._index2text) == len(set(self._index2text)), (
'Duplicate entries in vocabulary'
)
def entries(self) -> Iterable[Entry]:
"""
Yields all "true" entries of the vocabulary (all minus the special
entries).
"""
for ind in range(len(self.SPECIAL_ENTRIES), len(self)):
yield self._index2text[cast(Vind, ind)]
def to_text(self, index: Vind) -> str:
return self._index2text[index]
def to_index(self, text: str) -> Vind:
return self._text2index[text]
def __len__(self) -> int:
return len(self._index2text)
def __getitem__(self, idx: Vind) -> Entry:
return self._index2text[idx]
def to_lexeme(self, idx: Vind) -> None:
# TODO: return a lexeme
raise NotImplementedError
@classmethod
def from_json_file(cls, filename: PathLike) -> 'Vocabulary':
with open(filename) as json_file:
return cls(json.load(json_file))
unk_token_index = Vind(0)
start_token_index = Vind(1)
end_token_index = Vind(2)
unk_token = UNK_TOKEN
start_token = START_TOKEN
end_token = END_TOKEN
class LegacyVocabulary(Sized):
"""
One-to-one mapping of vocabulary strings to vocabulary indices (Vinds).
"""
def __init__(self) -> None:
warnings.warn('deprecated', DeprecationWarning)
import javascript
self._vocab = javascript.vocabulary
assert len(self._vocab) == 101
def to_text(self, index: Vind) -> str:
if index == self.start_token_index:
return self.start_token
elif index == self.end_token_index:
return self.end_token
return self._vocab.to_text(index)
def to_index(self, text: str) -> Vind:
if text == self.start_token:
return self.start_token_index
elif text == self.end_token:
return self.end_token_index
return self._vocab.to_index(text)
def __len__(self) -> int:
return len(self._vocab) - 1
start_token_index = Vind(0)
end_token_index = Vind(99)
start_token = '/*<START>*/'
end_token = '/*<END>*/'
# TODO: Once again with the proxy...?
class VocabularyProxy:
"""
Access to the vocabulary proxy.
"""
def __getattr__(self, name: str) -> Any:
# Avoid accessing the proxy prematurely
if name == '__wrapped__':
raise AttributeError
from .language import language
# Delegate to the current language's vocabulary
return getattr(language.vocabulary, name)
def __len__(self) -> int:
# This method must be explicilty defined, probably due to some weird
# CPython reason.
from .language import language
# Delegate to the current language's vocabulary
return len(language.vocabulary)
vocabulary = LegacyVocabulary()