-
Notifications
You must be signed in to change notification settings - Fork 1
/
europarl_english_french.py
172 lines (157 loc) · 7.72 KB
/
europarl_english_french.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import json
import logging
from typing import Dict
from allennlp.common import Params
from allennlp.common.file_utils import cached_path
from allennlp.common.util import START_SYMBOL, END_SYMBOL
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter, SpacyWordSplitter
from overrides import overrides
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("europarl_parallel_english_french")
class EuroparlEnglishFrenchReader(DatasetReader):
"""
Reads a jsonl containing parallel English-French utterances from the
proceedings of the European Parliament - http://www.statmt.org/europarl/
Expected format of each line: {"id": int, "en": str, "fr": str}
Fields not listed above will be ignored.
Each ``read`` yields a data instance of
en: ``TextField``
fr: ``TextField``
Parameters
----------
lazy : ``bool`` (optional, default=False)
Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will
take longer per batch. This also allows training with datasets that are too large to fit
in memory.
en_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split English utterances into tokens.
Defaults to ``WordTokenizer()``.
fr_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split French utterances into tokens.
Defaults to ``WordTokenizer(SpacyWordSplitter(language=;fr_core_news_sm)``.
en_token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define English token representations. Defaults to ``{"tokens":
SingleIdTokenIndexer(namespace="en", lowercase_tokens=True)}``.
fr_token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define French token representations. Defaults to ``{"tokens":
SingleIdTokenIndexer(namespace="fr", lowercase_tokens=True)}``.
"""
def __init__(self,
lazy: bool = False,
en_tokenizer: Tokenizer = None,
fr_tokenizer: Tokenizer = None,
en_token_indexers: Dict[str, TokenIndexer] = None,
fr_token_indexers: Dict[str, TokenIndexer] = None) -> None:
super().__init__(lazy)
self._en_tokenizer = en_tokenizer or WordTokenizer(
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._fr_tokenizer = fr_tokenizer or WordTokenizer(
# Specify spaCy's French model instead (English is the default).
word_splitter=SpacyWordSplitter(language='fr_core_news_sm'),
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._en_token_indexers = en_token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="source", lowercase_tokens=True)
}
self._fr_token_indexers = fr_token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="target", lowercase_tokens=True)
}
@overrides
def _read(self, file_path):
with open(cached_path(file_path), 'r') as data_file:
logger.info("Reading instances from lines in file: %s", file_path)
for line in data_file:
line = line.strip("\n")
if not line:
continue
parallel_utterance = json.loads(line)
en_utterance = parallel_utterance['en']
fr_utterance = parallel_utterance['fr']
yield self.text_to_instance(en_utterance, fr_utterance)
@overrides
def text_to_instance(self, en_utterance: str, fr_utterance: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
en_utterance_tokenized = self._en_tokenizer.tokenize(en_utterance)
fr_utterance_tokenized = self._fr_tokenizer.tokenize(fr_utterance)
fields = {
'source': TextField(en_utterance_tokenized, self._en_token_indexers),
'target': TextField(fr_utterance_tokenized, self._fr_token_indexers)
}
return Instance(fields)
@DatasetReader.register("europarl_parallel_english_french_pretokenized")
class EuroparlEnglishFrenchReaderPretokenized(DatasetReader):
"""
Identical to ``EuroparlEnglishFrenchReader`` but assumes its input is already tokenized.
Each ``read`` yields a data instance of
en: ``TextField``
fr: ``TextField``
Parameters
----------
lazy : ``bool`` (optional, default=False)
Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will
take longer per batch. This also allows training with datasets that are too large to fit
in memory.
en_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split English utterances into tokens.
fr_tokenizer : ``Tokenizer``, optional
Tokenizer to use to split French utterances into tokens.
en_token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define English token representations. Defaults to ``{"tokens":
SingleIdTokenIndexer(namespace="en", lowercase_tokens=True)}``.
fr_token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define French token representations. Defaults to ``{"tokens":
SingleIdTokenIndexer(namespace="fr", lowercase_tokens=True)}``.
"""
def __init__(self,
lazy: bool = False,
en_tokenizer: Tokenizer = None,
fr_tokenizer: Tokenizer = None,
en_token_indexers: Dict[str, TokenIndexer] = None,
fr_token_indexers: Dict[str, TokenIndexer] = None) -> None:
super().__init__(lazy)
self._en_tokenizer = en_tokenizer or WordTokenizer(
word_splitter=JustSpacesWordSplitter(),
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._fr_tokenizer = fr_tokenizer or WordTokenizer(
word_splitter=JustSpacesWordSplitter(),
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._en_token_indexers = en_token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="source", lowercase_tokens=True)
}
self._fr_token_indexers = fr_token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="target", lowercase_tokens=True)
}
@overrides
def _read(self, file_path):
with open(cached_path(file_path), 'r') as data_file:
logger.info("Reading instances from lines in file: %s", file_path)
for line in data_file:
line = line.strip("\n")
if not line:
continue
parallel_utterance = json.loads(line)
en_utterance = parallel_utterance['en']
fr_utterance = parallel_utterance['fr']
yield self.text_to_instance(en_utterance, fr_utterance)
@overrides
def text_to_instance(self, en_utterance: str, fr_utterance: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
en_utterance_tokenized = self._en_tokenizer.tokenize(en_utterance)
fr_utterance_tokenized = self._fr_tokenizer.tokenize(fr_utterance)
fields = {
'source': TextField(en_utterance_tokenized, self._en_token_indexers),
'target': TextField(fr_utterance_tokenized, self._fr_token_indexers)
}
return Instance(fields)