Skip to content

Commit

Permalink
Merge pull request #1025 from PrimozGodec/language-normalizers
Browse files Browse the repository at this point in the history
[ENH] Lemmagen - Use ISO language codes
  • Loading branch information
VesnaT authored Dec 12, 2023
2 parents c0edee3 + 6ee6269 commit 0495fd5
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 78 deletions.
46 changes: 15 additions & 31 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,43 +213,27 @@ def __setstate__(self, state):

class LemmagenLemmatizer(BaseNormalizer):
name = 'Lemmagen Lemmatizer'
lemmagen_languages = {
"Bulgarian": "bg",
"Croatian": "hr",
"Czech": "cs",
"English": "en",
"Estonian": "et",
"Farsi/Persian": "fa",
"French": "fr",
"German": "de",
"Hungarian": "hu",
"Italian": "it",
"Macedonian": "mk",
"Polish": "pl",
"Romanian": "ro",
"Russian": "ru",
"Serbian": "sr",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Ukrainian": "uk"
}
supported_languages = set(Lemmatizer.list_supported_languages())

def __init__(self, language='English'):
def __init__(self, language="en"):
super().__init__()
self.language = language
self.lemmatizer = None

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
output_corpus = super().__call__(corpus, callback)
self.lemmatizer = None
return output_corpus
self.language = language # used only for unpicking
self.lemmatizer = Lemmatizer(language)

def normalizer(self, token):
assert self.lemmatizer is not None
t = self.lemmatizer.lemmatize(token)
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
return t if t else token

def __getstate__(self):
"""Remove model that cannot be pickled"""
state = super().__getstate__()
state["lemmatizer"] = None
return state

def __setstate__(self, state):
"""Reinstate the model when upickled"""
super().__setstate__(state)
self.lemmatizer = Lemmatizer(self.language)
11 changes: 9 additions & 2 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,15 +342,22 @@ def test_udpipe_deepcopy(self):
)

def test_lemmagen(self):
normalizer = preprocess.LemmagenLemmatizer('Slovenian')
sentence = 'Gori na gori hiša gori'
normalizer = preprocess.LemmagenLemmatizer("sl")
sentence = "Gori na gori hiša gori"
with self.corpus.unlocked():
self.corpus.metas[0, 0] = sentence
self.assertEqual(
[Lemmatizer("sl").lemmatize(t) for t in sentence.split()],
normalizer(self.corpus).tokens[0],
)

def test_lemmagen_all_langs(self):
for language in preprocess.LemmagenLemmatizer.supported_languages:
normalizer = preprocess.LemmagenLemmatizer(language)
tokens = normalizer(self.corpus).tokens
self.assertEqual(len(self.corpus), len(tokens))
self.assertTrue(all(tokens))

def test_normalizers_picklable(self):
""" Normalizers must be picklable, tests if it is true"""
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
Expand Down
36 changes: 22 additions & 14 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
Expand Down Expand Up @@ -475,13 +475,15 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

Expand All @@ -490,15 +492,17 @@ def __init__(self, parent=None, **kwargs):
self.__snowball_lang, self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE,
self.__set_udpipe_lang
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
)
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.lemmagen_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
self.__combo_lemm = LanguageComboBox(
self,
LemmagenLemmatizer.supported_languages,
self.__lemmagen_lang,
False,
self.__set_lemmagen_lang,
)

label = QLabel("Language:")
Expand Down Expand Up @@ -530,9 +534,9 @@ def __enable_udpipe(self):

def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
Expand Down Expand Up @@ -562,7 +566,7 @@ def __set_udpipe_lang(self, language: str):
def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.setCurrentText(language)
self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
Expand All @@ -587,12 +591,14 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_lang)}
args = {"language": params.get("snowball_language", def_snowball)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_lang),
args = {"language": params.get("udpipe_language", def_udpipe),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
Expand Down Expand Up @@ -1384,6 +1390,8 @@ def str_into_paths(label):
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]


if __name__ == "__main__":
Expand Down
74 changes: 43 additions & 31 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,30 +271,16 @@ def test_migrate_settings(self):
}
self.create_widget(OWPreprocess, stored_settings=settings)

def test_migrate_language_settings(self):
def test_migrate_filter_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
(
"preprocess.normalize",
{
"snowball_language": "French",
"udpipe_language": "German",
"lemmagen_language": "Slovenian",
},
),
("preprocess.filter", {"language": "Finnish"}),
]
"preprocessors": [("preprocess.filter", {"language": "Finnish"})]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
filter_settings = widget.storedsettings["preprocessors"][1][1]
self.assertEqual("Slovenian", normalize_settings["lemmagen_language"])
self.assertEqual("French", normalize_settings["snowball_language"])
self.assertEqual("German", normalize_settings["udpipe_language"])
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("fi", filter_settings["language"])

# NLTK uses Slovene instead of Slovenian, this is also the reason
Expand All @@ -320,6 +306,32 @@ def test_migrate_language_settings(self):
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertIsNone(filter_settings["language"])

def test_migrate_lemmagen_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"lemmagen_language": "Slovenian"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sl", normalize_settings["lemmagen_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"lemmagen_language": "English"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -459,19 +471,23 @@ def test_init(self):
self.assertFalse(self.check_use.isChecked())

def test_parameters(self):
params = {"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "English",
"udpipe_tokenizer": False}
params = {
"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "Bulgarian",
"udpipe_tokenizer": True}
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
self.assertEqual(self.combo_sbl.currentText(), "Dutch")
Expand Down Expand Up @@ -738,10 +754,6 @@ def test_createinstance(self):
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
self.assertIsInstance(pp, MaxEntTagger)

# TODO - implement StanfordPOSTagger
# pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
# self.assertIsInstance(pp, StanfordPOSTagger)

def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")

Expand Down

0 comments on commit 0495fd5

Please sign in to comment.