diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 735666714..bc8989495 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -213,39 +213,12 @@ def __setstate__(self, state): class LemmagenLemmatizer(BaseNormalizer): name = 'Lemmagen Lemmatizer' - lemmagen_languages = { - "Bulgarian": "bg", - "Croatian": "hr", - "Czech": "cs", - "English": "en", - "Estonian": "et", - "Farsi/Persian": "fa", - "French": "fr", - "German": "de", - "Hungarian": "hu", - "Italian": "it", - "Macedonian": "mk", - "Polish": "pl", - "Romanian": "ro", - "Russian": "ru", - "Serbian": "sr", - "Slovak": "sk", - "Slovenian": "sl", - "Spanish": "es", - "Ukrainian": "uk" - } + supported_languages = set(Lemmatizer.list_supported_languages()) - def __init__(self, language='English'): + def __init__(self, language="en"): super().__init__() - self.language = language - self.lemmatizer = None - - def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: - # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward - self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language]) - output_corpus = super().__call__(corpus, callback) - self.lemmatizer = None - return output_corpus + self.language = language # used only for unpicking + self.lemmatizer = Lemmatizer(language) def normalizer(self, token): assert self.lemmatizer is not None @@ -253,3 +226,14 @@ def normalizer(self, token): # sometimes Lemmagen returns an empty string, return original tokens # in this case return t if t else token + + def __getstate__(self): + """Remove model that cannot be pickled""" + state = super().__getstate__() + state["lemmatizer"] = None + return state + + def __setstate__(self, state): + """Reinstate the model when upickled""" + super().__setstate__(state) + self.lemmatizer = Lemmatizer(self.language) diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 26dc54821..0b9bdb4c8 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -342,8 +342,8 @@ def test_udpipe_deepcopy(self): ) def test_lemmagen(self): - normalizer = preprocess.LemmagenLemmatizer('Slovenian') - sentence = 'Gori na gori hiša gori' + normalizer = preprocess.LemmagenLemmatizer("sl") + sentence = "Gori na gori hiša gori" with self.corpus.unlocked(): self.corpus.metas[0, 0] = sentence self.assertEqual( @@ -351,6 +351,13 @@ def test_lemmagen(self): normalizer(self.corpus).tokens[0], ) + def test_lemmagen_all_langs(self): + for language in preprocess.LemmagenLemmatizer.supported_languages: + normalizer = preprocess.LemmagenLemmatizer(language) + tokens = normalizer(self.corpus).tokens + self.assertEqual(len(self.corpus), len(tokens)) + self.assertTrue(all(tokens)) + def test_normalizers_picklable(self): """ Normalizers must be picklable, tests if it is true""" for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}: diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index 35f1b245c..eac04df92 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -24,7 +24,7 @@ from Orange.widgets.widget import Input, Output, Msg, Message from orangecontrib.text import Corpus -from orangecontrib.text.language import ISO2LANG +from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import nltk_data_dir from orangecontrib.text.preprocess import * from orangecontrib.text.preprocess.normalize import UDPipeStopIteration @@ -475,13 +475,15 @@ class NormalizationModule(SingleMethodModule): UDPipe: UDPipeLemmatizer, Lemmagen: LemmagenLemmatizer} DEFAULT_METHOD = Porter - DEFAULT_LANGUAGE = "English" + DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso + DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso + DEFAULT_LANGUAGE = "en" DEFAULT_USE_TOKE = False def __init__(self, parent=None, **kwargs): super().__init__(parent, **kwargs) - self.__snowball_lang = self.DEFAULT_LANGUAGE - self.__udpipe_lang = self.DEFAULT_LANGUAGE + self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG + self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG self.__lemmagen_lang = self.DEFAULT_LANGUAGE self.__use_tokenizer = self.DEFAULT_USE_TOKE @@ -490,15 +492,17 @@ def __init__(self, parent=None, **kwargs): self.__snowball_lang, self.__set_snowball_lang ) self.__combo_udl = UDPipeComboBox( - self, self.__udpipe_lang, self.DEFAULT_LANGUAGE, - self.__set_udpipe_lang + self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang ) self.__check_use = QCheckBox("UDPipe tokenizer", checked=self.DEFAULT_USE_TOKE) self.__check_use.clicked.connect(self.__set_use_tokenizer) - self.__combo_lemm = ComboBox( - self, LemmagenLemmatizer.lemmagen_languages, - self.__lemmagen_lang, self.__set_lemmagen_lang + self.__combo_lemm = LanguageComboBox( + self, + LemmagenLemmatizer.supported_languages, + self.__lemmagen_lang, + False, + self.__set_lemmagen_lang, ) label = QLabel("Language:") @@ -530,9 +534,9 @@ def __enable_udpipe(self): def setParameters(self, params: Dict): super().setParameters(params) - snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE) + snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG) self.__set_snowball_lang(snowball_lang) - udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE) + udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG) self.__set_udpipe_lang(udpipe_lang) use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE) self.__set_use_tokenizer(use_tokenizer) @@ -562,7 +566,7 @@ def __set_udpipe_lang(self, language: str): def __set_lemmagen_lang(self, language: str): if self.__lemmagen_lang != language: self.__lemmagen_lang = language - self.__combo_lemm.setCurrentText(language) + self.__combo_lemm.set_current_language(language) self.changed.emit() if self.method == self.Lemmagen: self.edited.emit() @@ -587,12 +591,14 @@ def parameters(self) -> Dict: def createinstance(params: Dict) -> BaseNormalizer: method = params.get("method", NormalizationModule.DEFAULT_METHOD) args = {} + def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG + def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG def_lang = NormalizationModule.DEFAULT_LANGUAGE if method == NormalizationModule.Snowball: - args = {"language": params.get("snowball_language", def_lang)} + args = {"language": params.get("snowball_language", def_snowball)} elif method == NormalizationModule.UDPipe: def_use = NormalizationModule.DEFAULT_USE_TOKE - args = {"language": params.get("udpipe_language", def_lang), + args = {"language": params.get("udpipe_language", def_udpipe), "use_tokenizer": params.get("udpipe_tokenizer", def_use)} elif method == NormalizationModule.Lemmagen: args = {"language": params.get("lemmagen_language", def_lang)} @@ -1384,6 +1390,8 @@ def str_into_paths(label): pp["language"] = None else: pp["language"] = StopwordsFilter.lang_to_iso(pp["language"]) + if pp_name == "preprocess.normalize" and "lemmagen_language" in pp: + pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]] if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index d5cfc633b..ea2abcd39 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -271,30 +271,16 @@ def test_migrate_settings(self): } self.create_widget(OWPreprocess, stored_settings=settings) - def test_migrate_language_settings(self): + def test_migrate_filter_language_settings(self): """Test migration to iso langauge codes""" settings = { "__version__": 3, "storedsettings": { - "preprocessors": [ - ( - "preprocess.normalize", - { - "snowball_language": "French", - "udpipe_language": "German", - "lemmagen_language": "Slovenian", - }, - ), - ("preprocess.filter", {"language": "Finnish"}), - ] + "preprocessors": [("preprocess.filter", {"language": "Finnish"})] }, } widget = self.create_widget(OWPreprocess, stored_settings=settings) - normalize_settings = widget.storedsettings["preprocessors"][0][1] - filter_settings = widget.storedsettings["preprocessors"][1][1] - self.assertEqual("Slovenian", normalize_settings["lemmagen_language"]) - self.assertEqual("French", normalize_settings["snowball_language"]) - self.assertEqual("German", normalize_settings["udpipe_language"]) + filter_settings = widget.storedsettings["preprocessors"][0][1] self.assertEqual("fi", filter_settings["language"]) # NLTK uses Slovene instead of Slovenian, this is also the reason @@ -320,6 +306,32 @@ def test_migrate_language_settings(self): filter_settings = widget.storedsettings["preprocessors"][0][1] self.assertIsNone(filter_settings["language"]) + def test_migrate_lemmagen_language_settings(self): + """Test migration to iso langauge codes""" + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"lemmagen_language": "Slovenian"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("sl", normalize_settings["lemmagen_language"]) + + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"lemmagen_language": "English"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("en", normalize_settings["lemmagen_language"]) + class TestTransformationModule(WidgetTest): def setUp(self): @@ -459,19 +471,23 @@ def test_init(self): self.assertFalse(self.check_use.isChecked()) def test_parameters(self): - params = {"method": NormalizationModule.Porter, - "snowball_language": "English", - "udpipe_language": "English", - "lemmagen_language": "English", - "udpipe_tokenizer": False} + params = { + "method": NormalizationModule.Porter, + "snowball_language": "English", + "udpipe_language": "English", + "lemmagen_language": "en", + "udpipe_tokenizer": False, + } self.assertDictEqual(self.editor.parameters(), params) def test_set_parameters(self): - params = {"method": NormalizationModule.UDPipe, - "snowball_language": "Dutch", - "udpipe_language": "Slovenian", - "lemmagen_language": "Bulgarian", - "udpipe_tokenizer": True} + params = { + "method": NormalizationModule.UDPipe, + "snowball_language": "Dutch", + "udpipe_language": "Slovenian", + "lemmagen_language": "bg", + "udpipe_tokenizer": True, + } self.editor.setParameters(params) self.assertDictEqual(self.editor.parameters(), params) self.assertEqual(self.combo_sbl.currentText(), "Dutch") @@ -738,10 +754,6 @@ def test_createinstance(self): pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt}) self.assertIsInstance(pp, MaxEntTagger) - # TODO - implement StanfordPOSTagger - # pp = self.editor.createinstance({"method": POSTaggingModule.Stanford}) - # self.assertIsInstance(pp, StanfordPOSTagger) - def test_repr(self): self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")