From 446a6fb6684b65ed908693fa1010f79ba3046a9f Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 15:00:02 +0200 Subject: [PATCH] Normalize - Use language from Corpus --- orangecontrib/text/language.py | 2 +- orangecontrib/text/preprocess/normalize.py | 63 ++++++++++++++------- orangecontrib/text/tests/test_preprocess.py | 42 +++++++------- 3 files changed, 63 insertions(+), 44 deletions(-) diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index d65c03d29..f4f8af28e 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -41,7 +41,7 @@ "ga": "Irish", "gl": "Galician", "got": "Gothic", - "grc": "Ancient greek", + "grc": "Ancient Greek", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 07f85f761..397965b55 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -84,22 +84,12 @@ def __init__(self, language='en'): self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem -def language_to_name(language): - return language.lower().replace(' ', '') + 'ud' - - -def file_to_name(file): - return file.replace('-', '').replace('_', '') - - -def file_to_language(file): - return file[:file.find('ud') - 1] \ - .replace('-', ' ').replace('_', ' ').capitalize() - - class UDPipeModels: server_url = "https://file.biolab.si/files/udpipe/" + # some languages differ between udpipe and iso standard + UDPIPE2LANG = {"Norwegian Bokmaal": "Norwegian Bokmål"} + def __init__(self): self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/') self.serverfiles = serverfiles.ServerFiles(self.server_url) @@ -107,23 +97,30 @@ def __init__(self): serverfiles=self.serverfiles) def __getitem__(self, language): - file_name = self._find_file(language_to_name(language)) + file_name = self._find_file(language) return self.localfiles.localpath_download(file_name) @property def model_files(self): try: - return self.serverfiles.listfiles() + files = self.serverfiles.listfiles() except ConnectionError: - return self.localfiles.listfiles() + files = self.localfiles.listfiles() + return self.__files_to_dict(files) def _find_file(self, language): - return next(filter(lambda f: file_to_name(f).startswith(language), - map(lambda f: f[0], self.model_files))) + return self.model_files[language][1] + + def __files_to_dict(self, files): + iso2lang = {} + for f in files: + langauge, iso = self.__file_to_language(f[0]) + iso2lang[iso] = (langauge, f[0]) + return iso2lang @property def supported_languages(self): - return list(map(lambda f: file_to_language(f[0]), self.model_files)) + return [(name, iso) for iso, (name, _) in self.model_files.items()] @property def online(self): @@ -133,6 +130,32 @@ def online(self): except ConnectionError: return False + def __file_to_language(self, file): + """ + Transform filenames to langauge strings and iso codes. + Language name has format "Language (Model)" + ISO code consist of real iso code which we add the model variation to for + example "en_lines" for lines english model. + """ + # language and potential model variation are delimited with - + name_split = file[: file.find("ud") - 1].split("-") + # capitalize multi-word languages separated by _ + lg = name_split[0].replace("_", " ").title() + # fix wrong spelling for Norwegian Bokmål + lg = self.UDPIPE2LANG.get(lg, lg) + + if len(name_split) > 1: + # languages with multiple models have model name as second item in split + return f"{lg} ({name_split[1]})", self.__lang2iso(lg, name_split[1]) + return lg, self.__lang2iso(lg, None) + + @staticmethod + def __lang2iso(language, model): + language = [LANG2ISO[language]] + if model: + language.append(model) + return "_".join(language) + class UDPipeStopIteration(StopIteration): pass @@ -141,7 +164,7 @@ class UDPipeStopIteration(StopIteration): class UDPipeLemmatizer(BaseNormalizer): name = 'UDPipe Lemmatizer' - def __init__(self, language='English', use_tokenizer=False): + def __init__(self, language="en", use_tokenizer=False): super().__init__() self.__language = language self.__use_tokenizer = use_tokenizer diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 516c2627c..fd020d6aa 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -20,12 +20,7 @@ PreprocessorList, StopwordsFilter, ) -from orangecontrib.text.preprocess.normalize import ( - file_to_language, - file_to_name, - language_to_name, - UDPipeModels, -) +from orangecontrib.text.preprocess.normalize import UDPipeModels SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles" @@ -270,7 +265,7 @@ def test_call_word_net(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_call_UDPipe(self): - pp = preprocess.UDPipeLemmatizer(language="Lithuanian") + pp = preprocess.UDPipeLemmatizer(language="lt") self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) @@ -304,7 +299,7 @@ def test_snowball_all_langs(self): def test_udpipe(self): """Test udpipe token lemmatization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" corpus = normalizer(self.corpus) @@ -313,7 +308,7 @@ def test_udpipe(self): def test_udpipe_doc(self): """Test udpipe lemmatization with its own tokenization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) with self.corpus.unlocked(): self.corpus.metas[0, 0] = "Ant kalno dega namas" corpus = normalizer(self.corpus) @@ -321,7 +316,7 @@ def test_udpipe_doc(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1) def test_udpipe_pickle(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) # udpipe store model after first call - model is not picklable normalizer(self.corpus) loaded = pickle.loads(pickle.dumps(normalizer)) @@ -336,7 +331,7 @@ def test_udpipe_pickle(self): ) def test_udpipe_deepcopy(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) copied = copy.deepcopy(normalizer) self.assertEqual(normalizer._UDPipeLemmatizer__language, copied._UDPipeLemmatizer__language) @@ -370,7 +365,7 @@ def test_normalizers_picklable(self): for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}: normalizer = getattr(preprocess.normalize, nm) normalizer = ( - normalizer(language="Lithuanian") + normalizer(language="lt") if normalizer is preprocess.UDPipeLemmatizer else normalizer() ) @@ -379,7 +374,7 @@ def test_normalizers_picklable(self): loaded(self.corpus) def test_cache(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" normalizer(self.corpus) @@ -395,21 +390,21 @@ def test_cache(self): class UDPipeModelsTests(unittest.TestCase): def test_label_transform(self, _): """Test helper functions for label transformation""" - self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'), - 'Slovenian sst') - self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'), - 'sloveniansstud2.0170801.udpipe') - self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud') + fun = UDPipeModels()._UDPipeModels__file_to_language + r = fun("slovenian-sst-ud-2.0-170801.udpipe") + self.assertTupleEqual(r, ("Slovenian (sst)", "sl_sst")) + r = fun("norwegian_bokmaal-sst-ud-2.0-170801.udpipe") + self.assertTupleEqual(r, ("Norwegian Bokmål (sst)", "nb_sst")) @patch(SF_DOWNLOAD, download_patch) def test_udpipe_model(self, _): """Test udpipe models loading from server""" models = UDPipeModels() - self.assertIn("Lithuanian", models.supported_languages) + self.assertIn(('Lithuanian', 'lt'), models.supported_languages) self.assertEqual(7, len(models.supported_languages)) local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe") - model = models["Lithuanian"] + model = models["lt"] self.assertEqual(model, local_file) self.assertTrue(os.path.isfile(local_file)) @@ -419,10 +414,11 @@ def test_udpipe_local_models(self, sf_mock): models = UDPipeModels() [models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()] # use Uyghur, it is the smallest model, we can have it in the repository - _ = models["Lithuanian"] + _ = models["lt"] sf_mock.side_effect = ConnectionError() - self.assertIn("Lithuanian", UDPipeModels().supported_languages) - self.assertEqual(1, len(UDPipeModels().supported_languages)) + exp = {"lt": ('Lithuanian', 'lithuanian-ud-2.0-170801.udpipe')} + self.assertDictEqual(exp, models.model_files) + self.assertListEqual([('Lithuanian', 'lt')], models.supported_languages) def test_udpipe_offline(self, sf_mock): """Test if UDPipe works offline"""