From 446a6fb6684b65ed908693fa1010f79ba3046a9f Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:00:02 +0200
Subject: [PATCH] Normalize - Use language from Corpus
---
orangecontrib/text/language.py | 2 +-
orangecontrib/text/preprocess/normalize.py | 63 ++++++++++++++-------
orangecontrib/text/tests/test_preprocess.py | 42 +++++++-------
3 files changed, 63 insertions(+), 44 deletions(-)
diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py
index d65c03d29..f4f8af28e 100644
--- a/orangecontrib/text/language.py
+++ b/orangecontrib/text/language.py
@@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
- "grc": "Ancient greek",
+ "grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py
index 07f85f761..397965b55 100644
--- a/orangecontrib/text/preprocess/normalize.py
+++ b/orangecontrib/text/preprocess/normalize.py
@@ -84,22 +84,12 @@ def __init__(self, language='en'):
self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem
-def language_to_name(language):
- return language.lower().replace(' ', '') + 'ud'
-
-
-def file_to_name(file):
- return file.replace('-', '').replace('_', '')
-
-
-def file_to_language(file):
- return file[:file.find('ud') - 1] \
- .replace('-', ' ').replace('_', ' ').capitalize()
-
-
class UDPipeModels:
server_url = "https://file.biolab.si/files/udpipe/"
+ # some languages differ between udpipe and iso standard
+ UDPIPE2LANG = {"Norwegian Bokmaal": "Norwegian Bokmål"}
+
def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
@@ -107,23 +97,30 @@ def __init__(self):
serverfiles=self.serverfiles)
def __getitem__(self, language):
- file_name = self._find_file(language_to_name(language))
+ file_name = self._find_file(language)
return self.localfiles.localpath_download(file_name)
@property
def model_files(self):
try:
- return self.serverfiles.listfiles()
+ files = self.serverfiles.listfiles()
except ConnectionError:
- return self.localfiles.listfiles()
+ files = self.localfiles.listfiles()
+ return self.__files_to_dict(files)
def _find_file(self, language):
- return next(filter(lambda f: file_to_name(f).startswith(language),
- map(lambda f: f[0], self.model_files)))
+ return self.model_files[language][1]
+
+ def __files_to_dict(self, files):
+ iso2lang = {}
+ for f in files:
+ langauge, iso = self.__file_to_language(f[0])
+ iso2lang[iso] = (langauge, f[0])
+ return iso2lang
@property
def supported_languages(self):
- return list(map(lambda f: file_to_language(f[0]), self.model_files))
+ return [(name, iso) for iso, (name, _) in self.model_files.items()]
@property
def online(self):
@@ -133,6 +130,32 @@ def online(self):
except ConnectionError:
return False
+ def __file_to_language(self, file):
+ """
+ Transform filenames to langauge strings and iso codes.
+ Language name has format "Language (Model)"
+ ISO code consist of real iso code which we add the model variation to for
+ example "en_lines" for lines english model.
+ """
+ # language and potential model variation are delimited with -
+ name_split = file[: file.find("ud") - 1].split("-")
+ # capitalize multi-word languages separated by _
+ lg = name_split[0].replace("_", " ").title()
+ # fix wrong spelling for Norwegian Bokmål
+ lg = self.UDPIPE2LANG.get(lg, lg)
+
+ if len(name_split) > 1:
+ # languages with multiple models have model name as second item in split
+ return f"{lg} ({name_split[1]})", self.__lang2iso(lg, name_split[1])
+ return lg, self.__lang2iso(lg, None)
+
+ @staticmethod
+ def __lang2iso(language, model):
+ language = [LANG2ISO[language]]
+ if model:
+ language.append(model)
+ return "_".join(language)
+
class UDPipeStopIteration(StopIteration):
pass
@@ -141,7 +164,7 @@ class UDPipeStopIteration(StopIteration):
class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'
- def __init__(self, language='English', use_tokenizer=False):
+ def __init__(self, language="en", use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index 516c2627c..fd020d6aa 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -20,12 +20,7 @@
PreprocessorList,
StopwordsFilter,
)
-from orangecontrib.text.preprocess.normalize import (
- file_to_language,
- file_to_name,
- language_to_name,
- UDPipeModels,
-)
+from orangecontrib.text.preprocess.normalize import UDPipeModels
SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles"
@@ -270,7 +265,7 @@ def test_call_word_net(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_call_UDPipe(self):
- pp = preprocess.UDPipeLemmatizer(language="Lithuanian")
+ pp = preprocess.UDPipeLemmatizer(language="lt")
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
@@ -304,7 +299,7 @@ def test_snowball_all_langs(self):
def test_udpipe(self):
"""Test udpipe token lemmatization"""
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
+ normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
corpus = normalizer(self.corpus)
@@ -313,7 +308,7 @@ def test_udpipe(self):
def test_udpipe_doc(self):
"""Test udpipe lemmatization with its own tokenization"""
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
+ normalizer = preprocess.UDPipeLemmatizer("lt", True)
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "Ant kalno dega namas"
corpus = normalizer(self.corpus)
@@ -321,7 +316,7 @@ def test_udpipe_doc(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)
def test_udpipe_pickle(self):
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
+ normalizer = preprocess.UDPipeLemmatizer("lt", True)
# udpipe store model after first call - model is not picklable
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
@@ -336,7 +331,7 @@ def test_udpipe_pickle(self):
)
def test_udpipe_deepcopy(self):
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
+ normalizer = preprocess.UDPipeLemmatizer("lt", True)
copied = copy.deepcopy(normalizer)
self.assertEqual(normalizer._UDPipeLemmatizer__language,
copied._UDPipeLemmatizer__language)
@@ -370,7 +365,7 @@ def test_normalizers_picklable(self):
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
normalizer = getattr(preprocess.normalize, nm)
normalizer = (
- normalizer(language="Lithuanian")
+ normalizer(language="lt")
if normalizer is preprocess.UDPipeLemmatizer
else normalizer()
)
@@ -379,7 +374,7 @@ def test_normalizers_picklable(self):
loaded(self.corpus)
def test_cache(self):
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
+ normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
normalizer(self.corpus)
@@ -395,21 +390,21 @@ def test_cache(self):
class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self, _):
"""Test helper functions for label transformation"""
- self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'),
- 'Slovenian sst')
- self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'),
- 'sloveniansstud2.0170801.udpipe')
- self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud')
+ fun = UDPipeModels()._UDPipeModels__file_to_language
+ r = fun("slovenian-sst-ud-2.0-170801.udpipe")
+ self.assertTupleEqual(r, ("Slovenian (sst)", "sl_sst"))
+ r = fun("norwegian_bokmaal-sst-ud-2.0-170801.udpipe")
+ self.assertTupleEqual(r, ("Norwegian Bokmål (sst)", "nb_sst"))
@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
"""Test udpipe models loading from server"""
models = UDPipeModels()
- self.assertIn("Lithuanian", models.supported_languages)
+ self.assertIn(('Lithuanian', 'lt'), models.supported_languages)
self.assertEqual(7, len(models.supported_languages))
local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe")
- model = models["Lithuanian"]
+ model = models["lt"]
self.assertEqual(model, local_file)
self.assertTrue(os.path.isfile(local_file))
@@ -419,10 +414,11 @@ def test_udpipe_local_models(self, sf_mock):
models = UDPipeModels()
[models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()]
# use Uyghur, it is the smallest model, we can have it in the repository
- _ = models["Lithuanian"]
+ _ = models["lt"]
sf_mock.side_effect = ConnectionError()
- self.assertIn("Lithuanian", UDPipeModels().supported_languages)
- self.assertEqual(1, len(UDPipeModels().supported_languages))
+ exp = {"lt": ('Lithuanian', 'lithuanian-ud-2.0-170801.udpipe')}
+ self.assertDictEqual(exp, models.model_files)
+ self.assertListEqual([('Lithuanian', 'lt')], models.supported_languages)
def test_udpipe_offline(self, sf_mock):
"""Test if UDPipe works offline"""