Skip to content

Commit

Permalink
Normalize - Use language from Corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 21, 2023
1 parent 02b1892 commit 446a6fb
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 44 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
Expand Down
63 changes: 43 additions & 20 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,46 +84,43 @@ def __init__(self, language='en'):
self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem


def language_to_name(language):
return language.lower().replace(' ', '') + 'ud'


def file_to_name(file):
return file.replace('-', '').replace('_', '')


def file_to_language(file):
return file[:file.find('ud') - 1] \
.replace('-', ' ').replace('_', ' ').capitalize()


class UDPipeModels:
server_url = "https://file.biolab.si/files/udpipe/"

# some languages differ between udpipe and iso standard
UDPIPE2LANG = {"Norwegian Bokmaal": "Norwegian Bokmål"}

def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
self.localfiles = serverfiles.LocalFiles(self.local_data,
serverfiles=self.serverfiles)

def __getitem__(self, language):
file_name = self._find_file(language_to_name(language))
file_name = self._find_file(language)
return self.localfiles.localpath_download(file_name)

@property
def model_files(self):
try:
return self.serverfiles.listfiles()
files = self.serverfiles.listfiles()
except ConnectionError:
return self.localfiles.listfiles()
files = self.localfiles.listfiles()
return self.__files_to_dict(files)

def _find_file(self, language):
return next(filter(lambda f: file_to_name(f).startswith(language),
map(lambda f: f[0], self.model_files)))
return self.model_files[language][1]

def __files_to_dict(self, files):
iso2lang = {}
for f in files:
langauge, iso = self.__file_to_language(f[0])
iso2lang[iso] = (langauge, f[0])
return iso2lang

@property
def supported_languages(self):
return list(map(lambda f: file_to_language(f[0]), self.model_files))
return [(name, iso) for iso, (name, _) in self.model_files.items()]

@property
def online(self):
Expand All @@ -133,6 +130,32 @@ def online(self):
except ConnectionError:
return False

def __file_to_language(self, file):
"""
Transform filenames to langauge strings and iso codes.
Language name has format "Language (Model)"
ISO code consist of real iso code which we add the model variation to for
example "en_lines" for lines english model.
"""
# language and potential model variation are delimited with -
name_split = file[: file.find("ud") - 1].split("-")
# capitalize multi-word languages separated by _
lg = name_split[0].replace("_", " ").title()
# fix wrong spelling for Norwegian Bokmål
lg = self.UDPIPE2LANG.get(lg, lg)

if len(name_split) > 1:
# languages with multiple models have model name as second item in split
return f"{lg} ({name_split[1]})", self.__lang2iso(lg, name_split[1])
return lg, self.__lang2iso(lg, None)

@staticmethod
def __lang2iso(language, model):
language = [LANG2ISO[language]]
if model:
language.append(model)
return "_".join(language)


class UDPipeStopIteration(StopIteration):
pass
Expand All @@ -141,7 +164,7 @@ class UDPipeStopIteration(StopIteration):
class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
def __init__(self, language="en", use_tokenizer=False):
super().__init__()
self.__language = language
self.__use_tokenizer = use_tokenizer
Expand Down
42 changes: 19 additions & 23 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@
PreprocessorList,
StopwordsFilter,
)
from orangecontrib.text.preprocess.normalize import (
file_to_language,
file_to_name,
language_to_name,
UDPipeModels,
)
from orangecontrib.text.preprocess.normalize import UDPipeModels


SF_LIST = "orangecontrib.text.preprocess.normalize.serverfiles.ServerFiles.listfiles"
Expand Down Expand Up @@ -270,7 +265,7 @@ def test_call_word_net(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_call_UDPipe(self):
pp = preprocess.UDPipeLemmatizer(language="Lithuanian")
pp = preprocess.UDPipeLemmatizer(language="lt")
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
Expand Down Expand Up @@ -304,7 +299,7 @@ def test_snowball_all_langs(self):

def test_udpipe(self):
"""Test udpipe token lemmatization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
corpus = normalizer(self.corpus)
Expand All @@ -313,15 +308,15 @@ def test_udpipe(self):

def test_udpipe_doc(self):
"""Test udpipe lemmatization with its own tokenization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "Ant kalno dega namas"
corpus = normalizer(self.corpus)
self.assertListEqual(list(corpus.tokens[0]), ["ant", "kalno", "degas", "namas"])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)

def test_udpipe_pickle(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
# udpipe store model after first call - model is not picklable
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
Expand All @@ -336,7 +331,7 @@ def test_udpipe_pickle(self):
)

def test_udpipe_deepcopy(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
normalizer = preprocess.UDPipeLemmatizer("lt", True)
copied = copy.deepcopy(normalizer)
self.assertEqual(normalizer._UDPipeLemmatizer__language,
copied._UDPipeLemmatizer__language)
Expand Down Expand Up @@ -370,7 +365,7 @@ def test_normalizers_picklable(self):
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
normalizer = getattr(preprocess.normalize, nm)
normalizer = (
normalizer(language="Lithuanian")
normalizer(language="lt")
if normalizer is preprocess.UDPipeLemmatizer
else normalizer()
)
Expand All @@ -379,7 +374,7 @@ def test_normalizers_picklable(self):
loaded(self.corpus)

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
normalizer(self.corpus)
Expand All @@ -395,21 +390,21 @@ def test_cache(self):
class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self, _):
"""Test helper functions for label transformation"""
self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'),
'Slovenian sst')
self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'),
'sloveniansstud2.0170801.udpipe')
self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud')
fun = UDPipeModels()._UDPipeModels__file_to_language
r = fun("slovenian-sst-ud-2.0-170801.udpipe")
self.assertTupleEqual(r, ("Slovenian (sst)", "sl_sst"))
r = fun("norwegian_bokmaal-sst-ud-2.0-170801.udpipe")
self.assertTupleEqual(r, ("Norwegian Bokmål (sst)", "nb_sst"))

@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
"""Test udpipe models loading from server"""
models = UDPipeModels()
self.assertIn("Lithuanian", models.supported_languages)
self.assertIn(('Lithuanian', 'lt'), models.supported_languages)
self.assertEqual(7, len(models.supported_languages))

local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe")
model = models["Lithuanian"]
model = models["lt"]
self.assertEqual(model, local_file)
self.assertTrue(os.path.isfile(local_file))

Expand All @@ -419,10 +414,11 @@ def test_udpipe_local_models(self, sf_mock):
models = UDPipeModels()
[models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()]
# use Uyghur, it is the smallest model, we can have it in the repository
_ = models["Lithuanian"]
_ = models["lt"]
sf_mock.side_effect = ConnectionError()
self.assertIn("Lithuanian", UDPipeModels().supported_languages)
self.assertEqual(1, len(UDPipeModels().supported_languages))
exp = {"lt": ('Lithuanian', 'lithuanian-ud-2.0-170801.udpipe')}
self.assertDictEqual(exp, models.model_files)
self.assertListEqual([('Lithuanian', 'lt')], models.supported_languages)

def test_udpipe_offline(self, sf_mock):
"""Test if UDPipe works offline"""
Expand Down

0 comments on commit 446a6fb

Please sign in to comment.