Skip to content

Commit

Permalink
Preprocess widget - Language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Jun 1, 2023
1 parent d8f4366 commit 1996d27
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 69 deletions.
1 change: 1 addition & 0 deletions orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer

# todo
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
Expand Down
3 changes: 3 additions & 0 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
# https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO
# code we made one up to be able to used it for stopwords (supported in NLTK)
"hi_eng": "Hinglish",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
Expand Down
29 changes: 16 additions & 13 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor

Expand Down Expand Up @@ -91,7 +91,8 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
name = 'Stopwords'

# nltk uses different language nams for some languages
nltk_mapping = {"Slovenian": "Slovene"}
nltk_mapping = {"Slovene": "Slovenian"}
nltk_mapping_inv = {v: k for k, v in nltk_mapping.items()}

def __init__(
self,
Expand All @@ -107,12 +108,13 @@ def __init__(

@wait_nltk_data
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# use language set in init if not None and Corpus's language otherwise
la = ISO2LANG[self.__language or corpus.language]
la = self.nltk_mapping.get(la, la)
if self.__use_default_stopwords:
if la in self.supported_languages():
self.__stopwords = set(x.strip() for x in stopwords.words(la.lower()))
# use language from attr if not None and Corpus's language otherwise
lang = self.__language or corpus.language
if lang in self.supported_languages():
lang = ISO2LANG[lang]
lang = self.nltk_mapping_inv.get(lang, lang).lower()
self.__stopwords = set(x.strip() for x in stopwords.words(lang))
else:
raise ValueError(
"The stopwords filter does not support the Corpus's or "
Expand All @@ -124,15 +126,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
@wait_nltk_data
def supported_languages():
# get NLTK list of stopwords
stopwords_listdir = []
try:
stopwords_listdir = [file for file in
os.listdir(stopwords._get_root())
if file.islower()]
stopwords_listdir = [
file for file in os.listdir(stopwords._get_root()) if file.islower()
]
except LookupError: # when no NLTK data is available
pass
stopwords_listdir = []

return sorted(file.capitalize() for file in stopwords_listdir)
def to_iso(lang):
return LANG2ISO[StopwordsFilter.nltk_mapping.get(lang, lang)]
return {to_iso(file.capitalize()) for file in stopwords_listdir}

def _check(self, token):
return token not in self.__stopwords and token not in self._lexicon
Expand Down
20 changes: 15 additions & 5 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def _find_file(self, language):
def supported_languages(self):
return sorted(map(lambda f: self.file_to_language(f[0]), self.model_files))

@property
def supported_languages_iso(self):
return [self.language_to_iso(lg) for lg in self.supported_languages]

Expand All @@ -156,17 +155,28 @@ def online(self):
except ConnectionError:
return False

# todo: clanup
# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"

def language_to_iso(self, language):
@staticmethod
def language_to_iso(language):
if "(" in language:
language, model = language.split("(")
return self.VARIATION_DELIMITER.join((language, model.strip(")")))
return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")")))
return LANG2ISO[language]

def iso_to_file(self, iso_langauge):
lg_var = iso_langauge.split(self.VARIATION_DELIMITER)
@staticmethod
def iso_to_language(iso_language):
lg_var = iso_language.split(UDPipeModels.VARIATION_DELIMITER)
lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], "")
lg = ISO2LANG[lg]
if model_variation:
model_variation = f"({model_variation})"
return " ".join((lg, model_variation)).strip()

def iso_to_file(self, iso_language):
lg_var = iso_language.split(self.VARIATION_DELIMITER)
lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], None)
lg = ISO2LANG[lg]
lg = [self.LANG2UDPIPE.get(lg, lg).lower().replace(" ", "_")]
Expand Down
Loading

0 comments on commit 1996d27

Please sign in to comment.