From a9ed149bf65d7ed3fb9dfb5b6f2bdde5f0a3614c Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 23 Feb 2024 14:06:15 +0100
Subject: [PATCH 1/3] Keywords widget - Use language from Coprpus
---
orangecontrib/text/widgets/owkeywords.py | 64 +++++++++--
.../text/widgets/tests/test_owkeywords.py | 102 ++++++++++++++++--
2 files changed, 150 insertions(+), 16 deletions(-)
diff --git a/orangecontrib/text/widgets/owkeywords.py b/orangecontrib/text/widgets/owkeywords.py
index a5df964a8..92f926ef8 100644
--- a/orangecontrib/text/widgets/owkeywords.py
+++ b/orangecontrib/text/widgets/owkeywords.py
@@ -22,12 +22,12 @@
from orangecontrib.text import Corpus
from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \
- YAKE_LANGUAGE_MAPPING, RAKE_LANGUAGES
+ YAKE_LANGUAGES, RAKE_LANGUAGES
+from orangecontrib.text.language import LanguageModel
from orangecontrib.text.preprocess import BaseNormalizer
from orangecontrib.text.widgets.utils.words import create_words_table, \
WORDS_COLUMN_NAME
-YAKE_LANGUAGES = list(YAKE_LANGUAGE_MAPPING.keys())
CONNECTION_WARNING = (
f"{ScoringMethods.MBERT} could not extract keywords from some "
"documents due to connection error. Please rerun keyword extraction."
@@ -202,15 +202,17 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin):
keywords = "extract keywords, characteristic, term"
buttons_area_orientation = Qt.Vertical
+ settings_version = 2
# Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr)
# in setting we want to save integer and not Enum object (in case of PyQt6)
DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder))
+ DEFAULT_LANGUAGE = "en"
settingsHandler = DomainContextHandler()
selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF})
- yake_lang_index: int = Setting(YAKE_LANGUAGES.index("English"))
- rake_lang_index: int = Setting(RAKE_LANGUAGES.index("English"))
+ yake_language: Optional[str] = Setting(None, schema_only=True)
+ rake_language: Optional[str] = Setting(None, schema_only=True)
agg_method: int = Setting(AggregationMethods.MEAN)
sel_method: int = ContextSetting(SelectionMethods.N_BEST)
n_selected: int = ContextSetting(3)
@@ -236,6 +238,15 @@ def __init__(self):
self.words: Optional[List] = None
self.__cached_keywords = {}
self.model = KeywordsTableModel(parent=self)
+
+ # languages from workflow should be retained when data on input
+ self.__pending_yake_language = self.yake_language
+ self.__pending_rake_language = self.rake_language
+ # language setting is None by default to prevent default language is
+ # saved as pending. It is set to default (here) after pending is stored
+ self.yake_language = self.yake_language or self.DEFAULT_LANGUAGE
+ self.rake_language = self.rake_language or self.DEFAULT_LANGUAGE
+
self._setup_gui()
def _setup_gui(self):
@@ -243,11 +254,17 @@ def _setup_gui(self):
box = gui.widgetBox(self.controlArea, "Scoring Methods", grid)
yake_cb = gui.comboBox(
- self.controlArea, self, "yake_lang_index", items=YAKE_LANGUAGES,
+ self.controlArea,
+ self,
+ "yake_language",
+ model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES),
callback=self.__on_yake_lang_changed
)
rake_cb = gui.comboBox(
- self.controlArea, self, "rake_lang_index", items=RAKE_LANGUAGES,
+ self.controlArea,
+ self,
+ "rake_language",
+ model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES),
callback=self.__on_rake_lang_changed
)
@@ -371,6 +388,17 @@ def set_corpus(self, corpus: Optional[Corpus]):
self.corpus = corpus
self.openContext(self.corpus)
self.__sel_method_buttons.button(self.sel_method).setChecked(True)
+ if corpus is not None and corpus.language is not None:
+ if self.__pending_rake_language is not None:
+ self.yake_language = self.__pending_yake_language
+ self.rake_language = self.__pending_rake_language
+ self.__pending_yake_language = None
+ self.__pending_rake_language = None
+ else:
+ if corpus.language in YAKE_LANGUAGES:
+ self.yake_language = corpus.language
+ if corpus.language in RAKE_LANGUAGES:
+ self.rake_language = corpus.language
def _clear(self):
self.clear_messages()
@@ -397,11 +425,11 @@ def update_scores(self):
self.Warning.extraction_warnings.clear()
kwargs = {
ScoringMethods.YAKE: {
- "language": YAKE_LANGUAGES[self.yake_lang_index],
+ "language": self.yake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
ScoringMethods.RAKE: {
- "language": RAKE_LANGUAGES[self.rake_lang_index],
+ "language": self.rake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1,
},
}
@@ -508,6 +536,26 @@ def send_report(self):
self.report_paragraph("Words", ", ".join(self.words))
self.report_table("Keywords", self.view, num_format="{:.3f}")
+ @classmethod
+ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
+ if version is None or version < 2:
+ # before version 2 settings were indexes now they are strings
+ # with language name and selected aggregator name
+ if "yake_lang_index" in settings:
+ settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]]
+ if "rake_lang_index" in settings:
+ # historic copy of RAKE_LANGUAGES, since current list (now set) depends
+ # on languages in NLTK. If they change order or add a language settings
+ # will not be migrated correctly
+ # fmt: off
+ previous_order = [
+ "ar", "az", "eu", "bn", "ca", "zh", "da", "nl", "en", "fi",
+ "fr", "de", "el", "he", "hi_eng", "hu", "id", "it", "kk",
+ "ne", "no", "pt", "ro", "ru", "sl", "es", "sv", "tg", "tr"
+ ]
+ # fmt: on
+ settings["rake_language"] = previous_order[settings["rake_lang_index"]]
+
if __name__ == "__main__":
# pylint: disable=ungrouped-imports
diff --git a/orangecontrib/text/widgets/tests/test_owkeywords.py b/orangecontrib/text/widgets/tests/test_owkeywords.py
index 200e77246..76c4f36d0 100644
--- a/orangecontrib/text/widgets/tests/test_owkeywords.py
+++ b/orangecontrib/text/widgets/tests/test_owkeywords.py
@@ -215,23 +215,27 @@ def dummy_mbert(tokens, progress_callback=None):
scores = {"TF-IDF", "YAKE!", "Rake", "MBERT"}
settings = {"selected_scoring_methods": scores}
widget = self.create_widget(OWKeywords, stored_settings=settings)
+ self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
+ self.wait_until_finished(widget=widget, timeout=10000)
+
+ for i in range(4):
+ m[i][1].assert_called_once()
+ m[i][1].reset_mock()
- cb = widget.controls.yake_lang_index
+ cb = widget.controls.yake_language
simulate.combobox_activate_item(cb, "Arabic")
- cb = widget.controls.rake_lang_index
+ self.wait_until_finished(widget=widget, timeout=10000)
+ cb = widget.controls.rake_language
simulate.combobox_activate_item(cb, "Finnish")
-
- self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
self.wait_until_finished(widget=widget, timeout=10000)
+
out = self.get_output(widget.Outputs.words, widget=widget)
self.assertEqual(scores, {a.name for a in out.domain.attributes})
- m[0][1].assert_called_once()
m[1][1].assert_called_once()
m[2][1].assert_called_once()
- m[3][1].assert_called_once()
- self.assertEqual(m[1][1].call_args[1]["language"], "Arabic")
- self.assertEqual(m[2][1].call_args[1]["language"], "Finnish")
+ self.assertEqual(m[1][1].call_args[1]["language"], "ar")
+ self.assertEqual(m[2][1].call_args[1]["language"], "fi")
def test_method_change(self):
"""Test method change by clicking"""
@@ -333,6 +337,88 @@ def test_connection_error(self):
np.testing.assert_array_equal(output.X, [[7.5], [1]])
self.assertFalse(self.widget.Warning.extraction_warnings.is_shown())
+ def test_language_from_corpus(self):
+ self.corpus.attributes["language"] = "it"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual("it", self.widget.yake_language)
+ self.assertEqual("it", self.widget.rake_language)
+
+ simulate.combobox_activate_item(self.widget.controls.yake_language, "Finnish")
+ simulate.combobox_activate_item(self.widget.controls.rake_language, "Finnish")
+ self.assertEqual("fi", self.widget.yake_language)
+ self.assertEqual("fi", self.widget.rake_language)
+
+ # language none of them support - language should not change
+ self.corpus.attributes["language"] = "mr"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual("fi", self.widget.yake_language)
+ self.assertEqual("fi", self.widget.rake_language)
+
+ # language that is supported by RAKE - language sets for RAKE
+ self.corpus.attributes["language"] = "hi_eng"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual("fi", self.widget.yake_language)
+ self.assertEqual("hi_eng", self.widget.rake_language)
+
+ # language that is supported by YAKE! - language sets for YAKE
+ self.corpus.attributes["language"] = "uk"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual("uk", self.widget.yake_language)
+ self.assertEqual("hi_eng", self.widget.rake_language)
+
+ # language that both support - widget sets both langagues
+ self.corpus.attributes["language"] = "it"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual("it", self.widget.yake_language)
+ self.assertEqual("it", self.widget.rake_language)
+
+ # langauge is None - nothing changes
+ self.corpus.attributes["language"] = None
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual("it", self.widget.yake_language)
+ self.assertEqual("it", self.widget.rake_language)
+
+ # corpus None - nothing changes
+ self.send_signal(self.widget.Inputs.corpus, None)
+ self.assertEqual("it", self.widget.yake_language)
+ self.assertEqual("it", self.widget.rake_language)
+
+ def test_language_from_settings(self):
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ simulate.combobox_activate_item(self.widget.controls.yake_language, "Slovenian")
+ simulate.combobox_activate_item(self.widget.controls.rake_language, "Nepali")
+
+ self.assertEqual("sl", self.widget.yake_language)
+ self.assertEqual("ne", self.widget.rake_language)
+ settings = self.widget.settingsHandler.pack_data(self.widget)
+
+ widget = self.create_widget(OWKeywords, stored_settings=settings)
+ self.assertEqual("en", self.corpus.language)
+ self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
+ self.assertEqual("sl", widget.yake_language)
+ self.assertEqual("ne", widget.rake_language)
+
+ def test_language_migration(self):
+ settings = {"__version__": 1, "yake_lang_index": 0, "rake_lang_index": 0}
+ widget = self.create_widget(OWKeywords, stored_settings=settings)
+ self.assertEqual("ar", widget.yake_language)
+ self.assertEqual("ar", widget.rake_language)
+
+ settings = {"__version__": 1, "yake_lang_index": 4, "rake_lang_index": 4}
+ widget = self.create_widget(OWKeywords, stored_settings=settings)
+ self.assertEqual("zh", widget.yake_language)
+ self.assertEqual("ca", widget.rake_language)
+
+ settings = {"__version__": 1, "yake_lang_index": 20, "rake_lang_index": 20}
+ widget = self.create_widget(OWKeywords, stored_settings=settings)
+ self.assertEqual("lv", widget.yake_language)
+ self.assertEqual("no", widget.rake_language)
+
+ settings = {"__version__": 1, "yake_lang_index": 33, "rake_lang_index": 28}
+ widget = self.create_widget(OWKeywords, stored_settings=settings)
+ self.assertEqual("uk", widget.yake_language)
+ self.assertEqual("tr", widget.rake_language)
+
if __name__ == "__main__":
unittest.main()
From f6a2e691f6c4eb59d8eeeaec93eae1e68613ac11 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 23 Feb 2024 14:06:30 +0100
Subject: [PATCH 2/3] Keywords - Use ISO languages
---
orangecontrib/text/keywords/__init__.py | 61 ++++++-------------------
1 file changed, 15 insertions(+), 46 deletions(-)
diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py
index 783ecad13..deb22c2c0 100644
--- a/orangecontrib/text/keywords/__init__.py
+++ b/orangecontrib/text/keywords/__init__.py
@@ -17,51 +17,18 @@
from orangecontrib.text.keywords.rake import Rake
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.preprocess import StopwordsFilter
-
-# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer
-
-# todo: refactor when refactoring language for keywords module
-# this is a temporary solution since supported_languages now returns lang ISO codes
-RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()]
+# all available languages for RAKE
+RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
-YAKE_LANGUAGE_MAPPING = {
- "Arabic": "ar",
- "Armenian": "hy",
- "Breton": "br",
- "Bulgarian": "bg",
- "Chinese": "zh",
- "Croatian": "hr",
- "Czech": "cz",
- "Danish": "da",
- "Dutch": "nl",
- "English": "en",
- "Estonian": "et",
- "Finnish": "fi",
- "French": "fr",
- "German": "de",
- "Greek": "el",
- "Hindi": "hi",
- "Hungarian": "hu",
- "Indonesian": "id",
- "Italian": "it",
- "Japanese": "ja",
- "Latvian": "lv",
- "Lithuanian": "lt",
- "Norwegian": "no",
- "Persian": "fa",
- "Polish": "pl",
- "Portuguese": "pt",
- "Romanian": "ro",
- "Russian": "ru",
- "Slovak": "sk",
- "Slovenian": "sl",
- "Spanish": "es",
- "Swedish": "sv",
- "Turkish": "tr",
- "Ukrainian": "uk"
-}
+# fmt: off
+YAKE_LANGUAGES = [
+ "ar", "hy", "br", "bg", "zh", "hr", "cs", "da", "nl", "en", "et", "fi",
+ "fr", "de", "el", "hi", "hu", "id", "it", "ja", "lv", "lt", "no", "fa",
+ "pl", "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr", "uk"
+]
+# fmt: on
def tfidf_keywords(
@@ -110,7 +77,7 @@ def tfidf_keywords(
def yake_keywords(
texts: List[str],
- language: str = "English",
+ language: str = "en",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
@@ -135,7 +102,6 @@ def yake_keywords(
if progress_callback is None:
progress_callback = dummy_callback
- language = YAKE_LANGUAGE_MAPPING[language]
extractor = yake.KeywordExtractor(lan=language, n=max_len)
keywords = []
@@ -148,7 +114,7 @@ def yake_keywords(
def rake_keywords(
texts: List[str],
- language: str = "English",
+ language: str = "en",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
@@ -174,9 +140,12 @@ def rake_keywords(
if progress_callback is None:
progress_callback = dummy_callback
- if language.lower() not in [l.lower() for l in RAKE_LANGUAGES]:
+ if language not in RAKE_LANGUAGES:
raise ValueError(f"Language must be one of: {RAKE_LANGUAGES}")
+ language = ISO2LANG[language]
+ # some languages (e.g. Slovenian have different name than ISO name in nltk)
+ language = StopwordsFilter.LANG2NLTK.get(language, language)
stop_words_ = [x.strip() for x in stopwords.words(language.lower())]
rake_object = Rake(stop_words_, max_words_length=max_len)
From 153e06f8d5a5012b2625238c329e342dc3e08df7 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 23 Feb 2024 13:26:50 +0100
Subject: [PATCH 3/3] LanguageModel - Fix languages order and tests
---
orangecontrib/text/language.py | 3 ++-
orangecontrib/text/tests/test_language.py | 29 ++++++++++++++++++++++-
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py
index 1250c9757..fb64ddde3 100644
--- a/orangecontrib/text/language.py
+++ b/orangecontrib/text/language.py
@@ -124,7 +124,8 @@ def __init__(
"""
if languages is None:
# if languages not provided take all available languages
- languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get)
+ languages = filter(None, ISO2LANG)
+ languages = sorted(languages, key=ISO2LANG.get)
if include_none:
languages = [None] + languages
super().__init__(iterable=languages)
diff --git a/orangecontrib/text/tests/test_language.py b/orangecontrib/text/tests/test_language.py
index 24780152f..d068b2a5f 100644
--- a/orangecontrib/text/tests/test_language.py
+++ b/orangecontrib/text/tests/test_language.py
@@ -5,7 +5,34 @@
from Orange.data import StringVariable, Domain
from orangecontrib.text import Corpus
-from orangecontrib.text.language import detect_language, ISO2LANG
+from orangecontrib.text.language import detect_language, ISO2LANG, LanguageModel
+
+
+class TestLanguageModel(TestCase):
+ def test_model_without_languages(self):
+ # no None, all languages
+ lm = LanguageModel()
+ self.assertEqual(len(ISO2LANG) - 1, lm.rowCount())
+ all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
+ expected = sorted(list(ISO2LANG.values())[:-1])
+ self.assertEqual(expected, all_langs)
+
+ lm = LanguageModel(include_none=True)
+ self.assertEqual(len(ISO2LANG), lm.rowCount())
+ all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
+ expected = sorted(list(ISO2LANG.values())[:-1])
+ self.assertEqual(["(no language)"] + expected, all_langs)
+
+ def test_model_with_languages(self):
+ lm = LanguageModel(include_none=True, languages=["en", "ar", "it"])
+ self.assertEqual(4, lm.rowCount())
+ all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
+ self.assertEqual(["(no language)", "Arabic", "English", "Italian"], all_langs)
+
+ lm = LanguageModel(languages=["en", "ar", "it"])
+ self.assertEqual(3, lm.rowCount())
+ all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
+ self.assertEqual(["Arabic", "English", "Italian"], all_langs)
class TestLanguage(TestCase):