From a9ed149bf65d7ed3fb9dfb5b6f2bdde5f0a3614c Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 23 Feb 2024 14:06:15 +0100 Subject: [PATCH 1/3] Keywords widget - Use language from Coprpus --- orangecontrib/text/widgets/owkeywords.py | 64 +++++++++-- .../text/widgets/tests/test_owkeywords.py | 102 ++++++++++++++++-- 2 files changed, 150 insertions(+), 16 deletions(-) diff --git a/orangecontrib/text/widgets/owkeywords.py b/orangecontrib/text/widgets/owkeywords.py index a5df964a8..92f926ef8 100644 --- a/orangecontrib/text/widgets/owkeywords.py +++ b/orangecontrib/text/widgets/owkeywords.py @@ -22,12 +22,12 @@ from orangecontrib.text import Corpus from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \ - YAKE_LANGUAGE_MAPPING, RAKE_LANGUAGES + YAKE_LANGUAGES, RAKE_LANGUAGES +from orangecontrib.text.language import LanguageModel from orangecontrib.text.preprocess import BaseNormalizer from orangecontrib.text.widgets.utils.words import create_words_table, \ WORDS_COLUMN_NAME -YAKE_LANGUAGES = list(YAKE_LANGUAGE_MAPPING.keys()) CONNECTION_WARNING = ( f"{ScoringMethods.MBERT} could not extract keywords from some " "documents due to connection error. Please rerun keyword extraction." @@ -202,15 +202,17 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin): keywords = "extract keywords, characteristic, term" buttons_area_orientation = Qt.Vertical + settings_version = 2 # Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr) # in setting we want to save integer and not Enum object (in case of PyQt6) DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder)) + DEFAULT_LANGUAGE = "en" settingsHandler = DomainContextHandler() selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF}) - yake_lang_index: int = Setting(YAKE_LANGUAGES.index("English")) - rake_lang_index: int = Setting(RAKE_LANGUAGES.index("English")) + yake_language: Optional[str] = Setting(None, schema_only=True) + rake_language: Optional[str] = Setting(None, schema_only=True) agg_method: int = Setting(AggregationMethods.MEAN) sel_method: int = ContextSetting(SelectionMethods.N_BEST) n_selected: int = ContextSetting(3) @@ -236,6 +238,15 @@ def __init__(self): self.words: Optional[List] = None self.__cached_keywords = {} self.model = KeywordsTableModel(parent=self) + + # languages from workflow should be retained when data on input + self.__pending_yake_language = self.yake_language + self.__pending_rake_language = self.rake_language + # language setting is None by default to prevent default language is + # saved as pending. It is set to default (here) after pending is stored + self.yake_language = self.yake_language or self.DEFAULT_LANGUAGE + self.rake_language = self.rake_language or self.DEFAULT_LANGUAGE + self._setup_gui() def _setup_gui(self): @@ -243,11 +254,17 @@ def _setup_gui(self): box = gui.widgetBox(self.controlArea, "Scoring Methods", grid) yake_cb = gui.comboBox( - self.controlArea, self, "yake_lang_index", items=YAKE_LANGUAGES, + self.controlArea, + self, + "yake_language", + model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES), callback=self.__on_yake_lang_changed ) rake_cb = gui.comboBox( - self.controlArea, self, "rake_lang_index", items=RAKE_LANGUAGES, + self.controlArea, + self, + "rake_language", + model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES), callback=self.__on_rake_lang_changed ) @@ -371,6 +388,17 @@ def set_corpus(self, corpus: Optional[Corpus]): self.corpus = corpus self.openContext(self.corpus) self.__sel_method_buttons.button(self.sel_method).setChecked(True) + if corpus is not None and corpus.language is not None: + if self.__pending_rake_language is not None: + self.yake_language = self.__pending_yake_language + self.rake_language = self.__pending_rake_language + self.__pending_yake_language = None + self.__pending_rake_language = None + else: + if corpus.language in YAKE_LANGUAGES: + self.yake_language = corpus.language + if corpus.language in RAKE_LANGUAGES: + self.rake_language = corpus.language def _clear(self): self.clear_messages() @@ -397,11 +425,11 @@ def update_scores(self): self.Warning.extraction_warnings.clear() kwargs = { ScoringMethods.YAKE: { - "language": YAKE_LANGUAGES[self.yake_lang_index], + "language": self.yake_language, "max_len": self.corpus.ngram_range[1] if self.corpus else 1 }, ScoringMethods.RAKE: { - "language": RAKE_LANGUAGES[self.rake_lang_index], + "language": self.rake_language, "max_len": self.corpus.ngram_range[1] if self.corpus else 1, }, } @@ -508,6 +536,26 @@ def send_report(self): self.report_paragraph("Words", ", ".join(self.words)) self.report_table("Keywords", self.view, num_format="{:.3f}") + @classmethod + def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]): + if version is None or version < 2: + # before version 2 settings were indexes now they are strings + # with language name and selected aggregator name + if "yake_lang_index" in settings: + settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]] + if "rake_lang_index" in settings: + # historic copy of RAKE_LANGUAGES, since current list (now set) depends + # on languages in NLTK. If they change order or add a language settings + # will not be migrated correctly + # fmt: off + previous_order = [ + "ar", "az", "eu", "bn", "ca", "zh", "da", "nl", "en", "fi", + "fr", "de", "el", "he", "hi_eng", "hu", "id", "it", "kk", + "ne", "no", "pt", "ro", "ru", "sl", "es", "sv", "tg", "tr" + ] + # fmt: on + settings["rake_language"] = previous_order[settings["rake_lang_index"]] + if __name__ == "__main__": # pylint: disable=ungrouped-imports diff --git a/orangecontrib/text/widgets/tests/test_owkeywords.py b/orangecontrib/text/widgets/tests/test_owkeywords.py index 200e77246..76c4f36d0 100644 --- a/orangecontrib/text/widgets/tests/test_owkeywords.py +++ b/orangecontrib/text/widgets/tests/test_owkeywords.py @@ -215,23 +215,27 @@ def dummy_mbert(tokens, progress_callback=None): scores = {"TF-IDF", "YAKE!", "Rake", "MBERT"} settings = {"selected_scoring_methods": scores} widget = self.create_widget(OWKeywords, stored_settings=settings) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.wait_until_finished(widget=widget, timeout=10000) + + for i in range(4): + m[i][1].assert_called_once() + m[i][1].reset_mock() - cb = widget.controls.yake_lang_index + cb = widget.controls.yake_language simulate.combobox_activate_item(cb, "Arabic") - cb = widget.controls.rake_lang_index + self.wait_until_finished(widget=widget, timeout=10000) + cb = widget.controls.rake_language simulate.combobox_activate_item(cb, "Finnish") - - self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) self.wait_until_finished(widget=widget, timeout=10000) + out = self.get_output(widget.Outputs.words, widget=widget) self.assertEqual(scores, {a.name for a in out.domain.attributes}) - m[0][1].assert_called_once() m[1][1].assert_called_once() m[2][1].assert_called_once() - m[3][1].assert_called_once() - self.assertEqual(m[1][1].call_args[1]["language"], "Arabic") - self.assertEqual(m[2][1].call_args[1]["language"], "Finnish") + self.assertEqual(m[1][1].call_args[1]["language"], "ar") + self.assertEqual(m[2][1].call_args[1]["language"], "fi") def test_method_change(self): """Test method change by clicking""" @@ -333,6 +337,88 @@ def test_connection_error(self): np.testing.assert_array_equal(output.X, [[7.5], [1]]) self.assertFalse(self.widget.Warning.extraction_warnings.is_shown()) + def test_language_from_corpus(self): + self.corpus.attributes["language"] = "it" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual("it", self.widget.yake_language) + self.assertEqual("it", self.widget.rake_language) + + simulate.combobox_activate_item(self.widget.controls.yake_language, "Finnish") + simulate.combobox_activate_item(self.widget.controls.rake_language, "Finnish") + self.assertEqual("fi", self.widget.yake_language) + self.assertEqual("fi", self.widget.rake_language) + + # language none of them support - language should not change + self.corpus.attributes["language"] = "mr" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual("fi", self.widget.yake_language) + self.assertEqual("fi", self.widget.rake_language) + + # language that is supported by RAKE - language sets for RAKE + self.corpus.attributes["language"] = "hi_eng" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual("fi", self.widget.yake_language) + self.assertEqual("hi_eng", self.widget.rake_language) + + # language that is supported by YAKE! - language sets for YAKE + self.corpus.attributes["language"] = "uk" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual("uk", self.widget.yake_language) + self.assertEqual("hi_eng", self.widget.rake_language) + + # language that both support - widget sets both langagues + self.corpus.attributes["language"] = "it" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual("it", self.widget.yake_language) + self.assertEqual("it", self.widget.rake_language) + + # langauge is None - nothing changes + self.corpus.attributes["language"] = None + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual("it", self.widget.yake_language) + self.assertEqual("it", self.widget.rake_language) + + # corpus None - nothing changes + self.send_signal(self.widget.Inputs.corpus, None) + self.assertEqual("it", self.widget.yake_language) + self.assertEqual("it", self.widget.rake_language) + + def test_language_from_settings(self): + self.send_signal(self.widget.Inputs.corpus, self.corpus) + simulate.combobox_activate_item(self.widget.controls.yake_language, "Slovenian") + simulate.combobox_activate_item(self.widget.controls.rake_language, "Nepali") + + self.assertEqual("sl", self.widget.yake_language) + self.assertEqual("ne", self.widget.rake_language) + settings = self.widget.settingsHandler.pack_data(self.widget) + + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.assertEqual("en", self.corpus.language) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.assertEqual("sl", widget.yake_language) + self.assertEqual("ne", widget.rake_language) + + def test_language_migration(self): + settings = {"__version__": 1, "yake_lang_index": 0, "rake_lang_index": 0} + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.assertEqual("ar", widget.yake_language) + self.assertEqual("ar", widget.rake_language) + + settings = {"__version__": 1, "yake_lang_index": 4, "rake_lang_index": 4} + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.assertEqual("zh", widget.yake_language) + self.assertEqual("ca", widget.rake_language) + + settings = {"__version__": 1, "yake_lang_index": 20, "rake_lang_index": 20} + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.assertEqual("lv", widget.yake_language) + self.assertEqual("no", widget.rake_language) + + settings = {"__version__": 1, "yake_lang_index": 33, "rake_lang_index": 28} + widget = self.create_widget(OWKeywords, stored_settings=settings) + self.assertEqual("uk", widget.yake_language) + self.assertEqual("tr", widget.rake_language) + if __name__ == "__main__": unittest.main() From f6a2e691f6c4eb59d8eeeaec93eae1e68613ac11 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 23 Feb 2024 14:06:30 +0100 Subject: [PATCH 2/3] Keywords - Use ISO languages --- orangecontrib/text/keywords/__init__.py | 61 ++++++------------------- 1 file changed, 15 insertions(+), 46 deletions(-) diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py index 783ecad13..deb22c2c0 100644 --- a/orangecontrib/text/keywords/__init__.py +++ b/orangecontrib/text/keywords/__init__.py @@ -17,51 +17,18 @@ from orangecontrib.text.keywords.rake import Rake from orangecontrib.text.language import ISO2LANG from orangecontrib.text.preprocess import StopwordsFilter - -# all available languages for RAKE from orangecontrib.text.vectorization import BowVectorizer - -# todo: refactor when refactoring language for keywords module -# this is a temporary solution since supported_languages now returns lang ISO codes -RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()] +# all available languages for RAKE +RAKE_LANGUAGES = StopwordsFilter.supported_languages() # all available languages for YAKE! -YAKE_LANGUAGE_MAPPING = { - "Arabic": "ar", - "Armenian": "hy", - "Breton": "br", - "Bulgarian": "bg", - "Chinese": "zh", - "Croatian": "hr", - "Czech": "cz", - "Danish": "da", - "Dutch": "nl", - "English": "en", - "Estonian": "et", - "Finnish": "fi", - "French": "fr", - "German": "de", - "Greek": "el", - "Hindi": "hi", - "Hungarian": "hu", - "Indonesian": "id", - "Italian": "it", - "Japanese": "ja", - "Latvian": "lv", - "Lithuanian": "lt", - "Norwegian": "no", - "Persian": "fa", - "Polish": "pl", - "Portuguese": "pt", - "Romanian": "ro", - "Russian": "ru", - "Slovak": "sk", - "Slovenian": "sl", - "Spanish": "es", - "Swedish": "sv", - "Turkish": "tr", - "Ukrainian": "uk" -} +# fmt: off +YAKE_LANGUAGES = [ + "ar", "hy", "br", "bg", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", + "fr", "de", "el", "hi", "hu", "id", "it", "ja", "lv", "lt", "no", "fa", + "pl", "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr", "uk" +] +# fmt: on def tfidf_keywords( @@ -110,7 +77,7 @@ def tfidf_keywords( def yake_keywords( texts: List[str], - language: str = "English", + language: str = "en", max_len: int = 1, progress_callback: Callable = None ) -> List[List[Tuple[str, float]]]: @@ -135,7 +102,6 @@ def yake_keywords( if progress_callback is None: progress_callback = dummy_callback - language = YAKE_LANGUAGE_MAPPING[language] extractor = yake.KeywordExtractor(lan=language, n=max_len) keywords = [] @@ -148,7 +114,7 @@ def yake_keywords( def rake_keywords( texts: List[str], - language: str = "English", + language: str = "en", max_len: int = 1, progress_callback: Callable = None ) -> List[List[Tuple[str, float]]]: @@ -174,9 +140,12 @@ def rake_keywords( if progress_callback is None: progress_callback = dummy_callback - if language.lower() not in [l.lower() for l in RAKE_LANGUAGES]: + if language not in RAKE_LANGUAGES: raise ValueError(f"Language must be one of: {RAKE_LANGUAGES}") + language = ISO2LANG[language] + # some languages (e.g. Slovenian have different name than ISO name in nltk) + language = StopwordsFilter.LANG2NLTK.get(language, language) stop_words_ = [x.strip() for x in stopwords.words(language.lower())] rake_object = Rake(stop_words_, max_words_length=max_len) From 153e06f8d5a5012b2625238c329e342dc3e08df7 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 23 Feb 2024 13:26:50 +0100 Subject: [PATCH 3/3] LanguageModel - Fix languages order and tests --- orangecontrib/text/language.py | 3 ++- orangecontrib/text/tests/test_language.py | 29 ++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index 1250c9757..fb64ddde3 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -124,7 +124,8 @@ def __init__( """ if languages is None: # if languages not provided take all available languages - languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get) + languages = filter(None, ISO2LANG) + languages = sorted(languages, key=ISO2LANG.get) if include_none: languages = [None] + languages super().__init__(iterable=languages) diff --git a/orangecontrib/text/tests/test_language.py b/orangecontrib/text/tests/test_language.py index 24780152f..d068b2a5f 100644 --- a/orangecontrib/text/tests/test_language.py +++ b/orangecontrib/text/tests/test_language.py @@ -5,7 +5,34 @@ from Orange.data import StringVariable, Domain from orangecontrib.text import Corpus -from orangecontrib.text.language import detect_language, ISO2LANG +from orangecontrib.text.language import detect_language, ISO2LANG, LanguageModel + + +class TestLanguageModel(TestCase): + def test_model_without_languages(self): + # no None, all languages + lm = LanguageModel() + self.assertEqual(len(ISO2LANG) - 1, lm.rowCount()) + all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())] + expected = sorted(list(ISO2LANG.values())[:-1]) + self.assertEqual(expected, all_langs) + + lm = LanguageModel(include_none=True) + self.assertEqual(len(ISO2LANG), lm.rowCount()) + all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())] + expected = sorted(list(ISO2LANG.values())[:-1]) + self.assertEqual(["(no language)"] + expected, all_langs) + + def test_model_with_languages(self): + lm = LanguageModel(include_none=True, languages=["en", "ar", "it"]) + self.assertEqual(4, lm.rowCount()) + all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())] + self.assertEqual(["(no language)", "Arabic", "English", "Italian"], all_langs) + + lm = LanguageModel(languages=["en", "ar", "it"]) + self.assertEqual(3, lm.rowCount()) + all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())] + self.assertEqual(["Arabic", "English", "Italian"], all_langs) class TestLanguage(TestCase):