Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Preprocess - Use language from Corpus #963

Merged
merged 1 commit into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from orangecontrib.text.vectorization import BowVectorizer

# all available languages for RAKE
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
RAKE_LANGUAGES = StopwordsFilter.supported_languages
# all available languages for YAKE!
# fmt: off
YAKE_LANGUAGES = [
Expand Down
5 changes: 3 additions & 2 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ def lang_to_iso(language: str) -> str:
"""
return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)]

@staticmethod
@classmethod
@property
@wait_nltk_data
def supported_languages() -> Set[str]:
def supported_languages(_) -> Set[str]:
"""
List all languages supported by NLTK

Expand Down
4 changes: 4 additions & 0 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]]
def supported_languages(self) -> List[Tuple[str, str]]:
return [(name, iso) for iso, (name, _) in self.model_files.items()]

@property
def supported_languages_iso(self) -> List[Tuple[str, str]]:
return {iso for _, iso in self.supported_languages}

@property
def online(self) -> bool:
try:
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def test_stopwords_slovene(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_supported_languages(self):
langs = preprocess.StopwordsFilter.supported_languages()
langs = preprocess.StopwordsFilter.supported_languages
self.assertIsInstance(langs, set)
# just testing few of most important languages since I want for test to be
# resistant for any potentially newly introduced languages by NLTK
Expand Down
85 changes: 76 additions & 9 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from AnyQt.QtGui import QBrush, QValidator

from Orange.util import wrap_callback
from orangecanvas.gui.utils import disconnected
from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath

import Orange.widgets.data.owpreprocess
Expand Down Expand Up @@ -112,7 +114,8 @@ def set_current_language(self, iso_language: Optional[str]):
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
self.setCurrentIndex(index)
if index >= 0:
self.setCurrentIndex(index)


class UDPipeComboBox(LanguageComboBox):
Expand All @@ -130,15 +133,9 @@ def items(self) -> List:
def add_items(self, _, include_none: bool, language: str):
self.__items = self.items
super().add_items(self.__items, include_none, language)

def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
if iso_language in iso_items:
super().set_current_language(iso_language)
elif self.__default_lang in iso_items:
if language not in iso_items and self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
elif self.__items:
self.setCurrentIndex(0)

def showPopup(self):
if self.__items != self.items:
Expand Down Expand Up @@ -657,7 +654,7 @@ def __init__(self, parent=None, **kwargs):

self.__combo = LanguageComboBox(
self,
StopwordsFilter.supported_languages(),
StopwordsFilter.supported_languages,
self.__sw_lang,
True,
self.__set_language,
Expand Down Expand Up @@ -1044,6 +1041,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()


class PreprocessSettingsHandler(SettingsHandler):
"""
Settings handler, that makes all language settings, which are
a part of common preprocess settings, schema_only. It removes them when
settings are not loaded from schema but from common settings.
"""
def _remove_schema_only(self, settings_dict):
super()._remove_schema_only(settings_dict)
for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
for pp_name, settings in data["storedsettings"]["preprocessors"]:
for key in list(settings):
if "language" in key:
settings.pop(key)


PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
Expand Down Expand Up @@ -1127,12 +1139,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical

def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
self.__store_pending_languages()

box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
Expand All @@ -1150,6 +1164,16 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)

def set_model(self, pmodel):
"""Connect signal which handle setting language from corpus"""
super().set_model(pmodel)
if pmodel:
pmodel.rowsInserted.connect(self.__on_item_inserted)

def __on_item_inserted(self, _, first: int, last: int):
assert first == last
self.__set_languages_single_editor(first)

def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
Expand Down Expand Up @@ -1179,6 +1203,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
def set_data(self, data: Corpus):
self.cancel()
self.data = data
self.__set_languages()

LANG_PARAMS = {
"preprocess.normalize": [
("snowball_language", SnowballStemmer.supported_languages),
("udpipe_language", UDPipeModels().supported_languages_iso),
("lemmagen_language", LemmagenLemmatizer.supported_languages),
],
"preprocess.filter": [("language", StopwordsFilter.supported_languages)],
}

def __store_pending_languages(self):
settings = self.storedsettings["preprocessors"]
self.__pending_languages = {
pp_name: {p for p in par if "language" in p} for pp_name, par in settings
}

def __set_languages(self):
if self.data is not None:
for i in range(self.preprocessormodel.rowCount()):
self.__set_languages_single_editor(i)
self.__pending_languages = {}

def __set_languages_single_editor(self, item_index: int):
"""
Set language from corpus for single editor/module,
keep language unchanged if it comes from schema (pending).
"""
if self.data and self.data.language:
model = self.preprocessormodel
item = model.item(item_index)
pp_name = item.data(DescriptionRole).qualname
params = item.data(ParametersRole)
pending = self.__pending_languages.get(pp_name, set())
for param, available_langs in self.LANG_PARAMS.get(pp_name, []):
if param not in pending and self.data.language in available_langs:
# set language if not pending from schema - should not be changed
# and if available for the method
params[param] = self.data.language
with disconnected(model.dataChanged, self.__on_modelchanged):
# disconnection prevent double apply call, it is already called
# on new data and when row inserted, both caller of this method
item.setData(params, ParametersRole)

def buildpreproc(self) -> PreprocessorList:
plist = []
Expand Down
163 changes: 159 additions & 4 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from unittest.mock import patch, PropertyMock, MagicMock, Mock

import numpy as np
from AnyQt.QtGui import QStandardItem, QIcon
from Orange.data import Domain, StringVariable
from Orange.widgets.data.utils.preprocess import DescriptionRole, ParametersRole
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate
Expand Down Expand Up @@ -180,6 +182,153 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())

def test_language_from_corpus(self):
"""Test language from corpus is set correctly"""
initial = {
"name": "",
"preprocessors": [("preprocess.normalize", {}), ("preprocess.filter", {})],
}
self.widget.storedsettings = initial
self.widget._initialize()
self.assertDictEqual(initial, self.widget.storedsettings)
combos = self.widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
["English", "English", "English", "English"],
[c.currentText() for c in combos]
)

# test with Slovenian - language should set for all preprocessors except
# Snowball that doesn't support Slovenian
self.corpus.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["English", "Slovenian", "Slovenian", "Slovenian"],
[c.currentText() for c in combos]
)
settings = self.widget.storedsettings["preprocessors"]
self.assertEqual("sl", settings[0][1]["udpipe_language"])
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
self.assertEqual("sl", settings[1][1]["language"])

# test with Lithuanian that is support by one preprocessors
self.corpus.attributes["language"] = "lt"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["English", "Lithuanian", "Slovenian", "Slovenian"],
[c.currentText() for c in combos]
)
settings = self.widget.storedsettings["preprocessors"]
self.assertEqual("lt", settings[0][1]["udpipe_language"])
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
self.assertEqual("sl", settings[1][1]["language"])

self.corpus.attributes["language"] = "pt"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
[c.currentText() for c in combos]
)
settings = self.widget.storedsettings["preprocessors"]
self.assertEqual("pt", settings[0][1]["snowball_language"])
self.assertEqual("pt", settings[0][1]["udpipe_language"])
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
self.assertEqual("pt", settings[1][1]["language"])

# language not supported by any preprocessor - language shouldn't change
self.corpus.attributes["language"] = "bo"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
[c.currentText() for c in combos]
)
settings = self.widget.storedsettings["preprocessors"]
self.assertEqual("pt", settings[0][1]["snowball_language"])
self.assertEqual("pt", settings[0][1]["udpipe_language"])
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
self.assertEqual("pt", settings[1][1]["language"])

# test with missing language - language shouldn't change
self.corpus.attributes["language"] = None
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
[c.currentText() for c in combos]
)
settings = self.widget.storedsettings["preprocessors"]
self.assertEqual("pt", settings[0][1]["snowball_language"])
self.assertEqual("pt", settings[0][1]["udpipe_language"])
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
self.assertEqual("pt", settings[1][1]["language"])

def test_language_from_schema(self):
"""Test language from schema/workflow is retained"""
initial = {
"name": "",
"preprocessors": [
(
"preprocess.normalize",
{
"lemmagen_language": "sl",
"snowball_language": "nl",
"udpipe_language": "lt",
},
),
("preprocess.filter", {"language": "nl"}),
],
}
self.widget.storedsettings = initial

settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWPreprocess, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
self.assertDictEqual(initial, widget.storedsettings)
combos = widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
["Dutch", "Lithuanian", "Slovenian", "Dutch"],
[c.currentText() for c in combos]
)

def test_language_from_corpus_editor_inserted(self):
"""Test language from corpus is set to new editor too"""
initial = {
"name": "",
"preprocessors": [("preprocess.filter", {})],
}
self.widget.storedsettings = initial
self.widget._initialize()
self.assertDictEqual(initial, self.widget.storedsettings)
combos = self.widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
["English"],
[c.currentText() for c in combos]
)

# insert data - language of stopwords combo should change to italian
self.corpus.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Slovenian"],
[c.currentText() for c in combos]
)

# insert new editor - all languages except snowball should be set to Slovenian
pp_def = self.widget._qname2ppdef["preprocess.normalize"]
description = pp_def.description
item = QStandardItem(description.title)
icon = QIcon(description.icon)
item.setIcon(icon)
item.setToolTip(description.summary)
item.setData(pp_def, DescriptionRole)
item.setData({}, ParametersRole)
self.widget.preprocessormodel.insertRow(0, [item])
self.wait_until_finished()

combos = self.widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
['Slovenian', 'English', 'Slovenian', 'Slovenian'],
[c.currentText() for c in combos]
)


@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
Expand Down Expand Up @@ -983,14 +1132,20 @@ def test_set_current_language(self):
self.assertEqual("Portuguese", cb.currentText())
cb.set_current_language("sl")
self.assertEqual("Slovenian", cb.currentText())
cb.set_current_language("abc") # should set to default
self.assertEqual("English", cb.currentText())
cb.set_current_language("abc") # language not in list - keep current seleciton
self.assertEqual("Slovenian", cb.currentText())

def test_set_language_to_default(self):
"""In case current item not in dropdown anymore set language to default"""
mock = Mock()
cb = UDPipeComboBox(None, "pt", "en", mock)
self.assertEqual("Portuguese", cb.currentText())
# when no default language in the dropdown set to first
cb.removeItem(0)
x = cb._UDPipeComboBox__items
cb._UDPipeComboBox__items = x[:3] + x[4:]
cb.set_current_language("abc")
self.assertEqual("English (lines)", cb.currentText())
cb.showPopup()
self.assertEqual("English", cb.currentText())

def test_change_item(self):
mock = Mock()
Expand Down
Loading