Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Keywords - language from corpus #961

Merged
merged 3 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 15 additions & 46 deletions orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,18 @@
from orangecontrib.text.keywords.rake import Rake
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.preprocess import StopwordsFilter

# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer


# todo: refactor when refactoring language for keywords module
# this is a temporary solution since supported_languages now returns lang ISO codes
RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()]
# all available languages for RAKE
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
"Arabic": "ar",
"Armenian": "hy",
"Breton": "br",
"Bulgarian": "bg",
"Chinese": "zh",
"Croatian": "hr",
"Czech": "cz",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"Finnish": "fi",
"French": "fr",
"German": "de",
"Greek": "el",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Italian": "it",
"Japanese": "ja",
"Latvian": "lv",
"Lithuanian": "lt",
"Norwegian": "no",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Turkish": "tr",
"Ukrainian": "uk"
}
# fmt: off
YAKE_LANGUAGES = [
"ar", "hy", "br", "bg", "zh", "hr", "cs", "da", "nl", "en", "et", "fi",
"fr", "de", "el", "hi", "hu", "id", "it", "ja", "lv", "lt", "no", "fa",
"pl", "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr", "uk"
]
# fmt: on


def tfidf_keywords(
Expand Down Expand Up @@ -110,7 +77,7 @@ def tfidf_keywords(

def yake_keywords(
texts: List[str],
language: str = "English",
language: str = "en",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
Expand All @@ -135,7 +102,6 @@ def yake_keywords(
if progress_callback is None:
progress_callback = dummy_callback

language = YAKE_LANGUAGE_MAPPING[language]
extractor = yake.KeywordExtractor(lan=language, n=max_len)

keywords = []
Expand All @@ -148,7 +114,7 @@ def yake_keywords(

def rake_keywords(
texts: List[str],
language: str = "English",
language: str = "en",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
Expand All @@ -174,9 +140,12 @@ def rake_keywords(
if progress_callback is None:
progress_callback = dummy_callback

if language.lower() not in [l.lower() for l in RAKE_LANGUAGES]:
if language not in RAKE_LANGUAGES:
raise ValueError(f"Language must be one of: {RAKE_LANGUAGES}")

language = ISO2LANG[language]
# some languages (e.g. Slovenian have different name than ISO name in nltk)
language = StopwordsFilter.LANG2NLTK.get(language, language)
stop_words_ = [x.strip() for x in stopwords.words(language.lower())]
rake_object = Rake(stop_words_, max_words_length=max_len)

Expand Down
3 changes: 2 additions & 1 deletion orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def __init__(
"""
if languages is None:
# if languages not provided take all available languages
languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get)
languages = filter(None, ISO2LANG)
languages = sorted(languages, key=ISO2LANG.get)
if include_none:
languages = [None] + languages
super().__init__(iterable=languages)
Expand Down
29 changes: 28 additions & 1 deletion orangecontrib/text/tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,34 @@
from Orange.data import StringVariable, Domain

from orangecontrib.text import Corpus
from orangecontrib.text.language import detect_language, ISO2LANG
from orangecontrib.text.language import detect_language, ISO2LANG, LanguageModel


class TestLanguageModel(TestCase):
def test_model_without_languages(self):
# no None, all languages
lm = LanguageModel()
self.assertEqual(len(ISO2LANG) - 1, lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
expected = sorted(list(ISO2LANG.values())[:-1])
self.assertEqual(expected, all_langs)

lm = LanguageModel(include_none=True)
self.assertEqual(len(ISO2LANG), lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
expected = sorted(list(ISO2LANG.values())[:-1])
self.assertEqual(["(no language)"] + expected, all_langs)

def test_model_with_languages(self):
lm = LanguageModel(include_none=True, languages=["en", "ar", "it"])
self.assertEqual(4, lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
self.assertEqual(["(no language)", "Arabic", "English", "Italian"], all_langs)

lm = LanguageModel(languages=["en", "ar", "it"])
self.assertEqual(3, lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
self.assertEqual(["Arabic", "English", "Italian"], all_langs)


class TestLanguage(TestCase):
Expand Down
64 changes: 56 additions & 8 deletions orangecontrib/text/widgets/owkeywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@

from orangecontrib.text import Corpus
from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \
YAKE_LANGUAGE_MAPPING, RAKE_LANGUAGES
YAKE_LANGUAGES, RAKE_LANGUAGES
from orangecontrib.text.language import LanguageModel
from orangecontrib.text.preprocess import BaseNormalizer
from orangecontrib.text.widgets.utils.words import create_words_table, \
WORDS_COLUMN_NAME

YAKE_LANGUAGES = list(YAKE_LANGUAGE_MAPPING.keys())
CONNECTION_WARNING = (
f"{ScoringMethods.MBERT} could not extract keywords from some "
"documents due to connection error. Please rerun keyword extraction."
Expand Down Expand Up @@ -202,15 +202,17 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin):
keywords = "extract keywords, characteristic, term"

buttons_area_orientation = Qt.Vertical
settings_version = 2

# Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr)
# in setting we want to save integer and not Enum object (in case of PyQt6)
DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder))
DEFAULT_LANGUAGE = "en"

settingsHandler = DomainContextHandler()
selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF})
yake_lang_index: int = Setting(YAKE_LANGUAGES.index("English"))
rake_lang_index: int = Setting(RAKE_LANGUAGES.index("English"))
yake_language: Optional[str] = Setting(None, schema_only=True)
rake_language: Optional[str] = Setting(None, schema_only=True)
agg_method: int = Setting(AggregationMethods.MEAN)
sel_method: int = ContextSetting(SelectionMethods.N_BEST)
n_selected: int = ContextSetting(3)
Expand All @@ -236,18 +238,33 @@ def __init__(self):
self.words: Optional[List] = None
self.__cached_keywords = {}
self.model = KeywordsTableModel(parent=self)

# languages from workflow should be retained when data on input
self.__pending_yake_language = self.yake_language
self.__pending_rake_language = self.rake_language
# language setting is None by default to prevent default language is
# saved as pending. It is set to default (here) after pending is stored
self.yake_language = self.yake_language or self.DEFAULT_LANGUAGE
self.rake_language = self.rake_language or self.DEFAULT_LANGUAGE

self._setup_gui()

def _setup_gui(self):
grid = QGridLayout()
box = gui.widgetBox(self.controlArea, "Scoring Methods", grid)

yake_cb = gui.comboBox(
self.controlArea, self, "yake_lang_index", items=YAKE_LANGUAGES,
self.controlArea,
self,
"yake_language",
model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES),
callback=self.__on_yake_lang_changed
)
rake_cb = gui.comboBox(
self.controlArea, self, "rake_lang_index", items=RAKE_LANGUAGES,
self.controlArea,
self,
"rake_language",
model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES),
callback=self.__on_rake_lang_changed
)

Expand Down Expand Up @@ -371,6 +388,17 @@ def set_corpus(self, corpus: Optional[Corpus]):
self.corpus = corpus
self.openContext(self.corpus)
self.__sel_method_buttons.button(self.sel_method).setChecked(True)
if corpus is not None and corpus.language is not None:
if self.__pending_rake_language is not None:
self.yake_language = self.__pending_yake_language
self.rake_language = self.__pending_rake_language
self.__pending_yake_language = None
self.__pending_rake_language = None
else:
if corpus.language in YAKE_LANGUAGES:
self.yake_language = corpus.language
if corpus.language in RAKE_LANGUAGES:
self.rake_language = corpus.language

def _clear(self):
self.clear_messages()
Expand All @@ -397,11 +425,11 @@ def update_scores(self):
self.Warning.extraction_warnings.clear()
kwargs = {
ScoringMethods.YAKE: {
"language": YAKE_LANGUAGES[self.yake_lang_index],
"language": self.yake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
ScoringMethods.RAKE: {
"language": RAKE_LANGUAGES[self.rake_lang_index],
"language": self.rake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1,
},
}
Expand Down Expand Up @@ -508,6 +536,26 @@ def send_report(self):
self.report_paragraph("Words", ", ".join(self.words))
self.report_table("Keywords", self.view, num_format="{:.3f}")

@classmethod
def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
if version is None or version < 2:
# before version 2 settings were indexes now they are strings
# with language name and selected aggregator name
if "yake_lang_index" in settings:
settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]]
if "rake_lang_index" in settings:
# historic copy of RAKE_LANGUAGES, since current list (now set) depends
# on languages in NLTK. If they change order or add a language settings
# will not be migrated correctly
# fmt: off
previous_order = [
"ar", "az", "eu", "bn", "ca", "zh", "da", "nl", "en", "fi",
"fr", "de", "el", "he", "hi_eng", "hu", "id", "it", "kk",
"ne", "no", "pt", "ro", "ru", "sl", "es", "sv", "tg", "tr"
]
# fmt: on
settings["rake_language"] = previous_order[settings["rake_lang_index"]]


if __name__ == "__main__":
# pylint: disable=ungrouped-imports
Expand Down
Loading
Loading