diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index df3f23bb7..7967518c2 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -1,5 +1,4 @@ import os -import warnings from collections import Counter, defaultdict from copy import copy, deepcopy from numbers import Integral @@ -9,9 +8,6 @@ import nltk import numpy as np -import scipy.sparse as sp -from gensim import corpora - from Orange.data import ( Variable, ContinuousVariable, @@ -23,17 +19,12 @@ ) from Orange.preprocess.transformation import Identity from Orange.data.util import get_unique_names +from gensim import corpora +from orangewidget.utils.signals import summarize, PartialSummary +import scipy.sparse as sp from orangecontrib.text.language import ISO2LANG -try: - from orangewidget.utils.signals import summarize, PartialSummary - # import to check if Table summary is available - if summarize_by_name does - # not exist Orange (3.28) does not support automated summaries - from Orange.widgets.utils.state_summary import summarize_by_name -except ImportError: - summarize, PartialSummary = None, None - def get_sample_corpora_dir(): path = os.path.dirname(__file__) @@ -88,7 +79,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None: """ self.text_features = [] # list of text features for mining self._tokens = None - self._dictionary = None self.ngram_range = (1, 1) self._pos_tags = None from orangecontrib.text.preprocess import PreprocessorList @@ -397,8 +387,13 @@ def store_tokens(self, tokens, dictionary=None): Args: tokens (list): List of lists containing tokens. """ + if dictionary is not None: + warn( + "dictionary argument is deprecated and doesn't have effect." + "It will be removed in future orange3-text 1.15.", + FutureWarning, + ) self._tokens = np.array(tokens, dtype=object) - self._dictionary = dictionary or corpora.Dictionary(self.tokens) @property def tokens(self): @@ -407,7 +402,7 @@ def tokens(self): present, run default preprocessor and return tokens. """ if self._tokens is None: - return self._base_tokens()[0] + return self._base_tokens() return self._tokens def has_tokens(self): @@ -419,19 +414,17 @@ def _base_tokens(self): BASE_TOKENIZER, PreprocessorList # don't use anything that requires NLTK data to assure async download - base_preprocessors = PreprocessorList([BASE_TRANSFORMER, - BASE_TOKENIZER]) + base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER]) corpus = base_preprocessors(self) - return corpus.tokens, corpus.dictionary + return corpus.tokens @property def dictionary(self): - """ - corpora.Dictionary: A token to id mapper. - """ - if self._dictionary is None: - return self._base_tokens()[1] - return self._dictionary + warn( + "dictionary is deprecated and will be removed in Orange3-text 1.15", + FutureWarning, + ) + return corpora.Dictionary(self.tokens) @property def pos_tags(self): @@ -468,6 +461,16 @@ def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False): for n in range(self.ngram_range[0], self.ngram_range[1]+1)))) for doc in data) + def count_tokens(self) -> int: + """Count number of all (non-unique) tokens in the corpus""" + return sum(map(len, self.tokens)) + + def count_unique_tokens(self) -> int: + """Count number of all (unique) tokens in the corpus""" + # it seems to be fast enough even datasets very large dataset, so I + # would avoid caching to prevetnt potential problems connected to that + return len({tk for lst in self.tokens for tk in lst}) + @property def ngrams(self): """generator: Ngram representations of documents.""" @@ -476,10 +479,9 @@ def ngrams(self): def copy(self): """Return a copy of the table.""" c = super().copy() - # since tokens and dictionary are considered immutable copies are not needed c._setup_corpus(text_features=copy(self.text_features)) + # since tokens are considered immutable copies are not needed c._tokens = self._tokens - c._dictionary = self._dictionary c.ngram_range = self.ngram_range c.pos_tags = self.pos_tags c.name = self.name @@ -640,7 +642,6 @@ def retain_preprocessing(orig, new, key=...): new.pos_tags = orig.pos_tags else: raise TypeError('Indexing by type {} not supported.'.format(type(key))) - new._dictionary = orig._dictionary if isinstance(new, Corpus): # _find_identical_feature returns non when feature not found @@ -665,23 +666,20 @@ def retain_preprocessing(orig, new, key=...): new._infer_text_features() -if summarize: - # summarize is not available in older versions of orange-widget-base - # skip if not available - @summarize.register(Corpus) - def summarize_corpus(corpus: Corpus) -> PartialSummary: - """ - Provides automated input and output summaries for Corpus - """ - table_summary = summarize.dispatch(Table)(corpus) - extras = ( - ( - f"
Tokens: {sum(map(len, corpus.tokens))}, " - f"Types: {len(corpus.dictionary)}" - ) - if corpus.has_tokens() - else "
Corpus is not preprocessed" +@summarize.register(Corpus) +def summarize_corpus(corpus: Corpus) -> PartialSummary: + """ + Provides automated input and output summaries for Corpus + """ + table_summary = summarize.dispatch(Table)(corpus) + extras = ( + ( + f"
Tokens: {corpus.count_tokens()}, " + f"Types: {corpus.count_unique_tokens()}" ) - language = ISO2LANG[corpus.language] if corpus.language else "not set" - extras += f"
Language: {language}" - return PartialSummary(table_summary.summary, table_summary.details + extras) + if corpus.has_tokens() + else "
Corpus is not preprocessed" + ) + language = ISO2LANG[corpus.language] if corpus.language else "not set" + extras += f"
Language: {language}" + return PartialSummary(table_summary.summary, table_summary.details + extras) diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 851c5b7ef..50f748c31 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) return self._filter_tokens(corpus, wrap_callback(callback, start=0.2)) - def _filter_tokens(self, corpus: Corpus, callback: Callable, - dictionary=None) -> Corpus: + def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: callback(0, "Filtering...") filtered_tokens = [] filtered_tags = [] @@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable, if corpus.pos_tags is not None: filtered_tags.append(list(compress(corpus.pos_tags[i], filter_map))) - if dictionary is None: - corpus.store_tokens(filtered_tokens) - else: - corpus.store_tokens(filtered_tokens, dictionary) + corpus.store_tokens(filtered_tokens) if filtered_tags: corpus.pos_tags = np.array(filtered_tags, dtype=object) return corpus @@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: def _fit(self, corpus: Corpus): raise NotImplemented - def _filter_tokens(self, corpus: Corpus, callback: Callable, - dictionary=None) -> Corpus: - corpus = super()._filter_tokens(corpus, callback, - dictionary=self._dictionary) - return corpus + def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: + return super()._filter_tokens(corpus, callback) def _check(self, token): assert self._lexicon is not None diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index cb743e6d2..d5d3691e6 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -2,38 +2,30 @@ import pickle import unittest from datetime import datetime -from unittest import skipIf import numpy as np from numpy.testing import assert_array_equal -from orangecontrib.text.preprocess import ( - RegexpTokenizer, - LowercaseTransformer, - StopwordsFilter, -) -from scipy.sparse import csr_matrix, issparse - from Orange.data import ( - Table, + ContinuousVariable, DiscreteVariable, - StringVariable, Domain, - ContinuousVariable, + StringVariable, + Table, dataset_dirs, ) +from orangewidget.utils.signals import summarize +from scipy.sparse import csr_matrix, issparse +import orangecontrib from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus +from orangecontrib.text.preprocess import ( + LowercaseTransformer, + RegexpTokenizer, + StopwordsFilter, +) from orangecontrib.text.tag import AveragedPerceptronTagger -try: - from orangewidget.utils.signals import summarize - # import to check if Table summary is available - if summarize_by_name does - # not exist Orange (3.28) does not support automated summaries - from Orange.widgets.utils.state_summary import summarize_by_name -except ImportError: - summarize = None - class CorpusTests(unittest.TestCase): def setUp(self): @@ -198,7 +190,6 @@ def test_extend_attributes_keep_preprocessing(self): self.assertEqual(len(new_c._tokens), len(c)) np.testing.assert_equal(new_c._tokens, new_c._tokens) - self.assertEqual(new_c._dictionary, c._dictionary) self.assertEqual(new_c.text_features, c.text_features) self.assertEqual(new_c.ngram_range, c.ngram_range) self.assertEqual(new_c.attributes, c.attributes) @@ -415,20 +406,17 @@ def test_getitem(self): self.assertEqual(len(sel), 1) self.assertEqual(len(sel._tokens), 1) np.testing.assert_equal(sel._tokens, np.array([c._tokens[0]])) - self.assertEqual(sel._dictionary, c._dictionary) sel = c[0:5] self.assertEqual(len(sel), 5) self.assertEqual(len(sel._tokens), 5) np.testing.assert_equal(sel._tokens, c._tokens[0:5]) - self.assertEqual(sel._dictionary, c._dictionary) ind = [3, 4, 5, 6] sel = c[ind] self.assertEqual(len(sel), len(ind)) self.assertEqual(len(sel._tokens), len(ind)) np.testing.assert_equal(sel._tokens, c._tokens[ind]) - self.assertEqual(sel._dictionary, c._dictionary) self.assertEqual(sel.text_features, c.text_features) self.assertEqual(sel.ngram_range, c.ngram_range) self.assertEqual(sel.attributes, c.attributes) @@ -438,7 +426,6 @@ def test_getitem(self): self.assertEqual(len(sel), len(ind)) self.assertEqual(len(sel._tokens), len(ind)) np.testing.assert_equal(sel._tokens, c._tokens[ind]) - self.assertEqual(sel._dictionary, c._dictionary) self.assertEqual(sel.text_features, c.text_features) self.assertEqual(sel.ngram_range, c.ngram_range) self.assertEqual(sel.attributes, c.attributes) @@ -448,7 +435,6 @@ def test_getitem(self): self.assertEqual(len(sel), len(ind)) self.assertEqual(len(sel._tokens), len(ind)) np.testing.assert_equal(sel._tokens, c._tokens[list(ind)]) - self.assertEqual(sel._dictionary, c._dictionary) self.assertEqual(sel.text_features, c.text_features) self.assertEqual(sel.ngram_range, c.ngram_range) self.assertEqual(sel.attributes, c.attributes) @@ -457,7 +443,6 @@ def test_getitem(self): self.assertEqual(len(sel), len(c)) self.assertEqual(len(sel._tokens), len(c)) np.testing.assert_equal(sel._tokens, c._tokens) - self.assertEqual(sel._dictionary, c._dictionary) self.assertEqual(sel.text_features, c.text_features) self.assertEqual(sel.ngram_range, c.ngram_range) self.assertEqual(sel.attributes, c.attributes) @@ -466,7 +451,6 @@ def test_getitem(self): self.assertEqual(len(sel), 5) self.assertEqual(len(sel._tokens), 5) np.testing.assert_equal(sel._tokens, c._tokens[0:5]) - self.assertEqual(sel._dictionary, c._dictionary) def test_set_text_features(self): c = Corpus.from_file('friends-transcripts')[:100] @@ -707,8 +691,48 @@ def test_language_unpickle(self): corpus = Corpus.from_file(file) self.assertIsNone(corpus.attributes["language"]) + def test_count_tokens(self): + domain = Domain([], metas=[StringVariable("Text")]) + texts = np.array([["Test text"], ["This is another test text"], ["Text 3"]]) + corpus = Corpus.from_numpy( + domain, + np.empty((3, 0)), + metas=texts, + text_features=domain.metas, + language="en", + ) + corpus = RegexpTokenizer()(corpus) + self.assertEqual(9, corpus.count_tokens()) + # test on Corpus subset + self.assertEqual(7, corpus[:2].count_tokens()) + self.assertEqual(2, corpus[:1].count_tokens()) + + def test_count_unique_tokens(self): + domain = Domain([], metas=[StringVariable("Text")]) + texts = np.array([["Test text"], ["This is another test text"], ["Text 3"]]) + corpus = Corpus.from_numpy( + domain, + np.empty((3, 0)), + metas=texts, + text_features=domain.metas, + language="en", + ) + corpus = RegexpTokenizer()(LowercaseTransformer()(corpus)) + self.assertEqual(6, corpus.count_unique_tokens()) + # test on Corpus subset + self.assertEqual(5, corpus[:2].count_unique_tokens()) + self.assertEqual(2, corpus[:1].count_unique_tokens()) + + def test_remove_dictionary(self): + """ + When this test starts to fail remove: + - this test + - dictionary property from Corpus + - dictionary argument from Corpus.store_tokens + """ + self.assertFalse(orangecontrib.text.__version__.startswith("1.15")) + -@skipIf(summarize is None, "summarize is not available for orange3<=3.28") class TestCorpusSummaries(unittest.TestCase): def test_corpus_not_preprocessed(self): """Check if details part of the summary is formatted correctly""" @@ -747,6 +771,40 @@ def test_corpus_preprocessed(self): self.assertEqual(140, summary.summary) self.assertEqual(details, summary.details) + def test_corpus_subset(self): + """Test numbers are correct on corpus subset""" + # use custom set to have more control + domain = Domain([], metas=[StringVariable("Text")]) + corpus = Corpus.from_numpy( + domain, + np.empty((2, 0)), + metas=np.array([["This is test text 1"], ["This is test another text"]]), + text_features=domain.metas, + language="en", + ) + corpus = RegexpTokenizer()(corpus) + + details = ( + f"{len(corpus)} instances, 1 variable
" + f"Metas: string
" + f"Tokens: 10, Types: 6
" + f"Language: English" + ) + summary = summarize.dispatch(Corpus)(corpus) + self.assertEqual(2, summary.summary) + self.assertEqual(details, summary.details) + + corpus = corpus[:1] + details = ( + f"{len(corpus)} instance, 1 variable
" + f"Metas: string
" + f"Tokens: 5, Types: 5
" + f"Language: English" + ) + summary = summarize.dispatch(Corpus)(corpus) + self.assertEqual(1, summary.summary) + self.assertEqual(details, summary.details) + def test_language(self): """Check if details part of the summary is formatted correctly""" corpus = Corpus.from_file("book-excerpts") diff --git a/orangecontrib/text/tests/test_topic_modeling.py b/orangecontrib/text/tests/test_topic_modeling.py index 309697bc0..94cd906d2 100644 --- a/orangecontrib/text/tests/test_topic_modeling.py +++ b/orangecontrib/text/tests/test_topic_modeling.py @@ -18,8 +18,8 @@ def test_fit_transform(self): def test_get_topic_table_by_id(self): self.model.fit(self.corpus) topic1 = self.model.get_topics_table_by_id(1) - self.assertEqual(len(topic1), len(self.corpus.dictionary)) - self.assertEqual(topic1.metas.shape, (len(self.corpus.dictionary), 2)) + self.assertEqual(len(topic1), self.corpus.count_unique_tokens()) + self.assertEqual(topic1.metas.shape, (self.corpus.count_unique_tokens(), 2)) # self.assertAlmostEqual(topic1.W.sum(), 1.) self.assertFalse(any(topic1.W == np.nan)) diff --git a/orangecontrib/text/topics/topics.py b/orangecontrib/text/topics/topics.py index e53ac58a9..a96162179 100644 --- a/orangecontrib/text/topics/topics.py +++ b/orangecontrib/text/topics/topics.py @@ -77,7 +77,9 @@ def infer_ngrams_corpus(corpus, return_dict=False): dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None) idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep]) keep = [keep[i][0] for i in idx_of_keep] - result = Sparse2Corpus(corpus.X[:, keep].T) + result = [] + if len(dictionary) > 0: + result = Sparse2Corpus(corpus.X[:, keep].T) return (result, dictionary) if return_dict else result @@ -106,7 +108,8 @@ def fit(self, corpus, on_progress=dummy_callback): Args: corpus (Corpus): A corpus to learn topics from. """ - if not len(corpus.dictionary): + ngrams_corpus, dictionary = infer_ngrams_corpus(corpus, return_dict=True) + if len(dictionary) == 0: return None model_kwars = self.kwargs if "callbacks" in inspect.getfullargspec(self.Model).args: @@ -116,7 +119,6 @@ def fit(self, corpus, on_progress=dummy_callback): model_kwars, callbacks=[GensimProgressCallback(on_progress)] ) - ngrams_corpus, dictionary = infer_ngrams_corpus(corpus, return_dict=True) self.model = self.Model( corpus=ngrams_corpus, id2word=dictionary, **model_kwars ) diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py index 663503c8a..ed51ed2eb 100644 --- a/orangecontrib/text/vectorization/bagofwords.py +++ b/orangecontrib/text/vectorization/bagofwords.py @@ -70,10 +70,12 @@ def __init__(self, norm=NONE, wlocal=COUNT, wglobal=NONE): self.wglobal = wglobal def _transform(self, corpus, source_dict=None, callback=dummy_callback): - if not (len(corpus.dictionary) or source_dict) or not len(corpus): + if len(corpus) == 0: return corpus temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict + if len(dic) == 0: + return corpus callback(0.3) temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus] model = models.TfidfModel(dictionary=dic, normalize=False, diff --git a/orangecontrib/text/widgets/owcorpusviewer.py b/orangecontrib/text/widgets/owcorpusviewer.py index 4ab0af28f..ef18fdaef 100644 --- a/orangecontrib/text/widgets/owcorpusviewer.py +++ b/orangecontrib/text/widgets/owcorpusviewer.py @@ -613,8 +613,8 @@ def update_info(self): if self.corpus is not None: has_tokens = self.corpus.has_tokens() self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.corpus)}" - self.n_tokens = sum(map(len, self.corpus.tokens)) if has_tokens else "n/a" - self.n_types = len(self.corpus.dictionary) if has_tokens else "n/a" + self.n_tokens = self.corpus.count_tokens() if has_tokens else "n/a" + self.n_types = self.corpus.count_unique_tokens() if has_tokens else "n/a" else: self.n_matching = "n/a" self.n_matches = "n/a" diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index d31af4911..adbb4d024 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -1186,7 +1186,7 @@ def callback(i: float, status=""): if not pp_data.has_tokens(): pp_data = BASE_TOKENIZER( pp_data, wrap_callback(callback, start=0.9)) - if pp_data is not None and len(pp_data.dictionary) == 0: + if pp_data is not None and pp_data.count_tokens() == 0: msgs.append(self.Warning.no_token_left) pp_data = None return Result(corpus=pp_data, msgs=msgs) @@ -1212,9 +1212,8 @@ def update_preview(self, data): try: tokens = next(data.ngrams_iterator(include_postags=True)) self.preview = ", ".join(tokens[:5]) - n_tokens = sum( - map(len, data.tokens)) if data.has_tokens() else '' - n_types = len(data.dictionary) if data.has_tokens() else '' + n_tokens = data.count_tokens() if data.has_tokens() else '' + n_types = data.count_unique_tokens() if data.has_tokens() else '' self.output_info = f"Tokens: {n_tokens}\nTypes: {n_types}" except StopIteration: self.preview = "" diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index c4f1e47ab..d1e47f2a3 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -1,6 +1,8 @@ import unittest from unittest.mock import patch, PropertyMock, MagicMock, Mock +import numpy as np +from Orange.data import Domain, StringVariable from orangewidget.utils.filedialogs import RecentPath from Orange.widgets.tests.base import WidgetTest @@ -31,8 +33,8 @@ def test_outputs(self): def test_previews(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) self.wait_until_finished() - self.assertTrue(self.widget.preview) - self.assertTrue(self.widget.output_info) + self.assertEqual("human, machine, interface, lab, abc", self.widget.preview) + self.assertEqual("Tokens: 52\nTypes: 35", self.widget.output_info) self.send_signal(self.widget.Inputs.corpus, None) self.wait_until_finished() self.assertFalse(self.widget.preview) @@ -139,6 +141,35 @@ def fun(*_): widget = self.create_widget(OWPreprocess) self.assertTrue(widget.Error.invalid_encoding.is_shown()) + @patch( + "orangecontrib.text.widgets.owpreprocess.OWPreprocess.storedsettings", + PropertyMock( + return_value={ + "preprocessors": [ + ("preprocess.tokenize", {"method": TokenizerModule.Word}), + ("preprocess.filter", {"method": FilteringModule.Stopwords}), + ] + } + ), + ) + def test_no_tokens_left(self): + # make corpus with only stop words to get no_token_left warning + domain = Domain([], metas=[StringVariable("Text")]) + corpus = Corpus.from_numpy( + domain, + np.empty((2, 0)), + metas=np.array([["is are"], ["been"]]), + text_features=domain.metas, + language="en", + ) + self.send_signal(self.widget.Inputs.corpus, corpus) + self.wait_until_finished() + self.assertTrue(self.widget.Warning.no_token_left.is_shown()) + + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() + self.assertFalse(self.widget.Warning.no_token_left.is_shown()) + @patch(SF_LIST, new=Mock(return_value=SERVER_FILES)) class TestOWPreprocessMigrateSettings(WidgetTest):