diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index df3f23bb7..7967518c2 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -1,5 +1,4 @@
import os
-import warnings
from collections import Counter, defaultdict
from copy import copy, deepcopy
from numbers import Integral
@@ -9,9 +8,6 @@
import nltk
import numpy as np
-import scipy.sparse as sp
-from gensim import corpora
-
from Orange.data import (
Variable,
ContinuousVariable,
@@ -23,17 +19,12 @@
)
from Orange.preprocess.transformation import Identity
from Orange.data.util import get_unique_names
+from gensim import corpora
+from orangewidget.utils.signals import summarize, PartialSummary
+import scipy.sparse as sp
from orangecontrib.text.language import ISO2LANG
-try:
- from orangewidget.utils.signals import summarize, PartialSummary
- # import to check if Table summary is available - if summarize_by_name does
- # not exist Orange (3.28) does not support automated summaries
- from Orange.widgets.utils.state_summary import summarize_by_name
-except ImportError:
- summarize, PartialSummary = None, None
-
def get_sample_corpora_dir():
path = os.path.dirname(__file__)
@@ -88,7 +79,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
"""
self.text_features = [] # list of text features for mining
self._tokens = None
- self._dictionary = None
self.ngram_range = (1, 1)
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
@@ -397,8 +387,13 @@ def store_tokens(self, tokens, dictionary=None):
Args:
tokens (list): List of lists containing tokens.
"""
+ if dictionary is not None:
+ warn(
+ "dictionary argument is deprecated and doesn't have effect."
+ "It will be removed in future orange3-text 1.15.",
+ FutureWarning,
+ )
self._tokens = np.array(tokens, dtype=object)
- self._dictionary = dictionary or corpora.Dictionary(self.tokens)
@property
def tokens(self):
@@ -407,7 +402,7 @@ def tokens(self):
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
- return self._base_tokens()[0]
+ return self._base_tokens()
return self._tokens
def has_tokens(self):
@@ -419,19 +414,17 @@ def _base_tokens(self):
BASE_TOKENIZER, PreprocessorList
# don't use anything that requires NLTK data to assure async download
- base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
- BASE_TOKENIZER])
+ base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER])
corpus = base_preprocessors(self)
- return corpus.tokens, corpus.dictionary
+ return corpus.tokens
@property
def dictionary(self):
- """
- corpora.Dictionary: A token to id mapper.
- """
- if self._dictionary is None:
- return self._base_tokens()[1]
- return self._dictionary
+ warn(
+ "dictionary is deprecated and will be removed in Orange3-text 1.15",
+ FutureWarning,
+ )
+ return corpora.Dictionary(self.tokens)
@property
def pos_tags(self):
@@ -468,6 +461,16 @@ def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
for n in range(self.ngram_range[0], self.ngram_range[1]+1))))
for doc in data)
+ def count_tokens(self) -> int:
+ """Count number of all (non-unique) tokens in the corpus"""
+ return sum(map(len, self.tokens))
+
+ def count_unique_tokens(self) -> int:
+ """Count number of all (unique) tokens in the corpus"""
+ # it seems to be fast enough even datasets very large dataset, so I
+ # would avoid caching to prevetnt potential problems connected to that
+ return len({tk for lst in self.tokens for tk in lst})
+
@property
def ngrams(self):
"""generator: Ngram representations of documents."""
@@ -476,10 +479,9 @@ def ngrams(self):
def copy(self):
"""Return a copy of the table."""
c = super().copy()
- # since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(text_features=copy(self.text_features))
+ # since tokens are considered immutable copies are not needed
c._tokens = self._tokens
- c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
c.pos_tags = self.pos_tags
c.name = self.name
@@ -640,7 +642,6 @@ def retain_preprocessing(orig, new, key=...):
new.pos_tags = orig.pos_tags
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
- new._dictionary = orig._dictionary
if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
@@ -665,23 +666,20 @@ def retain_preprocessing(orig, new, key=...):
new._infer_text_features()
-if summarize:
- # summarize is not available in older versions of orange-widget-base
- # skip if not available
- @summarize.register(Corpus)
- def summarize_corpus(corpus: Corpus) -> PartialSummary:
- """
- Provides automated input and output summaries for Corpus
- """
- table_summary = summarize.dispatch(Table)(corpus)
- extras = (
- (
- f"
Tokens: {sum(map(len, corpus.tokens))}, "
- f"Types: {len(corpus.dictionary)}"
- )
- if corpus.has_tokens()
- else "
Corpus is not preprocessed"
+@summarize.register(Corpus)
+def summarize_corpus(corpus: Corpus) -> PartialSummary:
+ """
+ Provides automated input and output summaries for Corpus
+ """
+ table_summary = summarize.dispatch(Table)(corpus)
+ extras = (
+ (
+ f"
Tokens: {corpus.count_tokens()}, "
+ f"Types: {corpus.count_unique_tokens()}"
)
- language = ISO2LANG[corpus.language] if corpus.language else "not set"
- extras += f"
Language: {language}"
- return PartialSummary(table_summary.summary, table_summary.details + extras)
+ if corpus.has_tokens()
+ else "
Corpus is not preprocessed"
+ )
+ language = ISO2LANG[corpus.language] if corpus.language else "not set"
+ extras += f"
Language: {language}"
+ return PartialSummary(table_summary.summary, table_summary.details + extras)
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 851c5b7ef..50f748c31 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))
- def _filter_tokens(self, corpus: Corpus, callback: Callable,
- dictionary=None) -> Corpus:
+ def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
callback(0, "Filtering...")
filtered_tokens = []
filtered_tags = []
@@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable,
if corpus.pos_tags is not None:
filtered_tags.append(list(compress(corpus.pos_tags[i],
filter_map)))
- if dictionary is None:
- corpus.store_tokens(filtered_tokens)
- else:
- corpus.store_tokens(filtered_tokens, dictionary)
+ corpus.store_tokens(filtered_tokens)
if filtered_tags:
corpus.pos_tags = np.array(filtered_tags, dtype=object)
return corpus
@@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
def _fit(self, corpus: Corpus):
raise NotImplemented
- def _filter_tokens(self, corpus: Corpus, callback: Callable,
- dictionary=None) -> Corpus:
- corpus = super()._filter_tokens(corpus, callback,
- dictionary=self._dictionary)
- return corpus
+ def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
+ return super()._filter_tokens(corpus, callback)
def _check(self, token):
assert self._lexicon is not None
diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
index cb743e6d2..d5d3691e6 100644
--- a/orangecontrib/text/tests/test_corpus.py
+++ b/orangecontrib/text/tests/test_corpus.py
@@ -2,38 +2,30 @@
import pickle
import unittest
from datetime import datetime
-from unittest import skipIf
import numpy as np
from numpy.testing import assert_array_equal
-from orangecontrib.text.preprocess import (
- RegexpTokenizer,
- LowercaseTransformer,
- StopwordsFilter,
-)
-from scipy.sparse import csr_matrix, issparse
-
from Orange.data import (
- Table,
+ ContinuousVariable,
DiscreteVariable,
- StringVariable,
Domain,
- ContinuousVariable,
+ StringVariable,
+ Table,
dataset_dirs,
)
+from orangewidget.utils.signals import summarize
+from scipy.sparse import csr_matrix, issparse
+import orangecontrib
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
+from orangecontrib.text.preprocess import (
+ LowercaseTransformer,
+ RegexpTokenizer,
+ StopwordsFilter,
+)
from orangecontrib.text.tag import AveragedPerceptronTagger
-try:
- from orangewidget.utils.signals import summarize
- # import to check if Table summary is available - if summarize_by_name does
- # not exist Orange (3.28) does not support automated summaries
- from Orange.widgets.utils.state_summary import summarize_by_name
-except ImportError:
- summarize = None
-
class CorpusTests(unittest.TestCase):
def setUp(self):
@@ -198,7 +190,6 @@ def test_extend_attributes_keep_preprocessing(self):
self.assertEqual(len(new_c._tokens), len(c))
np.testing.assert_equal(new_c._tokens, new_c._tokens)
- self.assertEqual(new_c._dictionary, c._dictionary)
self.assertEqual(new_c.text_features, c.text_features)
self.assertEqual(new_c.ngram_range, c.ngram_range)
self.assertEqual(new_c.attributes, c.attributes)
@@ -415,20 +406,17 @@ def test_getitem(self):
self.assertEqual(len(sel), 1)
self.assertEqual(len(sel._tokens), 1)
np.testing.assert_equal(sel._tokens, np.array([c._tokens[0]]))
- self.assertEqual(sel._dictionary, c._dictionary)
sel = c[0:5]
self.assertEqual(len(sel), 5)
self.assertEqual(len(sel._tokens), 5)
np.testing.assert_equal(sel._tokens, c._tokens[0:5])
- self.assertEqual(sel._dictionary, c._dictionary)
ind = [3, 4, 5, 6]
sel = c[ind]
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[ind])
- self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
@@ -438,7 +426,6 @@ def test_getitem(self):
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[ind])
- self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
@@ -448,7 +435,6 @@ def test_getitem(self):
self.assertEqual(len(sel), len(ind))
self.assertEqual(len(sel._tokens), len(ind))
np.testing.assert_equal(sel._tokens, c._tokens[list(ind)])
- self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
@@ -457,7 +443,6 @@ def test_getitem(self):
self.assertEqual(len(sel), len(c))
self.assertEqual(len(sel._tokens), len(c))
np.testing.assert_equal(sel._tokens, c._tokens)
- self.assertEqual(sel._dictionary, c._dictionary)
self.assertEqual(sel.text_features, c.text_features)
self.assertEqual(sel.ngram_range, c.ngram_range)
self.assertEqual(sel.attributes, c.attributes)
@@ -466,7 +451,6 @@ def test_getitem(self):
self.assertEqual(len(sel), 5)
self.assertEqual(len(sel._tokens), 5)
np.testing.assert_equal(sel._tokens, c._tokens[0:5])
- self.assertEqual(sel._dictionary, c._dictionary)
def test_set_text_features(self):
c = Corpus.from_file('friends-transcripts')[:100]
@@ -707,8 +691,48 @@ def test_language_unpickle(self):
corpus = Corpus.from_file(file)
self.assertIsNone(corpus.attributes["language"])
+ def test_count_tokens(self):
+ domain = Domain([], metas=[StringVariable("Text")])
+ texts = np.array([["Test text"], ["This is another test text"], ["Text 3"]])
+ corpus = Corpus.from_numpy(
+ domain,
+ np.empty((3, 0)),
+ metas=texts,
+ text_features=domain.metas,
+ language="en",
+ )
+ corpus = RegexpTokenizer()(corpus)
+ self.assertEqual(9, corpus.count_tokens())
+ # test on Corpus subset
+ self.assertEqual(7, corpus[:2].count_tokens())
+ self.assertEqual(2, corpus[:1].count_tokens())
+
+ def test_count_unique_tokens(self):
+ domain = Domain([], metas=[StringVariable("Text")])
+ texts = np.array([["Test text"], ["This is another test text"], ["Text 3"]])
+ corpus = Corpus.from_numpy(
+ domain,
+ np.empty((3, 0)),
+ metas=texts,
+ text_features=domain.metas,
+ language="en",
+ )
+ corpus = RegexpTokenizer()(LowercaseTransformer()(corpus))
+ self.assertEqual(6, corpus.count_unique_tokens())
+ # test on Corpus subset
+ self.assertEqual(5, corpus[:2].count_unique_tokens())
+ self.assertEqual(2, corpus[:1].count_unique_tokens())
+
+ def test_remove_dictionary(self):
+ """
+ When this test starts to fail remove:
+ - this test
+ - dictionary property from Corpus
+ - dictionary argument from Corpus.store_tokens
+ """
+ self.assertFalse(orangecontrib.text.__version__.startswith("1.15"))
+
-@skipIf(summarize is None, "summarize is not available for orange3<=3.28")
class TestCorpusSummaries(unittest.TestCase):
def test_corpus_not_preprocessed(self):
"""Check if details part of the summary is formatted correctly"""
@@ -747,6 +771,40 @@ def test_corpus_preprocessed(self):
self.assertEqual(140, summary.summary)
self.assertEqual(details, summary.details)
+ def test_corpus_subset(self):
+ """Test numbers are correct on corpus subset"""
+ # use custom set to have more control
+ domain = Domain([], metas=[StringVariable("Text")])
+ corpus = Corpus.from_numpy(
+ domain,
+ np.empty((2, 0)),
+ metas=np.array([["This is test text 1"], ["This is test another text"]]),
+ text_features=domain.metas,
+ language="en",
+ )
+ corpus = RegexpTokenizer()(corpus)
+
+ details = (
+ f"{len(corpus)} instances, 1 variable
"
+ f"Metas: string
"
+ f"Tokens: 10, Types: 6
"
+ f"Language: English"
+ )
+ summary = summarize.dispatch(Corpus)(corpus)
+ self.assertEqual(2, summary.summary)
+ self.assertEqual(details, summary.details)
+
+ corpus = corpus[:1]
+ details = (
+ f"{len(corpus)} instance, 1 variable
"
+ f"Metas: string
"
+ f"Tokens: 5, Types: 5
"
+ f"Language: English"
+ )
+ summary = summarize.dispatch(Corpus)(corpus)
+ self.assertEqual(1, summary.summary)
+ self.assertEqual(details, summary.details)
+
def test_language(self):
"""Check if details part of the summary is formatted correctly"""
corpus = Corpus.from_file("book-excerpts")
diff --git a/orangecontrib/text/tests/test_topic_modeling.py b/orangecontrib/text/tests/test_topic_modeling.py
index 309697bc0..94cd906d2 100644
--- a/orangecontrib/text/tests/test_topic_modeling.py
+++ b/orangecontrib/text/tests/test_topic_modeling.py
@@ -18,8 +18,8 @@ def test_fit_transform(self):
def test_get_topic_table_by_id(self):
self.model.fit(self.corpus)
topic1 = self.model.get_topics_table_by_id(1)
- self.assertEqual(len(topic1), len(self.corpus.dictionary))
- self.assertEqual(topic1.metas.shape, (len(self.corpus.dictionary), 2))
+ self.assertEqual(len(topic1), self.corpus.count_unique_tokens())
+ self.assertEqual(topic1.metas.shape, (self.corpus.count_unique_tokens(), 2))
# self.assertAlmostEqual(topic1.W.sum(), 1.)
self.assertFalse(any(topic1.W == np.nan))
diff --git a/orangecontrib/text/topics/topics.py b/orangecontrib/text/topics/topics.py
index e53ac58a9..a96162179 100644
--- a/orangecontrib/text/topics/topics.py
+++ b/orangecontrib/text/topics/topics.py
@@ -77,7 +77,9 @@ def infer_ngrams_corpus(corpus, return_dict=False):
dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None)
idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep])
keep = [keep[i][0] for i in idx_of_keep]
- result = Sparse2Corpus(corpus.X[:, keep].T)
+ result = []
+ if len(dictionary) > 0:
+ result = Sparse2Corpus(corpus.X[:, keep].T)
return (result, dictionary) if return_dict else result
@@ -106,7 +108,8 @@ def fit(self, corpus, on_progress=dummy_callback):
Args:
corpus (Corpus): A corpus to learn topics from.
"""
- if not len(corpus.dictionary):
+ ngrams_corpus, dictionary = infer_ngrams_corpus(corpus, return_dict=True)
+ if len(dictionary) == 0:
return None
model_kwars = self.kwargs
if "callbacks" in inspect.getfullargspec(self.Model).args:
@@ -116,7 +119,6 @@ def fit(self, corpus, on_progress=dummy_callback):
model_kwars, callbacks=[GensimProgressCallback(on_progress)]
)
- ngrams_corpus, dictionary = infer_ngrams_corpus(corpus, return_dict=True)
self.model = self.Model(
corpus=ngrams_corpus, id2word=dictionary, **model_kwars
)
diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py
index 663503c8a..ed51ed2eb 100644
--- a/orangecontrib/text/vectorization/bagofwords.py
+++ b/orangecontrib/text/vectorization/bagofwords.py
@@ -70,10 +70,12 @@ def __init__(self, norm=NONE, wlocal=COUNT, wglobal=NONE):
self.wglobal = wglobal
def _transform(self, corpus, source_dict=None, callback=dummy_callback):
- if not (len(corpus.dictionary) or source_dict) or not len(corpus):
+ if len(corpus) == 0:
return corpus
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
+ if len(dic) == 0:
+ return corpus
callback(0.3)
temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
model = models.TfidfModel(dictionary=dic, normalize=False,
diff --git a/orangecontrib/text/widgets/owcorpusviewer.py b/orangecontrib/text/widgets/owcorpusviewer.py
index 4ab0af28f..ef18fdaef 100644
--- a/orangecontrib/text/widgets/owcorpusviewer.py
+++ b/orangecontrib/text/widgets/owcorpusviewer.py
@@ -613,8 +613,8 @@ def update_info(self):
if self.corpus is not None:
has_tokens = self.corpus.has_tokens()
self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.corpus)}"
- self.n_tokens = sum(map(len, self.corpus.tokens)) if has_tokens else "n/a"
- self.n_types = len(self.corpus.dictionary) if has_tokens else "n/a"
+ self.n_tokens = self.corpus.count_tokens() if has_tokens else "n/a"
+ self.n_types = self.corpus.count_unique_tokens() if has_tokens else "n/a"
else:
self.n_matching = "n/a"
self.n_matches = "n/a"
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index d31af4911..adbb4d024 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -1186,7 +1186,7 @@ def callback(i: float, status=""):
if not pp_data.has_tokens():
pp_data = BASE_TOKENIZER(
pp_data, wrap_callback(callback, start=0.9))
- if pp_data is not None and len(pp_data.dictionary) == 0:
+ if pp_data is not None and pp_data.count_tokens() == 0:
msgs.append(self.Warning.no_token_left)
pp_data = None
return Result(corpus=pp_data, msgs=msgs)
@@ -1212,9 +1212,8 @@ def update_preview(self, data):
try:
tokens = next(data.ngrams_iterator(include_postags=True))
self.preview = ", ".join(tokens[:5])
- n_tokens = sum(
- map(len, data.tokens)) if data.has_tokens() else ''
- n_types = len(data.dictionary) if data.has_tokens() else ''
+ n_tokens = data.count_tokens() if data.has_tokens() else ''
+ n_types = data.count_unique_tokens() if data.has_tokens() else ''
self.output_info = f"Tokens: {n_tokens}\nTypes: {n_types}"
except StopIteration:
self.preview = ""
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index c4f1e47ab..d1e47f2a3 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -1,6 +1,8 @@
import unittest
from unittest.mock import patch, PropertyMock, MagicMock, Mock
+import numpy as np
+from Orange.data import Domain, StringVariable
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
@@ -31,8 +33,8 @@ def test_outputs(self):
def test_previews(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()
- self.assertTrue(self.widget.preview)
- self.assertTrue(self.widget.output_info)
+ self.assertEqual("human, machine, interface, lab, abc", self.widget.preview)
+ self.assertEqual("Tokens: 52\nTypes: 35", self.widget.output_info)
self.send_signal(self.widget.Inputs.corpus, None)
self.wait_until_finished()
self.assertFalse(self.widget.preview)
@@ -139,6 +141,35 @@ def fun(*_):
widget = self.create_widget(OWPreprocess)
self.assertTrue(widget.Error.invalid_encoding.is_shown())
+ @patch(
+ "orangecontrib.text.widgets.owpreprocess.OWPreprocess.storedsettings",
+ PropertyMock(
+ return_value={
+ "preprocessors": [
+ ("preprocess.tokenize", {"method": TokenizerModule.Word}),
+ ("preprocess.filter", {"method": FilteringModule.Stopwords}),
+ ]
+ }
+ ),
+ )
+ def test_no_tokens_left(self):
+ # make corpus with only stop words to get no_token_left warning
+ domain = Domain([], metas=[StringVariable("Text")])
+ corpus = Corpus.from_numpy(
+ domain,
+ np.empty((2, 0)),
+ metas=np.array([["is are"], ["been"]]),
+ text_features=domain.metas,
+ language="en",
+ )
+ self.send_signal(self.widget.Inputs.corpus, corpus)
+ self.wait_until_finished()
+ self.assertTrue(self.widget.Warning.no_token_left.is_shown())
+
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
+ self.assertFalse(self.widget.Warning.no_token_left.is_shown())
+
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):