From 6628a38aac1c680282cdd953e5c2ed400fb2a4f7 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Tue, 1 Aug 2023 10:37:01 +0200
Subject: [PATCH] Corpus - Remove dictionary
---
orangecontrib/text/corpus.py | 24 +++++-------------------
orangecontrib/text/preprocess/filter.py | 15 ++++-----------
2 files changed, 9 insertions(+), 30 deletions(-)
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index d6a3ec99b..a5bbad2ec 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -78,7 +78,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
"""
self.text_features = [] # list of text features for mining
self._tokens = None
- self._dictionary = None
self.ngram_range = (1, 1)
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
@@ -382,13 +381,12 @@ def documents_from_features(self, feats):
return [' '.join(f.str_val(val) for f, val in zip(data.domain.metas, row))
for row in data.metas]
- def store_tokens(self, tokens, dictionary=None):
+ def store_tokens(self, tokens):
"""
Args:
tokens (list): List of lists containing tokens.
"""
self._tokens = np.array(tokens, dtype=object)
- self._dictionary = dictionary or corpora.Dictionary(self.tokens)
@property
def tokens(self):
@@ -397,7 +395,7 @@ def tokens(self):
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
- return self._base_tokens()[0]
+ return self._base_tokens()
return self._tokens
def has_tokens(self):
@@ -409,19 +407,9 @@ def _base_tokens(self):
BASE_TOKENIZER, PreprocessorList
# don't use anything that requires NLTK data to assure async download
- base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
- BASE_TOKENIZER])
+ base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER])
corpus = base_preprocessors(self)
- return corpus.tokens, corpus.dictionary
-
- @property
- def dictionary(self):
- """
- corpora.Dictionary: A token to id mapper.
- """
- if self._dictionary is None:
- return self._base_tokens()[1]
- return self._dictionary
+ return corpus.tokens
@property
def pos_tags(self):
@@ -476,10 +464,9 @@ def ngrams(self):
def copy(self):
"""Return a copy of the table."""
c = super().copy()
- # since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(text_features=copy(self.text_features))
+ # since tokens are considered immutable copies are not needed
c._tokens = self._tokens
- c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
c.pos_tags = self.pos_tags
c.name = self.name
@@ -640,7 +627,6 @@ def retain_preprocessing(orig, new, key=...):
new.pos_tags = orig.pos_tags
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
- new._dictionary = orig._dictionary
if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 851c5b7ef..50f748c31 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))
- def _filter_tokens(self, corpus: Corpus, callback: Callable,
- dictionary=None) -> Corpus:
+ def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
callback(0, "Filtering...")
filtered_tokens = []
filtered_tags = []
@@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable,
if corpus.pos_tags is not None:
filtered_tags.append(list(compress(corpus.pos_tags[i],
filter_map)))
- if dictionary is None:
- corpus.store_tokens(filtered_tokens)
- else:
- corpus.store_tokens(filtered_tokens, dictionary)
+ corpus.store_tokens(filtered_tokens)
if filtered_tags:
corpus.pos_tags = np.array(filtered_tags, dtype=object)
return corpus
@@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
def _fit(self, corpus: Corpus):
raise NotImplemented
- def _filter_tokens(self, corpus: Corpus, callback: Callable,
- dictionary=None) -> Corpus:
- corpus = super()._filter_tokens(corpus, callback,
- dictionary=self._dictionary)
- return corpus
+ def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
+ return super()._filter_tokens(corpus, callback)
def _check(self, token):
assert self._lexicon is not None