Corpus - Remove dictionary

biolab · Aug 1, 2023 · 6628a38 · 6628a38
1 parent 13429e5
commit 6628a38
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 30 deletions.
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
@@ -78,7 +78,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
         """
         self.text_features = []    # list of text features for mining
         self._tokens = None
-        self._dictionary = None
         self.ngram_range = (1, 1)
         self._pos_tags = None
         from orangecontrib.text.preprocess import PreprocessorList
@@ -382,13 +381,12 @@ def documents_from_features(self, feats):
         return [' '.join(f.str_val(val) for f, val in zip(data.domain.metas, row))
                 for row in data.metas]
 
-    def store_tokens(self, tokens, dictionary=None):
+    def store_tokens(self, tokens):
         """
         Args:
             tokens (list): List of lists containing tokens.
         """
         self._tokens = np.array(tokens, dtype=object)
-        self._dictionary = dictionary or corpora.Dictionary(self.tokens)
 
     @property
     def tokens(self):
@@ -397,7 +395,7 @@ def tokens(self):
         present, run default preprocessor and return tokens.
         """
         if self._tokens is None:
-            return self._base_tokens()[0]
+            return self._base_tokens()
         return self._tokens
 
     def has_tokens(self):
@@ -409,19 +407,9 @@ def _base_tokens(self):
             BASE_TOKENIZER, PreprocessorList
 
         # don't use anything that requires NLTK data to assure async download
-        base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
-                                               BASE_TOKENIZER])
+        base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER])
         corpus = base_preprocessors(self)
-        return corpus.tokens, corpus.dictionary
-
-    @property
-    def dictionary(self):
-        """
-        corpora.Dictionary: A token to id mapper.
-        """
-        if self._dictionary is None:
-            return self._base_tokens()[1]
-        return self._dictionary
+        return corpus.tokens
 
     @property
     def pos_tags(self):
@@ -476,10 +464,9 @@ def ngrams(self):
     def copy(self):
         """Return a copy of the table."""
         c = super().copy()
-        # since tokens and dictionary are considered immutable copies are not needed
         c._setup_corpus(text_features=copy(self.text_features))
+        # since tokens are considered immutable copies are not needed
         c._tokens = self._tokens
-        c._dictionary = self._dictionary
         c.ngram_range = self.ngram_range
         c.pos_tags = self.pos_tags
         c.name = self.name
@@ -640,7 +627,6 @@ def retain_preprocessing(orig, new, key=...):
                     new.pos_tags = orig.pos_tags
                 else:
                     raise TypeError('Indexing by type {} not supported.'.format(type(key)))
-                new._dictionary = orig._dictionary
 
             if isinstance(new, Corpus):
                 # _find_identical_feature returns non when feature not found

diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
@@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
         corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
         return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))
 
-    def _filter_tokens(self, corpus: Corpus, callback: Callable,
-                       dictionary=None) -> Corpus:
+    def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
         callback(0, "Filtering...")
         filtered_tokens = []
         filtered_tags = []
@@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable,
             if corpus.pos_tags is not None:
                 filtered_tags.append(list(compress(corpus.pos_tags[i],
                                                    filter_map)))
-        if dictionary is None:
-            corpus.store_tokens(filtered_tokens)
-        else:
-            corpus.store_tokens(filtered_tokens, dictionary)
+        corpus.store_tokens(filtered_tokens)
         if filtered_tags:
             corpus.pos_tags = np.array(filtered_tags, dtype=object)
         return corpus
@@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     def _fit(self, corpus: Corpus):
         raise NotImplemented
 
-    def _filter_tokens(self, corpus: Corpus, callback: Callable,
-                       dictionary=None) -> Corpus:
-        corpus = super()._filter_tokens(corpus, callback,
-                                        dictionary=self._dictionary)
-        return corpus
+    def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
+        return super()._filter_tokens(corpus, callback)
 
     def _check(self, token):
         assert self._lexicon is not None