From 6628a38aac1c680282cdd953e5c2ed400fb2a4f7 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Tue, 1 Aug 2023 10:37:01 +0200 Subject: [PATCH] Corpus - Remove dictionary --- orangecontrib/text/corpus.py | 24 +++++------------------- orangecontrib/text/preprocess/filter.py | 15 ++++----------- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index d6a3ec99b..a5bbad2ec 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -78,7 +78,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None: """ self.text_features = [] # list of text features for mining self._tokens = None - self._dictionary = None self.ngram_range = (1, 1) self._pos_tags = None from orangecontrib.text.preprocess import PreprocessorList @@ -382,13 +381,12 @@ def documents_from_features(self, feats): return [' '.join(f.str_val(val) for f, val in zip(data.domain.metas, row)) for row in data.metas] - def store_tokens(self, tokens, dictionary=None): + def store_tokens(self, tokens): """ Args: tokens (list): List of lists containing tokens. """ self._tokens = np.array(tokens, dtype=object) - self._dictionary = dictionary or corpora.Dictionary(self.tokens) @property def tokens(self): @@ -397,7 +395,7 @@ def tokens(self): present, run default preprocessor and return tokens. """ if self._tokens is None: - return self._base_tokens()[0] + return self._base_tokens() return self._tokens def has_tokens(self): @@ -409,19 +407,9 @@ def _base_tokens(self): BASE_TOKENIZER, PreprocessorList # don't use anything that requires NLTK data to assure async download - base_preprocessors = PreprocessorList([BASE_TRANSFORMER, - BASE_TOKENIZER]) + base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER]) corpus = base_preprocessors(self) - return corpus.tokens, corpus.dictionary - - @property - def dictionary(self): - """ - corpora.Dictionary: A token to id mapper. - """ - if self._dictionary is None: - return self._base_tokens()[1] - return self._dictionary + return corpus.tokens @property def pos_tags(self): @@ -476,10 +464,9 @@ def ngrams(self): def copy(self): """Return a copy of the table.""" c = super().copy() - # since tokens and dictionary are considered immutable copies are not needed c._setup_corpus(text_features=copy(self.text_features)) + # since tokens are considered immutable copies are not needed c._tokens = self._tokens - c._dictionary = self._dictionary c.ngram_range = self.ngram_range c.pos_tags = self.pos_tags c.name = self.name @@ -640,7 +627,6 @@ def retain_preprocessing(orig, new, key=...): new.pos_tags = orig.pos_tags else: raise TypeError('Indexing by type {} not supported.'.format(type(key))) - new._dictionary = orig._dictionary if isinstance(new, Corpus): # _find_identical_feature returns non when feature not found diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 851c5b7ef..50f748c31 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) return self._filter_tokens(corpus, wrap_callback(callback, start=0.2)) - def _filter_tokens(self, corpus: Corpus, callback: Callable, - dictionary=None) -> Corpus: + def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: callback(0, "Filtering...") filtered_tokens = [] filtered_tags = [] @@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable, if corpus.pos_tags is not None: filtered_tags.append(list(compress(corpus.pos_tags[i], filter_map))) - if dictionary is None: - corpus.store_tokens(filtered_tokens) - else: - corpus.store_tokens(filtered_tokens, dictionary) + corpus.store_tokens(filtered_tokens) if filtered_tags: corpus.pos_tags = np.array(filtered_tags, dtype=object) return corpus @@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: def _fit(self, corpus: Corpus): raise NotImplemented - def _filter_tokens(self, corpus: Corpus, callback: Callable, - dictionary=None) -> Corpus: - corpus = super()._filter_tokens(corpus, callback, - dictionary=self._dictionary) - return corpus + def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: + return super()._filter_tokens(corpus, callback) def _check(self, token): assert self._lexicon is not None