Statistics - Select statistic computation source

biolab · Feb 2, 2024 · 0af2b21 · 0af2b21
1 parent 3c281b0
commit 0af2b21
Show file tree

Hide file tree

Showing 2 changed files with 156 additions and 83 deletions.
diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
@@ -3,7 +3,7 @@
 from copy import copy
 from itertools import groupby
 from string import punctuation
-from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator
+from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator, Dict
 
 import numpy as np
 from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy
@@ -14,6 +14,7 @@
 from Orange.widgets.utils.widgetpreview import WidgetPreview
 from Orange.widgets.widget import Input, Output, OWWidget
 from nltk import tokenize
+from orangecanvas.gui.utils import disconnected
 from orangewidget.widget import Msg
 
 from orangecontrib.text import Corpus
@@ -73,28 +74,6 @@ def count_appearances(
         return sum(d.lower().count(c) for c in characters for d in document)
 
 
-def preprocess_only_words(corpus: Corpus) -> Corpus:
-    """
-    Apply the preprocessor that splits words, transforms them to lower case
-    (and removes punctuations).
-
-    Parameters
-    ----------
-    corpus
-        Corpus on which the preprocessor will be applied.
-
-    Returns
-    -------
-    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
-    """
-    p = PreprocessorList(
-        [LowercaseTransformer(),
-         # by default regexp keeps only words (no punctuations, no spaces)
-         RegexpTokenizer()]
-    )
-    return p(corpus)
-
-
 def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]:
     """
     Extract source from corpus according to source variable:
@@ -252,7 +231,6 @@ def per_cent_unique_words(
     Ratio between unique words count and all words count
     """
     assert source == Sources.TOKENS
-    corpus = preprocess_only_words(corpus)
 
     def perc_unique(tokens: str):
         callback()
@@ -270,7 +248,6 @@ def starts_with(
     Number of words that starts with the string in `prefix`.
     """
     assert source == Sources.TOKENS
-    corpus = preprocess_only_words(corpus)
 
     def number_starts_with(tokens: List[str]):
         callback()
@@ -289,7 +266,6 @@ def ends_with(
     Number of words that ends with the string in `postfix`.
     """
     assert source == Sources.TOKENS
-    corpus = preprocess_only_words(corpus)
 
     def number_ends_with(tokens: List[str]):
         callback()
@@ -393,7 +369,6 @@ def lix(
     https://en.wikipedia.org/wiki/Lix_(readability_test)
     """
     assert source == Sources.TOKENS
-    corpus = preprocess_only_words(corpus)
     tokenizer = tokenize.PunktSentenceTokenizer()
 
     def lix_index(document, tokens):
@@ -426,18 +401,21 @@ class ComputeValue:
     pattern
         Some statistics need additional parameter with the pattern
         (e.g. starts with), for others it is set to empty string.
+    source
+        Part of the corpus used for computation: either tokens/ngrams or whole documents
     """
 
-    def __init__(self, function: Callable, pattern: str) -> None:
+    def __init__(self, function: Callable, pattern: str, source: str) -> None:
         self.function = function
         self.pattern = pattern
+        self.source = source
 
     def __call__(self, data: Corpus) -> np.ndarray:
         """
         This function compute values on new table.
         """
         # lambda is added as a placeholder for a callback.
-        return self.function(data, self.pattern, lambda: True)[0]
+        return self.function(data, self.pattern, self.source, lambda: True)[0]
 
     def __eq__(self, other):
         return self.function == other.function and self.pattern == other.pattern
@@ -455,7 +433,7 @@ def __hash__(self):
     ("Word count", words_count, None, (Sources.DOCUMENTS,)),
     ("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)),
     ("N-gram count", n_gram_count, None, (Sources.TOKENS,)),
-    ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)),  # todo: discuss
+    ("Average word length", average_word_len, None, (Sources.DOCUMENTS,)),
     ("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)),
     ("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)),
     ("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)),
@@ -505,7 +483,7 @@ def advance():
         fun = STATISTICS_FUNCTIONS[s]
         result = fun(corpus, patern, source, advance)
         if result is not None:
-            result = result + (ComputeValue(fun, patern),)
+            result = result + (ComputeValue(fun, patern, source),)
         state.set_partial_result((s, patern, source, result))
 
 
@@ -530,6 +508,7 @@ class Warning(OWWidget.Warning):
     want_main_area = False
     mainArea_width_height_ratio = None
 
+    settings_version = 2
     # rules used to reset the active rules
     default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])]
     active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:])
@@ -633,10 +612,10 @@ def _add_line():
         def _remove_line():
             self.statistics_combos.pop().deleteLater()
             self.line_edits.pop().deleteLater()
+            self.source_combos.pop().deleteLater()
             self.remove_buttons.pop().deleteLater()
 
         def _fix_tab_order():
-            # TODO: write it differently - check create class
             for i, (r, c, l, s) in enumerate(
                 zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos)
             ):
@@ -646,9 +625,10 @@ def _fix_tab_order():
                     l.setVisible(True)
                 else:
                     l.setVisible(False)
-                s.clear()
-                s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]])
-                s.setCurrentText(r[2])
+                with disconnected(s.currentIndexChanged, self._sync_edit_source_combo):
+                    s.clear()
+                    s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]])
+                    s.setCurrentText(r[2])
 
         n = len(self.active_rules)
         while n > len(self.statistics_combos):
@@ -673,7 +653,7 @@ def _sync_edit_combo(self) -> None:
         combo = self.sender()
         edit_index = self.statistics_combos.index(combo)
         selected_i = combo.currentIndex()
-        default_value = STATISTICS_DEFAULT_VALUE[selected_i]
+        default_value = STATISTICS_DEFAULT_VALUE[selected_i] or ""
         default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0]
         self.active_rules[edit_index] = (selected_i, default_value, default_source)
         self.adjust_n_rule_rows()
@@ -682,18 +662,14 @@ def _sync_edit_line(self) -> None:
         """ Update rules when line edit value changed """
         line_edit = self.sender()
         edit_index = self.line_edits.index(line_edit)
-        self.active_rules[edit_index] = (
-            self.active_rules[edit_index][0],
-            line_edit.text(),
-            STATISTICS_DEFAULT_SOURCES[edit_index][0]
-        )
+        arules = self.active_rules[edit_index]
+        self.active_rules[edit_index] = (arules[0], line_edit.text(), arules[2])
 
     def _sync_edit_source_combo(self) -> None:
-        """ Update rules when line edit value changed """
+        """ Update rules when source value change """
         combo = self.sender()
         edit_index = self.source_combos.index(combo)
         value = combo.currentText()
-        print(value)
         arules = self.active_rules[edit_index]
         self.active_rules[edit_index] = (arules[0], arules[1], value)
 
@@ -766,6 +742,21 @@ def output_results(self) -> None:
         )
         self.Outputs.corpus.send(new_corpus)
 
+    @classmethod
+    def migrate_settings(cls, settings: Dict, version: int):
+        def def_source(idx):
+            """Return source that behaviour is the most similar to previous version"""
+            if STATISTICS_NAMES[idx] == "Regex":
+                # regex was working on tokens in the previous version
+                return Sources.TOKENS
+            # others that allow both sources were working on documents
+            return STATISTICS_DEFAULT_SOURCES[idx][0]
+
+        if version < 2:
+            if "active_rules" in settings:
+                new_rules = [(r, v, def_source(r)) for r, v in settings["active_rules"]]
+                settings["active_rules"] = new_rules
+
 
 if __name__ == "__main__":
     WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))