Skip to content

Commit

Permalink
Statistics - Select statistic computation source
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 2, 2024
1 parent 3c281b0 commit 0af2b21
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 83 deletions.
77 changes: 34 additions & 43 deletions orangecontrib/text/widgets/owstatistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from copy import copy
from itertools import groupby
from string import punctuation
from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator
from typing import Callable, List, Optional, Tuple, Union, Generator, Iterator, Dict

import numpy as np
from AnyQt.QtWidgets import QComboBox, QGridLayout, QLabel, QLineEdit, QSizePolicy
Expand All @@ -14,6 +14,7 @@
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input, Output, OWWidget
from nltk import tokenize
from orangecanvas.gui.utils import disconnected
from orangewidget.widget import Msg

from orangecontrib.text import Corpus
Expand Down Expand Up @@ -73,28 +74,6 @@ def count_appearances(
return sum(d.lower().count(c) for c in characters for d in document)


def preprocess_only_words(corpus: Corpus) -> Corpus:
"""
Apply the preprocessor that splits words, transforms them to lower case
(and removes punctuations).
Parameters
----------
corpus
Corpus on which the preprocessor will be applied.
Returns
-------
Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
"""
p = PreprocessorList(
[LowercaseTransformer(),
# by default regexp keeps only words (no punctuations, no spaces)
RegexpTokenizer()]
)
return p(corpus)


def get_source(corpus: Corpus, source: str) -> Union[List[str], Iterator[List[str]]]:
"""
Extract source from corpus according to source variable:
Expand Down Expand Up @@ -252,7 +231,6 @@ def per_cent_unique_words(
Ratio between unique words count and all words count
"""
assert source == Sources.TOKENS
corpus = preprocess_only_words(corpus)

def perc_unique(tokens: str):
callback()
Expand All @@ -270,7 +248,6 @@ def starts_with(
Number of words that starts with the string in `prefix`.
"""
assert source == Sources.TOKENS
corpus = preprocess_only_words(corpus)

def number_starts_with(tokens: List[str]):
callback()
Expand All @@ -289,7 +266,6 @@ def ends_with(
Number of words that ends with the string in `postfix`.
"""
assert source == Sources.TOKENS
corpus = preprocess_only_words(corpus)

def number_ends_with(tokens: List[str]):
callback()
Expand Down Expand Up @@ -393,7 +369,6 @@ def lix(
https://en.wikipedia.org/wiki/Lix_(readability_test)
"""
assert source == Sources.TOKENS
corpus = preprocess_only_words(corpus)
tokenizer = tokenize.PunktSentenceTokenizer()

def lix_index(document, tokens):
Expand Down Expand Up @@ -426,18 +401,21 @@ class ComputeValue:
pattern
Some statistics need additional parameter with the pattern
(e.g. starts with), for others it is set to empty string.
source
Part of the corpus used for computation: either tokens/ngrams or whole documents
"""

def __init__(self, function: Callable, pattern: str) -> None:
def __init__(self, function: Callable, pattern: str, source: str) -> None:
self.function = function
self.pattern = pattern
self.source = source

def __call__(self, data: Corpus) -> np.ndarray:
"""
This function compute values on new table.
"""
# lambda is added as a placeholder for a callback.
return self.function(data, self.pattern, lambda: True)[0]
return self.function(data, self.pattern, self.source, lambda: True)[0]

def __eq__(self, other):
return self.function == other.function and self.pattern == other.pattern
Expand All @@ -455,7 +433,7 @@ def __hash__(self):
("Word count", words_count, None, (Sources.DOCUMENTS,)),
("Character count", characters_count, None, (Sources.DOCUMENTS, Sources.TOKENS)),
("N-gram count", n_gram_count, None, (Sources.TOKENS,)),
("Average word length", average_word_len, None, (Sources.DOCUMENTS,)), # todo: discuss
("Average word length", average_word_len, None, (Sources.DOCUMENTS,)),
("Punctuation count", punctuation_count, None, (Sources.DOCUMENTS,)),
("Capital letter count", capital_count, None, (Sources.DOCUMENTS,)),
("Vowel count", vowel_count, "a,e,i,o,u", (Sources.DOCUMENTS,)),
Expand Down Expand Up @@ -505,7 +483,7 @@ def advance():
fun = STATISTICS_FUNCTIONS[s]
result = fun(corpus, patern, source, advance)
if result is not None:
result = result + (ComputeValue(fun, patern),)
result = result + (ComputeValue(fun, patern, source),)
state.set_partial_result((s, patern, source, result))


Expand All @@ -530,6 +508,7 @@ class Warning(OWWidget.Warning):
want_main_area = False
mainArea_width_height_ratio = None

settings_version = 2
# rules used to reset the active rules
default_rules = [(0, "", STATISTICS[0][-1][0]), (1, "", STATISTICS[0][-1][0])]
active_rules: List[Tuple[int, str, str]] = Setting(default_rules[:])
Expand Down Expand Up @@ -633,10 +612,10 @@ def _add_line():
def _remove_line():
self.statistics_combos.pop().deleteLater()
self.line_edits.pop().deleteLater()
self.source_combos.pop().deleteLater()
self.remove_buttons.pop().deleteLater()

def _fix_tab_order():
# TODO: write it differently - check create class
for i, (r, c, l, s) in enumerate(
zip(self.active_rules, self.statistics_combos, self.line_edits, self.source_combos)
):
Expand All @@ -646,9 +625,10 @@ def _fix_tab_order():
l.setVisible(True)
else:
l.setVisible(False)
s.clear()
s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]])
s.setCurrentText(r[2])
with disconnected(s.currentIndexChanged, self._sync_edit_source_combo):
s.clear()
s.addItems(STATISTICS_DEFAULT_SOURCES[r[0]])
s.setCurrentText(r[2])

n = len(self.active_rules)
while n > len(self.statistics_combos):
Expand All @@ -673,7 +653,7 @@ def _sync_edit_combo(self) -> None:
combo = self.sender()
edit_index = self.statistics_combos.index(combo)
selected_i = combo.currentIndex()
default_value = STATISTICS_DEFAULT_VALUE[selected_i]
default_value = STATISTICS_DEFAULT_VALUE[selected_i] or ""
default_source = STATISTICS_DEFAULT_SOURCES[selected_i][0]
self.active_rules[edit_index] = (selected_i, default_value, default_source)
self.adjust_n_rule_rows()
Expand All @@ -682,18 +662,14 @@ def _sync_edit_line(self) -> None:
""" Update rules when line edit value changed """
line_edit = self.sender()
edit_index = self.line_edits.index(line_edit)
self.active_rules[edit_index] = (
self.active_rules[edit_index][0],
line_edit.text(),
STATISTICS_DEFAULT_SOURCES[edit_index][0]
)
arules = self.active_rules[edit_index]
self.active_rules[edit_index] = (arules[0], line_edit.text(), arules[2])

def _sync_edit_source_combo(self) -> None:
""" Update rules when line edit value changed """
""" Update rules when source value change """
combo = self.sender()
edit_index = self.source_combos.index(combo)
value = combo.currentText()
print(value)
arules = self.active_rules[edit_index]
self.active_rules[edit_index] = (arules[0], arules[1], value)

Expand Down Expand Up @@ -766,6 +742,21 @@ def output_results(self) -> None:
)
self.Outputs.corpus.send(new_corpus)

@classmethod
def migrate_settings(cls, settings: Dict, version: int):
def def_source(idx):
"""Return source that behaviour is the most similar to previous version"""
if STATISTICS_NAMES[idx] == "Regex":
# regex was working on tokens in the previous version
return Sources.TOKENS
# others that allow both sources were working on documents
return STATISTICS_DEFAULT_SOURCES[idx][0]

if version < 2:
if "active_rules" in settings:
new_rules = [(r, v, def_source(r)) for r, v in settings["active_rules"]]
settings["active_rules"] = new_rules


if __name__ == "__main__":
WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))
Loading

0 comments on commit 0af2b21

Please sign in to comment.