From 9e0457ee2a866177b57679d00a02a3b26b432221 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 27 Sep 2024 14:56:23 +0200 Subject: [PATCH] Refactoring --- .flake8 | 6 + .github/FUNDING.yml | 1 - .github/workflows/syntax-and-unit-tests.yml | 33 -- .github/workflows/test.yml | 25 + .gitignore | 1 + .pre-commit-config.yaml | 27 ++ .zenodo.json | 2 +- LICENSE | 2 +- MANIFEST.in | 3 - README.md | 8 +- VERSION | 1 + demo/demo_quaxa.py | 22 +- pyproject.toml | 34 ++ quaxa/__init__.py | 35 +- quaxa/quaxa.py | 502 ++++++++++++++------ requirements-dev.txt | 12 +- requirements.txt | 4 +- setup.py | 37 -- test/test_quaxa.py | 402 +++++++--------- 19 files changed, 678 insertions(+), 479 deletions(-) create mode 100644 .flake8 delete mode 100644 .github/FUNDING.yml delete mode 100644 .github/workflows/syntax-and-unit-tests.yml create mode 100644 .github/workflows/test.yml create mode 100644 .pre-commit-config.yaml delete mode 100644 MANIFEST.in create mode 100644 VERSION create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..82dbbd2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 80 +extend-select = B950 +extend-ignore = E203,E501,E701 +per-file-ignores = + quaxa/__init__.py:F401 diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index 3fdc614..0000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1 +0,0 @@ -github: [ulf1, LydiaKoerber] diff --git a/.github/workflows/syntax-and-unit-tests.yml b/.github/workflows/syntax-and-unit-tests.yml deleted file mode 100644 index f98e88d..0000000 --- a/.github/workflows/syntax-and-unit-tests.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Python application - -on: [push] - -jobs: - build: - - runs-on: ubuntu-18.04 - - strategy: - matrix: - python-version: ['3.7', '3.x'] - - name: Python ${{ matrix.python-version }} Tests - - steps: - - uses: actions/checkout@v1 - - name: Setup python - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r requirements-dev.txt - - name: Lint with flake8 - run: | - flake8 --ignore=F401 --exclude=$(grep -v '^#' .gitignore | xargs | sed -e 's/ /,/g') - - name: Unit Test with unittest - run: | - PYTHONPATH=. python -m unittest diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..730db91 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,25 @@ +name: Python application + +on: [push] + +jobs: + build: + strategy: + matrix: + platform: [windows-latest, macos-latest, ubuntu-latest] + + runs-on: ${{ matrix.platform }} + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install dependencies + run: | + pip install -U pip + pip install -r requirements-dev.txt + pip install . + - name: Run unit tests + run: | + python -m unittest diff --git a/.gitignore b/.gitignore index 91fcca3..1fb03ec 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,4 @@ venv.bak/ .vscode profile/data* .theia +*.temp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..29ced40 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files +- repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + additional_dependencies: [flake8-bugbear] +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy diff --git a/.zenodo.json b/.zenodo.json index a7a0fa0..063eee3 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -18,4 +18,4 @@ "good example extractor", "German" ] -} \ No newline at end of file +} diff --git a/LICENSE b/LICENSE index 5de6c20..01644d6 100644 --- a/LICENSE +++ b/LICENSE @@ -198,4 +198,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 783709f..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include README.md -include README.rst -recursive-include test *.py diff --git a/README.md b/README.md index b092e64..5c13c87 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Wenn 1 Knock-out Kriterium identifiziert wird, dann wird direkt der Score direkt | `has_blacklist_words` | bool | Satzbeleg enthält Wörter, sodass in keinem Fall der Satzbeleg als Wörterbuchbeispiel in Betracht gezogen wird; ausgenommen das Blacklist-Wort ist selbt der Wörterbucheintrag. (dt. Blacklist ist voreingestellt) | [1] GDEX blacklist | ### Diskontierungsfakoren -Je Kriterium wird ein Faktor berechnet, und alle Faktoren miteinander multipliziert. +Je Kriterium wird ein Faktor berechnet, und alle Faktoren miteinander multipliziert. Wenn bspw. ein Faktor eine Penality von 0.1 bekommt, dann ist der Faktor 0.9. Für den Gesamtscore wird der Gesamtfaktor mit 0.5 multipliziert. @@ -79,11 +79,11 @@ pip install -r requirements-dev.txt --no-cache-dir Publish ```sh -python setup.py sdist +python setup.py sdist twine upload -r pypi dist/* ``` -### Clean up +### Clean up ```sh find . -type f -name "*.pyc" | xargs rm @@ -106,4 +106,4 @@ The "Evidence" project was funded by the Deutsche Forschungsgemeinschaft (DFG, G ### Maintenance - till 31.Aug.2023 (v0.1.0) the code repository was maintained within the DFG project [433249742](https://gepris.dfg.de/gepris/projekt/433249742) -- since 01.Sep.2023 (v0.1.0) the code repository is maintained by Ulf Hamster. \ No newline at end of file +- since 01.Sep.2023 (v0.1.0) the code repository is maintained by Ulf Hamster. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..17e51c3 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.1 diff --git a/demo/demo_quaxa.py b/demo/demo_quaxa.py index 42c4ba3..eec1e04 100644 --- a/demo/demo_quaxa.py +++ b/demo/demo_quaxa.py @@ -1,5 +1,7 @@ -import conllu import random + +import conllu + import quaxa import quaxa.reader @@ -8,21 +10,19 @@ def demo(): # read conllu file - corpus = conllu.parse(open('demo.conllu', 'r').read()) + corpus = conllu.parse(open("demo.conllu", "r").read()) # compute scores for example sentences for annot in corpus: lemmas_content = [ - tok.get('lemma') for tok in annot - if tok.get('upos') in {'NOUN', 'VERB', 'ADJ'} + tok.get("lemma") + for tok in annot + if tok.get("upos") in {"NOUN", "VERB", "ADJ"} ] - sent = annot.metadata['text'] + sent = annot.metadata["text"] for headword in lemmas_content: - factor = quaxa.total_score( - headword=headword, txt=sent, annotation=annot) - print(( - "total_score:" - f"{factor: 7.4f} | {headword} | {sent[:50]} ...")) + factor = quaxa.total_score(headword=headword, txt=sent, annotation=annot) + print(("total_score:" f"{factor: 7.4f} | {headword} | {sent[:50]} ...")) -if __name__ == '__main__': +if __name__ == "__main__": demo() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9fe1d39 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "quaxa" +description = "QUAlity of sentence eXAmples scoring" +authors = [{name = "Ulf Hamster", email = "554c46@gmail.com"}] +classifiers = [ + "Development Status :: 1 - Planning", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Topic :: Education", + "Topic :: Scientific/Engineering", + "Topic :: Text Processing :: Linguistic" +] +requires-python = ">=3.7" +dynamic = ["dependencies", "version", "readme"] + +[project.urls] +Homepage = "https://github.com/ulf1/quaxa" + +[tool.isort] +profile = "black" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} +version = {file = ["VERSION"]} +readme = {file = ["README.md"], content-type = "text/markdown"} + +[tool.setuptools.packages.find] +include = ["quaxa*"] +exclude = ["test*"] diff --git a/quaxa/__init__.py b/quaxa/__init__.py index ea3c612..49b57dc 100644 --- a/quaxa/__init__.py +++ b/quaxa/__init__.py @@ -1,19 +1,28 @@ -__version__ = '0.1.1' +from pathlib import Path + +__version__ = (Path(__file__) / ".." / ".." / "VERSION").resolve().read_text().strip() from .quaxa import ( - total_score, - isa_knockout_criteria, + BLACKLIST_WORDS_DE, + DEFAULT_SPACE_DEIXIS_TERMS, + DEFAULT_TIME_DEIXIS_TERMS, + ORD_RARE_CHARS_DE, + ORDS_QWERTZ_DE, + QWERTZ_DE, + RARE_CHARS_DE, + deixis_person, + deixis_space, + deixis_time, factor_gradual_criteria, - has_finite_verb_and_subject, - is_misparsed, - has_illegal_chars, - has_blacklist_words, BLACKLIST_WORDS_DE, - factor_rarechars, RARE_CHARS_DE, ORD_RARE_CHARS_DE, - factor_notkeyboardchar, QWERTZ_DE, ORDS_QWERTZ_DE, factor_graylist_words, factor_named_entity, - deixis_space, DEFAULT_SPACE_DEIXIS_TERMS, - deixis_time, DEFAULT_TIME_DEIXIS_TERMS, - deixis_person, - optimal_interval + factor_notkeyboardchar, + factor_rarechars, + has_blacklist_words, + has_finite_verb_and_subject, + has_illegal_chars, + is_misparsed, + isa_knockout_criteria, + optimal_interval, + total_score, ) diff --git a/quaxa/quaxa.py b/quaxa/quaxa.py index 3840e22..017cf94 100644 --- a/quaxa/quaxa.py +++ b/quaxa/quaxa.py @@ -2,7 +2,7 @@ def total_score(**kwargs) -> float: - """ Rule-based sentence scoring formula + """Rule-based sentence scoring formula Parameters: ----------- @@ -17,19 +17,19 @@ def total_score(**kwargs) -> float: float Score if a sentence example is suitable as dictionary example. """ - score = .5 * isa_knockout_criteria(**kwargs) - score += .5 * factor_gradual_criteria(**kwargs) + score = 0.5 * isa_knockout_criteria(**kwargs) + score += 0.5 * factor_gradual_criteria(**kwargs) return score def isa_knockout_criteria(**kwargs): # read input arguments - headword = kwargs.get('headword') - txt = kwargs.get('txt') - annotation = kwargs.get('annotation') - blacklist = kwargs.get('blacklist') # optional + headword = kwargs.get("headword") + txt = kwargs.get("txt") + annotation = kwargs.get("annotation") + blacklist = kwargs.get("blacklist") # optional # prepare variables - lemmas = [t.get('lemma') for t in annotation] + lemmas = [t.get("lemma") for t in annotation] # compute factor if not has_finite_verb_and_subject(annotation): return False @@ -44,24 +44,24 @@ def isa_knockout_criteria(**kwargs): def factor_gradual_criteria(**kwargs): # read input arguments - headword = kwargs.get('headword') - txt = kwargs.get('txt') - annotation = kwargs.get('annotation') - graylist = kwargs.get('graylist') # optional + headword = kwargs.get("headword") + txt = kwargs.get("txt") + annotation = kwargs.get("annotation") + graylist = kwargs.get("graylist") # optional # prepare variables - lemmas = [t.get('lemma') for t in annotation] + lemmas = [t.get("lemma") for t in annotation] num_tokens = len(annotation) # penalties - penalty_rarechars = kwargs.get('penalty_rarechars', 0.125) - penalty_notkeyboardchar = kwargs.get('penalty_notkeyboardchar', True) - penalty_graylist_words = kwargs.get('penalty_graylist_words', 0.075) - penalty_named_entity = kwargs.get('penalty_named_entity', 0.1667) - penalty_interval = kwargs.get('penalty_interval', True) - optimal_interval_low = kwargs.get('optimal_interval_low', 10) - optimal_interval_high = kwargs.get('optimal_interval_high', 20) - penalty_space_deixis = kwargs.get('penalty_space_deixis', 0.034) - penalty_time_deixis = kwargs.get('penalty_time_deixis', 0.034) - penalty_person_deixis = kwargs.get('penalty_person_deixis', 0.034) + penalty_rarechars = kwargs.get("penalty_rarechars", 0.125) + penalty_notkeyboardchar = kwargs.get("penalty_notkeyboardchar", True) + penalty_graylist_words = kwargs.get("penalty_graylist_words", 0.075) + penalty_named_entity = kwargs.get("penalty_named_entity", 0.1667) + penalty_interval = kwargs.get("penalty_interval", True) + optimal_interval_low = kwargs.get("optimal_interval_low", 10) + optimal_interval_high = kwargs.get("optimal_interval_high", 20) + penalty_space_deixis = kwargs.get("penalty_space_deixis", 0.034) + penalty_time_deixis = kwargs.get("penalty_time_deixis", 0.034) + penalty_person_deixis = kwargs.get("penalty_person_deixis", 0.034) # compute factor factor = 1.0 if penalty_rarechars >= 0.0: @@ -70,22 +70,24 @@ def factor_gradual_criteria(**kwargs): factor *= factor_notkeyboardchar(txt) if penalty_graylist_words >= 0.0: factor *= factor_graylist_words( - headword, lemmas, graylist, penalty_factor=penalty_graylist_words) + headword, lemmas, graylist, penalty_factor=penalty_graylist_words + ) if penalty_named_entity >= 0.0: factor *= factor_named_entity( - headword, annotation, penalty_factor=penalty_named_entity) + headword, annotation, penalty_factor=penalty_named_entity + ) if penalty_interval: factor *= optimal_interval( - num_tokens, low=optimal_interval_low, high=optimal_interval_high) + num_tokens, low=optimal_interval_low, high=optimal_interval_high + ) if penalty_space_deixis >= 0.0: - factor *= deixis_space( - headword, lemmas, penalty_factor=penalty_space_deixis) + factor *= deixis_space(headword, lemmas, penalty_factor=penalty_space_deixis) if penalty_time_deixis >= 0.0: - factor *= deixis_time( - headword, lemmas, penalty_factor=penalty_time_deixis) + factor *= deixis_time(headword, lemmas, penalty_factor=penalty_time_deixis) if penalty_person_deixis >= 0.0: factor *= deixis_person( - headword, annotation, penalty_factor=penalty_person_deixis) + headword, annotation, penalty_factor=penalty_person_deixis + ) # done return factor @@ -96,32 +98,35 @@ def has_finite_verb_and_subject(annotation: List[dict]) -> bool: It is a knockout criterion. """ # find the root of the dependency tree - root = [token for token in annotation if token['deprel'].lower() == 'root'] - if len(root) != 1: + roots = [token for token in annotation if token["deprel"].lower() == "root"] + if len(roots) != 1: return False - root = root[0] - root_id = root['id'] + root = roots[0] + root_id = root["id"] # find finite verb def is_finite_verb(tok): - if tok.get('upos', '') in {'AUX', 'VERB'}: - flag = tok.get('feats', '').get('VerbForm', '') == 'Fin' - return flag or tok.get('xpos', '').endswith('FIN') + if tok.get("upos", "") in {"AUX", "VERB"}: + flag = tok.get("feats", "").get("VerbForm", "") == "Fin" + return flag or tok.get("xpos", "").endswith("FIN") return False + # find finite verb that are a) root, or b) child of root verb = [ - tok for tok in annotation - if is_finite_verb(tok) and ( - tok['id'] == root_id or tok.get('head', '') == root_id) + tok + for tok in annotation + if is_finite_verb(tok) + and (tok["id"] == root_id or tok.get("head", "") == root_id) ] if len(verb) == 0: return False # find subject that are a) root, or b) child of root subj = [ - tok for tok in annotation - if (tok['upos'] in {'NOUN', 'PROPN', 'PRON'}) and ( - tok['id'] == root_id or tok.get('head', '') == root_id) + tok + for tok in annotation + if (tok["upos"] in {"NOUN", "PROPN", "PRON"}) + and (tok["id"] == root_id or tok.get("head", "") == root_id) ] if len(subj) == 0: return False @@ -152,13 +157,13 @@ def is_misparsed(txt: str): conditions = [ txt[0].islower(), txt[0].isspace(), - txt[0] in ',.?!()/&%-_:;#+*~<>|^°', - txt[-1] not in '?!.' + txt[0] in ",.?!()/&%-_:;#+*~<>|^°", + txt[-1] not in "?!.", ] return any(conditions) -def has_illegal_chars(txt: str, illegal_chars='<>|[]/\\^@'): +def has_illegal_chars(txt: str, illegal_chars="<>|[]/\\^@"): """Blacklist of illegal characters Rules: @@ -193,33 +198,33 @@ def has_illegal_chars(txt: str, illegal_chars='<>|[]/\\^@'): BLACKLIST_WORDS_DE = [ - 'negroid', - 'Zigeunerbande', - 'Mischling', - 'Zigeunerleben', - 'Zigeunerkind', - 'durchvögeln', - 'durchficken', - 'durchbumsen', - 'Idiot', - 'Polenböller', - 'geisteskrank', - 'Neger', - 'Zigeuner', - 'Nigger', - 'Schwuchtel', - 'Herrenrasse', - 'Negersklave', - 'Negerin', - 'Negerblut', - 'Negerkind', - 'Negerstamm' + "negroid", + "Zigeunerbande", + "Mischling", + "Zigeunerleben", + "Zigeunerkind", + "durchvögeln", + "durchficken", + "durchbumsen", + "Idiot", + "Polenböller", + "geisteskrank", + "Neger", + "Zigeuner", + "Nigger", + "Schwuchtel", + "Herrenrasse", + "Negersklave", + "Negerin", + "Negerblut", + "Negerkind", + "Negerstamm", ] -def has_blacklist_words(headword: str, - lemmas: List[str], - blacklist_words: List[str] = BLACKLIST_WORDS_DE): +def has_blacklist_words( + headword: str, lemmas: List[str], blacklist_words: List[str] = BLACKLIST_WORDS_DE +): if blacklist_words is None: blacklist_words = BLACKLIST_WORDS_DE a = set(lemmas) @@ -227,14 +232,14 @@ def has_blacklist_words(headword: str, return len(a.intersection(b)) > 0 -RARE_CHARS_DE = '0123456789\'.,!?)(;:-' +RARE_CHARS_DE = "0123456789'.,!?)(;:-" ORD_RARE_CHARS_DE = [ord(c) for c in RARE_CHARS_DE] -def factor_rarechars(txt: str, - rare_chars: List[int] = ORD_RARE_CHARS_DE, - penalty_factor: float = 0.1): +def factor_rarechars( + txt: str, rare_chars: List[int] = ORD_RARE_CHARS_DE, penalty_factor: float = 0.1 +): """Penalize rare characters Parameters: @@ -255,30 +260,205 @@ def factor_rarechars(txt: str, QWERTZ_DE = [ - '^', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ß', "'", - 'q', 'w', 'e', 'r', 't', 'z', 'u', 'i', 'o', 'p', 'ü', '+', - 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', '#', - '<', 'y', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-', - '°', '!', '"', '§', '$', '%', '&', '/', '(', ')', '=', '?', '`', - 'Q', 'W', 'E', 'R', 'T', 'Z', 'U', 'I', 'O', 'P', 'Ü', '*', - 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', "'", - '>', 'Y', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_', - '′', '¹', '²', '³', '¼', '½', '¬', '{', '[', ']', '}', '\\', '¸', - '@', 'ł', '€', '¶', 'ŧ', '←', '↓', '→', 'ø', 'þ', '"', '~', - 'æ', 'ſ', 'ð', 'đ', 'ŋ', 'ħ', '̣', 'ĸ', 'ł', '˝', '^', '’', - '|', '»', '«', '¢', '„', '“', '”', 'µ', '·', '…', '–', - '″', '¡', '⅛', '£', '¤', '⅜', '⅝', '⅞', '™', '±', '°', '¿', '˛', - 'Ω', 'Ł', '€', '®', 'Ŧ', '¥', '↑', 'ı', 'Ø', 'Þ', '°', '¯', - 'Æ', 'ẞ', 'Ð', 'ª', 'Ŋ', 'Ħ', '˙', '&', 'Ł', '̣', '̣', '˘', - '', '›', '‹', '©', '‚', '‘', '’', 'º', '×', '÷', '—', - ' ' + "^", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "0", + "ß", + "'", + "q", + "w", + "e", + "r", + "t", + "z", + "u", + "i", + "o", + "p", + "ü", + "+", + "a", + "s", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "ö", + "ä", + "#", + "<", + "y", + "x", + "c", + "v", + "b", + "n", + "m", + ",", + ".", + "-", + "°", + "!", + '"', + "§", + "$", + "%", + "&", + "/", + "(", + ")", + "=", + "?", + "`", + "Q", + "W", + "E", + "R", + "T", + "Z", + "U", + "I", + "O", + "P", + "Ü", + "*", + "A", + "S", + "D", + "F", + "G", + "H", + "J", + "K", + "L", + "Ö", + "Ä", + "'", + ">", + "Y", + "X", + "C", + "V", + "B", + "N", + "M", + ";", + ":", + "_", + "′", + "¹", + "²", + "³", + "¼", + "½", + "¬", + "{", + "[", + "]", + "}", + "\\", + "¸", + "@", + "ł", + "€", + "¶", + "ŧ", + "←", + "↓", + "→", + "ø", + "þ", + '"', + "~", + "æ", + "ſ", + "ð", + "đ", + "ŋ", + "ħ", + "̣", + "ĸ", + "ł", + "˝", + "^", + "’", + "|", + "»", + "«", + "¢", + "„", + "“", + "”", + "µ", + "·", + "…", + "–", + "″", + "¡", + "⅛", + "£", + "¤", + "⅜", + "⅝", + "⅞", + "™", + "±", + "°", + "¿", + "˛", + "Ω", + "Ł", + "€", + "®", + "Ŧ", + "¥", + "↑", + "ı", + "Ø", + "Þ", + "°", + "¯", + "Æ", + "ẞ", + "Ð", + "ª", + "Ŋ", + "Ħ", + "˙", + "&", + "Ł", + "̣", + "̣", + "˘", + "", + "›", + "‹", + "©", + "‚", + "‘", + "’", + "º", + "×", + "÷", + "—", + " ", ] ORDS_QWERTZ_DE = sorted([ord(c) for c in QWERTZ_DE if c]) -def factor_notkeyboardchar( - txt: str, eligible: List[int] = ORDS_QWERTZ_DE): +def factor_notkeyboardchar(txt: str, eligible: List[int] = ORDS_QWERTZ_DE): """Computes the percentage of characters not typable on a German keyboard. Parameters: @@ -298,22 +478,24 @@ def factor_notkeyboardchar( return len([c for c in txt if ord(c) in eligible]) / len(txt) -def factor_graylist_words(headword: str, - lemmas: List[str], - graylist_words: List[str], - penalty_factor: float = 0.1): +def factor_graylist_words( + headword: str, + lemmas: List[str], + graylist_words: List[str], + penalty_factor: float = 0.1, +): """Penalize graylist words""" if graylist_words is None: - return 1.0 # no default list - num_matches = len([ - lem for lem in lemmas - if lem != headword and lem in graylist_words]) + return 1.0 # no default list + num_matches = len( + [lem for lem in lemmas if lem != headword and lem in graylist_words] + ) return max(0.0, 1.0 - penalty_factor * num_matches) -def factor_named_entity(headword: str, - annotation: List[dict], - penalty_factor: float = 0.15): +def factor_named_entity( + headword: str, annotation: List[dict], penalty_factor: float = 0.15 +): """Named Enity / Proper Noun penality If the headword is a named entity, we want to avoid that the sentence. @@ -342,35 +524,50 @@ def factor_named_entity(headword: str, """ num_matches = 0 for tok in annotation: - if tok.get('lemma', '') == headword: - if (tok.get('upos', '') == 'PROPN') or (tok.get('xpos') == 'NE'): + if tok.get("lemma", "") == headword: + if (tok.get("upos", "") == "PROPN") or (tok.get("xpos") == "NE"): num_matches += 1 return max(0.0, 1.0 - penalty_factor * num_matches) -def _deixis(headword: str, - lemmas: List[str], - deixis_terms: List[str], - penalty_factor: float = 0.1): +def _deixis( + headword: str, + lemmas: List[str], + deixis_terms: List[str], + penalty_factor: float = 0.1, +): """Deixis factor function Utility function used for deixis_space and deixis_time. """ - num_matches = len([ - lem for lem in lemmas - if lem != headword and lem in deixis_terms]) + num_matches = len( + [lem for lem in lemmas if lem != headword and lem in deixis_terms] + ) return max(0.0, 1.0 - penalty_factor * num_matches) DEFAULT_SPACE_DEIXIS_TERMS = [ - 'hier', 'dort', 'über', 'da', 'vor', 'hinter', 'links', 'von', 'rechts', - 'von', 'oben', 'unten'] + "hier", + "dort", + "über", + "da", + "vor", + "hinter", + "links", + "von", + "rechts", + "von", + "oben", + "unten", +] -def deixis_space(headword: str, - lemmas: List[str], - space_deixis_terms: List[str] = DEFAULT_SPACE_DEIXIS_TERMS, - penalty_factor: float = 0.1) -> float: +def deixis_space( + headword: str, + lemmas: List[str], + space_deixis_terms: List[str] = DEFAULT_SPACE_DEIXIS_TERMS, + penalty_factor: float = 0.1, +) -> float: """Space deixis penality Parameters: @@ -398,21 +595,32 @@ def deixis_space(headword: str, ------------ https://gsw.phil-fak.uni-duesseldorf.de/diskurslinguistik/index.php?title=Deiktischer_Ausdruck """ - return _deixis(headword=headword, - lemmas=lemmas, - deixis_terms=space_deixis_terms, - penalty_factor=penalty_factor) + return _deixis( + headword=headword, + lemmas=lemmas, + deixis_terms=space_deixis_terms, + penalty_factor=penalty_factor, + ) DEFAULT_TIME_DEIXIS_TERMS = [ - 'jetzt', 'heute', 'gestern', 'morgen', 'dann', 'damals', 'bald', - 'kürzlich'] + "jetzt", + "heute", + "gestern", + "morgen", + "dann", + "damals", + "bald", + "kürzlich", +] -def deixis_time(headword: str, - lemmas: List[str], - time_deixis_terms: List[str] = DEFAULT_TIME_DEIXIS_TERMS, - penalty_factor: float = 0.1) -> float: +def deixis_time( + headword: str, + lemmas: List[str], + time_deixis_terms: List[str] = DEFAULT_TIME_DEIXIS_TERMS, + penalty_factor: float = 0.1, +) -> float: """Time deixis penality Parameters: @@ -440,15 +648,17 @@ def deixis_time(headword: str, ------------ https://gsw.phil-fak.uni-duesseldorf.de/diskurslinguistik/index.php?title=Deiktischer_Ausdruck """ - return _deixis(headword=headword, - lemmas=lemmas, - deixis_terms=time_deixis_terms, - penalty_factor=penalty_factor) + return _deixis( + headword=headword, + lemmas=lemmas, + deixis_terms=time_deixis_terms, + penalty_factor=penalty_factor, + ) -def deixis_person(headword: str, - annotation: List[dict], - penalty_factor: float = 0.1) -> float: +def deixis_person( + headword: str, annotation: List[dict], penalty_factor: float = 0.1 +) -> float: """Personal deixis penality We use UD's UPOS and features as filter criteria. The following @@ -478,12 +688,12 @@ def deixis_person(headword: str, factors : float Number between 0.0 and 1.0 """ - PTyp = {'Prs', 'Dem', 'Ind', 'Neg', 'Tot'} + PTyp = {"Prs", "Dem", "Ind", "Neg", "Tot"} num_matches = 0 for t in annotation: - if t['lemma'] != headword: - if t.get('upos', '') == 'PRON': - if t.get('feats', {}).get('PronType', '') in PTyp: + if t["lemma"] != headword: + if t.get("upos", "") == "PRON": + if t.get("feats", {}).get("PronType", "") in PTyp: num_matches += 1 return max(0.0, 1.0 - penalty_factor * num_matches) @@ -509,15 +719,15 @@ def optimal_interval(num_tokens: int, low: int = 10, high: int = 20) -> float: 0.0 (=sentence length bad), 1.0 (=sentence length ok) """ if low <= num_tokens <= high: - return 1. + return 1.0 elif num_tokens < low: - if num_tokens < low / 2.: - return 0. + if num_tokens < low / 2.0: + return 0.0 else: diff = low - num_tokens - return 1 - diff * (1. / (low / 2.)) + return 1 - diff * (1.0 / (low / 2.0)) else: if num_tokens > (2 * high): - return 0. + return 0.0 diff = (2 * high) - num_tokens return diff / high diff --git a/requirements-dev.txt b/requirements-dev.txt index 8f95515..5255621 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,5 @@ -# publish -setuptools>=56.0.0 -twine==3.3.0 -wheel>=0.31.0 -# syntax check, unit test, profiling -flake8>=3.8.4 -pytest>=6.2.1 +flake8 +flake8-bugbear +mypy +pre-commit +pytest diff --git a/requirements.txt b/requirements.txt index 3522ee8..b300bfe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -# public packages (see setup.py) -conllu>=4.5.3 +spacy==3.7.6 +de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl#sha256=d88c737eb7eb766f730f6a2dcb99dfcdb81623e1e0d89a9c638a2182ac19c52e diff --git a/setup.py b/setup.py deleted file mode 100644 index 18da362..0000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -import setuptools -import os - - -def read(fname): - with open(os.path.join(os.path.dirname(__file__), fname)) as fp: - s = fp.read() - return s - - -def get_version(path): - with open(path, "r") as fp: - lines = fp.read() - for line in lines.split("\n"): - if line.startswith('__version__'): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - raise RuntimeError("Unable to find version string.") - - -setuptools.setup( - name='quaxa', - version=get_version("quaxa/__init__.py"), - description='QUAlity of sentence eXAmples scoring', - long_description=read('README.md'), - long_description_content_type='text/markdown', - url='http://github.com/ulf1/quaxa', - author='Ulf Hamster', - author_email='554c46@gmail.com', - license='Apache License 2.0', - packages=['quaxa'], - install_requires=[ - "conllu>=4.5.3" - ], - python_requires='>=3.7', - zip_safe=True -) diff --git a/test/test_quaxa.py b/test/test_quaxa.py index 51b1120..808fcb6 100644 --- a/test/test_quaxa.py +++ b/test/test_quaxa.py @@ -1,13 +1,14 @@ import unittest -import quaxa +import quaxa SENTS = [ "Manasse ist ein einzigartiger Parfümeur.", "Ich hatte Gelegenheit eines seiner Seminare zu besuchen.", ( "7 Tage Erholung im Ferienhaus am Müritz See in einer idyllischen " - "Landschaft inmitten der Mecklenburgischen Seenplatte.") + "Landschaft inmitten der Mecklenburgischen Seenplatte." + ), ] @@ -18,13 +19,9 @@ "text": "Manasse", "lemma": "Manasse", "upos": "PROPN", - "feats": { - "Case": "Nom", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Nom", "Gender": "Fem", "Number": "Sing"}, "head": 5, - "deprel": "nsubj" + "deprel": "nsubj", }, { "id": 2, @@ -36,10 +33,10 @@ "Number": "Sing", "Person": "3", "Tense": "Pres", - "VerbForm": "Fin" + "VerbForm": "Fin", }, "head": 5, - "deprel": "cop" + "deprel": "cop", }, { "id": 3, @@ -52,10 +49,10 @@ "Gender": "Masc", "Number": "Sing", "NumType": "Card", - "PronType": "Art" + "PronType": "Art", }, "head": 5, - "deprel": "det" + "deprel": "det", }, { "id": 4, @@ -66,23 +63,19 @@ "Case": "Nom", "Degree": "Pos", "Gender": "Masc", - "Number": "Sing" + "Number": "Sing", }, "head": 5, - "deprel": "amod" + "deprel": "amod", }, { "id": 5, "text": "Parfümeur", "lemma": "Parfümeur", "upos": "NOUN", - "feats": { - "Case": "Nom", - "Gender": "Masc", - "Number": "Sing" - }, + "feats": {"Case": "Nom", "Gender": "Masc", "Number": "Sing"}, "head": 0, - "deprel": "root" + "deprel": "root", }, { "id": 6, @@ -90,8 +83,8 @@ "lemma": ".", "upos": "PUNCT", "head": 5, - "deprel": "punct" - } + "deprel": "punct", + }, ], [ { @@ -103,10 +96,10 @@ "Case": "Nom", "Number": "Sing", "Person": "1", - "PronType": "Prs" + "PronType": "Prs", }, "head": 2, - "deprel": "nsubj" + "deprel": "nsubj", }, { "id": 2, @@ -118,23 +111,19 @@ "Number": "Sing", "Person": "1", "Tense": "Past", - "VerbForm": "Fin" + "VerbForm": "Fin", }, "head": 0, - "deprel": "root" + "deprel": "root", }, { "id": 3, "text": "Gelegenheit", "lemma": "Gelegenheit", "upos": "NOUN", - "feats": { - "Case": "Acc", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Acc", "Gender": "Fem", "Number": "Sing"}, "head": 2, - "deprel": "obj" + "deprel": "obj", }, { "id": 4, @@ -147,10 +136,10 @@ "Gender": "Neut", "Number": "Sing", "NumType": "Card", - "PronType": "Art" + "PronType": "Art", }, "head": 6, - "deprel": "det" + "deprel": "det", }, { "id": 5, @@ -165,23 +154,19 @@ "Number[psor]": "Sing", "Person": "3", "Poss": "Yes", - "PronType": "Prs" + "PronType": "Prs", }, "head": 6, - "deprel": "det:poss" + "deprel": "det:poss", }, { "id": 6, "text": "Seminare", "lemma": "Seminar", "upos": "NOUN", - "feats": { - "Case": "Gen", - "Gender": "Neut", - "Number": "Plur" - }, + "feats": {"Case": "Gen", "Gender": "Neut", "Number": "Plur"}, "head": 8, - "deprel": "obj" + "deprel": "obj", }, { "id": 7, @@ -189,18 +174,16 @@ "lemma": "zu", "upos": "PART", "head": 8, - "deprel": "mark" + "deprel": "mark", }, { "id": 8, "text": "besuchen", "lemma": "besuchen", "upos": "VERB", - "feats": { - "VerbForm": "Inf" - }, + "feats": {"VerbForm": "Inf"}, "head": 3, - "deprel": "xcomp" + "deprel": "xcomp", }, { "id": 9, @@ -208,8 +191,8 @@ "lemma": ".", "upos": "PUNCT", "head": 2, - "deprel": "punct" - } + "deprel": "punct", + }, ], [ { @@ -217,52 +200,36 @@ "text": "7", "lemma": "7", "upos": "NUM", - "feats": { - "NumType": "Card" - }, + "feats": {"NumType": "Card"}, "head": 2, - "deprel": "nummod" + "deprel": "nummod", }, { "id": 2, "text": "Tage", "lemma": "Tag", "upos": "NOUN", - "feats": { - "Case": "Nom", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Nom", "Gender": "Fem", "Number": "Sing"}, "head": 3, - "deprel": "nmod" + "deprel": "nmod", }, { "id": 3, "text": "Erholung", "lemma": "Erholung", "upos": "NOUN", - "feats": { - "Case": "Acc", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Acc", "Gender": "Fem", "Number": "Sing"}, "head": 0, - "deprel": "root" - }, - { - "id": (4, "-", 5), - "text": "im", - "lemma": "_", - "upos": "_", - "deprel": "_" + "deprel": "root", }, + {"id": (4, "-", 5), "text": "im", "lemma": "_", "upos": "_", "deprel": "_"}, { "id": 4, "text": "in", "lemma": "in", "upos": "ADP", "head": 6, - "deprel": "case" + "deprel": "case", }, { "id": 5, @@ -274,38 +241,28 @@ "Definite": "Def", "Gender": "Neut", "Number": "Sing", - "PronType": "Art" + "PronType": "Art", }, "head": 6, - "deprel": "det" + "deprel": "det", }, { "id": 6, "text": "Ferienhaus", "lemma": "Ferienhaus", "upos": "NOUN", - "feats": { - "Case": "Dat", - "Gender": "Neut", - "Number": "Sing" - }, + "feats": {"Case": "Dat", "Gender": "Neut", "Number": "Sing"}, "head": 3, - "deprel": "nmod" - }, - { - "id": (7, "-", 8), - "text": "am", - "lemma": "_", - "upos": "_", - "deprel": "_" + "deprel": "nmod", }, + {"id": (7, "-", 8), "text": "am", "lemma": "_", "upos": "_", "deprel": "_"}, { "id": 7, "text": "an", "lemma": "an", "upos": "ADP", "head": 9, - "deprel": "case" + "deprel": "case", }, { "id": 8, @@ -317,36 +274,28 @@ "Definite": "Def", "Gender": "Masc", "Number": "Sing", - "PronType": "Art" + "PronType": "Art", }, "head": 9, - "deprel": "det" + "deprel": "det", }, { "id": 9, "text": "Müritz", "lemma": "Müritz", "upos": "PROPN", - "feats": { - "Case": "Dat", - "Gender": "Masc", - "Number": "Sing" - }, + "feats": {"Case": "Dat", "Gender": "Masc", "Number": "Sing"}, "head": 6, - "deprel": "nmod" + "deprel": "nmod", }, { "id": 10, "text": "See", "lemma": "See", "upos": "PROPN", - "feats": { - "Case": "Dat", - "Gender": "Masc", - "Number": "Sing" - }, + "feats": {"Case": "Dat", "Gender": "Masc", "Number": "Sing"}, "head": 9, - "deprel": "flat" + "deprel": "flat", }, { "id": 11, @@ -354,7 +303,7 @@ "lemma": "in", "upos": "ADP", "head": 14, - "deprel": "case" + "deprel": "case", }, { "id": 12, @@ -367,10 +316,10 @@ "Gender": "Fem", "Number": "Sing", "NumType": "Card", - "PronType": "Art" + "PronType": "Art", }, "head": 14, - "deprel": "det" + "deprel": "det", }, { "id": 13, @@ -381,23 +330,19 @@ "Case": "Dat", "Degree": "Pos", "Gender": "Fem", - "Number": "Sing" + "Number": "Sing", }, "head": 14, - "deprel": "amod" + "deprel": "amod", }, { "id": 14, "text": "Landschaft", "lemma": "Landschaft", "upos": "NOUN", - "feats": { - "Case": "Dat", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Dat", "Gender": "Fem", "Number": "Sing"}, "head": 9, - "deprel": "nmod" + "deprel": "nmod", }, { "id": 15, @@ -405,7 +350,7 @@ "lemma": "inmitten", "upos": "ADP", "head": 18, - "deprel": "case" + "deprel": "case", }, { "id": 16, @@ -417,36 +362,28 @@ "Definite": "Def", "Gender": "Fem", "Number": "Sing", - "PronType": "Art" + "PronType": "Art", }, "head": 18, - "deprel": "det" + "deprel": "det", }, { "id": 17, "text": "Mecklenburgischen", "lemma": "Mecklenburgischen", "upos": "PROPN", - "feats": { - "Case": "Dat", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Dat", "Gender": "Fem", "Number": "Sing"}, "head": 18, - "deprel": "amod" + "deprel": "amod", }, { "id": 18, "text": "Seenplatte", "lemma": "Seenplatte", "upos": "PROPN", - "feats": { - "Case": "Dat", - "Gender": "Fem", - "Number": "Sing" - }, + "feats": {"Case": "Dat", "Gender": "Fem", "Number": "Sing"}, "head": 9, - "deprel": "nmod" + "deprel": "nmod", }, { "id": 19, @@ -454,9 +391,9 @@ "lemma": ".", "upos": "PUNCT", "head": 3, - "deprel": "punct" - } - ] + "deprel": "punct", + }, + ], ] @@ -464,46 +401,57 @@ class QuaxTester(unittest.TestCase): def setUp(self): self.sents = SENTS self.annots = ANNOTS - self.lemmata = [ - [tok.get('lemma') for tok in tree] for tree in self.annots] + self.lemmata = [[tok.get("lemma") for tok in tree] for tree in self.annots] def test_total_score(self): for txt, annot in zip(self.sents, self.annots): for tok in annot: - if tok.get('upos', '') in {'NOUN', 'VERB', 'ADJ'}: - headword = tok['lemma'] + if tok.get("upos", "") in {"NOUN", "VERB", "ADJ"}: + headword = tok["lemma"] factor = quaxa.total_score( - headword=headword, txt=txt, annotation=annot) - print(( - "total_score:" - f"{factor: 7.4f} | {headword} | {txt[:20]} ...")) - self.assertGreaterEqual(factor, 0.) - self.assertLessEqual(factor, 1.) + headword=headword, txt=txt, annotation=annot + ) + print( + ( + "total_score:" + f"{factor: 7.4f} | {headword} | {txt[:20]} ..." + ) + ) + self.assertGreaterEqual(factor, 0.0) + self.assertLessEqual(factor, 1.0) def test_isa_knockout_criteria(self): for txt, annot in zip(self.sents, self.annots): for tok in annot: - if tok.get('upos', '') in {'NOUN', 'VERB', 'ADJ'}: - headword = tok['lemma'] + if tok.get("upos", "") in {"NOUN", "VERB", "ADJ"}: + headword = tok["lemma"] flag = quaxa.isa_knockout_criteria( - headword=headword, txt=txt, annotation=annot) - print(( - "isa_knockout_criteria:" - f"{flag} | {headword} | {txt[:20]} ...")) + headword=headword, txt=txt, annotation=annot + ) + print( + ( + "isa_knockout_criteria:" + f"{flag} | {headword} | {txt[:20]} ..." + ) + ) self.assertIs(flag is True or flag is False, True) def test_factor_gradual_criteria(self): for txt, annot in zip(self.sents, self.annots): for tok in annot: - if tok.get('upos', '') in {'NOUN', 'VERB', 'ADJ'}: - headword = tok['lemma'] + if tok.get("upos", "") in {"NOUN", "VERB", "ADJ"}: + headword = tok["lemma"] factor = quaxa.factor_gradual_criteria( - headword=headword, txt=txt, annotation=annot) - print(( - "factor_gradual_criteria:" - f"{factor:7.4f} | {headword} | {txt[:20]} ...")) - self.assertGreaterEqual(factor, 0.) - self.assertLessEqual(factor, 1.) + headword=headword, txt=txt, annotation=annot + ) + print( + ( + "factor_gradual_criteria:" + f"{factor:7.4f} | {headword} | {txt[:20]} ..." + ) + ) + self.assertGreaterEqual(factor, 0.0) + self.assertLessEqual(factor, 1.0) def test_has_finite_verb_and_subject(self): target = [True, True, False] @@ -516,16 +464,16 @@ def test_is_misparsed(self): res = quaxa.is_misparsed(sent) self.assertFalse(res) - res = quaxa.is_misparsed('Das ist ein Beispieltext.') + res = quaxa.is_misparsed("Das ist ein Beispieltext.") self.assertFalse(res) - res = quaxa.is_misparsed('Das ist ein Beispieltext') + res = quaxa.is_misparsed("Das ist ein Beispieltext") self.assertTrue(res) - res = quaxa.is_misparsed('das ist ein Beispieltext.') + res = quaxa.is_misparsed("das ist ein Beispieltext.") self.assertTrue(res) - res = quaxa.is_misparsed('\tDas ist ein Beispieltext.') + res = quaxa.is_misparsed("\tDas ist ein Beispieltext.") self.assertTrue(res) def test_has_illegal_chars(self): @@ -533,30 +481,32 @@ def test_has_illegal_chars(self): res = quaxa.has_illegal_chars(sent) self.assertFalse(res) - res = quaxa.has_illegal_chars('https://somerandomurl.com') + res = quaxa.has_illegal_chars("https://somerandomurl.com") self.assertTrue(res) - res = quaxa.has_illegal_chars('name@mail.com') + res = quaxa.has_illegal_chars("name@mail.com") self.assertTrue(res) - res = quaxa.has_illegal_chars('my test\rnew windows paragraph') + res = quaxa.has_illegal_chars("my test\rnew windows paragraph") self.assertTrue(res) def test_has_blacklist_words(self): for annot in self.annots: - lemmas = [tok.get('lemma') for tok in annot] + lemmas = [tok.get("lemma") for tok in annot] for tok in annot: - headword = tok['lemma'] - res = quaxa.has_blacklist_words( - headword=headword, lemmas=lemmas) + headword = tok["lemma"] + res = quaxa.has_blacklist_words(headword=headword, lemmas=lemmas) self.assertFalse(res) - res = quaxa.has_blacklist_words('Beispielsatz', [ - 'und', 'der', 'sein', 'ein', 'Beispielsatz', 'mit', 'Idiot', '--']) + res = quaxa.has_blacklist_words( + "Beispielsatz", + ["und", "der", "sein", "ein", "Beispielsatz", "mit", "Idiot", "--"], + ) self.assertTrue(res) - res = quaxa.has_blacklist_words('Idiot', [ - 'und', 'der', 'sein', 'ein', 'Beispielsatz', 'mit', 'Idiot', '--']) + res = quaxa.has_blacklist_words( + "Idiot", ["und", "der", "sein", "ein", "Beispielsatz", "mit", "Idiot", "--"] + ) self.assertFalse(res) def test_factor_graylist_rarechars(self): @@ -565,30 +515,30 @@ def test_factor_graylist_rarechars(self): res = quaxa.factor_rarechars(sent) self.assertEqual(res, target[i]) - res = quaxa.factor_rarechars("\'\'..??") + res = quaxa.factor_rarechars("''..??") self.assertAlmostEqual(res, 0.4) # rounding error def test_factor_graylist_notkeyboardchar(self): for sent in self.sents: res = quaxa.factor_notkeyboardchar(sent) - self.assertEqual(res, 1.) + self.assertEqual(res, 1.0) - res = quaxa.factor_notkeyboardchar('ßÄÖÜäöü') - self.assertEqual(res, 1.) + res = quaxa.factor_notkeyboardchar("ßÄÖÜäöü") + self.assertEqual(res, 1.0) - res = quaxa.factor_notkeyboardchar( - 'À la carte, s\'il vous plaît\n') + res = quaxa.factor_notkeyboardchar("À la carte, s'il vous plaît\n") self.assertLess(res, 1.0) def test_factor_graylist_words(self): - GRAYLIST = ['Seminar'] + GRAYLIST = ["Seminar"] target = [1.0, 0.9, 1.0] for i, annot in enumerate(self.annots): - lemmas = [tok.get('lemma') for tok in annot] + lemmas = [tok.get("lemma") for tok in annot] for tok in annot: - headword = tok['lemma'] + headword = tok["lemma"] res = quaxa.factor_graylist_words( - headword=headword, lemmas=lemmas, graylist_words=GRAYLIST) + headword=headword, lemmas=lemmas, graylist_words=GRAYLIST + ) if headword in GRAYLIST: self.assertEqual(res, target[i] + 0.1) else: @@ -597,36 +547,45 @@ def test_factor_graylist_words(self): def test_factor_named_entity(self): for annot in self.annots: for tok in annot: - headword = tok['lemma'] + headword = tok["lemma"] res = quaxa.factor_named_entity( - headword=headword, annotation=annot, penalty_factor=0.15) - flag = tok.get('upos', '') == 'PROPN' - flag = flag or tok.get('xpos', '') == 'NE' + headword=headword, annotation=annot, penalty_factor=0.15 + ) + flag = tok.get("upos", "") == "PROPN" + flag = flag or tok.get("xpos", "") == "NE" if flag: self.assertEqual(res, 0.85) else: self.assertEqual(res, 1.0) def test_deixis(self): - lemmas = ['heute', 'hier', '--', 'morgen', 'dort', '--'] - result2 = [quaxa.deixis_space('heute', lemmas), - quaxa.deixis_time('heute', lemmas)] - self.assertEqual(result2, [.8, .9]) - - result3 = [quaxa.deixis_space('hier', lemmas), - quaxa.deixis_time('hier', lemmas)] - self.assertEqual(result3, [.9, .8]) + lemmas = ["heute", "hier", "--", "morgen", "dort", "--"] + result2 = [ + quaxa.deixis_space("heute", lemmas), + quaxa.deixis_time("heute", lemmas), + ] + self.assertEqual(result2, [0.8, 0.9]) + + result3 = [ + quaxa.deixis_space("hier", lemmas), + quaxa.deixis_time("hier", lemmas), + ] + self.assertEqual(result3, [0.9, 0.8]) def test_deixis_person(self): target = [1.0, 0.9, 1.0] for i, annot in enumerate(self.annots): for tok in annot: - headword = tok['lemma'] - res = quaxa.deixis_person( - headword=headword, annotation=annot) - flag = tok.get('upos', '') == 'PRON' - flag = flag and tok.get('feats', {}).get('PronType', '') in [ - 'Prs', 'Dem', 'Ind', 'Neg', 'Tot'] + headword = tok["lemma"] + res = quaxa.deixis_person(headword=headword, annotation=annot) + flag = tok.get("upos", "") == "PRON" + flag = flag and tok.get("feats", {}).get("PronType", "") in [ + "Prs", + "Dem", + "Ind", + "Neg", + "Tot", + ] if flag: self.assertEqual(res, target[i] + 0.1) else: @@ -636,29 +595,32 @@ def test_optimal_interval(self): for annot in self.annots: num_tokens = len(annot) res = quaxa.optimal_interval( - num_tokens=num_tokens, - low=num_tokens * 2, - high=num_tokens * 3) - self.assertLess(res, 1.) + num_tokens=num_tokens, low=num_tokens * 2, high=num_tokens * 3 + ) + self.assertLess(res, 1.0) res = quaxa.optimal_interval( - num_tokens=num_tokens, - low=num_tokens // 2, - high=num_tokens * 2) - self.assertEqual(res, 1.) - - num_tokens = len(( - "Das ist ein Beispielsatz mit optimaler Länge von über 10 Tokens." - ).split(" ")) + num_tokens=num_tokens, low=num_tokens // 2, high=num_tokens * 2 + ) + self.assertEqual(res, 1.0) + + num_tokens = len( + ("Das ist ein Beispielsatz mit optimaler Länge von über 10 Tokens.").split( + " " + ) + ) result2 = quaxa.optimal_interval(num_tokens=num_tokens) - self.assertEqual(result2, 1.) + self.assertEqual(result2, 1.0) - num_tokens = len('Viel zu kurz.'.split(" ")) + num_tokens = len("Viel zu kurz.".split(" ")) result3 = quaxa.optimal_interval(num_tokens=num_tokens) - self.assertEqual(result3, 0.) - - num_tokens = len(( - "Dieser hingegen ist leider zu lang. Das macht ihn weniger " - "angenehm zu lesen. Daher ist der zurückgegebene Wert kleiner " - "als 1, schade.").split(" ")) + self.assertEqual(result3, 0.0) + + num_tokens = len( + ( + "Dieser hingegen ist leider zu lang. Das macht ihn weniger " + "angenehm zu lesen. Daher ist der zurückgegebene Wert kleiner " + "als 1, schade." + ).split(" ") + ) result4 = quaxa.optimal_interval(num_tokens=num_tokens) - self.assertLess(result4, 1.) + self.assertLess(result4, 1.0)