forked from ulf1/quaxa
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
22 changed files
with
3,838 additions
and
1,441 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[flake8] | ||
max-line-length = 80 | ||
extend-select = B950 | ||
extend-ignore = E203,E501,E701 | ||
per-file-ignores = | ||
quaxa/__init__.py:F401 |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: Python application | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build: | ||
strategy: | ||
matrix: | ||
platform: [windows-latest, macos-latest, ubuntu-latest] | ||
|
||
runs-on: ${{ matrix.platform }} | ||
|
||
steps: | ||
- uses: actions/checkout@v4 | ||
- uses: actions/setup-python@v5 | ||
with: | ||
python-version: "3.x" | ||
- name: Install dependencies | ||
run: | | ||
pip install -U pip | ||
pip install '.[dev]' | ||
- name: Run unit tests | ||
run: | | ||
pytest --cov=quaxa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,3 +108,4 @@ venv.bak/ | |
.vscode | ||
profile/data* | ||
.theia | ||
*.temp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v3.2.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-yaml | ||
- id: check-added-large-files | ||
- repo: https://github.com/psf/black | ||
rev: 22.10.0 | ||
hooks: | ||
- id: black | ||
- repo: https://github.com/PyCQA/flake8 | ||
rev: 7.0.0 | ||
hooks: | ||
- id: flake8 | ||
additional_dependencies: [flake8-bugbear] | ||
- repo: https://github.com/pycqa/isort | ||
rev: 5.12.0 | ||
hooks: | ||
- id: isort | ||
- repo: https://github.com/pre-commit/mirrors-mypy | ||
rev: v1.8.0 | ||
hooks: | ||
- id: mypy |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,38 @@ | ||
[![PyPI version](https://badge.fury.io/py/quaxa.svg)](https://badge.fury.io/py/quaxa) | ||
[![PyPi downloads](https://img.shields.io/pypi/dm/quaxa)](https://img.shields.io/pypi/dm/quaxa) | ||
[![DOI](https://zenodo.org/badge/667310199.svg)](https://zenodo.org/badge/latestdoi/667310199) | ||
# quaxa - QUAlity of sentence eXAmples | ||
|
||
# QUAXA: QUAlity of sentence eXAmples scoring | ||
Rule-based sentence scoring algorithm based on GDEX. | ||
_Rule-based sentence scoring algorithm based on GDEX_ | ||
|
||
## Installation | ||
|
||
`quaxa` can be installed as a package from its GitHub source repository: | ||
|
||
```sh | ||
pip install git+https://github.com/zentrum-lexikographie/quaxa.git | ||
``` | ||
|
||
For development, clone it from GitHub and install it locally, including optional dependencies: | ||
|
||
``` sh | ||
pip install -e .[dev] | ||
``` | ||
|
||
## Usage | ||
|
||
|
||
``` python-console | ||
>>> import spacy, quaxa | ||
>>> nlp = spacy.load("de_core_news_sm") | ||
>>> [s._.quaxa for s in quaxa.de_core(nlp("Achtung! Das ist ein toller Test.")).sents] | ||
[0.0, 0.5966] | ||
``` | ||
|
||
## Testing | ||
|
||
Run tests, including calculation of code coverage: | ||
|
||
``` sh | ||
pytest --cov=quaxa | ||
``` | ||
|
||
## Rules | ||
|
||
|
@@ -25,7 +53,7 @@ Wenn 1 Knock-out Kriterium identifiziert wird, dann wird direkt der Score direkt | |
| `has_blacklist_words` | bool | Satzbeleg enthält Wörter, sodass in keinem Fall der Satzbeleg als Wörterbuchbeispiel in Betracht gezogen wird; ausgenommen das Blacklist-Wort ist selbt der Wörterbucheintrag. (dt. Blacklist ist voreingestellt) | [1] GDEX blacklist | | ||
|
||
### Diskontierungsfakoren | ||
Je Kriterium wird ein Faktor berechnet, und alle Faktoren miteinander multipliziert. | ||
Je Kriterium wird ein Faktor berechnet, und alle Faktoren miteinander multipliziert. | ||
Wenn bspw. ein Faktor eine Penality von 0.1 bekommt, dann ist der Faktor 0.9. | ||
Für den Gesamtscore wird der Gesamtfaktor mit 0.5 multipliziert. | ||
|
||
|
@@ -40,70 +68,20 @@ Für den Gesamtscore wird der Gesamtfaktor mit 0.5 multipliziert. | |
| `deixis_person` | [0.0, 1.0] | Strafe Wörter mit `UPOS=PRON` und `PronType=Prs|Dem|Ind|Neg|Tot` ab. Entspricht STTS PoS-Tags `PDS` (`PRON` + `Dem`, z.B, das, dies, die, diese, der), `PIS` (`PRON` + `Ind|Neg|Tot`, z.B, man, allem, nichts, alles, mehr), `PPER` (`PRON` + `Prs`, z.B, es, sie, er, wir, ich), `PPOSS` (`PRON` + `Prs`, z.B, ihren, Seinen, seinem, unsrigen, meiner). | [1] GDEX graylist PoS- Tags, [2] GBEX Dexis; [3], [4] | | ||
| `optimal_interval` | [0.0, 1.0] | Strafe Satzbelege mit zu wenigen/vielen Wörter ab ab. | [1] GDEX | | ||
|
||
## Acknowledgements | ||
|
||
Quellen: | ||
- [1] Lexical Computing, "GDEX configuration introduction", URL: https://www.sketchengine.eu/syntax-of-gdex-configuration-files/ | ||
- [2] Didakowski, Lemnitzer, Geyken, 2012, "Automatic example sentence ex- traction for a contemporary German dictionary", URL: https://euralex.org/publications/automatic-example-sentence-extraction-for-a-contemporary-german-dictionary/ | ||
- [3] LingTermNet, URL: https://gsw.phil-fak.uni-duesseldorf.de/diskurslinguistik/index.php?title=Deiktischer_Ausdruck | ||
- [4] Universial Dependency, UPOS-STTS conversion table, URL: https://universaldependencies.org/tagset-conversion/de-stts-uposf.html | ||
|
||
|
||
## Appendix | ||
|
||
### Installation | ||
The `quaxa` [git repo](http://github.com/ulf1/quaxa) is available as [PyPi package](https://pypi.org/project/quaxa) | ||
|
||
```sh | ||
pip install quaxa | ||
pip install git+ssh://[email protected]/ulf1/quaxa.git | ||
``` | ||
|
||
### Install a virtual environment | ||
|
||
```sh | ||
python3 -m venv .venv | ||
source .venv/bin/activate | ||
pip install --upgrade pip | ||
pip install -r requirements.txt --no-cache-dir | ||
pip install -r requirements-dev.txt --no-cache-dir | ||
``` | ||
|
||
(If your git repo is stored in a folder with whitespaces, then don't use the subfolder `.venv`. Use an absolute path without whitespaces.) | ||
|
||
### Python commands | ||
|
||
* Jupyter for the examples: `jupyter lab` | ||
* Check syntax: `flake8 --ignore=F401 --exclude=$(grep -v '^#' .gitignore | xargs | sed -e 's/ /,/g')` | ||
* Run Unit Tests: `PYTHONPATH=. python -m unittest` | ||
|
||
Publish | ||
|
||
```sh | ||
python setup.py sdist | ||
twine upload -r pypi dist/* | ||
``` | ||
|
||
### Clean up | ||
|
||
```sh | ||
find . -type f -name "*.pyc" | xargs rm | ||
find . -type d -name "__pycache__" | xargs rm -r | ||
rm -r .pytest_cache | ||
rm -r .venv | ||
``` | ||
|
||
|
||
### Support | ||
Please [open an issue](https://github.com/ulf1/quaxa/issues/new) for support. | ||
|
||
|
||
### Contributing | ||
Please contribute using [Github Flow](https://guides.github.com/introduction/flow/). Create a branch, add commits, and [open a pull request](https://github.com/ulf1/quaxa/compare/). | ||
This package was initially developed as part of the [EVIDENCE | ||
project](https://gepris.dfg.de/gepris/projekt/433249742) and funded by | ||
the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation, | ||
GU 798/27-1; GE 1119/11-1). Between August 2023 and October 2024, it | ||
has been maintained by [Ulf Hamster](https://github.com/ulf1/). | ||
|
||
QUAXA makes use of [VulGer](https://aclanthology.org/W19-3513), a | ||
lexicon covering words from the lower end of the German language | ||
register — terms typically considered rough, vulgar, or | ||
obscene. VulGer is used under the terms of the CC-BY-SA license. | ||
|
||
### Acknowledgements | ||
The "Evidence" project was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) - [433249742](https://gepris.dfg.de/gepris/projekt/433249742) (GU 798/27-1; GE 1119/11-1). | ||
## Bibliography | ||
|
||
### Maintenance | ||
- till 31.Aug.2023 (v0.1.0) the code repository was maintained within the DFG project [433249742](https://gepris.dfg.de/gepris/projekt/433249742) | ||
- since 01.Sep.2023 (v0.1.0) the code repository is maintained by Ulf Hamster. | ||
* Lexical Computing, "GDEX configuration introduction", URL: https://www.sketchengine.eu/syntax-of-gdex-configuration-files/ | ||
* Didakowski, Lemnitzer, Geyken, 2012, "Automatic example sentence ex- traction for a contemporary German dictionary", URL: https://euralex.org/publications/automatic-example-sentence-extraction-for-a-contemporary-german-dictionary/ |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
[build-system] | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project] | ||
name = "quaxa" | ||
version = "0.1.1" | ||
description = "QUAlity of sentence eXAmples scoring" | ||
authors = [{name = "Ulf Hamster", email = "[email protected]"}] | ||
classifiers = [ | ||
"Development Status :: 1 - Planning", | ||
"Intended Audience :: Developers", | ||
"Intended Audience :: Science/Research", | ||
"License :: OSI Approved :: Apache Software License", | ||
"Topic :: Education", | ||
"Topic :: Scientific/Engineering", | ||
"Topic :: Text Processing :: Linguistic" | ||
] | ||
requires-python = ">=3.7" | ||
dependencies = ["spacy>=3.7"] | ||
dynamic = ["readme"] | ||
|
||
[project.optional-dependencies] | ||
dev = [ | ||
"coverage", | ||
"flake8", | ||
"flake8-bugbear", | ||
"mypy", | ||
"pre-commit", | ||
"pytest", | ||
"pytest-cov", | ||
"de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl#sha256=d88c737eb7eb766f730f6a2dcb99dfcdb81623e1e0d89a9c638a2182ac19c52e" | ||
] | ||
|
||
[project.urls] | ||
Homepage = "https://github.com/ulf1/quaxa" | ||
|
||
[tool.isort] | ||
profile = "black" | ||
|
||
[tool.setuptools.dynamic] | ||
readme = {file = ["README.md"], content-type = "text/markdown"} | ||
|
||
[tool.setuptools.packages.find] | ||
exclude = ["tests"] | ||
|
||
[tool.setuptools.package-data] | ||
"quaxa" = ["VulGer.csv"] |
Oops, something went wrong.