diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml new file mode 100644 index 00000000..d446096f --- /dev/null +++ b/.github/workflows/python-build.yml @@ -0,0 +1,83 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Python Build + +on: + push: + branches: + - main + paths: + - 'Cargo.lock' + - 'Cargo.toml' + - 'pyproject.toml' + - 'requirements.txt' + - 'src/**' + - 'tests/**' + - '**.yml' + pull_request: + branches: + - main + paths: + - 'Cargo.lock' + - 'Cargo.toml' + - 'pyproject.toml' + - 'requirements.txt' + - 'src/**' + - 'tests/**' + - '**.yml' + +jobs: + python-build: + name: Python ${{ matrix.python-version }} on ${{ matrix.name }} + + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest, macos-latest, windows-latest ] + python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + include: + - os: ubuntu-latest + name: Linux 64-Bit + + - os: macos-latest + name: MacOS 64-Bit + + - os: windows-latest + name: Windows 64-Bit + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install maturin and pytest + run: pip install -r requirements.txt + + - name: Build Python extension + run: maturin build + + - name: Install Python extension + run: pip install --find-links=target/wheels lingua-language-detector + + - name: Run Python unit tests + run: pytest tests/python diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml new file mode 100644 index 00000000..2ef283f6 --- /dev/null +++ b/.github/workflows/python-release.yml @@ -0,0 +1,141 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Python Release + +on: + push: + tags: + - v1.* + +jobs: + linux: + name: Python on Linux and target ${{ matrix.target }} + + runs-on: ubuntu-latest + + strategy: + matrix: + target: [x86_64, x86, aarch64] + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: auto + + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist + + windows: + name: Python on Windows and target ${{ matrix.target }} + + runs-on: windows-latest + + strategy: + matrix: + target: [x64, x86] + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + architecture: ${{ matrix.target }} + cache: 'pip' + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist + + macos: + name: Python on MacOS and target ${{ matrix.target }} + + runs-on: macos-latest + + strategy: + matrix: + target: [x86_64, aarch64] + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: wheels + path: dist + + release: + name: Publish wheels to PyPI + + runs-on: ubuntu-latest + + needs: [linux, windows, macos] + + steps: + - name: Download wheels from previous jobs + uses: actions/download-artifact@v3 + with: + name: wheels + + - name: Upload to PyPI + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --skip-existing * diff --git a/.github/workflows/build.yml b/.github/workflows/rust-build.yml similarity index 96% rename from .github/workflows/build.yml rename to .github/workflows/rust-build.yml index c094633a..15e970a2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/rust-build.yml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: build +name: Rust Build on: push: @@ -36,8 +36,8 @@ on: - '**.yml' jobs: - build-and-test: - name: ${{ matrix.name }} + rust-build: + name: Rust on ${{ matrix.name }} runs-on: ${{ matrix.os }} @@ -100,10 +100,10 @@ jobs: target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - - name: Build in debug mode + - name: Build target in debug mode run: cargo build --target ${{ matrix.target }} --locked - - name: Run unit tests + - name: Test default target in debug mode run: cargo test --target ${{ matrix.target }} - name: Run WASM integration tests on NodeJS diff --git a/.gitignore b/.gitignore index a51f52b6..b0584025 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ *.launch .settings/ .metadata/ +.venv *.sublime-workspace tmp/ out/ diff --git a/Cargo.lock b/Cargo.lock index f497d59c..6d4bfc15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -654,7 +654,7 @@ checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "lingua" -version = "1.5.0" +version = "1.6.0" dependencies = [ "ahash 0.8.6", "brotli", @@ -743,6 +743,7 @@ dependencies = [ "lingua-zulu-language-model", "maplit", "once_cell", + "pyo3", "rayon", "regex 1.10.2", "rstest", @@ -1296,6 +1297,16 @@ version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.19" @@ -1420,6 +1431,29 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + [[package]] name = "pin-project-lite" version = "0.2.9" @@ -1475,6 +1509,67 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pyo3" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8453b658fe480c3e70c8ed4e3d3ec33eb74988bd186561b0cc66b85c3bc4b" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "parking_lot", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96fe70b176a89cff78f2fa7b3c930081e163d5379b4dcdf993e3ae29ca662e5" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "214929900fd25e6604661ed9cf349727c8920d47deff196c4e28165a6ef2a96b" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dac53072f717aa1bfa4db832b39de8c875b7c7af4f4a6fe93cdbf9264cf8383b" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7774b5a8282bd4f25f803b1f0d945120be959a36c72e08e7cd031c792fdfd424" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "quote" version = "1.0.28" @@ -1728,6 +1823,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "smallvec" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" + [[package]] name = "static_assertions" version = "1.1.0" @@ -1764,6 +1865,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "target-lexicon" +version = "0.12.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c39fd04924ca3a864207c66fc2cd7d22d7c016007f9ce846cbb9326331930a" + [[package]] name = "tempfile" version = "3.8.1" @@ -1841,6 +1948,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + [[package]] name = "utf8-ranges" version = "0.1.3" diff --git a/Cargo.toml b/Cargo.toml index adec2d4d..562b86f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ members = ["language-models/*"] [package] name = "lingua" -version = "1.5.0" +version = "1.6.0" authors = ["Peter M. Stahl "] description = """ An accurate natural language detection library, suitable for long and short text alike @@ -144,6 +144,7 @@ lingua-zulu-language-model = { path = "language-models/zu", version = "1.1.0", o ahash = "0.8.6" cld2 = { version = "1.0.2", optional = true } indoc = { version = "2.0.4", optional = true } +pyo3 = { version = "0.20.0", optional = true } rayon = "1.8.0" titlecase = { version = "2.2.0", optional = true } whatlang = { version = "0.16.3", optional = true } @@ -183,6 +184,7 @@ default = [ ] accuracy-reports = ["cld2", "indoc", "titlecase", "whatlang", "whichlang"] benchmark = ["cld2", "whatlang", "whichlang"] +python = ["pyo3"] afrikaans = ["lingua-afrikaans-language-model"] albanian = ["lingua-albanian-language-model"] arabic = ["lingua-arabic-language-model"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..81500760 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,117 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[project] +name = "lingua-language-detector" +version = "1.4.0" +authors = [{name = "Peter M. Stahl", email = "pemistahl@gmail.com"}] +description = "An accurate natural language detection library, suitable for long and short text alike" +#readme = "README_PYPI.md" +requires-python = ">=3.8" +license = {file = "LICENSE"} +keywords = [ + "language-processing", + "language-detection", + "language-recognition", + "nlp" +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: Afrikaans", + "Natural Language :: Arabic", + "Natural Language :: Basque", + "Natural Language :: Bengali", + "Natural Language :: Bosnian", + "Natural Language :: Bulgarian", + "Natural Language :: Catalan", + "Natural Language :: Chinese (Simplified)", + "Natural Language :: Chinese (Traditional)", + "Natural Language :: Croatian", + "Natural Language :: Czech", + "Natural Language :: Danish", + "Natural Language :: Dutch", + "Natural Language :: English", + "Natural Language :: Esperanto", + "Natural Language :: Finnish", + "Natural Language :: French", + "Natural Language :: German", + "Natural Language :: Greek", + "Natural Language :: Hebrew", + "Natural Language :: Hindi", + "Natural Language :: Hungarian", + "Natural Language :: Icelandic", + "Natural Language :: Indonesian", + "Natural Language :: Irish", + "Natural Language :: Italian", + "Natural Language :: Japanese", + "Natural Language :: Korean", + "Natural Language :: Latin", + "Natural Language :: Latvian", + "Natural Language :: Lithuanian", + "Natural Language :: Macedonian", + "Natural Language :: Malay", + "Natural Language :: Marathi", + "Natural Language :: Norwegian", + "Natural Language :: Panjabi", + "Natural Language :: Persian", + "Natural Language :: Polish", + "Natural Language :: Portuguese", + "Natural Language :: Romanian", + "Natural Language :: Russian", + "Natural Language :: Serbian", + "Natural Language :: Slovak", + "Natural Language :: Slovenian", + "Natural Language :: Spanish", + "Natural Language :: Swedish", + "Natural Language :: Tamil", + "Natural Language :: Telugu", + "Natural Language :: Thai", + "Natural Language :: Turkish", + "Natural Language :: Ukrainian", + "Natural Language :: Urdu", + "Natural Language :: Vietnamese", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Rust", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Software Development :: Internationalization", + "Topic :: Software Development :: Localization", + "Topic :: Text Processing :: Linguistic", + "Typing :: Typed" +] + +[project.urls] +homepage = "https://github.com/pemistahl/lingua-py" +repository = "https://github.com/pemistahl/lingua-py" + +[project.optional-dependencies] +test = ["pytest == 7.4.3"] + +[tool.maturin] +module-name = "lingua" +features = ["pyo3/extension-module", "python"] + +[build-system] +requires = ["maturin>=1.1,<2.0"] +build-backend = "maturin" + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..5b21f2be --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +maturin == 1.3.1 +pytest == 7.4.3 diff --git a/src/builder.rs b/src/builder.rs index c0ce0eec..4bcccc30 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -28,6 +28,7 @@ pub(crate) const MINIMUM_RELATIVE_DISTANCE_MESSAGE: &str = /// This struct configures and creates an instance of [LanguageDetector]. #[derive(Clone)] +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub struct LanguageDetectorBuilder { languages: HashSet, minimum_relative_distance: f64, diff --git a/src/detector.rs b/src/detector.rs index 54f66308..689fc84c 100644 --- a/src/detector.rs +++ b/src/detector.rs @@ -50,6 +50,7 @@ static QUADRIGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMa static FIVEGRAM_MODELS: LazyLanguageModelMap = Lazy::new(|| RwLock::new(HashMap::new())); /// This struct detects the language of given input text. +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub struct LanguageDetector { languages: HashSet, minimum_relative_distance: f64, diff --git a/src/isocode.rs b/src/isocode.rs index 33d75c63..122fbc9a 100644 --- a/src/isocode.rs +++ b/src/isocode.rs @@ -38,6 +38,7 @@ use strum_macros::{EnumIter, EnumString}; )] #[allow(clippy::upper_case_acronyms)] #[strum(ascii_case_insensitive)] +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub enum IsoCode639_1 { #[cfg(feature = "afrikaans")] /// The ISO 639-1 code for [`Afrikaans`](crate::language::Language::Afrikaans) @@ -359,6 +360,7 @@ pub enum IsoCode639_1 { )] #[allow(clippy::upper_case_acronyms)] #[strum(ascii_case_insensitive)] +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub enum IsoCode639_3 { #[cfg(feature = "afrikaans")] /// The ISO 639-3 code for [`Afrikaans`](crate::language::Language::Afrikaans) diff --git a/src/language.rs b/src/language.rs index 9fd93edb..ac8ab8a4 100644 --- a/src/language.rs +++ b/src/language.rs @@ -42,6 +42,7 @@ use crate::isocode::{IsoCode639_1, IsoCode639_3}; )] #[serde(rename_all(serialize = "UPPERCASE", deserialize = "UPPERCASE"))] #[strum(ascii_case_insensitive)] +#[cfg_attr(feature = "python", pyo3::prelude::pyclass(rename_all = "UPPERCASE"))] pub enum Language { #[cfg(feature = "afrikaans")] Afrikaans, @@ -277,10 +278,12 @@ impl Display for Language { } impl Language { + /// Returns a set of all supported languages. pub fn all() -> HashSet { Language::iter().collect() } + /// Returns a set of all supported spoken languages. pub fn all_spoken_ones() -> HashSet { Language::iter() .filter(|it| { @@ -293,42 +296,51 @@ impl Language { .collect() } + /// Returns a set of all languages supporting the Arabic script. pub fn all_with_arabic_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Arabic)) .collect() } + /// Returns a set of all languages supporting the Cyrillic script. pub fn all_with_cyrillic_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Cyrillic)) .collect() } + /// Returns a set of all languages supporting the Devanagari script. pub fn all_with_devanagari_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Devanagari)) .collect() } + /// Returns a set of all languages supporting the Latin script. pub fn all_with_latin_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Latin)) .collect() } + /// Returns the language associated with the ISO 639-1 code + /// passed to this method. pub fn from_iso_code_639_1(iso_code: &IsoCode639_1) -> Language { Language::iter() .find(|it| &it.iso_code_639_1() == iso_code) .unwrap() } + /// Returns the language associated with the ISO 639-3 code + /// passed to this method. pub fn from_iso_code_639_3(iso_code: &IsoCode639_3) -> Language { Language::iter() .find(|it| &it.iso_code_639_3() == iso_code) .unwrap() } + /// Returns the ISO 639-1 code of this language. pub fn iso_code_639_1(&self) -> IsoCode639_1 { match self { #[cfg(feature = "afrikaans")] @@ -558,6 +570,7 @@ impl Language { } } + /// Returns the ISO 639-3 code of this language. pub fn iso_code_639_3(&self) -> IsoCode639_3 { match self { #[cfg(feature = "afrikaans")] diff --git a/src/lib.rs b/src/lib.rs index 2d53c768..2eebfb2d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -402,6 +402,9 @@ mod result; mod script; mod writer; +#[cfg(feature = "python")] +mod python; + #[cfg(target_family = "wasm")] mod wasm; diff --git a/src/python.rs b/src/python.rs new file mode 100644 index 00000000..abb9f75e --- /dev/null +++ b/src/python.rs @@ -0,0 +1,602 @@ +/* + * Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::any::Any; +use std::collections::hash_map::DefaultHasher; +use std::collections::HashSet; +use std::hash::{Hash, Hasher}; +use std::io; +use std::panic; +use std::path::PathBuf; + +use pyo3::exceptions::{PyException, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyTuple, PyType}; + +use crate::builder::{ + LanguageDetectorBuilder, MINIMUM_RELATIVE_DISTANCE_MESSAGE, MISSING_LANGUAGE_MESSAGE, +}; +use crate::detector::LanguageDetector; +use crate::isocode::{IsoCode639_1, IsoCode639_3}; +use crate::language::Language; +use crate::result::DetectionResult; +use crate::writer::{LanguageModelFilesWriter, TestDataFilesWriter}; + +#[pymodule] +fn lingua(_py: Python<'_>, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} + +/// This class describes a language's confidence value. +/// +/// Attributes: +/// +/// language (Language): +/// The language associated with this confidence value. +/// +/// value (float): +/// The language's confidence value which lies between 0.0 and 1.0. +#[pyclass] +struct ConfidenceValue { + language: Language, + value: f64, +} + +#[pymethods] +impl ConfidenceValue { + #[new] + fn new(language: Language, value: f64) -> Self { + Self { language, value } + } + + /// Return the language of the associated confidence value. + #[getter] + fn language(&self) -> Language { + self.language + } + + /// Return the confidence value for the associated language. + /// + /// The confidence value is a value between 0.0 and 1.0. + #[getter] + fn value(&self) -> f64 { + self.value + } +} + +#[pymethods] +impl DetectionResult { + /// Return the start index of the identified single-language substring. + #[pyo3(name = "start_index")] + #[getter] + fn py_start_index(&self) -> usize { + self.start_index() + } + + /// Return the end index of the identified single-language substring. + #[pyo3(name = "end_index")] + #[getter] + fn py_end_index(&self) -> usize { + self.end_index() + } + + /// Return the number of words being part of the identified + /// single-language substring. + #[pyo3(name = "word_count")] + #[getter] + fn py_word_count(&self) -> usize { + self.word_count() + } + + /// Return the detected language of the identified single-language substring. + #[pyo3(name = "language")] + #[getter] + fn py_language(&self) -> Language { + self.language() + } +} + +#[pymethods] +impl IsoCode639_1 { + fn __hash__(&self) -> u64 { + let mut hasher = DefaultHasher::new(); + self.hash(&mut hasher); + hasher.finish() + } + + #[getter] + fn name(&self) -> String { + self.to_string().to_uppercase() + } +} + +#[pymethods] +impl IsoCode639_3 { + fn __hash__(&self) -> u64 { + let mut hasher = DefaultHasher::new(); + self.hash(&mut hasher); + hasher.finish() + } + + #[getter] + fn name(&self) -> String { + self.to_string().to_uppercase() + } +} + +#[pymethods] +impl Language { + fn __hash__(&self) -> u64 { + let mut hasher = DefaultHasher::new(); + self.hash(&mut hasher); + hasher.finish() + } + + /// Return a set of all supported languages. + #[pyo3(name = "all")] + #[classmethod] + fn py_all(_cls: &PyType) -> HashSet { + Self::all() + } + + /// Return a set of all supported spoken languages. + #[pyo3(name = "all_spoken_ones")] + #[classmethod] + fn py_all_spoken_ones(_cls: &PyType) -> HashSet { + Self::all_spoken_ones() + } + + /// Return a set of all languages supporting the Arabic script. + #[pyo3(name = "all_with_arabic_script")] + #[classmethod] + fn py_all_with_arabic_script(_cls: &PyType) -> HashSet { + Self::all_with_arabic_script() + } + + /// Return a set of all languages supporting the Cyrillic script. + #[pyo3(name = "all_with_cyrillic_script")] + #[classmethod] + fn py_all_with_cyrillic_script(_cls: &PyType) -> HashSet { + Self::all_with_cyrillic_script() + } + + /// Return a set of all languages supporting the Devanagari script. + #[pyo3(name = "all_with_devanagari_script")] + #[classmethod] + fn py_all_with_devanagari_script(_cls: &PyType) -> HashSet { + Self::all_with_devanagari_script() + } + + /// Return a set of all languages supporting the Latin script. + #[pyo3(name = "all_with_latin_script")] + #[classmethod] + fn py_all_with_latin_script(_cls: &PyType) -> HashSet { + Self::all_with_latin_script() + } + + /// Return the language associated with the ISO 639-1 code + /// passed to this method. + /// + /// Raises: + /// ValueError: if there is no language for the given ISO code + #[pyo3(name = "from_iso_code_639_1")] + #[classmethod] + fn py_from_iso_code_639_1(_cls: &PyType, iso_code: &IsoCode639_1) -> Self { + Self::from_iso_code_639_1(iso_code) + } + + /// Return the language associated with the ISO 639-3 code + /// passed to this method. + /// + /// Raises: + /// ValueError: if there is no language for the given ISO code + #[pyo3(name = "from_iso_code_639_3")] + #[classmethod] + fn py_from_iso_code_639_3(_cls: &PyType, iso_code: &IsoCode639_3) -> Self { + Self::from_iso_code_639_3(iso_code) + } + + /// Return the ISO 639-1 code of this language. + #[pyo3(name = "iso_code_639_1")] + #[getter] + fn py_iso_code_639_1(&self) -> IsoCode639_1 { + self.iso_code_639_1() + } + + /// Return the ISO 639-3 code of this language. + #[pyo3(name = "iso_code_639_3")] + #[getter] + fn py_iso_code_639_3(&self) -> IsoCode639_3 { + self.iso_code_639_3() + } + + #[getter] + fn name(&self) -> String { + self.to_string().to_uppercase() + } +} + +#[pymethods] +impl LanguageDetectorBuilder { + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages. + #[pyo3(name = "from_all_languages")] + #[classmethod] + fn py_from_all_languages(_cls: &PyType) -> Self { + Self::from_all_languages() + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in spoken languages. + #[pyo3(name = "from_all_spoken_languages")] + #[classmethod] + fn py_from_all_spoken_languages(_cls: &PyType) -> Self { + Self::from_all_spoken_languages() + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Arabic script. + #[pyo3(name = "from_all_languages_with_arabic_script")] + #[classmethod] + fn py_from_all_languages_with_arabic_script(_cls: &PyType) -> Self { + Self::from_all_languages_with_arabic_script() + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Cyrillic script. + #[pyo3(name = "from_all_languages_with_cyrillic_script")] + #[classmethod] + fn py_from_all_languages_with_cyrillic_script(_cls: &PyType) -> Self { + Self::from_all_languages_with_cyrillic_script() + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Devanagari script. + #[pyo3(name = "from_all_languages_with_devanagari_script")] + #[classmethod] + fn py_from_all_languages_with_devanagari_script(_cls: &PyType) -> Self { + Self::from_all_languages_with_devanagari_script() + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Latin script. + #[pyo3(name = "from_all_languages_with_latin_script")] + #[classmethod] + fn py_from_all_languages_with_latin_script(_cls: &PyType) -> Self { + Self::from_all_languages_with_latin_script() + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages except those passed to this method. + #[pyo3(name = "from_all_languages_without")] + #[pyo3(signature = (*languages))] + #[classmethod] + fn py_from_all_languages_without(_cls: &PyType, languages: &PyTuple) -> PyResult { + match languages.extract::>() { + Ok(vector) => match panic::catch_unwind(|| Self::from_all_languages_without(&vector)) { + Ok(builder) => Ok(builder), + Err(_) => Err(PyValueError::new_err(MISSING_LANGUAGE_MESSAGE)), + }, + Err(err) => Err(err), + } + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with the languages passed to this method. + #[pyo3(name = "from_languages")] + #[pyo3(signature = (*languages))] + #[classmethod] + fn py_from_languages(_cls: &PyType, languages: &PyTuple) -> PyResult { + match languages.extract::>() { + Ok(vector) => match panic::catch_unwind(|| Self::from_languages(&vector)) { + Ok(builder) => Ok(builder), + Err(_) => Err(PyValueError::new_err(MISSING_LANGUAGE_MESSAGE)), + }, + Err(err) => Err(err), + } + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with the languages specified by the ISO 639-1 codes passed + /// to this method. + /// + /// Raises: + /// ValueError: if less than two ISO codes are specified + #[pyo3(name = "from_iso_codes_639_1")] + #[pyo3(signature = (*iso_codes))] + #[classmethod] + fn py_from_iso_codes_639_1(_cls: &PyType, iso_codes: &PyTuple) -> PyResult { + match iso_codes.extract::>() { + Ok(vector) => match panic::catch_unwind(|| Self::from_iso_codes_639_1(&vector)) { + Ok(builder) => Ok(builder), + Err(_) => Err(PyValueError::new_err(MISSING_LANGUAGE_MESSAGE)), + }, + Err(err) => Err(err), + } + } + + /// Create and return an instance of LanguageDetectorBuilder + /// with the languages specified by the ISO 639-3 codes passed + /// to this method. + /// + /// Raises: + /// ValueError: if less than two ISO codes are specified + #[pyo3(name = "from_iso_codes_639_3")] + #[pyo3(signature = (*iso_codes))] + #[classmethod] + fn py_from_iso_codes_639_3(_cls: &PyType, iso_codes: &PyTuple) -> PyResult { + match iso_codes.extract::>() { + Ok(vector) => match panic::catch_unwind(|| Self::from_iso_codes_639_3(&vector)) { + Ok(builder) => Ok(builder), + Err(_) => Err(PyValueError::new_err(MISSING_LANGUAGE_MESSAGE)), + }, + Err(err) => Err(err), + } + } + + /// Set the desired value for the minimum relative distance measure. + /// + /// By default, Lingua returns the most likely language for a given + /// input text. However, there are certain words that are spelled the + /// same in more than one language. The word 'prologue', for instance, + /// is both a valid English and French word. Lingua would output either + /// English or French which might be wrong in the given context. + /// For cases like that, it is possible to specify a minimum relative + /// distance that the logarithmized and summed up probabilities for + /// each possible language have to satisfy. + /// + /// Be aware that the distance between the language probabilities is + /// dependent on the length of the input text. The longer the input + /// text, the larger the distance between the languages. So if you + /// want to classify very short text phrases, do not set the minimum + /// relative distance too high. Otherwise you will get most results + /// returned as None which is the return value for cases where + /// language detection is not reliably possible. + /// + /// Raises: + /// ValueError: if distance is smaller than 0.0 or greater than 0.99 + #[pyo3(name = "with_minimum_relative_distance")] + fn py_with_minimum_relative_distance( + mut self_: PyRefMut, + distance: f64, + ) -> PyResult> { + if !(0.0..=0.99).contains(&distance) { + Err(PyValueError::new_err(MINIMUM_RELATIVE_DISTANCE_MESSAGE)) + } else { + self_.with_minimum_relative_distance(distance); + Ok(self_) + } + } + + /// Preload all language models when creating the LanguageDetector + /// instance. + /// + /// By default, Lingua uses lazy-loading to load only those language + /// models on demand which are considered relevant by the rule-based + /// filter engine. For web services, for instance, it is rather + /// beneficial to preload all language models into memory to avoid + /// unexpected latency while waiting for the service response. This + /// method allows to switch between these two loading modes. + #[pyo3(name = "with_preloaded_language_models")] + fn py_with_preloaded_language_models(mut self_: PyRefMut) -> PyRefMut { + self_.with_preloaded_language_models(); + self_ + } + + /// Disable the high accuracy mode in order to save memory + /// and increase performance. + /// + /// By default, Lingua's high detection accuracy comes at the cost + /// of loading large language models into memory which might not be + /// feasible for systems running low on resources. + /// + /// This method disables the high accuracy mode so that only a small + /// subset of language models is loaded into memory. The downside of + /// this approach is that detection accuracy for short texts consisting + /// of less than 120 characters will drop significantly. However, + /// detection accuracy for texts which are longer than 120 characters + /// will remain mostly unaffected. + #[pyo3(name = "with_low_accuracy_mode")] + fn py_with_low_accuracy_mode(mut self_: PyRefMut) -> PyRefMut { + self_.with_low_accuracy_mode(); + self_ + } + + /// Create and return the configured LanguageDetector instance. + #[pyo3(name = "build")] + fn py_build(&mut self) -> LanguageDetector { + self.build() + } +} + +#[pymethods] +impl LanguageDetector { + /// Clear all language models loaded by this LanguageDetector instance. + /// + /// This helps to free allocated memory previously consumed by the models. + #[pyo3(name = "unload_language_models")] + fn py_unload_language_models(&self) { + self.unload_language_models() + } + + /// Detect the language of text. + /// + /// If the language cannot be reliably detected, None is returned. + #[pyo3(name = "detect_language_of")] + fn py_detect_language_of(&self, text: String) -> Option { + self.detect_language_of(text) + } + + /// Attempt to detect multiple languages in mixed-language text. + /// + /// This feature is experimental and under continuous development. + /// + /// A list of DetectionResult is returned containing an entry for each + /// contiguous single-language text section as identified by the library. + /// Each entry consists of the identified language, a start index and an + /// end index. The indices denote the substring that has been identified + /// as a contiguous single-language text section. + #[pyo3(name = "detect_multiple_languages_of")] + fn py_detect_multiple_languages_of(&self, text: String) -> Vec { + self.detect_multiple_languages_of(text) + } + + /// Compute confidence values for each language supported + /// by this detector for the given text. + /// + /// The confidence values denote how likely it is that the + /// given text has been written in any of the languages + /// supported by this detector. + /// + /// A list is returned containing those languages which the + /// calling instance of LanguageDetector has been built from. + /// The entries are sorted by their confidence value in + /// descending order. Each value is a probability between + /// 0.0 and 1.0. The probabilities of all languages will sum to 1.0. + /// If the language is unambiguously identified by the rule engine, + /// the value 1.0 will always be returned for this language. The + /// other languages will receive a value of 0.0. + #[pyo3(name = "compute_language_confidence_values")] + fn py_compute_language_confidence_values(&self, text: String) -> Vec { + self.compute_language_confidence_values(text) + .iter() + .map(|tup| ConfidenceValue { + language: tup.0, + value: tup.1, + }) + .collect() + } + + /// Compute the confidence value for the given language and input text. + /// + /// The confidence value denotes how likely it is that the given text + /// has been written in the given language. The value that this method + /// computes is a number between 0.0 and 1.0. If the language is + /// unambiguously identified by the rule engine, the value 1.0 will + /// always be returned. If the given language is not supported by this + /// detector instance, the value 0.0 will always be returned. + #[pyo3(name = "compute_language_confidence")] + fn py_compute_language_confidence(&self, text: String, language: Language) -> f64 { + self.compute_language_confidence(text, language) + } +} + +#[pymethods] +impl LanguageModelFilesWriter { + /// Create language model files and write them to a directory. + /// + /// Args: + /// input_file_path: The path to a txt file used for language + /// model creation. The assumed encoding of the txt file is UTF-8. + /// output_directory_path: The path to an existing directory where the + /// language model files are to be written. + /// language: The language for which to create language models. + /// char_class: A regex character class such as \\p{L} to restrict the + /// set of characters that the language models are built from. + /// + /// Raises: + /// Exception: if the input file path is not absolute or does not point + /// to an existing txt file; if the input file's encoding is not + /// UTF-8; if the output directory path is not absolute or does not + /// point to an existing directory; if the character class cannot + /// be compiled to a valid regular expression + #[pyo3(name = "create_and_write_language_model_files")] + #[classmethod] + fn py_create_and_write_language_model_files( + _cls: &PyType, + input_file_path: PathBuf, + output_directory_path: PathBuf, + language: &Language, + char_class: &str, + ) -> PyResult<()> { + convert_io_result_to_py_result(panic::catch_unwind(|| { + Self::create_and_write_language_model_files( + input_file_path.as_path(), + output_directory_path.as_path(), + language, + char_class, + ) + })) + } +} + +#[pymethods] +impl TestDataFilesWriter { + /// Create test data files for accuracy report generation and + /// write them to a directory. + /// + /// Args: + /// input_file_path: The path to a txt file used for test data + /// creation. The assumed encoding of the txt file is UTF-8. + /// output_directory_path: The path to an existing directory where + /// the test data files are to be written. + /// char_class: A regex character class such as \\p{L} to restrict + /// the set of characters that the test data are built from. + /// maximum_lines: The maximum number of lines each test data file + /// should have. + /// + /// Raises: + /// Exception: if the input file path is not absolute or does not point + /// to an existing txt file; if the input file's encoding is not + /// UTF-8; if the output directory path is not absolute or does not + /// point to an existing directory; if the character class cannot + /// be compiled to a valid regular expression + #[pyo3(name = "create_and_write_test_data_files")] + #[classmethod] + fn py_create_and_write_test_data_files( + _cls: &PyType, + input_file_path: PathBuf, + output_directory_path: PathBuf, + char_class: &str, + maximum_lines: u32, + ) -> PyResult<()> { + convert_io_result_to_py_result(panic::catch_unwind(|| { + Self::create_and_write_test_data_files( + input_file_path.as_path(), + output_directory_path.as_path(), + char_class, + maximum_lines, + ) + })) + } +} + +fn convert_io_result_to_py_result( + io_result: Result, Box<(dyn Any + Send + 'static)>>, +) -> PyResult<()> { + match io_result { + Ok(_) => Ok(()), + Err(err) => { + let panic_info = match err.downcast::() { + Ok(message) => *message, + Err(err) => match err.downcast::<&str>() { + Ok(message) => message.to_string(), + Err(_) => "Unknown error occurred".to_string(), + }, + }; + Err(PyException::new_err(panic_info)) + } + } +} diff --git a/src/result.rs b/src/result.rs index a57998af..09b7a0c4 100644 --- a/src/result.rs +++ b/src/result.rs @@ -19,6 +19,7 @@ use crate::language::Language; /// This struct describes a contiguous single-language /// text section within a possibly mixed-language text. #[derive(Copy, Clone, Debug)] +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub struct DetectionResult { pub(crate) start_index: usize, pub(crate) end_index: usize, @@ -35,6 +36,11 @@ impl DetectionResult { pub fn end_index(&self) -> usize { self.end_index } + /// Returns the number of words being part of the identified + /// single-language substring. + pub fn word_count(&self) -> usize { + self.word_count + } /// Returns the detected language of the identified single-language substring. pub fn language(&self) -> Language { self.language diff --git a/src/writer.rs b/src/writer.rs index ee06c5ef..23f40455 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -30,14 +30,16 @@ use crate::ngram::Ngram; use crate::Language; /// This struct creates language model files and writes them to a directory. +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub struct LanguageModelFilesWriter; /// This struct creates test data files for accuracy report generation /// and writes them to a directory. +#[cfg_attr(feature = "python", pyo3::prelude::pyclass)] pub struct TestDataFilesWriter; impl LanguageModelFilesWriter { - /// Creates language model files for accuracy report generation and writes them to a directory. + /// Creates language model files and writes them to a directory. /// /// `input_file_path`: The path to a txt file used for language model creation. /// The assumed encoding of the txt file is UTF-8. @@ -359,7 +361,7 @@ fn check_output_directory_path(output_directory_path: &Path) { } if !output_directory_path.exists() { panic!( - "Output directory '{}' does not exist", + "Output directory path '{}' does not exist", output_directory_path.display() ); } diff --git a/tests/python/test_builder.py b/tests/python/test_builder.py new file mode 100644 index 00000000..f62e4c85 --- /dev/null +++ b/tests/python/test_builder.py @@ -0,0 +1,104 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from lingua import IsoCode639_1, IsoCode639_3, Language, LanguageDetectorBuilder + + +def test_build_from_blacklist_does_not_panic(): + languages = {Language.TURKISH, Language.ROMANIAN} + LanguageDetectorBuilder.from_all_languages_without(*languages) + + +def test_cannot_build_from_blacklist(): + languages = Language.all().difference({Language.GERMAN}) + with pytest.raises(ValueError) as exception_info: + LanguageDetectorBuilder.from_all_languages_without(*languages) + assert ( + exception_info.value.args[0] + == "LanguageDetector needs at least 2 languages to choose from" + ) + + +def test_build_from_whitelist_does_not_panic(): + languages = {Language.GERMAN, Language.ENGLISH} + LanguageDetectorBuilder.from_languages(*languages) + + +def test_cannot_build_from_whitelist(): + with pytest.raises(ValueError) as exception_info: + LanguageDetectorBuilder.from_languages(Language.GERMAN) + assert ( + exception_info.value.args[0] + == "LanguageDetector needs at least 2 languages to choose from" + ) + + +def test_build_from_iso_639_1_codes_does_not_panic(): + LanguageDetectorBuilder.from_iso_codes_639_1( + IsoCode639_1.DE, IsoCode639_1.SV + ) + + +def test_cannot_build_from_iso_639_1_codes(): + with pytest.raises(ValueError) as exception_info: + LanguageDetectorBuilder.from_iso_codes_639_1(IsoCode639_1.DE) + assert ( + exception_info.value.args[0] + == "LanguageDetector needs at least 2 languages to choose from" + ) + + +def test_build_from_iso_639_3_codes_does_not_panic(): + LanguageDetectorBuilder.from_iso_codes_639_3( + IsoCode639_3.DEU, IsoCode639_3.SWE + ) + + +def test_cannot_build_from_iso_639_3_codes(): + with pytest.raises(ValueError) as exception_info: + LanguageDetectorBuilder.from_iso_codes_639_3(IsoCode639_3.DEU) + assert ( + exception_info.value.args[0] + == "LanguageDetector needs at least 2 languages to choose from" + ) + + +def test_build_with_minimum_relative_distance_does_not_panic(): + ( + LanguageDetectorBuilder + .from_all_languages() + .with_minimum_relative_distance(0.2) + ) + + +def test_cannot_build_with_minimum_relative_distance(): + builder = LanguageDetectorBuilder.from_all_languages() + for value in (-0.01, -2.3, 1.0, 1.7): + with pytest.raises(ValueError) as exception_info: + builder.with_minimum_relative_distance(value) + assert ( + exception_info.value.args[0] + == "Minimum relative distance must lie in between 0.0 and 0.99" + ) + + +def test_build_with_low_accuracy_mode_does_not_panic(): + ( + LanguageDetectorBuilder + .from_all_languages() + .with_low_accuracy_mode() + ) diff --git a/tests/python/test_detector.py b/tests/python/test_detector.py new file mode 100644 index 00000000..78f25864 --- /dev/null +++ b/tests/python/test_detector.py @@ -0,0 +1,184 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from lingua import ( + ConfidenceValue, + Language, + LanguageDetector, + LanguageDetectorBuilder +) + +detector_for_english_and_german = ( + LanguageDetectorBuilder.from_languages( + Language.ENGLISH, Language.GERMAN) + .with_preloaded_language_models() + .build() +) + + +def test_detect_language(): + assert ( + detector_for_english_and_german + .detect_language_of("Alter") + == Language.GERMAN + ) + + +def test_no_language_is_returned(): + assert ( + detector_for_english_and_german + .detect_language_of("проарплап") + is None + ) + + +def test_detect_multiple_languages_for_empty_string(): + assert ( + detector_for_english_and_german + .detect_multiple_languages_of("") + == [] + ) + + +@pytest.mark.parametrize( + ",".join( + [ + "sentence", + "expected_first_substring", + "expected_first_word_count", + "expected_first_language", + "expected_second_substring", + "expected_second_word_count", + "expected_second_language", + ] + ), + [ + pytest.param( + ' He turned around and asked: "Entschuldigen Sie, sprechen Sie Deutsch?"', + " He turned around and asked: ", + 5, + Language.ENGLISH, + '"Entschuldigen Sie, sprechen Sie Deutsch?"', + 5, + Language.GERMAN, + ) + ], +) +def test_detect_multiple_languages_with_two_languages( + sentence, + expected_first_substring, + expected_first_word_count, + expected_first_language, + expected_second_substring, + expected_second_word_count, + expected_second_language, +): + results = detector_for_english_and_german.detect_multiple_languages_of(sentence) + assert len(results) == 2 + + first_result = results[0] + first_substring = sentence[first_result.start_index : first_result.end_index] + assert first_substring == expected_first_substring + assert first_result.word_count == expected_first_word_count + assert first_result.language == expected_first_language + + second_result = results[1] + second_substring = sentence[second_result.start_index : second_result.end_index] + assert second_substring == expected_second_substring + assert second_result.word_count == expected_second_word_count + assert second_result.language == expected_second_language + + +@pytest.mark.parametrize( + "text,expected_confidence_values", + [ + pytest.param( + "groß", + [ + ConfidenceValue(Language.GERMAN, 1.0), + ConfidenceValue(Language.ENGLISH, 0.0), + ], + ), + pytest.param( + "Alter", + [ + ConfidenceValue(Language.GERMAN, 0.68), + ConfidenceValue(Language.ENGLISH, 0.32), + ], + ), + pytest.param( + "проарплап", + [ + ConfidenceValue(Language.ENGLISH, 0.0), + ConfidenceValue(Language.GERMAN, 0.0), + ], + ), + ], +) +def test_compute_language_confidence_values( + text, expected_confidence_values +): + confidence_values = ( + detector_for_english_and_german.compute_language_confidence_values( + text + ) + ) + assert len(confidence_values) == 2 + + first, second = confidence_values + expected_first, expected_second = expected_confidence_values + + assert first.language == expected_first.language + assert round(first.value, 2) == expected_first.value + + assert second.language == expected_second.language + assert round(second.value, 2) == expected_second.value + + +@pytest.mark.parametrize( + "text,expected_confidence_for_german,expected_confidence_for_english", + [ + pytest.param("groß", 1.0, 0.0), + pytest.param("Alter", 0.68, 0.32), + pytest.param("проарплап", 0.0, 0.0), + ], +) +def test_compute_language_confidence( + text, + expected_confidence_for_german, + expected_confidence_for_english, +): + confidence_for_german = ( + detector_for_english_and_german.compute_language_confidence( + text, Language.GERMAN + ) + ) + assert round(confidence_for_german, 2) == expected_confidence_for_german + + confidence_for_english = ( + detector_for_english_and_german.compute_language_confidence( + text, Language.ENGLISH + ) + ) + assert round(confidence_for_english, 2) == expected_confidence_for_english + + confidence_for_french = ( + detector_for_english_and_german.compute_language_confidence( + text, Language.FRENCH + ) + ) + assert confidence_for_french == 0.0 diff --git a/tests/python/test_language.py b/tests/python/test_language.py new file mode 100644 index 00000000..c67258b2 --- /dev/null +++ b/tests/python/test_language.py @@ -0,0 +1,282 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from lingua import IsoCode639_1, IsoCode639_3, Language + + +def test_iso_code_639_1_name(): + assert IsoCode639_1.EN.name == "EN" + + +def test_iso_code_639_3_name(): + assert IsoCode639_3.ENG.name == "ENG" + + +def test_language_name(): + assert Language.ENGLISH.name == "ENGLISH" + + +def test_all_languages_are_available(): + assert Language.all() == frozenset( + [ + Language.AFRIKAANS, + Language.ALBANIAN, + Language.ARABIC, + Language.ARMENIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BELARUSIAN, + Language.BENGALI, + Language.BOKMAL, + Language.BOSNIAN, + Language.BULGARIAN, + Language.CATALAN, + Language.CHINESE, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GEORGIAN, + Language.GERMAN, + Language.GREEK, + Language.GUJARATI, + Language.HEBREW, + Language.HINDI, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.JAPANESE, + Language.KAZAKH, + Language.KOREAN, + Language.LATIN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MACEDONIAN, + Language.MALAY, + Language.MAORI, + Language.MARATHI, + Language.MONGOLIAN, + Language.NYNORSK, + Language.PERSIAN, + Language.POLISH, + Language.PORTUGUESE, + Language.PUNJABI, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.SHONA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TAMIL, + Language.TELUGU, + Language.THAI, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.UKRAINIAN, + Language.URDU, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU, + ] + ) + + +def test_all_spoken_languages_are_available(): + assert Language.all_spoken_ones() == frozenset( + [ + Language.AFRIKAANS, + Language.ALBANIAN, + Language.ARABIC, + Language.ARMENIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BELARUSIAN, + Language.BENGALI, + Language.BOKMAL, + Language.BOSNIAN, + Language.BULGARIAN, + Language.CATALAN, + Language.CHINESE, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GEORGIAN, + Language.GERMAN, + Language.GREEK, + Language.GUJARATI, + Language.HEBREW, + Language.HINDI, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.JAPANESE, + Language.KAZAKH, + Language.KOREAN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MACEDONIAN, + Language.MALAY, + Language.MAORI, + Language.MARATHI, + Language.MONGOLIAN, + Language.NYNORSK, + Language.PERSIAN, + Language.POLISH, + Language.PORTUGUESE, + Language.PUNJABI, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.SHONA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TAMIL, + Language.TELUGU, + Language.THAI, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.UKRAINIAN, + Language.URDU, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU, + ] + ) + + +def test_languages_support_arabic_script(): + assert Language.all_with_arabic_script() == frozenset( + [Language.ARABIC, Language.PERSIAN, Language.URDU] + ) + + +def test_languages_support_cyrillic_alphabet(): + assert Language.all_with_cyrillic_script() == frozenset( + [ + Language.BELARUSIAN, + Language.BULGARIAN, + Language.KAZAKH, + Language.MACEDONIAN, + Language.MONGOLIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.UKRAINIAN, + ] + ) + + +def test_languages_support_devanagari_script(): + assert Language.all_with_devanagari_script() == frozenset( + [Language.HINDI, Language.MARATHI] + ) + + +def test_languages_support_latin_script(): + assert Language.all_with_latin_script() == frozenset( + [ + Language.AFRIKAANS, + Language.ALBANIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BOKMAL, + Language.BOSNIAN, + Language.CATALAN, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GERMAN, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.LATIN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MALAY, + Language.MAORI, + Language.NYNORSK, + Language.POLISH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.SHONA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU, + ] + ) + + +def test_language_from_iso_code_639_1(): + assert Language.from_iso_code_639_1(IsoCode639_1.DE) == Language.GERMAN + + +def test_language_from_iso_code_639_3(): + assert Language.from_iso_code_639_3(IsoCode639_3.DEU) == Language.GERMAN diff --git a/tests/python/test_writer.py b/tests/python/test_writer.py new file mode 100644 index 00000000..e27460d7 --- /dev/null +++ b/tests/python/test_writer.py @@ -0,0 +1,267 @@ +# +# Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pytest + +from pathlib import Path +from tempfile import NamedTemporaryFile, TemporaryDirectory + +from lingua import Language, LanguageModelFilesWriter, TestDataFilesWriter + +# prevent pytest from trying to collect methods in TestDataFilesWriter as tests +TestDataFilesWriter.__test__ = False + + +@pytest.fixture +def language_model_files_text(): + return ( + "These sentences are intended for testing purposes.\n" + "Do not use them in production!\n" + "By the way, they consist of 23 words in total." + ) + + +@pytest.fixture +def test_data_files_text(): + return ( + "There are many attributes associated with good software.\n" + "Some of these can be mutually contradictory, and different customers and participants may have different priorities.\n" + "Weinberg provides an example of how different goals can have a dramatic effect on both effort required and efficiency.\n" + "Furthermore, he notes that programmers will generally aim to achieve any explicit goals which may be set, probably at the expense of any other quality attributes.\n" + "Sommerville has identified four generalised attributes which are not concerned with what a program does, but how well the program does it:\n" + "Maintainability, Dependability, Efficiency, Usability\n" + ) + + +def test_language_model_files_writer(language_model_files_text): + input_file = create_temp_input_file(language_model_files_text) + input_file_path = Path(input_file.name) + + output_directory = TemporaryDirectory() + output_directory_path = Path(output_directory.name) + + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=input_file_path, + output_directory_path=output_directory_path, + language=Language.ENGLISH, + char_class="\\p{L}", + ) + + files = read_directory_content(output_directory_path) + + assert len(files) == 5 + assert files[4] == "unigrams.json.br" + assert files[0] == "bigrams.json.br" + assert files[3] == "trigrams.json.br" + assert files[2] == "quadrigrams.json.br" + assert files[1] == "fivegrams.json.br" + + +def test_test_data_files_writer(test_data_files_text): + input_file = create_temp_input_file(test_data_files_text) + input_file_path = Path(input_file.name) + + output_directory = TemporaryDirectory() + output_directory_path = Path(output_directory.name) + + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=input_file_path, + output_directory_path=output_directory_path, + char_class="\\p{L}", + maximum_lines=4, + ) + + files = read_directory_content(output_directory_path) + + assert len(files) == 3 + assert files[0] == "sentences.txt" + assert files[1] == "single-words.txt" + assert files[2] == "word-pairs.txt" + + +def test_relative_input_file_path_raises_exception(): + relative_input_file_path = Path("some/relative/path/file.txt") + expected_error_message = ( + f"Input file path '{relative_input_file_path}' is not absolute" + ) + + with pytest.raises(Exception) as exception_info1: + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=relative_input_file_path, + output_directory_path=Path("/some/output/directory"), + language=Language.ENGLISH, + char_class="\\p{L}", + ) + assert exception_info1.value.args[0] == expected_error_message + + with pytest.raises(Exception) as exception_info2: + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=relative_input_file_path, + output_directory_path=Path("/some/output/directory"), + char_class="\\p{L}", + maximum_lines=4, + ) + assert exception_info2.value.args[0] == expected_error_message + + +def test_non_existing_input_file_raises_exception(): + non_existing_input_file_path = ( + Path.cwd() / "some" / "non-existing" / "path" / "file.txt" + ) + expected_error_message = ( + f"Input file '{non_existing_input_file_path}' does not exist" + ) + + with pytest.raises(Exception) as exception_info1: + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=non_existing_input_file_path, + output_directory_path=Path("/some/output/directory"), + language=Language.ENGLISH, + char_class="\\p{L}", + ) + assert exception_info1.value.args[0] == expected_error_message + + with pytest.raises(Exception) as exception_info2: + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=non_existing_input_file_path, + output_directory_path=Path("/some/output/directory"), + char_class="\\p{L}", + maximum_lines=4, + ) + assert exception_info2.value.args[0] == expected_error_message + + +def test_directory_as_input_file_raises_exception(): + input_file = TemporaryDirectory() + input_file_path = Path(input_file.name) + expected_error_message = ( + f"Input file path '{input_file_path}' does not represent a regular file" + ) + + with pytest.raises(Exception) as exception_info1: + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=input_file_path, + output_directory_path=Path("/some/output/directory"), + language=Language.ENGLISH, + char_class="\\p{L}", + ) + assert exception_info1.value.args[0] == expected_error_message + + with pytest.raises(Exception) as exception_info2: + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=input_file_path, + output_directory_path=Path("/some/output/directory"), + char_class="\\p{L}", + maximum_lines=4, + ) + assert exception_info2.value.args[0] == expected_error_message + + +def test_relative_output_directory_path_raises_exception(): + input_file = create_temp_input_file("some content") + input_file_path = Path(input_file.name) + + relative_output_directory_path = Path("some/relative/path") + expected_error_message = ( + f"Output directory path '{relative_output_directory_path}' is not absolute" + ) + + with pytest.raises(Exception) as exception_info1: + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=input_file_path, + output_directory_path=relative_output_directory_path, + language=Language.ENGLISH, + char_class="\\p{L}", + ) + assert exception_info1.value.args[0] == expected_error_message + + with pytest.raises(Exception) as exception_info2: + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=input_file_path, + output_directory_path=relative_output_directory_path, + char_class="\\p{L}", + maximum_lines=4, + ) + assert exception_info2.value.args[0] == expected_error_message + + +def test_non_existing_output_directory_path_raises_exception(): + input_file = create_temp_input_file("some content") + input_file_path = Path(input_file.name) + + non_existing_output_directory_path = ( + Path.cwd() / "some" / "non-existing" / "directory" + ) + expected_error_message = ( + f"Output directory path '{non_existing_output_directory_path}' does not exist" + ) + + with pytest.raises(Exception) as exception_info1: + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=input_file_path, + output_directory_path=non_existing_output_directory_path, + language=Language.ENGLISH, + char_class="\\p{L}", + ) + assert exception_info1.value.args[0] == expected_error_message + + with pytest.raises(Exception) as exception_info2: + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=input_file_path, + output_directory_path=non_existing_output_directory_path, + char_class="\\p{L}", + maximum_lines=4, + ) + assert exception_info2.value.args[0] == expected_error_message + + +def test_file_as_output_directory_raises_exception(): + input_file = create_temp_input_file("some content") + input_file_path = Path(input_file.name) + expected_error_message = ( + f"Output directory path '{input_file_path}' does not represent a directory" + ) + + with pytest.raises(Exception) as exception_info1: + LanguageModelFilesWriter.create_and_write_language_model_files( + input_file_path=input_file_path, + output_directory_path=input_file_path, + language=Language.ENGLISH, + char_class="\\p{L}", + ) + assert exception_info1.value.args[0] == expected_error_message + + with pytest.raises(Exception) as exception_info2: + TestDataFilesWriter.create_and_write_test_data_files( + input_file_path=input_file_path, + output_directory_path=input_file_path, + char_class="\\p{L}", + maximum_lines=4, + ) + assert exception_info2.value.args[0] == expected_error_message + + +def create_temp_input_file(content: str): + input_file = NamedTemporaryFile() + input_file.write(bytes(content, "utf-8")) + input_file.seek(0) + return input_file + + +def read_directory_content(directory): + files = os.listdir(directory) + files.sort() + return files