diff --git a/src/language.rs b/src/language.rs index cd1a0260..ac8ab8a4 100644 --- a/src/language.rs +++ b/src/language.rs @@ -278,10 +278,12 @@ impl Display for Language { } impl Language { + /// Returns a set of all supported languages. pub fn all() -> HashSet { Language::iter().collect() } + /// Returns a set of all supported spoken languages. pub fn all_spoken_ones() -> HashSet { Language::iter() .filter(|it| { @@ -294,42 +296,51 @@ impl Language { .collect() } + /// Returns a set of all languages supporting the Arabic script. pub fn all_with_arabic_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Arabic)) .collect() } + /// Returns a set of all languages supporting the Cyrillic script. pub fn all_with_cyrillic_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Cyrillic)) .collect() } + /// Returns a set of all languages supporting the Devanagari script. pub fn all_with_devanagari_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Devanagari)) .collect() } + /// Returns a set of all languages supporting the Latin script. pub fn all_with_latin_script() -> HashSet { Language::iter() .filter(|it| it.alphabets().contains(&Alphabet::Latin)) .collect() } + /// Returns the language associated with the ISO 639-1 code + /// passed to this method. pub fn from_iso_code_639_1(iso_code: &IsoCode639_1) -> Language { Language::iter() .find(|it| &it.iso_code_639_1() == iso_code) .unwrap() } + /// Returns the language associated with the ISO 639-3 code + /// passed to this method. pub fn from_iso_code_639_3(iso_code: &IsoCode639_3) -> Language { Language::iter() .find(|it| &it.iso_code_639_3() == iso_code) .unwrap() } + /// Returns the ISO 639-1 code of this language. pub fn iso_code_639_1(&self) -> IsoCode639_1 { match self { #[cfg(feature = "afrikaans")] @@ -559,6 +570,7 @@ impl Language { } } + /// Returns the ISO 639-3 code of this language. pub fn iso_code_639_3(&self) -> IsoCode639_3 { match self { #[cfg(feature = "afrikaans")] diff --git a/src/python.rs b/src/python.rs index 4483f336..277c75d7 100644 --- a/src/python.rs +++ b/src/python.rs @@ -49,6 +49,15 @@ fn lingua(_py: Python<'_>, m: &PyModule) -> PyResult<()> { Ok(()) } +/// This class describes a language's confidence value. +/// +/// Attributes: +/// +/// language (Language): +/// The language associated with this confidence value. +/// +/// value (float): +/// The language's confidence value which lies between 0.0 and 1.0. #[pyclass] struct ConfidenceValue { language: Language, @@ -62,11 +71,15 @@ impl ConfidenceValue { Self { language, value } } + /// Return the language of the associated confidence value. #[getter] fn language(&self) -> Language { self.language } + /// Return the confidence value for the associated language. + /// + /// The confidence value is a value between 0.0 and 1.0. #[getter] fn value(&self) -> f64 { self.value @@ -75,24 +88,29 @@ impl ConfidenceValue { #[pymethods] impl DetectionResult { + /// Return the start index of the identified single-language substring. #[pyo3(name = "start_index")] #[getter] fn py_start_index(&self) -> usize { self.start_index() } + /// Return the end index of the identified single-language substring. #[pyo3(name = "end_index")] #[getter] fn py_end_index(&self) -> usize { self.end_index() } + /// Return the number of words being part of the identified + /// single-language substring. #[pyo3(name = "word_count")] #[getter] fn py_word_count(&self) -> usize { self.word_count() } + /// Return the detected language of the identified single-language substring. #[pyo3(name = "language")] #[getter] fn py_language(&self) -> Language { @@ -100,6 +118,22 @@ impl DetectionResult { } } +#[pymethods] +impl IsoCode639_1 { + #[getter] + fn name(&self) -> String { + self.to_string() + } +} + +#[pymethods] +impl IsoCode639_3 { + #[getter] + fn name(&self) -> String { + self.to_string() + } +} + #[pymethods] impl Language { fn __hash__(&self) -> u64 { @@ -108,60 +142,78 @@ impl Language { hasher.finish() } + /// Return a set of all supported languages. #[pyo3(name = "all")] #[classmethod] fn py_all(_cls: &PyType) -> HashSet { Self::all() } + /// Return a set of all supported spoken languages. #[pyo3(name = "all_spoken_ones")] #[classmethod] fn py_all_spoken_ones(_cls: &PyType) -> HashSet { Self::all_spoken_ones() } + /// Return a set of all languages supporting the Arabic script. #[pyo3(name = "all_with_arabic_script")] #[classmethod] fn py_all_with_arabic_script(_cls: &PyType) -> HashSet { Self::all_with_arabic_script() } + /// Return a set of all languages supporting the Cyrillic script. #[pyo3(name = "all_with_cyrillic_script")] #[classmethod] fn py_all_with_cyrillic_script(_cls: &PyType) -> HashSet { Self::all_with_cyrillic_script() } + /// Return a set of all languages supporting the Devanagari script. #[pyo3(name = "all_with_devanagari_script")] #[classmethod] fn py_all_with_devanagari_script(_cls: &PyType) -> HashSet { Self::all_with_devanagari_script() } + /// Return a set of all languages supporting the Latin script. #[pyo3(name = "all_with_latin_script")] #[classmethod] fn py_all_with_latin_script(_cls: &PyType) -> HashSet { Self::all_with_latin_script() } + /// Return the language associated with the ISO 639-1 code + /// passed to this method. + /// + /// Raises: + /// ValueError: if there is no language for the given ISO code #[pyo3(name = "from_iso_code_639_1")] #[classmethod] fn py_from_iso_code_639_1(_cls: &PyType, iso_code: &IsoCode639_1) -> Self { Self::from_iso_code_639_1(iso_code) } + /// Return the language associated with the ISO 639-3 code + /// passed to this method. + /// + /// Raises: + /// ValueError: if there is no language for the given ISO code #[pyo3(name = "from_iso_code_639_3")] #[classmethod] fn py_from_iso_code_639_3(_cls: &PyType, iso_code: &IsoCode639_3) -> Self { Self::from_iso_code_639_3(iso_code) } + /// Return the ISO 639-1 code of this language. #[pyo3(name = "iso_code_639_1")] #[getter] fn py_iso_code_639_1(&self) -> IsoCode639_1 { self.iso_code_639_1() } + /// Return the ISO 639-3 code of this language. #[pyo3(name = "iso_code_639_3")] #[getter] fn py_iso_code_639_3(&self) -> IsoCode639_3 { @@ -176,42 +228,56 @@ impl Language { #[pymethods] impl LanguageDetectorBuilder { + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages. #[pyo3(name = "from_all_languages")] #[classmethod] fn py_from_all_languages(_cls: &PyType) -> Self { Self::from_all_languages() } + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in spoken languages. #[pyo3(name = "from_all_spoken_languages")] #[classmethod] fn py_from_all_spoken_languages(_cls: &PyType) -> Self { Self::from_all_spoken_languages() } + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Arabic script. #[pyo3(name = "from_all_languages_with_arabic_script")] #[classmethod] fn py_from_all_languages_with_arabic_script(_cls: &PyType) -> Self { Self::from_all_languages_with_arabic_script() } + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Cyrillic script. #[pyo3(name = "from_all_languages_with_cyrillic_script")] #[classmethod] fn py_from_all_languages_with_cyrillic_script(_cls: &PyType) -> Self { Self::from_all_languages_with_cyrillic_script() } + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Devanagari script. #[pyo3(name = "from_all_languages_with_devanagari_script")] #[classmethod] fn py_from_all_languages_with_devanagari_script(_cls: &PyType) -> Self { Self::from_all_languages_with_devanagari_script() } + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages supporting the Latin script. #[pyo3(name = "from_all_languages_with_latin_script")] #[classmethod] fn py_from_all_languages_with_latin_script(_cls: &PyType) -> Self { Self::from_all_languages_with_latin_script() } + /// Create and return an instance of LanguageDetectorBuilder + /// with all built-in languages except those passed to this method. #[pyo3(name = "from_all_languages_without")] #[pyo3(signature = (*languages))] #[classmethod] @@ -224,6 +290,8 @@ impl LanguageDetectorBuilder { } } + /// Create and return an instance of LanguageDetectorBuilder + /// with the languages passed to this method. #[pyo3(name = "from_languages")] #[pyo3(signature = (*languages))] #[classmethod] @@ -236,6 +304,12 @@ impl LanguageDetectorBuilder { } } + /// Create and return an instance of LanguageDetectorBuilder + /// with the languages specified by the ISO 639-1 codes passed + /// to this method. + /// + /// Raises: + /// ValueError: if less than two ISO codes are specified #[pyo3(name = "from_iso_codes_639_1")] #[pyo3(signature = (*iso_codes))] #[classmethod] @@ -248,6 +322,12 @@ impl LanguageDetectorBuilder { } } + /// Create and return an instance of LanguageDetectorBuilder + /// with the languages specified by the ISO 639-3 codes passed + /// to this method. + /// + /// Raises: + /// ValueError: if less than two ISO codes are specified #[pyo3(name = "from_iso_codes_639_3")] #[pyo3(signature = (*iso_codes))] #[classmethod] @@ -260,6 +340,27 @@ impl LanguageDetectorBuilder { } } + /// Set the desired value for the minimum relative distance measure. + /// + /// By default, Lingua returns the most likely language for a given + /// input text. However, there are certain words that are spelled the + /// same in more than one language. The word 'prologue', for instance, + /// is both a valid English and French word. Lingua would output either + /// English or French which might be wrong in the given context. + /// For cases like that, it is possible to specify a minimum relative + /// distance that the logarithmized and summed up probabilities for + /// each possible language have to satisfy. + /// + /// Be aware that the distance between the language probabilities is + /// dependent on the length of the input text. The longer the input + /// text, the larger the distance between the languages. So if you + /// want to classify very short text phrases, do not set the minimum + /// relative distance too high. Otherwise you will get most results + /// returned as None which is the return value for cases where + /// language detection is not reliably possible. + /// + /// Raises: + /// ValueError: if distance is smaller than 0.0 or greater than 0.99 #[pyo3(name = "with_minimum_relative_distance")] fn py_with_minimum_relative_distance( mut self_: PyRefMut, @@ -273,18 +374,41 @@ impl LanguageDetectorBuilder { } } + /// Preload all language models when creating the LanguageDetector + /// instance. + /// + /// By default, Lingua uses lazy-loading to load only those language + /// models on demand which are considered relevant by the rule-based + /// filter engine. For web services, for instance, it is rather + /// beneficial to preload all language models into memory to avoid + /// unexpected latency while waiting for the service response. This + /// method allows to switch between these two loading modes. #[pyo3(name = "with_preloaded_language_models")] fn py_with_preloaded_language_models(mut self_: PyRefMut) -> PyRefMut { self_.with_preloaded_language_models(); self_ } + /// Disable the high accuracy mode in order to save memory + /// and increase performance. + /// + /// By default, Lingua's high detection accuracy comes at the cost + /// of loading large language models into memory which might not be + /// feasible for systems running low on resources. + /// + /// This method disables the high accuracy mode so that only a small + /// subset of language models is loaded into memory. The downside of + /// this approach is that detection accuracy for short texts consisting + /// of less than 120 characters will drop significantly. However, + /// detection accuracy for texts which are longer than 120 characters + /// will remain mostly unaffected. #[pyo3(name = "with_low_accuracy_mode")] fn py_with_low_accuracy_mode(mut self_: PyRefMut) -> PyRefMut { self_.with_low_accuracy_mode(); self_ } + /// Create and return the configured LanguageDetector instance. #[pyo3(name = "build")] fn py_build(&mut self) -> LanguageDetector { self.build() @@ -293,21 +417,51 @@ impl LanguageDetectorBuilder { #[pymethods] impl LanguageDetector { + /// Clear all language models loaded by this LanguageDetector instance. + /// + /// This helps to free allocated memory previously consumed by the models. #[pyo3(name = "unload_language_models")] fn py_unload_language_models(&self) { self.unload_language_models() } + /// Detect the language of text. + /// + /// If the language cannot be reliably detected, None is returned. #[pyo3(name = "detect_language_of")] fn py_detect_language_of(&self, text: String) -> Option { self.detect_language_of(text) } + /// Attempt to detect multiple languages in mixed-language text. + /// + /// This feature is experimental and under continuous development. + /// + /// A list of DetectionResult is returned containing an entry for each + /// contiguous single-language text section as identified by the library. + /// Each entry consists of the identified language, a start index and an + /// end index. The indices denote the substring that has been identified + /// as a contiguous single-language text section. #[pyo3(name = "detect_multiple_languages_of")] fn py_detect_multiple_languages_of(&self, text: String) -> Vec { self.detect_multiple_languages_of(text) } + /// Compute confidence values for each language supported + /// by this detector for the given text. + /// + /// The confidence values denote how likely it is that the + /// given text has been written in any of the languages + /// supported by this detector. + /// + /// A list is returned containing those languages which the + /// calling instance of LanguageDetector has been built from. + /// The entries are sorted by their confidence value in + /// descending order. Each value is a probability between + /// 0.0 and 1.0. The probabilities of all languages will sum to 1.0. + /// If the language is unambiguously identified by the rule engine, + /// the value 1.0 will always be returned for this language. The + /// other languages will receive a value of 0.0. #[pyo3(name = "compute_language_confidence_values")] fn py_compute_language_confidence_values(&self, text: String) -> Vec { self.compute_language_confidence_values(text) @@ -319,6 +473,14 @@ impl LanguageDetector { .collect() } + /// Compute the confidence value for the given language and input text. + /// + /// The confidence value denotes how likely it is that the given text + /// has been written in the given language. The value that this method + /// computes is a number between 0.0 and 1.0. If the language is + /// unambiguously identified by the rule engine, the value 1.0 will + /// always be returned. If the given language is not supported by this + /// detector instance, the value 0.0 will always be returned. #[pyo3(name = "compute_language_confidence")] fn py_compute_language_confidence(&self, text: String, language: Language) -> f64 { self.compute_language_confidence(text, language) @@ -327,6 +489,23 @@ impl LanguageDetector { #[pymethods] impl LanguageModelFilesWriter { + /// Create language model files and write them to a directory. + /// + /// Args: + /// input_file_path: The path to a txt file used for language + /// model creation. The assumed encoding of the txt file is UTF-8. + /// output_directory_path: The path to an existing directory where the + /// language model files are to be written. + /// language: The language for which to create language models. + /// char_class: A regex character class such as \\p{L} to restrict the + /// set of characters that the language models are built from. + /// + /// Raises: + /// Exception: if the input file path is not absolute or does not point + /// to an existing txt file; if the input file's encoding is not + /// UTF-8; if the output directory path is not absolute or does not + /// point to an existing directory; if the character class cannot + /// be compiled to a valid regular expression #[pyo3(name = "create_and_write_language_model_files")] #[classmethod] fn py_create_and_write_language_model_files( @@ -349,6 +528,25 @@ impl LanguageModelFilesWriter { #[pymethods] impl TestDataFilesWriter { + /// Create test data files for accuracy report generation and + /// write them to a directory. + /// + /// Args: + /// input_file_path: The path to a txt file used for test data + /// creation. The assumed encoding of the txt file is UTF-8. + /// output_directory_path: The path to an existing directory where + /// the test data files are to be written. + /// char_class: A regex character class such as \\p{L} to restrict + /// the set of characters that the test data are built from. + /// maximum_lines: The maximum number of lines each test data file + /// should have. + /// + /// Raises: + /// Exception: if the input file path is not absolute or does not point + /// to an existing txt file; if the input file's encoding is not + /// UTF-8; if the output directory path is not absolute or does not + /// point to an existing directory; if the character class cannot + /// be compiled to a valid regular expression #[pyo3(name = "create_and_write_test_data_files")] #[classmethod] fn py_create_and_write_test_data_files( diff --git a/src/writer.rs b/src/writer.rs index 18c268bc..23f40455 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -39,7 +39,7 @@ pub struct LanguageModelFilesWriter; pub struct TestDataFilesWriter; impl LanguageModelFilesWriter { - /// Creates language model files for accuracy report generation and writes them to a directory. + /// Creates language model files and writes them to a directory. /// /// `input_file_path`: The path to a txt file used for language model creation. /// The assumed encoding of the txt file is UTF-8. diff --git a/tests/python/test_language.py b/tests/python/test_language.py index d5e068a2..c67258b2 100644 --- a/tests/python/test_language.py +++ b/tests/python/test_language.py @@ -16,6 +16,14 @@ from lingua import IsoCode639_1, IsoCode639_3, Language +def test_iso_code_639_1_name(): + assert IsoCode639_1.EN.name == "EN" + + +def test_iso_code_639_3_name(): + assert IsoCode639_3.ENG.name == "ENG" + + def test_language_name(): assert Language.ENGLISH.name == "ENGLISH"