From 205f043c37cdbd20006a58ed4db1f8bd8ec5acb8 Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Thu, 7 Sep 2023 14:38:47 +0100 Subject: [PATCH] Add test for out-of-script sample texts --- tests/test_data_languages.py | 55 ++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/test_data_languages.py b/tests/test_data_languages.py index 125cb68e..8075c70d 100644 --- a/tests/test_data_languages.py +++ b/tests/test_data_languages.py @@ -32,7 +32,23 @@ "Simplified Han": "Han", "Korean": "Hangul", "Odia": "Oriya", - "Ol Chiki": "Ol_Chiki", + "Makasar": "Buginese", + "Lanna": "Tai Tham", + "Unified Canadian Aboriginal Syllabics": "Canadian Aboriginal", + "S-A Cuneiform": "Cuneiform", + "Pollard Phonetic": "Miao", + "Egyptian hieroglyphs": "Egyptian Hieroglyphs", + "Zanabazar": "Zanabazar Square", + "Nüshu": "Nushu", + "Mandaean": "Mandaic", + "N’Ko": "Nko", + "Varang Kshiti": "Warang Citi", + "Mende": "Mende Kikakui", + "Phags-pa": "Phags Pa", + "Fraser": "Lisu", + "Georgian Khutsuri": "Georgian", + "Orkhon": "Old Turkic", + "" } SKIP_EXEMPLARS = { @@ -41,7 +57,6 @@ "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic", } - @pytest.mark.parametrize("lang_code", LANGUAGES) @pytest.mark.parametrize( "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"] @@ -130,3 +145,39 @@ def test_exemplars_are_in_script(lang_code): f": {', '.join(out_of_script.keys())}" f" from scripts {', '.join(set(out_of_script.values()))}" ) + + +@pytest.mark.parametrize("lang_code", LANGUAGES.keys()) +def test_sample_texts_are_in_script(lang_code): + lang = LANGUAGES[lang_code] + script_name = SCRIPTS[lang.script].name + script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name) + if not lang.sample_text.ListFields(): + pytest.skip("No sample text for language " + lang_code) + return + if lang.id in SKIP_EXEMPLARS: + pytest.skip(SKIP_EXEMPLARS[lang.id]) + return + out_of_script = defaultdict(set) + for field in SampleText.fields: + if field.name == "note": + continue + samples = getattr(lang.sample_text, field.name) + chars = set(samples) + for char in chars: + char_script = youseedee.ucd_data(ord(char)).get("Script", "").replace("_", " ") + if char_script == "Common" or char_script == "Inherited": + continue + if char_script != script_name: + extensions = youseedee.ucd_data(ord(char)).get("Script_Extensions", "").split(" ") + if any(ext == lang.script for ext in extensions): + continue + out_of_script[char_script].add(char) + break + msg = [] + for script, chars in out_of_script.items(): + msg.append(f"'{''.join(chars)}' ({script} != {script_name})") + assert not out_of_script, ( + f"{lang_code} sample text contained out-of-script characters" + f": {', '.join(msg)}" + )