Skip to content

Commit

Permalink
Add test for out-of-script sample texts
Browse files Browse the repository at this point in the history
  • Loading branch information
simoncozens committed Sep 7, 2023
1 parent a017773 commit 205f043
Showing 1 changed file with 53 additions and 2 deletions.
55 changes: 53 additions & 2 deletions tests/test_data_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,23 @@
"Simplified Han": "Han",
"Korean": "Hangul",
"Odia": "Oriya",
"Ol Chiki": "Ol_Chiki",
"Makasar": "Buginese",
"Lanna": "Tai Tham",
"Unified Canadian Aboriginal Syllabics": "Canadian Aboriginal",
"S-A Cuneiform": "Cuneiform",
"Pollard Phonetic": "Miao",
"Egyptian hieroglyphs": "Egyptian Hieroglyphs",
"Zanabazar": "Zanabazar Square",
"Nüshu": "Nushu",
"Mandaean": "Mandaic",
"N’Ko": "Nko",
"Varang Kshiti": "Warang Citi",
"Mende": "Mende Kikakui",
"Phags-pa": "Phags Pa",
"Fraser": "Lisu",
"Georgian Khutsuri": "Georgian",
"Orkhon": "Old Turkic",
""
}

SKIP_EXEMPLARS = {
Expand All @@ -41,7 +57,6 @@
"sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
}


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
"exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
Expand Down Expand Up @@ -130,3 +145,39 @@ def test_exemplars_are_in_script(lang_code):
f": {', '.join(out_of_script.keys())}"
f" from scripts {', '.join(set(out_of_script.values()))}"
)


@pytest.mark.parametrize("lang_code", LANGUAGES.keys())
def test_sample_texts_are_in_script(lang_code):
lang = LANGUAGES[lang_code]
script_name = SCRIPTS[lang.script].name
script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
if not lang.sample_text.ListFields():
pytest.skip("No sample text for language " + lang_code)
return
if lang.id in SKIP_EXEMPLARS:
pytest.skip(SKIP_EXEMPLARS[lang.id])
return
out_of_script = defaultdict(set)
for field in SampleText.fields:
if field.name == "note":
continue
samples = getattr(lang.sample_text, field.name)
chars = set(samples)
for char in chars:
char_script = youseedee.ucd_data(ord(char)).get("Script", "").replace("_", " ")
if char_script == "Common" or char_script == "Inherited":
continue
if char_script != script_name:
extensions = youseedee.ucd_data(ord(char)).get("Script_Extensions", "").split(" ")
if any(ext == lang.script for ext in extensions):
continue
out_of_script[char_script].add(char)
break
msg = []
for script, chars in out_of_script.items():
msg.append(f"'{''.join(chars)}' ({script} != {script_name})")
assert not out_of_script, (
f"{lang_code} sample text contained out-of-script characters"
f": {', '.join(msg)}"
)

0 comments on commit 205f043

Please sign in to comment.