From 52046172fd490cc05c86d7daeb1f329cf8500360 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Sun, 12 Nov 2023 16:44:42 +0100 Subject: [PATCH] Implement unicode script extensions with `ucd-parse` --- gen-unicode/Cargo.toml | 1 + gen-unicode/README.md | 4 + gen-unicode/src/main.rs | 49 +- gen-unicode/src/scripts.rs | 497 +- src/parse.rs | 49 +- src/unicode.rs | 50 +- src/unicodetables.rs | 6794 ++--- tests/unicode_property_escapes.rs | 39228 ++++++++++++++-------------- 8 files changed, 23152 insertions(+), 23520 deletions(-) diff --git a/gen-unicode/Cargo.toml b/gen-unicode/Cargo.toml index 1e51822..8b26c6c 100644 --- a/gen-unicode/Cargo.toml +++ b/gen-unicode/Cargo.toml @@ -8,3 +8,4 @@ edition = "2021" [dependencies] codegen = "0.1.3" +ucd-parse = "0.1.12" diff --git a/gen-unicode/README.md b/gen-unicode/README.md index d64c4e3..8ca754f 100644 --- a/gen-unicode/README.md +++ b/gen-unicode/README.md @@ -15,6 +15,10 @@ This crate generates unicode tables and code specific for regress. curl -L http://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt -o emoji-data.txt curl -L http://ftp.unicode.org/Public/UNIDATA/PropList.txt -o PropList.txt curl -L http://ftp.unicode.org/Public/UNIDATA/Scripts.txt -o Scripts.txt + mkdir /tmp/ucd-15.0.0 + cd /tmp/ucd-15.0.0 + curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip + unzip UCD.zip ``` 2. Run this crate and redirect the output in the specific rs file in the regress crate: diff --git a/gen-unicode/src/main.rs b/gen-unicode/src/main.rs index e761a9f..f82235f 100644 --- a/gen-unicode/src/main.rs +++ b/gen-unicode/src/main.rs @@ -2,8 +2,10 @@ mod binary_properties; mod case_folding; mod general_category_values; mod scripts; + use codegen::Scope; use std::{fs::OpenOptions, io::Write}; +use ucd_parse::{parse, PropertyValueAlias, Script, ScriptExtension}; // Should match unicode.rs. const CODE_POINT_BITS: u32 = 20; @@ -14,7 +16,25 @@ const MAX_CODE_POINT: u32 = (1 << CODE_POINT_BITS) - 1; // Our length is stored with a bias of -1, so no need to subtract 1. const MAX_LENGTH: u32 = 1 << LENGTH_BITS; +pub(crate) struct GenUnicode { + pub(crate) scope: Scope, + pub(crate) scope_tests: Scope, + pub(crate) property_value_aliases: Vec, + pub(crate) scripts: Vec