diff --git a/Cargo.toml b/Cargo.toml index 6ca7c1f..6931cbf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,3 +35,6 @@ utf16 = [] [dependencies] hashbrown = "0.13.2" memchr = { version = "2.4.0", default-features = false } + +[patch.crates-io] +ucd-parse = { git = "https://github.com/raskad/ucd-generate.git", branch = "add-derived-normalization-properties" } diff --git a/gen-unicode/Cargo.toml b/gen-unicode/Cargo.toml index 8b26c6c..9f1688b 100644 --- a/gen-unicode/Cargo.toml +++ b/gen-unicode/Cargo.toml @@ -7,5 +7,5 @@ license = "MIT OR Apache-2.0" edition = "2021" [dependencies] -codegen = "0.1.3" +codegen = "0.2.0" ucd-parse = "0.1.12" diff --git a/gen-unicode/README.md b/gen-unicode/README.md index 8ca754f..4b64367 100644 --- a/gen-unicode/README.md +++ b/gen-unicode/README.md @@ -7,14 +7,6 @@ This crate generates unicode tables and code specific for regress. 1. Download the needed unicode source files: ```sh - curl -L http://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt -o CaseFolding.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/extracted/DerivedBinaryProperties.txt -o DerivedBinaryProperties.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt -o DerivedCoreProperties.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt -o DerivedGeneralCategory.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt -o DerivedNormalizationProps.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt -o emoji-data.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/PropList.txt -o PropList.txt - curl -L http://ftp.unicode.org/Public/UNIDATA/Scripts.txt -o Scripts.txt mkdir /tmp/ucd-15.0.0 cd /tmp/ucd-15.0.0 curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip diff --git a/gen-unicode/src/binary_properties.rs b/gen-unicode/src/binary_properties.rs index dffb3f4..1c5948c 100644 --- a/gen-unicode/src/binary_properties.rs +++ b/gen-unicode/src/binary_properties.rs @@ -1,412 +1,452 @@ -use crate::{chars_to_code_point_ranges, pack_adjacent_chars, parse_line}; -use std::fs::File; -use std::io::{self, BufRead}; - -use codegen::{Block, Enum, Function, Scope}; - -pub(crate) fn generate(scope: &mut Scope) { - let mut property_enum = Enum::new("UnicodePropertyBinary"); - property_enum - .vis("pub") - .derive("Debug") - .derive("Clone") - .derive("Copy"); - - let mut is_property_fn = Function::new("is_property_binary"); - is_property_fn - .vis("pub(crate)") - .arg("c", "char") - .arg("value", "&UnicodePropertyBinary") - .ret("bool") - .line("use UnicodePropertyBinary::*;"); - let mut is_property_fn_match_block = Block::new("match value"); - - let mut property_from_str_fn = Function::new("unicode_property_binary_from_str"); - property_from_str_fn - .arg("s", "&str") - .ret("Option") - .vis("pub") - .line("use UnicodePropertyBinary::*;"); - let mut property_from_str_fn_match_block = Block::new("match s"); - - for (alias, orig_name, name, ucd_file_name) in BINARY_PROPERTIES { - let file = File::open(ucd_file_name).unwrap(); - let lines = io::BufReader::new(file).lines(); - let mut chars = Vec::new(); +use crate::{chars_to_code_point_ranges, codepoints_to_range, pack_adjacent_chars, GenUnicode}; +use codegen::{Block, Enum, Function}; + +impl GenUnicode { + pub(crate) fn generate_binary_properties(&mut self) { + let mut property_enum = Enum::new("UnicodePropertyBinary"); + property_enum + .vis("pub") + .derive("Debug") + .derive("Clone") + .derive("Copy"); + + let mut is_property_fn = Function::new("is_property_binary"); + is_property_fn + .vis("pub(crate)") + .arg("c", "char") + .arg("value", "&UnicodePropertyBinary") + .ret("bool") + .line("use UnicodePropertyBinary::*;"); + let mut is_property_fn_match_block = Block::new("match value"); + + let mut property_from_str_fn = Function::new("unicode_property_binary_from_str"); + property_from_str_fn + .arg("s", "&str") + .ret("Option") + .vis("pub") + .line("use UnicodePropertyBinary::*;"); + let mut property_from_str_fn_match_block = Block::new("match s"); + + for (alias, orig_name, name, ucd_file) in BINARY_PROPERTIES { + let mut chars = ucd_file.chars(orig_name, self); + + pack_adjacent_chars(&mut chars); + + // Some properties cannot be packed into a CodePointRange. + if ["Noncharacter_Code_Point"].contains(orig_name) { + self.scope.raw(&format!( + "pub(crate) const {}: [CodePointRangeUnpacked; {}] = [\n {}\n];", + orig_name.to_uppercase(), + chars.len(), + chars + .iter() + .map(|cs| format!("CodePointRangeUnpacked::from({}, {}),", cs.0, cs.1)) + .collect::>() + .join("\n ") + )); + } else { + let ranges = chars_to_code_point_ranges(&chars); + self.scope.raw(&format!( + "pub(crate) const {}: [CodePointRange; {}] = [\n {}\n];", + orig_name.to_uppercase(), + ranges.len(), + ranges.join("\n ") + )); + } + + self.scope + .new_fn(&format!("is_{}", orig_name.to_lowercase())) + .vis("pub(crate)") + .arg("c", "char") + .ret("bool") + .line(&format!( + "{}.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()", + orig_name.to_uppercase() + )) + .doc(&format!( + "Return whether c has the '{}' Unicode property.", + orig_name + )); + + property_enum.new_variant(*name); + + is_property_fn_match_block.line(format!( + "{} => is_{}(c),", + name, + orig_name.to_lowercase() + )); - for line in lines { - parse_line(&line.unwrap(), &mut chars, orig_name); + property_from_str_fn_match_block.line(if alias.is_empty() { + format!("\"{}\" => Some({}),", orig_name, name) + } else { + format!("\"{}\" | \"{}\" => Some({}),", alias, orig_name, name) + }); } - pack_adjacent_chars(&mut chars); - - // Some properties cannot be packed into a CodePointRange. - if ["Noncharacter_Code_Point"].contains(orig_name) { - scope.raw(&format!( - "pub(crate) const {}: [CodePointRangeUnpacked; {}] = [\n {}\n];", - orig_name.to_uppercase(), - chars.len(), - chars - .iter() - .map(|cs| format!("CodePointRangeUnpacked::from({}, {}),", cs.0, cs.1)) - .collect::>() - .join("\n ") - )); - } else { - let ranges = chars_to_code_point_ranges(&chars); - scope.raw(&format!( - "pub(crate) const {}: [CodePointRange; {}] = [\n {}\n];", - orig_name.to_uppercase(), - ranges.len(), - ranges.join("\n ") - )); - } + // These are special ranges that are not in the UCD files + property_enum.new_variant("Ascii"); + property_enum.new_variant("Any"); + property_enum.new_variant("Assigned"); + + let ascii_ranges = chars_to_code_point_ranges(&[(0, 127)]); + + self.scope.raw(&format!( + "pub(crate) const ASCII: [CodePointRange; 1] = [\n {}\n];", + ascii_ranges.join("\n ") + )); - scope - .new_fn(&format!("is_{}", orig_name.to_lowercase())) + self.scope + .new_fn("is_ascii") .vis("pub(crate)") .arg("c", "char") .ret("bool") - .line(&format!( - "{}.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()", - orig_name.to_uppercase() - )) - .doc(&format!( - "Return whether c has the '{}' Unicode property.", - orig_name - )); + .line("ASCII.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()") + .doc("Return whether c has the 'ASCII' Unicode property."); + + self.scope.raw("pub(crate) const ANY: [CodePointRangeUnpacked; 1] = [\n CodePointRangeUnpacked::from(0, 1114111)\n];"); + + self.scope + .new_fn("is_any") + .vis("pub(crate)") + .arg("c", "char") + .ret("bool") + .line("ANY.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()") + .doc("Return whether c has the 'Any' Unicode property."); - property_enum.new_variant(name); + self.scope + .new_fn("is_assigned") + .vis("pub(crate)") + .arg("c", "char") + .ret("bool") + .line("UNASSIGNED.binary_search_by(|&cpr| cpr.compare(c as u32)).is_err()") + .doc("Return whether c has the 'Any' Unicode property."); - is_property_fn_match_block.line(format!("{} => is_{}(c),", name, orig_name.to_lowercase())); + is_property_fn_match_block.line("Ascii => is_ascii(c),"); + is_property_fn_match_block.line("Any => is_any(c),"); + is_property_fn_match_block.line("Assigned => is_assigned(c),"); - property_from_str_fn_match_block.line(if alias.is_empty() { - format!("\"{}\" => Some({}),", orig_name, name) - } else { - format!("\"{}\" | \"{}\" => Some({}),", alias, orig_name, name) - }); - } + property_from_str_fn_match_block.line("\"ASCII\" => Some(Ascii),"); + property_from_str_fn_match_block.line("\"Any\" => Some(Any),"); + property_from_str_fn_match_block.line("\"Assigned\" => Some(Assigned),"); - // These are special ranges that are not in the UCD files - property_enum.new_variant("Ascii"); - property_enum.new_variant("Any"); - property_enum.new_variant("Assigned"); - - let ascii_ranges = chars_to_code_point_ranges(&[(0, 127)]); - - scope.raw(&format!( - "pub(crate) const ASCII: [CodePointRange; 1] = [\n {}\n];", - ascii_ranges.join("\n ") - )); - - scope - .new_fn("is_ascii") - .vis("pub(crate)") - .arg("c", "char") - .ret("bool") - .line("ASCII.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()") - .doc("Return whether c has the 'ASCII' Unicode property."); - - scope.raw("pub(crate) const ANY: [CodePointRangeUnpacked; 1] = [\n CodePointRangeUnpacked::from(0, 1114111)\n];"); - - scope - .new_fn("is_any") - .vis("pub(crate)") - .arg("c", "char") - .ret("bool") - .line("ANY.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()") - .doc("Return whether c has the 'Any' Unicode property."); - - scope - .new_fn("is_assigned") - .vis("pub(crate)") - .arg("c", "char") - .ret("bool") - .line("UNASSIGNED.binary_search_by(|&cpr| cpr.compare(c as u32)).is_err()") - .doc("Return whether c has the 'Any' Unicode property."); - - is_property_fn_match_block.line("Ascii => is_ascii(c),"); - is_property_fn_match_block.line("Any => is_any(c),"); - is_property_fn_match_block.line("Assigned => is_assigned(c),"); - - property_from_str_fn_match_block.line("\"ASCII\" => Some(Ascii),"); - property_from_str_fn_match_block.line("\"Any\" => Some(Any),"); - property_from_str_fn_match_block.line("\"Assigned\" => Some(Assigned),"); - - is_property_fn.push_block(is_property_fn_match_block); - - property_from_str_fn_match_block.line("_ => None,"); - property_from_str_fn.push_block(property_from_str_fn_match_block); - - scope - .push_fn(is_property_fn) - .push_enum(property_enum) - .push_fn(property_from_str_fn); -} + is_property_fn.push_block(is_property_fn_match_block); -pub(crate) fn generate_tests(scope: &mut Scope) { - for (alias, orig_name, name, ucd_file_name) in BINARY_PROPERTIES { - let file = File::open(ucd_file_name).unwrap(); - let lines = io::BufReader::new(file).lines(); - let mut chars = Vec::new(); + property_from_str_fn_match_block.line("_ => None,"); + property_from_str_fn.push_block(property_from_str_fn_match_block); - for line in lines { - parse_line(&line.unwrap(), &mut chars, orig_name); - } + self.scope + .push_fn(is_property_fn) + .push_enum(property_enum) + .push_fn(property_from_str_fn); + } - scope - .new_fn(&format!( - "unicode_escape_property_binary_{}", - name.to_lowercase() - )) - .attr("test") - .line(format!( - "test_with_configs(unicode_escape_property_binary_{}_tc)", + pub(crate) fn generate_binary_properties_tests(&mut self) { + for (alias, orig_name, name, ucd_file) in BINARY_PROPERTIES { + let chars = ucd_file.chars(orig_name, self); + + self.scope_tests + .new_fn(&format!( + "unicode_escape_property_binary_{}", + name.to_lowercase() + )) + .attr("test") + .line(format!( + "test_with_configs(unicode_escape_property_binary_{}_tc)", + name.to_lowercase() + )); + + let f = self.scope_tests.new_fn(&format!( + "unicode_escape_property_binary_{}_tc", name.to_lowercase() )); - let f = scope.new_fn(&format!( - "unicode_escape_property_binary_{}_tc", - name.to_lowercase() - )); + f.arg("tc", "TestConfig"); - f.arg("tc", "TestConfig"); + let code_points: Vec = chars + .iter() + .map(|c| format!("\"\\u{{{:x}}}\"", c.0)) + .collect(); - let code_points: Vec = chars - .iter() - .map(|c| format!("\"\\u{{{:x}}}\"", c.0)) - .collect(); + f.line(format!( + "const CODE_POINTS: [&str; {}] = [\n {},\n];", + code_points.len(), + code_points.join(",\n ") + )); - f.line(format!( - "const CODE_POINTS: [&str; {}] = [\n {},\n];", - code_points.len(), - code_points.join(",\n ") - )); + let mut regexes = vec![format!(r#""^\\p{{{}}}+$""#, orig_name)]; - let mut regexes = vec![format!(r#""^\\p{{{}}}+$""#, orig_name)]; + if !alias.is_empty() { + regexes.push(format!(r#""^\\p{{{}}}+$""#, alias)); + } - if !alias.is_empty() { - regexes.push(format!(r#""^\\p{{{}}}+$""#, alias)); - } + f.line(format!( + "const REGEXES: [&str; {}] = [\n {},\n];", + regexes.len(), + regexes.join(",\n ") + )); - f.line(format!( - "const REGEXES: [&str; {}] = [\n {},\n];", - regexes.len(), - regexes.join(",\n ") - )); + let mut b = Block::new("for regex in REGEXES"); + b.line(r#"let regex = tc.compilef(regex, "u");"#); + + let mut bb = Block::new("for code_point in CODE_POINTS"); + bb.line("regex.test_succeeds(code_point);"); + + b.push_block(bb); + + f.push_block(b); + } + } +} - let mut b = Block::new("for regex in REGEXES"); - b.line(r#"let regex = tc.compilef(regex, "u");"#); +enum UCDFile { + CoreProperty, + Property, + EmojiProperty, + DerivedBinaryProperties, + DerivedNormalizationProperty, +} - let mut bb = Block::new("for code_point in CODE_POINTS"); - bb.line("regex.test_succeeds(code_point);"); +impl UCDFile { + fn chars(&self, property: &str, gen_unicode: &GenUnicode) -> Vec<(u32, u32)> { + let mut chars = Vec::new(); - b.push_block(bb); + match self { + Self::CoreProperty => { + for row in &gen_unicode.core_property { + if row.property == *property { + chars.push(codepoints_to_range(&row.codepoints)); + } + } + } + Self::Property => { + for row in &gen_unicode.properties { + if row.property == *property { + chars.push(codepoints_to_range(&row.codepoints)); + } + } + } + Self::EmojiProperty => { + for row in &gen_unicode.emoji_properties { + if row.property == *property { + chars.push(codepoints_to_range(&row.codepoints)); + } + } + } + Self::DerivedBinaryProperties => { + for row in &gen_unicode.derived_binary_properties { + if row.property == *property { + chars.push(codepoints_to_range(&row.codepoints)); + } + } + } + Self::DerivedNormalizationProperty => { + for row in &gen_unicode.derived_normalization_properties { + if row.property == *property { + chars.push(codepoints_to_range(&row.codepoints)); + } + } + } + } - f.push_block(b); + chars } } // Structure: (Alias, Name, CamelCaseName, UCDFileName) -const BINARY_PROPERTIES: &[(&str, &str, &str, &str); 50] = &[ - ( - "Alpha", - "Alphabetic", - "Alphabetic", - "DerivedCoreProperties.txt", - ), +const BINARY_PROPERTIES: &[(&str, &str, &str, UCDFile); 50] = &[ + ("Alpha", "Alphabetic", "Alphabetic", UCDFile::CoreProperty), ( "CI", "Case_Ignorable", "CaseIgnorable", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), - ("", "Cased", "Cased", "DerivedCoreProperties.txt"), + ("", "Cased", "Cased", UCDFile::CoreProperty), ( "CWCF", "Changes_When_Casefolded", "ChangesWhenCasefolded", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "CWCM", "Changes_When_Casemapped", "ChangesWhenCasemapped", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "CWL", "Changes_When_Lowercased", "ChangesWhenLowercased", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "CWT", "Changes_When_Titlecased", "ChangesWhenTitlecased", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "CWU", "Changes_When_Uppercased", "ChangesWhenUppercased", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "DI", "Default_Ignorable_Code_Point", "DefaultIgnorableCodePoint", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "Gr_Base", "Grapheme_Base", "GraphemeBase", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), ( "Gr_Ext", "Grapheme_Extend", "GraphemeExtend", - "DerivedCoreProperties.txt", - ), - ( - "IDC", - "ID_Continue", - "IDContinue", - "DerivedCoreProperties.txt", + UCDFile::CoreProperty, ), - ("IDS", "ID_Start", "IDStart", "DerivedCoreProperties.txt"), - ("", "Math", "Math", "DerivedCoreProperties.txt"), + ("IDC", "ID_Continue", "IDContinue", UCDFile::CoreProperty), + ("IDS", "ID_Start", "IDStart", UCDFile::CoreProperty), + ("", "Math", "Math", UCDFile::CoreProperty), + ("XIDC", "XID_Continue", "XIDContinue", UCDFile::CoreProperty), + ("XIDS", "XID_Start", "XIDStart", UCDFile::CoreProperty), ( - "XIDC", - "XID_Continue", - "XIDContinue", - "DerivedCoreProperties.txt", + "AHex", + "ASCII_Hex_Digit", + "ASCIIHexDigit", + UCDFile::Property, ), - ("XIDS", "XID_Start", "XIDStart", "DerivedCoreProperties.txt"), - ("AHex", "ASCII_Hex_Digit", "ASCIIHexDigit", "PropList.txt"), - ("Bidi_C", "Bidi_Control", "BidiControl", "PropList.txt"), - ("", "Dash", "Dash", "PropList.txt"), - ("Dep", "Deprecated", "Deprecated", "PropList.txt"), - ("Dia", "Diacritic", "Diacritic", "PropList.txt"), - ("Ext", "Extender", "Extender", "PropList.txt"), - ("Hex", "Hex_Digit", "HexDigit", "PropList.txt"), + ("Bidi_C", "Bidi_Control", "BidiControl", UCDFile::Property), + ("", "Dash", "Dash", UCDFile::Property), + ("Dep", "Deprecated", "Deprecated", UCDFile::Property), + ("Dia", "Diacritic", "Diacritic", UCDFile::Property), + ("Ext", "Extender", "Extender", UCDFile::Property), + ("Hex", "Hex_Digit", "HexDigit", UCDFile::Property), ( "IDSB", "IDS_Binary_Operator", "IDSBinaryOperator", - "PropList.txt", + UCDFile::Property, ), ( "IDST", "IDS_Trinary_Operator", "IDSTrinaryOperator", - "PropList.txt", + UCDFile::Property, ), - ("Ideo", "Ideographic", "Ideographic", "PropList.txt"), - ("Join_C", "Join_Control", "JoinControl", "PropList.txt"), + ("Ideo", "Ideographic", "Ideographic", UCDFile::Property), + ("Join_C", "Join_Control", "JoinControl", UCDFile::Property), ( "LOE", "Logical_Order_Exception", "LogicalOrderException", - "PropList.txt", - ), - ( - "Lower", - "Lowercase", - "Lowercase", - "DerivedCoreProperties.txt", + UCDFile::Property, ), + ("Lower", "Lowercase", "Lowercase", UCDFile::CoreProperty), ( "NChar", "Noncharacter_Code_Point", "NoncharacterCodePoint", - "PropList.txt", + UCDFile::Property, + ), + ( + "Pat_Syn", + "Pattern_Syntax", + "PatternSyntax", + UCDFile::Property, ), - ("Pat_Syn", "Pattern_Syntax", "PatternSyntax", "PropList.txt"), ( "Pat_WS", "Pattern_White_Space", "PatternWhiteSpace", - "PropList.txt", + UCDFile::Property, ), - ("QMark", "Quotation_Mark", "QuotationMark", "PropList.txt"), - ("", "Radical", "Radical", "PropList.txt"), + ( + "QMark", + "Quotation_Mark", + "QuotationMark", + UCDFile::Property, + ), + ("", "Radical", "Radical", UCDFile::Property), ( "RI", "Regional_Indicator", "RegionalIndicator", - "PropList.txt", + UCDFile::Property, ), ( "STerm", "Sentence_Terminal", "SentenceTerminal", - "PropList.txt", + UCDFile::Property, ), - ("SD", "Soft_Dotted", "SoftDotted", "PropList.txt"), + ("SD", "Soft_Dotted", "SoftDotted", UCDFile::Property), ( "Term", "Terminal_Punctuation", "TerminalPunctuation", - "PropList.txt", + UCDFile::Property, ), ( "UIdeo", "Unified_Ideograph", "UnifiedIdeograph", - "PropList.txt", - ), - ( - "Upper", - "Uppercase", - "Uppercase", - "DerivedCoreProperties.txt", + UCDFile::Property, ), + ("Upper", "Uppercase", "Uppercase", UCDFile::CoreProperty), ( "VS", "Variation_Selector", "VariationSelector", - "PropList.txt", + UCDFile::Property, ), - ("space", "White_Space", "WhiteSpace", "PropList.txt"), - ("", "Emoji", "Emoji", "emoji-data.txt"), + ("space", "White_Space", "WhiteSpace", UCDFile::Property), + ("", "Emoji", "Emoji", UCDFile::EmojiProperty), ( "EComp", "Emoji_Component", "EmojiComponent", - "emoji-data.txt", + UCDFile::EmojiProperty, + ), + ( + "EMod", + "Emoji_Modifier", + "EmojiModifier", + UCDFile::EmojiProperty, ), - ("EMod", "Emoji_Modifier", "EmojiModifier", "emoji-data.txt"), ( "EBase", "Emoji_Modifier_Base", "EmojiModifierBase", - "emoji-data.txt", + UCDFile::EmojiProperty, ), ( "EPres", "Emoji_Presentation", "EmojiPresentation", - "emoji-data.txt", + UCDFile::EmojiProperty, ), ( "ExtPict", "Extended_Pictographic", "ExtendedPictographic", - "emoji-data.txt", + UCDFile::EmojiProperty, ), ( "CWKCF", "Changes_When_NFKC_Casefolded", "ChangesWhenNFKCCasefolded", - "DerivedNormalizationProps.txt", + UCDFile::DerivedNormalizationProperty, ), ( "Bidi_M", "Bidi_Mirrored", "BidiMirrored", - "DerivedBinaryProperties.txt", + UCDFile::DerivedBinaryProperties, ), ]; diff --git a/gen-unicode/src/case_folding.rs b/gen-unicode/src/case_folding.rs index ec01b1e..838dfe1 100644 --- a/gen-unicode/src/case_folding.rs +++ b/gen-unicode/src/case_folding.rs @@ -1,7 +1,5 @@ -use crate::MAX_LENGTH; -use codegen::Scope; -use std::fs::File; -use std::io::{self, BufRead}; +use crate::{GenUnicode, MAX_LENGTH}; +use ucd_parse::CaseStatus; type CodePoint = u32; @@ -82,69 +80,43 @@ impl DeltaBlock { } } -fn create_delta_blocks(fps: &[FoldPair]) -> Vec { - let mut blocks: Vec = Vec::new(); - for &fp in fps { - match blocks.last_mut() { - Some(ref mut db) if db.can_append(fp) => db.append(fp), - _ => blocks.push(DeltaBlock::create(fp)), - } - } - blocks -} +impl GenUnicode { + pub(crate) fn generate_case_folds(&mut self) { + let mut fold_pairs = Vec::new(); -fn format_delta_blocks(scope: &mut Scope, dbs: &[DeltaBlock]) { - let mut lines = Vec::new(); - for db in dbs { - lines.push(format!( - "FoldRange::from({start:#04X}, {length}, {delta}, {modulo}),", - start = db.first().orig, - length = db.length(), - delta = db.delta(), - modulo = db.stride().unwrap_or(1), - )); - } - - scope.raw(&format!( - "pub(crate) const FOLDS: [FoldRange; {}] = [\n {}\n];", - dbs.len(), - lines.join("\n ") - )); -} - -/// Parse a CaseFolding line if it is of Common type. -/// Example line: "0051; C; 0071; # LATIN CAPITAL LETTER Q" -fn process_simple_fold(s: &str) -> Option { - // Trim trailing #s which are comments. - if let Some(s) = s.trim().split('#').next() { - let fields: Vec<&str> = s.split(';').map(str::trim).collect(); - if fields.len() != 4 { - return None; - } - let status = fields[1]; - if status != "C" && status != "S" { - return None; + for case_fold in &self.case_folds { + if case_fold.status != CaseStatus::Common && case_fold.status != CaseStatus::Simple { + continue; + } + fold_pairs.push(FoldPair { + orig: case_fold.codepoint.value(), + folded: case_fold.mapping[0].value(), + }); } - let from_hex = |s: &str| u32::from_str_radix(s, 16).unwrap(); - let (orig, folded) = (from_hex(fields[0]), from_hex(fields[2])); - return Some(FoldPair { orig, folded }); - } - None -} -pub(crate) fn generate_folds(scope: &mut Scope) { - let file = File::open("CaseFolding.txt").expect("could not open CaseFolding.txt"); - let lines = io::BufReader::new(file).lines(); - - let mut fold_pairs = Vec::new(); - for line in lines { - if let Some(s) = line.unwrap().as_str().trim().split('#').next() { - if let Some(fp) = process_simple_fold(s) { - fold_pairs.push(fp); + let mut delta_blocks: Vec = Vec::new(); + for fp in &fold_pairs { + match delta_blocks.last_mut() { + Some(ref mut db) if db.can_append(*fp) => db.append(*fp), + _ => delta_blocks.push(DeltaBlock::create(*fp)), } } - } - let delta_blocks = create_delta_blocks(&fold_pairs); - format_delta_blocks(scope, &delta_blocks) + let mut lines = Vec::new(); + for db in &delta_blocks { + lines.push(format!( + "FoldRange::from({start:#04X}, {length}, {delta}, {modulo}),", + start = db.first().orig, + length = db.length(), + delta = db.delta(), + modulo = db.stride().unwrap_or(1), + )); + } + + self.scope.raw(&format!( + "pub(crate) const FOLDS: [FoldRange; {}] = [\n {}\n];", + delta_blocks.len(), + lines.join("\n ") + )); + } } diff --git a/gen-unicode/src/general_category_values.rs b/gen-unicode/src/general_category_values.rs index cd3c492..e035dc3 100644 --- a/gen-unicode/src/general_category_values.rs +++ b/gen-unicode/src/general_category_values.rs @@ -1,294 +1,300 @@ -use crate::{chars_to_code_point_ranges, pack_adjacent_chars, parse_line}; +use crate::{chars_to_code_point_ranges, codepoints_to_range, pack_adjacent_chars, GenUnicode}; +use codegen::{Block, Enum, Function}; use std::collections::HashMap; -use std::fs::File; -use std::io::{self, BufRead}; - -use codegen::{Block, Enum, Function, Scope}; - -pub(crate) fn generate(scope: &mut Scope) { - let mut property_enum = Enum::new("UnicodePropertyValueGeneralCategory"); - property_enum - .vis("pub") - .derive("Debug") - .derive("Clone") - .derive("Copy"); - - let mut is_property_fn = Function::new("is_property_value_general_category"); - is_property_fn - .vis("pub(crate)") - .arg("c", "char") - .arg("value", "&UnicodePropertyValueGeneralCategory") - .ret("bool") - .line("use UnicodePropertyValueGeneralCategory::*;"); - let mut is_property_fn_match_block = Block::new("match value"); - - let mut property_from_str_fn = - Function::new("unicode_property_value_general_category_from_str"); - property_from_str_fn - .arg("s", "&str") - .ret("Option") - .vis("pub") - .line("use UnicodePropertyValueGeneralCategory::*;"); - let mut property_from_str_fn_match_block = Block::new("match s"); - - for (alias0, alias1, orig_name, name) in GENERAL_CATEGORY_VALUES { - let file = File::open("DerivedGeneralCategory.txt") - .expect("could not open DerivedGeneralCategory.txt"); - let lines = io::BufReader::new(file).lines(); - let mut chars = Vec::new(); - - for line in lines { - parse_line(&line.unwrap(), &mut chars, alias1); - } - - pack_adjacent_chars(&mut chars); - - // Some properties cannot be packed into a CodePointRange. - if ["Unassigned", "Private_Use"].contains(orig_name) { - scope.raw(&format!( - "pub(crate) const {}: [CodePointRangeUnpacked; {}] = [\n {}\n];", - orig_name.to_uppercase(), - chars.len(), - chars - .iter() - .map(|cs| format!("CodePointRangeUnpacked::from({}, {}),", cs.0, cs.1)) - .collect::>() - .join("\n ") - )); - } else { - let ranges = chars_to_code_point_ranges(&chars); - scope.raw(&format!( - "pub(crate) const {}: [CodePointRange; {}] = [\n {}\n];", - orig_name.to_uppercase(), - ranges.len(), - ranges.join("\n ") - )); - } - scope - .new_fn(&format!("is_{}", orig_name.to_lowercase())) +impl GenUnicode { + pub(crate) fn generate_general_category(&mut self) { + let mut property_enum = Enum::new("UnicodePropertyValueGeneralCategory"); + property_enum + .vis("pub") + .derive("Debug") + .derive("Clone") + .derive("Copy"); + + let mut is_property_fn = Function::new("is_property_value_general_category"); + is_property_fn .vis("pub(crate)") .arg("c", "char") + .arg("value", "&UnicodePropertyValueGeneralCategory") .ret("bool") - .line(&format!( - "{}.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()", - orig_name.to_uppercase() - )) - .doc(&format!( - "Return whether c has the '{}' Unicode property.", - orig_name - )); - - property_enum.new_variant(name); + .line("use UnicodePropertyValueGeneralCategory::*;"); + let mut is_property_fn_match_block = Block::new("match value"); + + let mut property_from_str_fn = + Function::new("unicode_property_value_general_category_from_str"); + property_from_str_fn + .arg("s", "&str") + .ret("Option") + .vis("pub") + .line("use UnicodePropertyValueGeneralCategory::*;"); + let mut property_from_str_fn_match_block = Block::new("match s"); + + for (alias0, alias1, orig_name, name) in GENERAL_CATEGORY_VALUES { + let mut chars = Vec::new(); + + for row in &self.derived_general_category { + if row.general_category == *alias1 { + chars.push(codepoints_to_range(&row.codepoints)); + } + } - is_property_fn_match_block.line(format!("{} => is_{}(c),", name, orig_name.to_lowercase())); + pack_adjacent_chars(&mut chars); + + // Some properties cannot be packed into a CodePointRange. + if ["Unassigned", "Private_Use"].contains(orig_name) { + self.scope.raw(&format!( + "pub(crate) const {}: [CodePointRangeUnpacked; {}] = [\n {}\n];", + orig_name.to_uppercase(), + chars.len(), + chars + .iter() + .map(|cs| format!("CodePointRangeUnpacked::from({}, {}),", cs.0, cs.1)) + .collect::>() + .join("\n ") + )); + } else { + let ranges = chars_to_code_point_ranges(&chars); + self.scope.raw(&format!( + "pub(crate) const {}: [CodePointRange; {}] = [\n {}\n];", + orig_name.to_uppercase(), + ranges.len(), + ranges.join("\n ") + )); + } - property_from_str_fn_match_block.line(if alias0.is_empty() { - format!("\"{}\" | \"{}\" => Some({}),", alias1, orig_name, name) - } else { - format!( - "\"{}\" | \"{}\" | \"{}\" => Some({}),", - alias0, alias1, orig_name, name - ) - }); - } + self.scope + .new_fn(&format!("is_{}", orig_name.to_lowercase())) + .vis("pub(crate)") + .arg("c", "char") + .ret("bool") + .line(&format!( + "{}.binary_search_by(|&cpr| cpr.compare(c as u32)).is_ok()", + orig_name.to_uppercase() + )) + .doc(&format!( + "Return whether c has the '{}' Unicode property.", + orig_name + )); + + property_enum.new_variant(*name); + + is_property_fn_match_block.line(format!( + "{} => is_{}(c),", + name, + orig_name.to_lowercase() + )); - for (alias0, alias1, orig_name, name, value_names_str) in GENERAL_CATEGORY_VALUES_DERIVED { - let value_name_ifs: Vec = value_names_str - .split(',') - .map(|name| format!("is_{}(c)", name.to_lowercase())) - .collect(); + property_from_str_fn_match_block.line(if alias0.is_empty() { + format!("\"{}\" | \"{}\" => Some({}),", alias1, orig_name, name) + } else { + format!( + "\"{}\" | \"{}\" | \"{}\" => Some({}),", + alias0, alias1, orig_name, name + ) + }); + } - scope - .new_fn(&format!("is_{}", orig_name.to_lowercase())) - .vis("pub(crate)") - .arg("c", "char") - .ret("bool") - .line(value_name_ifs.join(" || ")) - .doc(&format!( - "Return whether c has the '{}' Unicode property.", - orig_name + for (alias0, alias1, orig_name, name, value_names_str) in GENERAL_CATEGORY_VALUES_DERIVED { + let value_name_ifs: Vec = value_names_str + .split(',') + .map(|name| format!("is_{}(c)", name.to_lowercase())) + .collect(); + + self.scope + .new_fn(&format!("is_{}", orig_name.to_lowercase())) + .vis("pub(crate)") + .arg("c", "char") + .ret("bool") + .line(value_name_ifs.join(" || ")) + .doc(&format!( + "Return whether c has the '{}' Unicode property.", + orig_name + )); + + property_enum.new_variant(*name); + + is_property_fn_match_block.line(format!( + "{} => is_{}(c),", + name, + orig_name.to_lowercase() )); - property_enum.new_variant(name); + property_from_str_fn_match_block.line(if alias0.is_empty() { + format!("\"{}\" | \"{}\" => Some({}),", alias1, orig_name, name) + } else { + format!( + "\"{}\" | \"{}\" | \"{}\" => Some({}),", + alias0, alias1, orig_name, name + ) + }); + } - is_property_fn_match_block.line(format!("{} => is_{}(c),", name, orig_name.to_lowercase())); + is_property_fn.push_block(is_property_fn_match_block); - property_from_str_fn_match_block.line(if alias0.is_empty() { - format!("\"{}\" | \"{}\" => Some({}),", alias1, orig_name, name) - } else { - format!( - "\"{}\" | \"{}\" | \"{}\" => Some({}),", - alias0, alias1, orig_name, name - ) - }); - } + property_from_str_fn_match_block.line("_ => None,"); + property_from_str_fn.push_block(property_from_str_fn_match_block); - is_property_fn.push_block(is_property_fn_match_block); + self.scope + .push_fn(is_property_fn) + .push_enum(property_enum) + .push_fn(property_from_str_fn); + } - property_from_str_fn_match_block.line("_ => None,"); - property_from_str_fn.push_block(property_from_str_fn_match_block); + pub(crate) fn generate_general_category_tests(&mut self) { + let mut char_map: HashMap<&str, Vec<(u32, u32)>> = HashMap::new(); - scope - .push_fn(is_property_fn) - .push_enum(property_enum) - .push_fn(property_from_str_fn); -} + for (alias0, alias1, orig_name, name) in GENERAL_CATEGORY_VALUES { + // We skip surrogates, as rust does not allow them as chars. + if *name == "Surrogate" { + continue; + } -pub(crate) fn generate_tests(scope: &mut Scope) { - let mut char_map: HashMap<&str, Vec<(u32, u32)>> = HashMap::new(); + let mut chars = Vec::new(); - for (alias0, alias1, orig_name, name) in GENERAL_CATEGORY_VALUES { - // We skip surrogates, as rust does not allow them as chars. - if *name == "Surrogate" { - continue; - } + for row in &self.derived_general_category { + if row.general_category == *alias1 { + chars.push(codepoints_to_range(&row.codepoints)); + } + } - let file = File::open("DerivedGeneralCategory.txt").unwrap(); - let lines = io::BufReader::new(file).lines(); - let mut chars = Vec::new(); + char_map.insert(orig_name, chars.clone()); + + self.scope_tests + .new_fn(&format!( + "unicode_escape_property_gc_{}", + name.to_lowercase() + )) + .attr("test") + .line(format!( + "test_with_configs(unicode_escape_property_gc_{}_tc)", + name.to_lowercase() + )); + + let f = self.scope_tests.new_fn(&format!( + "unicode_escape_property_gc_{}_tc", + name.to_lowercase() + )); - for line in lines { - parse_line(&line.unwrap(), &mut chars, alias1); - } + f.arg("tc", "TestConfig"); - char_map.insert(orig_name, chars.clone()); + let code_points: Vec = chars + .iter() + .map(|c| format!("\"\\u{{{:x}}}\"", c.0)) + .collect(); - scope - .new_fn(&format!( - "unicode_escape_property_gc_{}", - name.to_lowercase() - )) - .attr("test") - .line(format!( - "test_with_configs(unicode_escape_property_gc_{}_tc)", - name.to_lowercase() + f.line(format!( + "const CODE_POINTS: [&str; {}] = [\n {},\n];", + code_points.len(), + code_points.join(",\n ") )); - let f = scope.new_fn(&format!( - "unicode_escape_property_gc_{}_tc", - name.to_lowercase() - )); - - f.arg("tc", "TestConfig"); - - let code_points: Vec = chars - .iter() - .map(|c| format!("\"\\u{{{:x}}}\"", c.0)) - .collect(); - - f.line(format!( - "const CODE_POINTS: [&str; {}] = [\n {},\n];", - code_points.len(), - code_points.join(",\n ") - )); - - let mut regexes = vec![ - format!(r#""^\\p{{General_Category={}}}+$""#, orig_name), - format!(r#""^\\p{{gc={}}}+$""#, orig_name), - format!(r#""^\\p{{{}}}+$""#, orig_name), - ]; - - if !alias0.is_empty() { - regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias0)); - regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias0)); - regexes.push(format!(r#""^\\p{{{}}}+$""#, alias0)); - } + let mut regexes = vec![ + format!(r#""^\\p{{General_Category={}}}+$""#, orig_name), + format!(r#""^\\p{{gc={}}}+$""#, orig_name), + format!(r#""^\\p{{{}}}+$""#, orig_name), + ]; - if !alias1.is_empty() { - regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias1)); - regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias1)); - regexes.push(format!(r#""^\\p{{{}}}+$""#, alias1)); - } + if !alias0.is_empty() { + regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias0)); + regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias0)); + regexes.push(format!(r#""^\\p{{{}}}+$""#, alias0)); + } - f.line(format!( - "const REGEXES: [&str; {}] = [\n {},\n];", - regexes.len(), - regexes.join(",\n ") - )); + if !alias1.is_empty() { + regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias1)); + regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias1)); + regexes.push(format!(r#""^\\p{{{}}}+$""#, alias1)); + } - let mut b = Block::new("for regex in REGEXES"); - b.line(r#"let regex = tc.compilef(regex, "u");"#); + f.line(format!( + "const REGEXES: [&str; {}] = [\n {},\n];", + regexes.len(), + regexes.join(",\n ") + )); - let mut bb = Block::new("for code_point in CODE_POINTS"); - bb.line("regex.test_succeeds(code_point);"); + let mut b = Block::new("for regex in REGEXES"); + b.line(r#"let regex = tc.compilef(regex, "u");"#); - b.push_block(bb); + let mut bb = Block::new("for code_point in CODE_POINTS"); + bb.line("regex.test_succeeds(code_point);"); - f.push_block(b); - } + b.push_block(bb); - for (alias0, alias1, orig_name, name, value_names_str) in GENERAL_CATEGORY_VALUES_DERIVED { - let mut chars = Vec::new(); + f.push_block(b); + } + + for (alias0, alias1, orig_name, name, value_names_str) in GENERAL_CATEGORY_VALUES_DERIVED { + let mut chars = Vec::new(); - for value_name in value_names_str.split(',') { - if let Some(cs) = char_map.get(value_name) { - chars.append(&mut cs.clone()); + for value_name in value_names_str.split(',') { + if let Some(cs) = char_map.get(value_name) { + chars.append(&mut cs.clone()); + } } - } - scope - .new_fn(&format!( - "unicode_escape_property_gc_{}", - name.to_lowercase() - )) - .attr("test") - .line(format!( - "test_with_configs(unicode_escape_property_gc_{}_tc)", + self.scope_tests + .new_fn(&format!( + "unicode_escape_property_gc_{}", + name.to_lowercase() + )) + .attr("test") + .line(format!( + "test_with_configs(unicode_escape_property_gc_{}_tc)", + name.to_lowercase() + )); + + let f = self.scope_tests.new_fn(&format!( + "unicode_escape_property_gc_{}_tc", name.to_lowercase() )); - let f = scope.new_fn(&format!( - "unicode_escape_property_gc_{}_tc", - name.to_lowercase() - )); - - f.arg("tc", "TestConfig"); - - let code_points: Vec = chars - .iter() - .map(|c| format!("\"\\u{{{:x}}}\"", c.0)) - .collect(); - - f.line(format!( - "const CODE_POINTS: [&str; {}] = [\n {},\n];", - code_points.len(), - code_points.join(",\n ") - )); - - let mut regexes = vec![ - format!(r#""^\\p{{General_Category={}}}+$""#, orig_name), - format!(r#""^\\p{{gc={}}}+$""#, orig_name), - format!(r#""^\\p{{{}}}+$""#, orig_name), - ]; - - if !alias0.is_empty() { - regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias0)); - regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias0)); - regexes.push(format!(r#""^\\p{{{}}}+$""#, alias0)); - } + f.arg("tc", "TestConfig"); - if !alias1.is_empty() { - regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias1)); - regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias1)); - regexes.push(format!(r#""^\\p{{{}}}+$""#, alias1)); - } + let code_points: Vec = chars + .iter() + .map(|c| format!("\"\\u{{{:x}}}\"", c.0)) + .collect(); + + f.line(format!( + "const CODE_POINTS: [&str; {}] = [\n {},\n];", + code_points.len(), + code_points.join(",\n ") + )); + + let mut regexes = vec![ + format!(r#""^\\p{{General_Category={}}}+$""#, orig_name), + format!(r#""^\\p{{gc={}}}+$""#, orig_name), + format!(r#""^\\p{{{}}}+$""#, orig_name), + ]; - f.line(format!( - "const REGEXES: [&str; {}] = [\n {},\n];", - regexes.len(), - regexes.join(",\n ") - )); + if !alias0.is_empty() { + regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias0)); + regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias0)); + regexes.push(format!(r#""^\\p{{{}}}+$""#, alias0)); + } + + if !alias1.is_empty() { + regexes.push(format!(r#""^\\p{{General_Category={}}}+$""#, alias1)); + regexes.push(format!(r#""^\\p{{gc={}}}+$""#, alias1)); + regexes.push(format!(r#""^\\p{{{}}}+$""#, alias1)); + } - let mut b = Block::new("for regex in REGEXES"); - b.line(r#"let regex = tc.compilef(regex, "u");"#); + f.line(format!( + "const REGEXES: [&str; {}] = [\n {},\n];", + regexes.len(), + regexes.join(",\n ") + )); + + let mut b = Block::new("for regex in REGEXES"); + b.line(r#"let regex = tc.compilef(regex, "u");"#); - let mut bb = Block::new("for code_point in CODE_POINTS"); - bb.line("regex.test_succeeds(code_point);"); + let mut bb = Block::new("for code_point in CODE_POINTS"); + bb.line("regex.test_succeeds(code_point);"); - b.push_block(bb); + b.push_block(bb); - f.push_block(b); + f.push_block(b); + } } } diff --git a/gen-unicode/src/main.rs b/gen-unicode/src/main.rs index f82235f..9709669 100644 --- a/gen-unicode/src/main.rs +++ b/gen-unicode/src/main.rs @@ -5,7 +5,11 @@ mod scripts; use codegen::Scope; use std::{fs::OpenOptions, io::Write}; -use ucd_parse::{parse, PropertyValueAlias, Script, ScriptExtension}; +use ucd_parse::{ + extracted::{DerivedBinaryProperties, DerivedGeneralCategory}, + parse, CaseFold, Codepoints, CoreProperty, DerivedNormalizationProperty, EmojiProperty, + Property, PropertyValueAlias, Script, ScriptExtension, +}; // Should match unicode.rs. const CODE_POINT_BITS: u32 = 20; @@ -16,23 +20,40 @@ const MAX_CODE_POINT: u32 = (1 << CODE_POINT_BITS) - 1; // Our length is stored with a bias of -1, so no need to subtract 1. const MAX_LENGTH: u32 = 1 << LENGTH_BITS; +pub(crate) const UCD_PATH: &str = "/tmp/ucd-15.0.0"; + pub(crate) struct GenUnicode { pub(crate) scope: Scope, pub(crate) scope_tests: Scope, + pub(crate) case_folds: Vec, pub(crate) property_value_aliases: Vec, pub(crate) scripts: Vec