diff --git a/glyphs2fontir/src/glyphdata.rs b/glyphs2fontir/src/glyphdata.rs index ffa209a92..31d2f3ef4 100644 --- a/glyphs2fontir/src/glyphdata.rs +++ b/glyphs2fontir/src/glyphdata.rs @@ -4,8 +4,701 @@ use std::collections::HashSet; -pub(crate) fn is_nonspacing_mark(codepoints: &HashSet, name: &str) -> bool { - // Try first by name +#[inline(always)] +pub(crate) fn might_be_a_nonspacing_mark_name(name: &str) -> bool { + // shove the first 4 chars into a u32 and see if they could possibly be a nonspacing mark + let head = name + .chars() + .enumerate() + .take(4) + .fold(0u32, |acc, (i, ch)| acc | (ch as u32) << (i * 8)); + matches!( + head, + 3232598 + | 3298134 + | 3363670 + | 3429206 + | 3494742 + | 3560278 + | 3625814 + | 3691350 + | 3756886 + | 761357412 + | 761358190 + | 761358434 + | 761358435 + | 761358436 + | 761358439 + | 761358442 + | 761358443 + | 761358448 + | 761358451 + | 761358452 + | 761359468 + | 761359982 + | 761361010 + | 761361267 + | 761361524 + | 761362798 + | 761621615 + | 762208626 + | 762275694 + | 762275952 + | 762275956 + | 762277486 + | 762280302 + | 762605412 + | 762798451 + | 762935666 + | 808538966 + | 809841013 + | 812215925 + | 825241973 + | 825307509 + | 825316182 + | 826552693 + | 826618229 + | 827545158 + | 828993141 + | 842093398 + | 844322374 + | 845770357 + | 858870614 + | 861099590 + | 862547573 + | 875647830 + | 892425046 + | 909202262 + | 960835957 + | 1094938219 + | 1095003755 + | 1095200363 + | 1095265899 + | 1095396971 + | 1095528043 + | 1097428597 + | 1097687404 + | 1110847861 + | 1130979693 + | 1145335403 + | 1147692402 + | 1164534125 + | 1164799595 + | 1180788590 + | 1181314677 + | 1196322411 + | 1212378731 + | 1212837483 + | 1214801266 + | 1214865773 + | 1231908459 + | 1265197421 + | 1280077419 + | 1281453683 + | 1281646963 + | 1281778033 + | 1298494837 + | 1313762923 + | 1348563828 + | 1381133931 + | 1382638444 + | 1397452395 + | 1398891620 + | 1398892398 + | 1398892642 + | 1398892643 + | 1398892644 + | 1398892647 + | 1398892651 + | 1398892656 + | 1398892659 + | 1398892660 + | 1398892666 + | 1398894190 + | 1398895475 + | 1398895476 + | 1398895732 + | 1398897006 + | 1398897252 + | 1399415149 + | 1400139620 + | 1414819435 + | 1415930732 + | 1416192365 + | 1450538341 + | 1498312299 + | 1516861806 + | 1631809131 + | 1631811172 + | 1632138852 + | 1632330092 + | 1632461153 + | 1632461164 + | 1632461173 + | 1632462181 + | 1632462191 + | 1632462197 + | 1632463201 + | 1632463209 + | 1632463215 + | 1632464751 + | 1632466273 + | 1632466293 + | 1632466532 + | 1632467321 + | 1632794212 + | 1633772661 + | 1633841010 + | 1633906550 + | 1633968499 + | 1633969249 + | 1634165092 + | 1634230632 + | 1634230637 + | 1634230644 + | 1634231396 + | 1634231673 + | 1634235252 + | 1634235504 + | 1634235508 + | 1634237028 + | 1634296929 + | 1634296943 + | 1634296949 + | 1634413941 + | 1634427243 + | 1634427250 + | 1634427257 + | 1634429294 + | 1634492776 + | 1634492784 + | 1634493028 + | 1634495596 + | 1634496360 + | 1634558321 + | 1634558329 + | 1634559333 + | 1634560353 + | 1634561903 + | 1634563425 + | 1634563445 + | 1634625889 + | 1634628705 + | 1634628709 + | 1634628973 + | 1634695028 + | 1634755940 + | 1634885995 + | 1634886003 + | 1634887033 + | 1634888054 + | 1634938209 + | 1634938213 + | 1634938217 + | 1634938223 + | 1634938229 + | 1634951534 + | 1634953590 + | 1634956139 + | 1635017064 + | 1635017072 + | 1635018082 + | 1635018867 + | 1635019110 + | 1635020644 + | 1635020649 + | 1635020917 + | 1635086693 + | 1635086707 + | 1650553461 + | 1650553717 + | 1651007860 + | 1651339107 + | 1651797860 + | 1651856225 + | 1651863396 + | 1665234532 + | 1667328115 + | 1667330678 + | 1667591791 + | 1667785076 + | 1667789669 + | 1667789925 + | 1667850605 + | 1667854198 + | 1667855220 + | 1668113767 + | 1668241004 + | 1668241010 + | 1668246627 + | 1668249196 + | 1668249202 + | 1668249460 + | 1668442467 + | 1668511097 + | 1668575076 + | 1668639605 + | 1668640367 + | 1668643425 + | 1668835699 + | 1683322468 + | 1684105331 + | 1684300141 + | 1684366179 + | 1684632162 + | 1684826484 + | 1684955491 + | 1684957538 + | 1684957555 + | 1685026676 + | 1685217640 + | 1685284961 + | 1685416556 + | 1685417569 + | 1699506529 + | 1699570024 + | 1699570028 + | 1699570029 + | 1699570030 + | 1699570039 + | 1699903076 + | 1700882788 + | 1700950889 + | 1700952184 + | 1701064050 + | 1701078390 + | 1701079415 + | 1701082474 + | 1701208434 + | 1701260649 + | 1701273956 + | 1701345897 + | 1701405037 + | 1701588331 + | 1701588332 + | 1701588333 + | 1701588334 + | 1701588336 + | 1701588338 + | 1701588340 + | 1701673590 + | 1701734766 + | 1701736308 + | 1701865844 + | 1701867637 + | 1701929338 + | 1701994864 + | 1701995879 + | 1701996660 + | 1702129253 + | 1702130529 + | 1702252385 + | 1702258028 + | 1702258034 + | 1702258035 + | 1702259046 + | 1702260329 + | 1702326124 + | 1702326134 + | 1702453616 + | 1702519152 + | 1717922913 + | 1718185057 + | 1731027304 + | 1731027314 + | 1731027318 + | 1734436211 + | 1734437991 + | 1734440545 + | 1734700399 + | 1734830446 + | 1734830455 + | 1734955881 + | 1734955885 + | 1734955893 + | 1734962287 + | 1734962293 + | 1735156324 + | 1735287161 + | 1735288172 + | 1735289202 + | 1735290732 + | 1735292264 + | 1735356537 + | 1735549284 + | 1751218034 + | 1751477359 + | 1751607653 + | 1751607656 + | 1751607660 + | 1751607666 + | 1751736685 + | 1751867764 + | 1751869806 + | 1752000867 + | 1752196466 + | 1752198241 + | 1752392046 + | 1752392048 + | 1752397169 + | 1752457570 + | 1752457574 + | 1752462433 + | 1766220146 + | 1766220151 + | 1766220153 + | 1766744428 + | 1767072097 + | 1767072108 + | 1767073135 + | 1767073141 + | 1767073646 + | 1767074149 + | 1767074153 + | 1767077217 + | 1768187235 + | 1768320617 + | 1768449380 + | 1768515169 + | 1768518260 + | 1768710767 + | 1768711540 + | 1768714099 + | 1768777062 + | 1768777063 + | 1768777075 + | 1768843629 + | 1769103975 + | 1769104752 + | 1769104761 + | 1769105507 + | 1769105768 + | 1769105782 + | 1769109108 + | 1769169252 + | 1769171314 + | 1769172587 + | 1769174381 + | 1769234809 + | 1769235833 + | 1769238113 + | 1769238377 + | 1769366898 + | 1769366900 + | 1785622371 + | 1798136163 + | 1798136164 + | 1798136166 + | 1798136167 + | 1798136168 + | 1798136170 + | 1798136171 + | 1798136172 + | 1798136177 + | 1798136180 + | 1798137189 + | 1798137199 + | 1798137201 + | 1798138225 + | 1798139747 + | 1798139748 + | 1798139755 + | 1798139756 + | 1798139757 + | 1798139758 + | 1798139760 + | 1798139764 + | 1798139766 + | 1798141297 + | 1798142322 + | 1801544801 + | 1801547379 + | 1802204530 + | 1802332515 + | 1802465128 + | 1802658157 + | 1802659181 + | 1802855785 + | 1816227433 + | 1818323059 + | 1818323300 + | 1818390375 + | 1818452847 + | 1818583666 + | 1818583670 + | 1818583672 + | 1818850160 + | 1819042404 + | 1819047280 + | 1819108730 + | 1819500919 + | 1819507053 + | 1819568500 + | 1819766636 + | 1835101306 + | 1835295088 + | 1835295089 + | 1835361642 + | 1835361645 + | 1835884900 + | 1835888483 + | 1836012386 + | 1836012388 + | 1836012391 + | 1836012394 + | 1836012397 + | 1836012403 + | 1836012406 + | 1836016455 + | 1836016460 + | 1836016461 + | 1836016462 + | 1836016466 + | 1836016481 + | 1836016482 + | 1836016483 + | 1836016484 + | 1836016485 + | 1836016486 + | 1836016487 + | 1836016488 + | 1836016489 + | 1836016491 + | 1836016492 + | 1836016493 + | 1836016494 + | 1836016495 + | 1836016496 + | 1836016498 + | 1836016499 + | 1836016500 + | 1836016501 + | 1836016502 + | 1836016503 + | 1836016504 + | 1836016506 + | 1836017778 + | 1836213606 + | 1849783908 + | 1851868005 + | 1851876197 + | 1851877492 + | 1851879284 + | 1852138867 + | 1852141411 + | 1852141669 + | 1852141679 + | 1852270963 + | 1852399994 + | 1852401273 + | 1852401776 + | 1852401779 + | 1852787813 + | 1852787817 + | 1852794735 + | 1852796782 + | 1852927851 + | 1852989809 + | 1852993384 + | 1853189987 + | 1853321060 + | 1865245026 + | 1865245027 + | 1865245028 + | 1865245031 + | 1865245032 + | 1865245034 + | 1865245035 + | 1865245036 + | 1865245037 + | 1865245038 + | 1865245040 + | 1865245042 + | 1865245043 + | 1865245044 + | 1865245046 + | 1865245049 + | 1867276641 + | 1867934057 + | 1867934828 + | 1867935596 + | 1867936370 + | 1867937125 + | 1867937141 + | 1868718437 + | 1868718454 + | 1868718456 + | 1868783979 + | 1868783982 + | 1868783984 + | 1868783986 + | 1868784993 + | 1868785253 + | 1868786038 + | 1868786041 + | 1868787305 + | 1868787553 + | 1868787577 + | 1868789113 + | 1868789345 + | 1869047155 + | 1869108069 + | 1869116261 + | 1869308267 + | 1869308275 + | 1869311856 + | 1869374052 + | 1869442145 + | 1869442920 + | 1869506938 + | 1869767011 + | 1869768058 + | 1869770603 + | 1869771361 + | 1869771891 + | 1869832557 + | 1870031201 + | 1870031980 + | 1870033522 + | 1870034293 + | 1885432929 + | 1885434234 + | 1885434471 + | 1885958772 + | 1886351972 + | 1886415220 + | 1886614899 + | 1887007331 + | 1903321466 + | 1917285988 + | 1918979429 + | 1918979433 + | 1918979439 + | 1918979445 + | 1918979449 + | 1919053668 + | 1919115629 + | 1919248740 + | 1919248996 + | 1919251316 + | 1919252079 + | 1919313505 + | 1919643504 + | 1919903859 + | 1919905640 + | 1920098658 + | 1920164203 + | 1920230770 + | 1920234337 + | 1920298854 + | 1932353889 + | 1935762024 + | 1935762034 + | 1935763310 + | 1935764595 + | 1935766373 + | 1936026226 + | 1936089447 + | 1936550243 + | 1936617315 + | 1936876903 + | 1936940907 + | 1937009010 + | 1937075312 + | 1937075809 + | 1937077613 + | 1952533857 + | 1952533861 + | 1952533865 + | 1952533871 + | 1952533877 + | 1952539765 + | 1952541798 + | 1952542069 + | 1952543585 + | 1952867692 + | 1952870259 + | 1953063277 + | 1953066601 + | 1953067622 + | 1953199470 + | 1953327459 + | 1953390946 + | 1953460850 + | 1953654134 + | 1953719660 + | 1953719673 + | 1953787758 + | 1953850209 + | 1968074347 + | 1968076388 + | 1968398689 + | 1968398690 + | 1968398691 + | 1968398692 + | 1968398695 + | 1968398696 + | 1968398698 + | 1968398699 + | 1968398700 + | 1968398701 + | 1968398702 + | 1968398704 + | 1968398706 + | 1968398707 + | 1968398708 + | 1968398711 + | 1968398713 + | 1968398714 + | 1969368425 + | 1969368437 + | 1969386865 + | 1969775469 + | 1969976691 + | 1970039155 + | 1970170221 + | 1970171457 + | 1970234476 + | 1970365806 + | 1970431336 + | 1970431353 + | 1970499177 + | 1984913761 + | 1986097767 + | 1986357363 + | 1986359906 + | 1986619514 + | 1986814564 + | 1986883179 + | 1986947446 + | 1987013747 + | 1987342180 + | 2003329907 + | 2003531124 + | 2003785317 + | 2003785321 + | 2003785333 + | 2003793509 + | 2003793525 + | 2036425069 + | 2036820322 + | 2037085538 + | 2037211507 + | 2037738601 + | 2037801323 + | 2037801335 + | 2053597562 + | 2053988712 + | 2054056299 + ) +} + +pub(crate) fn is_nonspacing_mark_name(name: &str) -> bool { + if !might_be_a_nonspacing_mark_name(name) { + return false; + } + + // slow road match name { "AnnuitySymbol" => return true, "FVS1-mong" => return true, @@ -2656,6 +3349,10 @@ pub(crate) fn is_nonspacing_mark(codepoints: &HashSet, name: &str) -> bool "zqaphadotted-syriac" => return true, _ => (), } + false +} + +fn any_nonspacing_mark_codepoint(codepoints: &HashSet) -> bool { // Failing name try by codepoint for cp in codepoints { match cp { @@ -3572,3 +4269,8 @@ pub(crate) fn is_nonspacing_mark(codepoints: &HashSet, name: &str) -> bool } false } + +pub(crate) fn is_nonspacing_mark(codepoints: &HashSet, name: &str) -> bool { + // Try first by name, then codepoint + is_nonspacing_mark_name(name) || any_nonspacing_mark_codepoint(codepoints) +} diff --git a/glyphs2fontir/src/glyphdata_test.rs b/glyphs2fontir/src/glyphdata_test.rs new file mode 100644 index 000000000..aafa3e73d --- /dev/null +++ b/glyphs2fontir/src/glyphdata_test.rs @@ -0,0 +1,28 @@ +//! Handwritten tests for generated glyphdata.rs + +#[cfg(test)] +mod tests { + use crate::glyphdata::{is_nonspacing_mark_name, might_be_a_nonspacing_mark_name}; + + #[test] + fn potential_mark_a() { + assert!(!might_be_a_nonspacing_mark_name("a")); + } + + #[test] + fn potential_mark_vs1() { + assert!(might_be_a_nonspacing_mark_name("VS1")); + } + + #[test] + fn potential_mark_accutcomb() { + assert!(might_be_a_nonspacing_mark_name("acutecomb")); + assert!(is_nonspacing_mark_name("acutecomb")); + } + + #[test] + fn potential_mark_accut_whatever() { + assert!(might_be_a_nonspacing_mark_name("acutWHATEVER")); + assert!(!is_nonspacing_mark_name("acutWHATEVER")); + } +} diff --git a/glyphs2fontir/src/lib.rs b/glyphs2fontir/src/lib.rs index 5a78f19e2..7396e2126 100644 --- a/glyphs2fontir/src/lib.rs +++ b/glyphs2fontir/src/lib.rs @@ -1,3 +1,4 @@ mod glyphdata; +mod glyphdata_test; pub mod source; mod toir; diff --git a/resources/scripts/non_spacing_marks.py b/resources/scripts/non_spacing_marks.py index 94f8803f3..ed8241cbe 100644 --- a/resources/scripts/non_spacing_marks.py +++ b/resources/scripts/non_spacing_marks.py @@ -10,7 +10,7 @@ python resources/scripts/non_spacing_marks.py """ -from absl import app +#from absl import app import glyphsLib from glyphsLib.glyphdata import GlyphData from importlib import resources @@ -26,6 +26,13 @@ def glyph_data(): return GlyphData.from_files(f1, f2) +def first4_as_u32(p4): + result = 0 + for (i, ch) in enumerate(p4): + result |= ord(ch) << i * 8 + return result + + def main(_): non_spacing_marks_by_unicode = set() non_spacing_marks_by_name = set() @@ -50,18 +57,97 @@ def main(_): f.write("\n") f.write("use std::collections::HashSet;\n") + + names = sorted(non_spacing_marks_by_name) + + # We're going to assume the nonspacing mark names are ascii, confirm so + for name in names: + for ch in name: + assert ord(ch) <= 255, f"Cannot handle {name}" + + unused_first = set() + for c in range(ord('a'), ord('z') + 1): + unused_first.add(chr(c)) + unused_first.add(chr(c).upper()) + used_first = {n[0] for n in names} + unused_first = unused_first - used_first + + used_lengths = {len(n) for n in names} + unused_lengths = set(range(1, max(used_lengths) + 1)) - used_lengths + print("UNUSED:", unused_first, unused_lengths) + + unique_prefix2s = {n[:2] for n in names} + print(len(unique_prefix2s), "unique_prefix2s", sorted(unique_prefix2s)) + + unique_prefix4s = {n[:4] for n in names} + print(len(unique_prefix4s), "unique_prefix4s", sorted(unique_prefix4s)) + f.write("\n") + f.write("#[inline(always)]\n") f.write( - "pub(crate) fn is_nonspacing_mark(codepoints: &HashSet, name: &str) -> bool {\n" + "pub(crate) fn might_be_a_nonspacing_mark_name(name: &str) -> bool {\n" + ) + f.write(" // shove the first 4 chars into a u32 and see if they could possibly be a nonspacing mark\n") + f.write(" let head = name.chars().enumerate().take(4).fold(0u32, |acc, (i, ch)| acc | (ch as u32) << (i * 8));\n") + f.write(" matches!(head, ") + f.write(" | ".join(str(u32) for u32 in sorted(first4_as_u32(p4) for p4 in unique_prefix4s))) + f.write(")\n") + f.write("}\n") + + f.write("\n") + f.write( + "pub(crate) fn is_nonspacing_mark_name(name: &str) -> bool {\n" ) - f.write(" // Try first by name\n") + f.write(" if !might_be_a_nonspacing_mark_name(name) {\n") + f.write(" return false;\n") + f.write(" }\n") + + # f.write(" // fast exit: length cannot match a mark\n") + # f.write(f" if name.len() > {max(used_lengths)}") + # if unused_lengths: + # f.write(" || matches!(name.len(), ") + # f.write(" | ".join(sorted(str(l) for l in unused_lengths))) + # f.write(") {\n") + # f.write(" return false;\n") + # f.write(" }\n") + + # f.write(" // fast exit: first char indicates no match\n") + # f.write(" let Some(first) = name.chars().next() else {\n") + # f.write(" return false;\n") + # f.write(" };\n") + # f.write(" if matches!(first, ") + # f.write(" | ".join(sorted(f"'{str(c)}'" for c in unused_first))) + # f.write(") {\n") + # f.write(" return false;\n") + # f.write(" };\n") + + # f.write(" // shove the first 4 chars into a u32 and see if they could possibly be a nonspacing mark\n") + # f.write(" let head = 0u32;\n") + # f.write(" for (i, ch) in name.chars().enumerate().take(4) {\n") + # f.write(" head |= ch << i * 8;\n") + # f.write(" }\n") + # f.write(" if !matches!(head, ") + # f.write(" | ".join(str(u32) for u32 in sorted(first4_as_u32(p4) for p4 in unique_prefix4s))) + # f.write(") {\n") + # f.write(" return false;\n") + # f.write(" }\n") + + f.write("\n") + + f.write(" // slow road\n") f.write(" match name {\n") - for name in sorted(non_spacing_marks_by_name): + for name in names: f.write(f' "{name}" => return true,\n') f.write(" _ => (),\n") f.write(" }\n") + f.write(" false\n") + f.write("}\n") + f.write("\n") + f.write( + "fn any_nonspacing_mark_codepoint(codepoints: &HashSet) -> bool {\n" + ) f.write(" // Failing name try by codepoint\n") f.write(" for cp in codepoints {\n") f.write(" match cp {\n") @@ -73,6 +159,15 @@ def main(_): f.write(" false\n") f.write("}\n") + f.write("\n") + f.write( + "pub(crate) fn is_nonspacing_mark(codepoints: &HashSet, name: &str) -> bool {\n" + ) + + f.write(" // Try first by name, then codepoint\n") + f.write(" is_nonspacing_mark_name(name) || any_nonspacing_mark_codepoint(codepoints)\n") + f.write("}\n") + if __name__ == "__main__": - app.run(main) + main(None)