Skip to content

Commit

Permalink
CLDR-16346 Fix initials 2 (#2718)
Browse files Browse the repository at this point in the history
* CLDR-16346 Fix initials 2

* CLDR-16346 Fix data files also, and tests

* CLDR-16346 minor cleanup
  • Loading branch information
macchiati authored Feb 15, 2023
1 parent b73baed commit 62d07d1
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 18 deletions.
4 changes: 2 additions & 2 deletions common/main/ar.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14433,8 +14433,8 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/
<nameOrderLocales order="givenFirst">und ar</nameOrderLocales>
<nameOrderLocales order="surnameFirst">ko vi yue zh</nameOrderLocales>
<foreignSpaceReplacement xml:space="preserve"> </foreignSpaceReplacement>
<initialPattern type="initial">{0}</initialPattern>
<initialPattern type="initialSequence">{0}. {1}.</initialPattern>
<initialPattern type="initial">{0}.</initialPattern>
<initialPattern type="initialSequence">{0} {1}</initialPattern>
<personName order="givenFirst" length="long" usage="referring" formality="formal">
<namePattern>{title} {given} {given2} {surname} {generation}، {credentials}</namePattern>
</personName>
Expand Down
8 changes: 4 additions & 4 deletions common/main/en.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9318,7 +9318,7 @@ annotations.
<nameOrderLocales order="surnameFirst">ja ko vi yue zh</nameOrderLocales>
<foreignSpaceReplacement xml:space="preserve">↑↑↑</foreignSpaceReplacement>
<initialPattern type="initial">{0}.</initialPattern>
<initialPattern type="initialSequence">{0} {1}</initialPattern>
<initialPattern type="initialSequence">{0}{1}</initialPattern>
<personName order="givenFirst" length="long" usage="referring" formality="formal">
<namePattern>{title} {given} {given2} {surname} {generation}, {credentials}</namePattern>
</personName>
Expand Down Expand Up @@ -9356,7 +9356,7 @@ annotations.
<namePattern>{given-informal-monogram-allCaps}</namePattern>
</personName>
<personName order="givenFirst" length="short" usage="referring" formality="formal">
<namePattern>{given-initial} {given2-initial} {surname}</namePattern>
<namePattern>{given-initial}{given2-initial} {surname}</namePattern>
</personName>
<personName order="givenFirst" length="short" usage="referring" formality="informal">
<namePattern>{given-informal} {surname-initial}</namePattern>
Expand Down Expand Up @@ -9410,7 +9410,7 @@ annotations.
<namePattern>{given-informal-monogram-allCaps}</namePattern>
</personName>
<personName order="surnameFirst" length="short" usage="referring" formality="formal">
<namePattern>{surname} {given-initial} {given2-initial}</namePattern>
<namePattern>{surname} {given-initial}{given2-initial}</namePattern>
</personName>
<personName order="surnameFirst" length="short" usage="referring" formality="informal">
<namePattern>{surname} {given-initial}</namePattern>
Expand Down Expand Up @@ -9440,7 +9440,7 @@ annotations.
<namePattern>{surname}, {given-informal}</namePattern>
</personName>
<personName order="sorting" length="short" usage="referring" formality="formal">
<namePattern>{surname-core}, {given-initial} {given2-initial} {surname-prefix}</namePattern>
<namePattern>{surname-core}, {given-initial}{given2-initial} {surname-prefix}</namePattern>
</personName>
<personName order="sorting" length="short" usage="referring" formality="informal">
<namePattern>{surname}, {given-informal}</namePattern>
Expand Down
4 changes: 2 additions & 2 deletions common/main/nl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19410,7 +19410,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/
<namePattern>{given-informal-monogram-allCaps}</namePattern>
</personName>
<personName order="surnameFirst" length="short" usage="referring" formality="formal">
<namePattern>{surname} {given-initial} {given2-initial}</namePattern>
<namePattern>{surname} {given-initial}{given2-initial}</namePattern>
</personName>
<personName order="surnameFirst" length="short" usage="referring" formality="informal">
<namePattern>{surname} {given-initial}</namePattern>
Expand Down Expand Up @@ -19440,7 +19440,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/
<namePattern>{surname}, {given-informal}</namePattern>
</personName>
<personName order="sorting" length="short" usage="referring" formality="formal">
<namePattern>{surname}, {given-initial} {given2-initial}</namePattern>
<namePattern>{surname}, {given-initial}{given2-initial}</namePattern>
</personName>
<personName order="sorting" length="short" usage="referring" formality="informal">
<namePattern>{surname}, {given-informal}</namePattern>
Expand Down
10 changes: 5 additions & 5 deletions common/main/yue.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11589,22 +11589,22 @@ CLDR data files are interpreted according to the LDML specification (http://unic
<namePattern>{given-informal-monogram-allCaps}</namePattern>
</personName>
<personName order="sorting" length="long" usage="referring" formality="formal">
<namePattern>{surname-core}{given}{given2}</namePattern>
<namePattern>{surname-core} {given} {given2}</namePattern>
</personName>
<personName order="sorting" length="long" usage="referring" formality="informal">
<namePattern>{surname}{given-informal}</namePattern>
<namePattern>{surname} {given-informal}</namePattern>
</personName>
<personName order="sorting" length="medium" usage="referring" formality="formal">
<namePattern>↑↑↑</namePattern>
</personName>
<personName order="sorting" length="medium" usage="referring" formality="informal">
<namePattern>{surname}{given-informal}</namePattern>
<namePattern>{surname} {given-informal}</namePattern>
</personName>
<personName order="sorting" length="short" usage="referring" formality="formal">
<namePattern>{surname-core}{given-initial}{given2-initial}</namePattern>
<namePattern>{surname-core} {given-initial} {given2-initial}</namePattern>
</personName>
<personName order="sorting" length="short" usage="referring" formality="informal">
<namePattern>{surname}{given-informal}</namePattern>
<namePattern>{surname} {given-informal}</namePattern>
</personName>
<sampleName item="nativeG">
<nameField type="given">文傑</nameField>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.unicode.cldr.test;

import java.util.ArrayList;
import java.util.List;

import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
Expand All @@ -10,9 +11,11 @@
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.XPathParts;
import org.unicode.cldr.util.personname.PersonNameFormatter;
import org.unicode.cldr.util.personname.PersonNameFormatter.NamePattern;
import org.unicode.cldr.util.personname.PersonNameFormatter.Optionality;
import org.unicode.cldr.util.personname.PersonNameFormatter.SampleType;

import com.ibm.icu.text.MessageFormat;
import com.ibm.icu.text.UnicodeSet;

public class CheckPersonNames extends CheckCLDR {
Expand All @@ -21,6 +24,7 @@ public class CheckPersonNames extends CheckCLDR {

boolean isRoot = false;
boolean hasRootParent = false;
String initialSeparator = " ";

private UnicodeSet allowedCharacters;
private boolean spacesNeededInNames;
Expand All @@ -44,6 +48,9 @@ public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, L
spacesNeededInNames = !PersonNameFormatter.LocaleSpacingData.getInstance()
.getScriptsNotNeedingSpacesInNames()
.contains(script);

String initialPatternSequence = cldrFileToCheck.getStringValue("//ldml/personNames/initialPattern[@type=\"initialSequence\"]");
initialSeparator = MessageFormat.format(initialPatternSequence, "", "");
//
return super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
}
Expand All @@ -65,14 +72,28 @@ public UnicodeSet getUnicodeSetForScript(String script) {
@Override
public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
List<CheckStatus> result) {
if (value == null
|| isRoot
if (isRoot
|| !path.startsWith("//ldml/personNames/")) {
return this;
}

XPathParts parts = XPathParts.getFrozenInstance(path);
switch(parts.getElement(2)) {
case "personName":
NamePattern namePattern = NamePattern.from(0, value);
ArrayList<List<String>> failures = namePattern.findInitialFailures(initialSeparator);
for (List<String> row : failures) {
String previousField = row.get(0);
String intermediateLiteral = row.get(1);
String followingField = row.get(1);
result.add(new CheckStatus().setCause(this)
.setMainType(CheckStatus.errorType)
.setSubtype(Subtype.illegalCharactersInPattern)
.setMessage("The gap between {0} and {2} must be the same as the pattern-initialSequence, =“{1}”",
previousField, intermediateLiteral, followingField));
}

break;
case "foreignSpaceReplacement":
if (spacesNeededInNames && !" ".equals(value)) {
result.add(new CheckStatus().setCause(this)
Expand All @@ -82,6 +103,9 @@ public CheckCLDR handleCheck(String path, String fullPath, String value, Options
}
break;
case "sampleName":
if (value == null) {
break;
}
if (!allowedCharacters.containsAll(value) && !value.equals(CldrUtility.NO_INHERITANCE_MARKER)) {
UnicodeSet bad = new UnicodeSet().addAll(value).removeAll(allowedCharacters);
final Type mainType = getPhase() != Phase.BUILD ? CheckStatus.errorType : CheckStatus.warningType; // we need to be able to check this in without error
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
import com.ibm.icu.text.CaseMap;
import com.ibm.icu.text.MessageFormat;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;

Expand Down Expand Up @@ -160,6 +162,7 @@ public enum Modifier {
prefix,
core,
;
public static final Set<Modifier> INITIALS = ImmutableSet.of(initialCap, initial);
public static final Comparator<Iterable<Modifier>> ITERABLE_COMPARE = Comparators.lexicographical(Comparator.<Modifier>naturalOrder());
public static final Comparator<Collection<Modifier>> LONGEST_FIRST = new Comparator<>() {

Expand Down Expand Up @@ -1006,6 +1009,50 @@ public String firstLiteralContaining(String item) {
}
return null;
}

/**
* Returns a list of (field, literal, field) that are inconsistent with the initialSeparator (derived from the initialPattern)
*/
public ArrayList<List<String>> findInitialFailures(String _initialSeparator) {
ArrayList<List<String>> failures;
String initialSeparator = finalWhitespace(_initialSeparator);

// check that the literal between initial fields matches the initial pattern
ModifiedField lastField = null;
boolean lastFieldInitial = false;
String lastLiteral = "";
failures = new ArrayList<>();
for (int i = 0; i < getElementCount(); ++i) {
// we can have {field}<literal>{field} or {field}{field}
ModifiedField field = getModifiedField(i);
if (field == null) {
lastLiteral = finalWhitespace(getLiteral(i));
} else {
boolean currentFieldInitial = !Collections.disjoint(field.getModifiers(), Modifier.INITIALS);
if (currentFieldInitial && lastFieldInitial) {
if (!initialSeparator.equals(lastLiteral)) {
failures.add(ImmutableList.of(lastField.toString(), lastLiteral, field.toString()));
}
}
lastField = field;
lastFieldInitial = currentFieldInitial;
lastLiteral = "";
}
}
return failures;
}

static final UnicodeSet WS = new UnicodeSet("\\p{whitespace}").freeze();

private String finalWhitespace(String string) {
if (!string.isEmpty()) {
int finalCp = string.codePointBefore(string.length());
if (WS.contains(finalCp)) {
return UTF16.valueOf(finalCp);
}
}
return "";
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.MessageFormat;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
Expand Down Expand Up @@ -180,7 +182,7 @@ public void TestWithCLDR() {
warnln("To see the contents of the English patterns, use -DTestPersonNameFormatter.SHOW");
}

check(ENGLISH_NAME_FORMATTER, sampleNameObject1, "order=sorting; length=short", "Smith, J. B.");
check(ENGLISH_NAME_FORMATTER, sampleNameObject1, "order=sorting; length=short", "Smith, J.B.");
check(ENGLISH_NAME_FORMATTER, sampleNameObject1, "length=long; usage=referring; formality=formal", "Dr. John Bob Smith Jr, MD");

// checkFormatterData(ENGLISH_NAME_FORMATTER);
Expand Down Expand Up @@ -574,7 +576,7 @@ public void TestFallbackFormatter() {
case allCaps:
expected = "VAN BERK"; break;
case initial:
expected = "v. B."; break;
expected = "v.B."; break;
case initialCap:
expected = "Van Berk"; break;
case monogram:
Expand Down Expand Up @@ -911,6 +913,26 @@ public void TestAll() {
continue;
}
PersonNameFormatter formatter = new PersonNameFormatter(cldrFile);
String initialPatternSequence = cldrFile.getStringValue("//ldml/personNames/initialPattern[@type=\"initialSequence\"]");
final String initialSeparator = MessageFormat.format(initialPatternSequence, "", "");

NamePatternData namePatternData = formatter.getNamePatternData();
Set<NamePattern> seen = new HashSet<>();
final ImmutableCollection<Entry<FormatParameters, NamePattern>> entries = namePatternData.getMatcherToPatterns().entries();
for (Entry<FormatParameters, NamePattern> entry2 : entries) {
NamePattern pattern = entry2.getValue();
if (!seen.contains(pattern)) {
seen.add(pattern);
ArrayList<List<String>> failures = pattern.findInitialFailures(initialSeparator);
failures.forEach(x -> errln(
"Conflict with initial pattern:\t" + locale
+ "\t«" + initialPatternSequence + "»"
+ "\t{" + x.get(0) + "}"
+ "\t«" + x.get(1) + "»"
+ "\t{" + x.get(2) + "}"
));
}
}
Multimap<String, FormatParameters> formattedToParameters = TreeMultimap.create();
for (Entry<SampleType, SimpleNameObject> entry : names.entrySet()) {
final SampleType sampleType = entry.getKey();
Expand Down Expand Up @@ -1082,7 +1104,7 @@ public void testEmptyFsrWrite() {
public void testInitials() {
String[][] tests = {{
"//ldml/personNames/personName[@order=\"givenFirst\"][@length=\"short\"][@usage=\"referring\"][@formality=\"formal\"]/namePattern",
"〖<i>🟨 Native name and script:</i>〗〖❬Zendaya❭〗〖❬I.❭ ❬Adler❭〗〖❬M. S.❭ ❬H.❭ ❬Watson❭〗〖❬B. W.❭ ❬H. R.❭ ❬Wooster❭〗〖<i>🟧 Foreign name and native script:</i>〗〖❬Sinbad❭〗〖❬K.❭ ❬Müller❭〗〖❬Z.❭ ❬H.❭ ❬Stöber❭〗〖❬A. C.❭ ❬C. M.❭ ❬von Brühl❭〗〖<i>🟥 Foreign name and script:</i>〗〖❬Є.❭ ❬М.❭ ❬Шевченко❭〗〖❬太郎山田❭〗"
"〖<i>🟨 Native name and script:</i>〗〖❬Zendaya❭〗〖❬I.❭ ❬Adler❭〗〖❬M.S.H.❭ ❬Watson❭〗〖❬B.W.H.R.❭ ❬Wooster❭〗〖<i>🟧 Foreign name and native script:</i>〗〖❬Sinbad❭〗〖❬K.❭ ❬Müller❭〗〖❬Z.H.❭ ❬Stöber❭〗〖❬A.C.C.M.❭ ❬von Brühl❭〗〖<i>🟥 Foreign name and script:</i>〗〖❬Є.М.❭ ❬Шевченко❭〗〖❬太郎山田❭〗"
}};
ExampleGenerator exampleGenerator = checkExamples(ENGLISH, tests);
}
Expand Down

0 comments on commit 62d07d1

Please sign in to comment.