Regex incorrectly identify surrogate pair unicode category #16922
Open
Description
opened on Apr 7, 2016
This defect makes regex useless when you need to match string using unicode categories and string could contain surrogate pairs:
[Test]
public void ClassifySurrogates_Test()
{
var value = "𦕒";
var regexLetters = new Regex(@"^\p{Lo}$");
var regexSurrogates = new Regex(@"^\p{Cs}{2}$");
Assert.AreEqual(UnicodeCategory.OtherLetter, char.GetUnicodeCategory(value, 0));
Assert.AreEqual(UnicodeCategory.OtherLetter, CharUnicodeInfo.GetUnicodeCategory(value, 0));
Assert.True(regexSurrogates.IsMatch(value));
// Fails here
Assert.True(regexLetters.IsMatch(value));
}
Activity