-
-
Notifications
You must be signed in to change notification settings - Fork 31.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
bpo-38043: Move unicodedata.normalize tests into test_unicodedata. #15712
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -7,10 +7,11 @@ | |||
""" | ||||
|
||||
import hashlib | ||||
from http.client import HTTPException | ||||
import sys | ||||
import unicodedata | ||||
import unittest | ||||
from test.support import script_helper | ||||
from test.support import open_urlresource, script_helper | ||||
|
||||
|
||||
class UnicodeMethodsTest(unittest.TestCase): | ||||
|
@@ -171,13 +172,6 @@ def test_combining(self): | |||
self.assertRaises(TypeError, self.db.combining) | ||||
self.assertRaises(TypeError, self.db.combining, 'xx') | ||||
|
||||
def test_normalize(self): | ||||
self.assertRaises(TypeError, self.db.normalize) | ||||
self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx') | ||||
self.assertEqual(self.db.normalize('NFKC', ''), '') | ||||
# The rest can be found in test_normalization.py | ||||
# which requires an external file. | ||||
|
||||
def test_pr29(self): | ||||
# http://www.unicode.org/review/pr-29.html | ||||
# See issues #1054943 and #10254. | ||||
|
@@ -208,9 +202,6 @@ def test_issue29456(self): | |||
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) | ||||
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) | ||||
|
||||
# For tests of unicodedata.is_normalized / self.db.is_normalized , | ||||
# see test_normalization.py . | ||||
|
||||
def test_east_asian_width(self): | ||||
eaw = self.db.east_asian_width | ||||
self.assertRaises(TypeError, eaw, b'a') | ||||
|
@@ -315,5 +306,102 @@ def test_linebreak_7643(self): | |||
self.assertEqual(len(lines), 1, | ||||
r"\u%.4x should not be a linebreak" % i) | ||||
|
||||
class NormalizationTest(unittest.TestCase): | ||||
@staticmethod | ||||
def check_version(testfile): | ||||
hdr = testfile.readline() | ||||
return unicodedata.unidata_version in hdr | ||||
|
||||
@staticmethod | ||||
def unistr(data): | ||||
data = [int(x, 16) for x in data.split(" ")] | ||||
return "".join([chr(x) for x in data]) | ||||
|
||||
def test_normalization(self): | ||||
TESTDATAFILE = "NormalizationTest.txt" | ||||
TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}" | ||||
|
||||
# Hit the exception early | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Similar to my other recommendation, this comment doesn't seem to provide any particularly useful information or context. |
||||
try: | ||||
testdata = open_urlresource(TESTDATAURL, encoding="utf-8", | ||||
check=self.check_version) | ||||
except PermissionError: | ||||
self.skipTest(f"Permission error when downloading {TESTDATAURL} " | ||||
f"into the test data directory") | ||||
except (OSError, HTTPException): | ||||
self.fail(f"Could not retrieve {TESTDATAURL}") | ||||
|
||||
with testdata: | ||||
self.run_normalization_tests(testdata) | ||||
|
||||
def run_normalization_tests(self, testdata): | ||||
part = None | ||||
part1_data = {} | ||||
|
||||
def NFC(str): | ||||
return unicodedata.normalize("NFC", str) | ||||
|
||||
def NFKC(str): | ||||
return unicodedata.normalize("NFKC", str) | ||||
|
||||
def NFD(str): | ||||
return unicodedata.normalize("NFD", str) | ||||
|
||||
def NFKD(str): | ||||
return unicodedata.normalize("NFKD", str) | ||||
|
||||
for line in testdata: | ||||
if '#' in line: | ||||
line = line.split('#')[0] | ||||
line = line.strip() | ||||
if not line: | ||||
continue | ||||
if line.startswith("@Part"): | ||||
part = line.split()[0] | ||||
continue | ||||
c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]] | ||||
|
||||
# Perform tests | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I don't see much of a purpose in this comment, it doesn't seem to provide any additional information or context for the assertions below it. |
||||
self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) | ||||
self.assertTrue(c4 == NFC(c4) == NFC(c5), line) | ||||
self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) | ||||
self.assertTrue(c5 == NFD(c4) == NFD(c5), line) | ||||
self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ | ||||
NFKC(c3) == NFKC(c4) == NFKC(c5), | ||||
line) | ||||
self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ | ||||
NFKD(c3) == NFKD(c4) == NFKD(c5), | ||||
line) | ||||
|
||||
self.assertTrue(unicodedata.is_normalized("NFC", c2)) | ||||
self.assertTrue(unicodedata.is_normalized("NFC", c4)) | ||||
|
||||
self.assertTrue(unicodedata.is_normalized("NFD", c3)) | ||||
self.assertTrue(unicodedata.is_normalized("NFD", c5)) | ||||
|
||||
self.assertTrue(unicodedata.is_normalized("NFKC", c4)) | ||||
self.assertTrue(unicodedata.is_normalized("NFKD", c5)) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Trivial nit: The amount of whitespace in this section seems a bit excessive. I would recommend condensing it down to something like this:
|
||||
|
||||
# Record part 1 data | ||||
if part == "@Part1": | ||||
part1_data[c1] = 1 | ||||
|
||||
# Perform tests for all other data | ||||
for c in range(sys.maxunicode+1): | ||||
X = chr(c) | ||||
if X in part1_data: | ||||
continue | ||||
self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) | ||||
|
||||
def test_edge_cases(self): | ||||
self.assertRaises(TypeError, unicodedata.normalize) | ||||
self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx') | ||||
self.assertEqual(unicodedata.normalize('NFKC', ''), '') | ||||
|
||||
def test_bug_834676(self): | ||||
# Check for bug 834676 | ||||
unicodedata.normalize('NFC', '\ud55c\uae00') | ||||
|
||||
|
||||
if __name__ == "__main__": | ||||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is "hdr" short for header here? I don't recall seeing this particular abbreviation used before, so it's not entirely clear to me. But, it seems to be the only thing that would make sense in this context. Unless this is a very common and well known abbreviation that I wasn't aware of, it doesn't seem to be worth saving the extra three characters. It looks like it's only used a total of 3 times within these tests, so changing the variable name shouldn't be an issue. Explicit > Implicit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, I'm pretty sure it's intended to mean "header". I'd write the full word in this context if I were writing this code new.
(General reason I've left it this way is in the top-level comment above.)
Fortunately because this variable is local to this small and simple bit of code, there isn't much the reader has to look at to see what it does. Even if the variable were named "x" or "var31", the code wouldn't be made that much harder to read. Maybe even easier, as you wouldn't be tempted to try to guess what the name meant 😉