Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-38043: Move unicodedata.normalize tests into test_unicodedata. #15712

Merged
merged 2 commits into from
Sep 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 0 additions & 117 deletions Lib/test/test_normalization.py

This file was deleted.

4 changes: 3 additions & 1 deletion Lib/test/test_ucn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from test import support
from http.client import HTTPException
from test.test_normalization import check_version

try:
from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
Expand Down Expand Up @@ -172,6 +171,9 @@ def test_named_sequences_sample(self):

def test_named_sequences_full(self):
# Check all the named sequences
def check_version(testfile):
hdr = testfile.readline()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
hdr = testfile.readline()
header = testfile.readline()

Is "hdr" short for header here? I don't recall seeing this particular abbreviation used before, so it's not entirely clear to me. But, it seems to be the only thing that would make sense in this context. Unless this is a very common and well known abbreviation that I wasn't aware of, it doesn't seem to be worth saving the extra three characters. It looks like it's only used a total of 3 times within these tests, so changing the variable name shouldn't be an issue. Explicit > Implicit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I'm pretty sure it's intended to mean "header". I'd write the full word in this context if I were writing this code new.

(General reason I've left it this way is in the top-level comment above.)

Fortunately because this variable is local to this small and simple bit of code, there isn't much the reader has to look at to see what it does. Even if the variable were named "x" or "var31", the code wouldn't be made that much harder to read. Maybe even easier, as you wouldn't be tempted to try to guess what the name meant 😉

return unicodedata.unidata_version in hdr
url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
unicodedata.unidata_version)
try:
Expand Down
110 changes: 99 additions & 11 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
"""

import hashlib
from http.client import HTTPException
import sys
import unicodedata
import unittest
from test.support import script_helper
from test.support import open_urlresource, script_helper


class UnicodeMethodsTest(unittest.TestCase):
Expand Down Expand Up @@ -171,13 +172,6 @@ def test_combining(self):
self.assertRaises(TypeError, self.db.combining)
self.assertRaises(TypeError, self.db.combining, 'xx')

def test_normalize(self):
self.assertRaises(TypeError, self.db.normalize)
self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
self.assertEqual(self.db.normalize('NFKC', ''), '')
# The rest can be found in test_normalization.py
# which requires an external file.

def test_pr29(self):
# http://www.unicode.org/review/pr-29.html
# See issues #1054943 and #10254.
Expand Down Expand Up @@ -208,9 +202,6 @@ def test_issue29456(self):
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)

# For tests of unicodedata.is_normalized / self.db.is_normalized ,
# see test_normalization.py .

def test_east_asian_width(self):
eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, b'a')
Expand Down Expand Up @@ -315,5 +306,102 @@ def test_linebreak_7643(self):
self.assertEqual(len(lines), 1,
r"\u%.4x should not be a linebreak" % i)

class NormalizationTest(unittest.TestCase):
@staticmethod
def check_version(testfile):
hdr = testfile.readline()
return unicodedata.unidata_version in hdr

@staticmethod
def unistr(data):
data = [int(x, 16) for x in data.split(" ")]
return "".join([chr(x) for x in data])

def test_normalization(self):
TESTDATAFILE = "NormalizationTest.txt"
TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"

# Hit the exception early
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Hit the exception early

Similar to my other recommendation, this comment doesn't seem to provide any particularly useful information or context.

try:
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
check=self.check_version)
except PermissionError:
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
f"into the test data directory")
except (OSError, HTTPException):
self.fail(f"Could not retrieve {TESTDATAURL}")

with testdata:
self.run_normalization_tests(testdata)

def run_normalization_tests(self, testdata):
part = None
part1_data = {}

def NFC(str):
return unicodedata.normalize("NFC", str)

def NFKC(str):
return unicodedata.normalize("NFKC", str)

def NFD(str):
return unicodedata.normalize("NFD", str)

def NFKD(str):
return unicodedata.normalize("NFKD", str)

for line in testdata:
if '#' in line:
line = line.split('#')[0]
line = line.strip()
if not line:
continue
if line.startswith("@Part"):
part = line.split()[0]
continue
c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]

# Perform tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Perform tests

I don't see much of a purpose in this comment, it doesn't seem to provide any additional information or context for the assertions below it.

self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
NFKC(c3) == NFKC(c4) == NFKC(c5),
line)
self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
NFKD(c3) == NFKD(c4) == NFKD(c5),
line)

self.assertTrue(unicodedata.is_normalized("NFC", c2))
self.assertTrue(unicodedata.is_normalized("NFC", c4))

self.assertTrue(unicodedata.is_normalized("NFD", c3))
self.assertTrue(unicodedata.is_normalized("NFD", c5))

self.assertTrue(unicodedata.is_normalized("NFKC", c4))
self.assertTrue(unicodedata.is_normalized("NFKD", c5))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trivial nit: The amount of whitespace in this section seems a bit excessive. I would recommend condensing it down to something like this:

            self.assertTrue(unicodedata.is_normalized("NFC", c2))
            self.assertTrue(unicodedata.is_normalized("NFC", c4))
            self.assertTrue(unicodedata.is_normalized("NFD", c3))
            self.assertTrue(unicodedata.is_normalized("NFD", c5))
            self.assertTrue(unicodedata.is_normalized("NFKC", c4))
            self.assertTrue(unicodedata.is_normalized("NFKD", c5))


# Record part 1 data
if part == "@Part1":
part1_data[c1] = 1

# Perform tests for all other data
for c in range(sys.maxunicode+1):
X = chr(c)
if X in part1_data:
continue
self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)

def test_edge_cases(self):
self.assertRaises(TypeError, unicodedata.normalize)
self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
self.assertEqual(unicodedata.normalize('NFKC', ''), '')

def test_bug_834676(self):
# Check for bug 834676
unicodedata.normalize('NFC', '\ud55c\uae00')


if __name__ == "__main__":
unittest.main()
1 change: 0 additions & 1 deletion PCbuild/lib.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,6 @@
<Compile Include="test\test_netrc.py" />
<Compile Include="test\test_nis.py" />
<Compile Include="test\test_nntplib.py" />
<Compile Include="test\test_normalization.py" />
<Compile Include="test\test_ntpath.py" />
<Compile Include="test\test_numeric_tower.py" />
<Compile Include="test\test_opcodes.py" />
Expand Down