Skip to content

Commit d968ca7

Browse files
authored
allow unicode (#111)
* initial commit to allow unicode * update version and changelog * add the flag to the CLI * update README.md
1 parent 07b87da commit d968ca7

File tree

6 files changed

+328
-7
lines changed

6 files changed

+328
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 6.1.0
2+
3+
- Add `allow_unicode` flag to allow unicode characters in the slug
4+
15
## 6.0.1
26

37
- Rework regex_pattern to mean the opposite (disallowed chars instead of allowed)

README.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def slugify(
4242
stopwords=(),
4343
regex_pattern=None,
4444
lowercase=True,
45-
replacements=()
45+
replacements=(),
46+
allow_unicode=False
4647
):
4748
"""
4849
Make a slug from the given text.
@@ -58,6 +59,7 @@ def slugify(
5859
:param regex_pattern (str): regex pattern for disallowed characters
5960
:param lowercase (bool): activate case sensitivity by setting it to False
6061
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
62+
:param allow_unicode (bool): allow unicode characters
6163
:return (str): slugify text
6264
"""
6365
```
@@ -75,6 +77,10 @@ txt = '影師嗎'
7577
r = slugify(txt)
7678
self.assertEqual(r, "ying-shi-ma")
7779

80+
txt = '影師嗎'
81+
r = slugify(txt, allow_unicode=True)
82+
self.assertEqual(r, "影師嗎")
83+
7884
txt = 'C\'est déjà l\'été.'
7985
r = slugify(txt)
8086
self.assertEqual(r, "c-est-deja-l-ete")
@@ -133,6 +139,14 @@ txt = 'ÜBER Über German Umlaut'
133139
r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']])
134140
self.assertEqual(r, "ueber-ueber-german-umlaut")
135141

142+
txt = 'i love 🦄'
143+
r = slugify(txt, allow_unicode=True)
144+
self.assertEqual(r, "i-love")
145+
146+
txt = 'i love 🦄'
147+
r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+')
148+
self.assertEqual(r, "🦄")
149+
136150
```
137151

138152
For more examples, have a look at the [test.py](test.py) file.

slugify/__main__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ def parse_args(argv):
3636
help="Activate case sensitivity")
3737
parser.add_argument("--replacements", nargs='+',
3838
help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
39+
parser.add_argument("--allow-unicode", action='store_true', default=False,
40+
help="Allow unicode characters")
3941

4042
args = parser.parse_args(argv[1:])
4143

@@ -73,7 +75,8 @@ def slugify_params(args):
7375
separator=args.separator,
7476
stopwords=args.stopwords,
7577
lowercase=args.lowercase,
76-
replacements=args.replacements
78+
replacements=args.replacements,
79+
allow_unicode=args.allow_unicode
7780
)
7881

7982

slugify/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
__url__ = 'https://github.com/un33k/python-slugify'
66
__license__ = 'MIT'
77
__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.'
8-
__version__ = '6.0.1'
8+
__version__ = '6.1.0'

slugify/slugify.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
1818
QUOTE_PATTERN = re.compile(r'[\']+')
1919
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
20+
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
2021
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
2122
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
2223
DEFAULT_SEPARATOR = '-'
@@ -66,7 +67,8 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav
6667

6768
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
6869
separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
69-
replacements: typing.Iterable[typing.Iterable[str]] = ()):
70+
replacements: typing.Iterable[typing.Iterable[str]] = (),
71+
allow_unicode=False):
7072
"""
7173
Make a slug from the given text.
7274
:param text (str): initial text
@@ -81,6 +83,7 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
8183
:param regex_pattern (str): regex pattern for disallowed characters
8284
:param lowercase (bool): activate case sensitivity by setting it to False
8385
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
86+
:param allow_unicode (bool): allow unicode characters
8487
:return (str):
8588
"""
8689

@@ -97,7 +100,8 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
97100
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
98101

99102
# decode unicode
100-
text = unidecode.unidecode(text)
103+
if not allow_unicode:
104+
text = unidecode.unidecode(text)
101105

102106
# ensure text is still in unicode
103107
if not isinstance(text, str):
@@ -122,7 +126,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
122126
pass
123127

124128
# translate
125-
text = unicodedata.normalize('NFKD', text)
129+
if allow_unicode:
130+
text = unicodedata.normalize('NFKC', text)
131+
else:
132+
text = unicodedata.normalize('NFKD', text)
133+
126134
if sys.version_info < (3,):
127135
text = text.encode('ascii', 'ignore')
128136

@@ -137,7 +145,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
137145
text = NUMBERS_PATTERN.sub('', text)
138146

139147
# replace all other unwanted characters
140-
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
148+
if allow_unicode:
149+
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
150+
else:
151+
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
152+
141153
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
142154

143155
# remove redundant

0 commit comments

Comments
 (0)