Skip to content

Commit 9261500

Browse files
author
Yifan Wu
committed
Add hamming distance function
1 parent 83147fd commit 9261500

File tree

6 files changed

+135
-14
lines changed

6 files changed

+135
-14
lines changed

.travis.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@ language: python
33
sudo: false
44

55
python:
6+
- "3.3"
7+
- "3.4"
8+
- "3.5"
69
- "3.6"
710
- "3.7"
811
- "3.8"
912

1013
install:
1114
- pip install -r requirements.txt
12-
15+
1316
script: "python3 test.py"

README.md

+25-6
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,34 @@
11
# python-text-distance
22

3+
[![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)
4+
[![Generic badge](https://img.shields.io/badge/pypi%20package-0.1.1-blue.svg)](https://pypi.org/project/pytextdist/)
5+
[![Build Status](https://travis-ci.com/ywu94/python-text-distance.svg?branch=master)](https://travis-ci.com/ywu94/python-text-distance)
6+
37
A python implementation of a variety of text distance and similarity metrics.
48

5-
### Requirements
69
---
7-
Python 3.3+
810

9-
### Install
11+
## Install
12+
13+
Requirements: `py>=3.3`, `pyyaml>=5.1.2`
14+
15+
Install Command: `pip install pytextdist`
16+
1017
---
11-
```
12-
pip install pytextdistance
13-
```
1418

19+
## Modules
20+
21+
### Edit Distance
22+
23+
All edit distances listed in [Edit Distance on Wikipedia](https://en.wikipedia.org/wiki/Edit_distance) are implemented.
24+
25+
> **[Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance)**: edit with insertion, deletion, and substitution
26+
27+
> **[Longest Common Subsequence Distance](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)**: edit with insertion and deletion
28+
29+
> **[Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)**: edit with insertion, deletion, substitution, and transposition of two adjacent units
30+
31+
> **[Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance)**: edit with substition
32+
33+
> **[Jaro & Jaro-Winkler Similarity](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)**: edit with transposition
1534

pytextdist/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import yaml
55

66
__name__ = "pytextdist"
7-
__version__ = "0.0.2"
7+
__version__ = "0.1.1"
88

99
import importlib
1010

@@ -15,7 +15,6 @@
1515
from . import edit_distance
1616
importlib.reload(edit_distance)
1717

18-
1918
"""
2019
Set up logging
2120
| Default logging configuration can be edited in logging.yaml.

pytextdist/edit_distance.py

+98-3
Original file line numberDiff line numberDiff line change
@@ -151,15 +151,17 @@ def lcs_distance(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ign
151151
for j in range(1, len_2+1):
152152
manipulation[i][j] = manipulation[i-1][j-1] + 1 if l_1[i-1] == l_2[j-1] else max(manipulation[i][j-1], manipulation[i-1][j])
153153

154-
return manipulation[-1][-1]
154+
distance = len_1 + len_2 - 2 * manipulation[-1][-1]
155+
156+
return distance
155157

156158
@input_validator(str, str)
157159
def lcs_similarity(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ignore_space=True, ignore_numeric=True, ignore_case=True):
158160
"""
159161
Get longest common subsequence similarity between two text phrases
160162
|
161163
| Formula
162-
| | longest common subsequence / longest length among two
164+
| | 1 - (longest common subsequence / sum of lengths)
163165
|
164166
| Argument
165167
| | phrase_1, phrase_2: text phrases to compare
@@ -200,7 +202,9 @@ def lcs_similarity(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, i
200202
for j in range(1, len_2+1):
201203
manipulation[i][j] = manipulation[i-1][j-1] + 1 if l_1[i-1] == l_2[j-1] else max(manipulation[i][j-1], manipulation[i-1][j])
202204

203-
similarity = manipulation[-1][-1]/max(len_1,len_2)
205+
distance = len_1 + len_2 - 2 * manipulation[-1][-1]
206+
207+
similarity = 1 - distance/(len_1+len_2)
204208

205209
return similarity
206210

@@ -436,3 +440,94 @@ def jaro_winkler_similarity(phrase_1, phrase_2, p=0.1, grain="char", ignore_non_
436440
similarity = jaro_similarity + l_common_prefix*p*(1-jaro_similarity)
437441

438442
return similarity
443+
444+
@input_validator(str, str)
445+
def hamming_distance(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ignore_space=True, ignore_numeric=True, ignore_case=True):
446+
"""
447+
Get Hamming distance between two text phrases
448+
|
449+
| Argument
450+
| | phrase_1, phrase_2: text phrases to compare
451+
|
452+
| Parameter
453+
| | grain: "char" or "word", grain for edit
454+
|
455+
| Parameter for preprocessing
456+
| | ignore_non_alnumspc: whether to remove all non alpha/numeric/space characters
457+
| | ignore_space: whether to remove all spaces
458+
| | ignore_numeric: whether to remove all numeric characters
459+
| | ignore_case: whether to convert all alpha characters to lower case
460+
|
461+
| Output
462+
| | distance (type: int)
463+
"""
464+
assert grain in ("char", "word"), "Illegal grain input: {}".format(grain)
465+
466+
# Preprocess text phrase into list of edit units
467+
if grain == "char":
468+
l_1 = word_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
469+
l_2 = word_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
470+
else:
471+
l_1 = sentence_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
472+
l_2 = sentence_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
473+
len_1, len_2 = len(l_1), len(l_2)
474+
475+
# Early exit if one of the lists is empty
476+
if len_1 == 0 or len_2 == 0: return max(len_1,len_2)
477+
478+
# Raise exception two lists have different length
479+
if len_1 != len_2: raise Exception("Can't calculate hamming distance between phrases of different lengths")
480+
481+
# Calculate hamming distance
482+
distance = 0
483+
for x, y in zip(l_1, l_2): distance += (1 if x != y else 0)
484+
485+
return distance
486+
487+
@input_validator(str, str)
488+
def hamming_similarity(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ignore_space=True, ignore_numeric=True, ignore_case=True):
489+
"""
490+
Get Hamming similarity between two text phrases
491+
|
492+
| Formula
493+
| | 1 - (Hamming distance / longest length among two)
494+
|
495+
| Argument
496+
| | phrase_1, phrase_2: text phrases to compare
497+
|
498+
| Parameter
499+
| | grain: "char" or "word", grain for edit
500+
|
501+
| Parameter for preprocessing
502+
| | ignore_non_alnumspc: whether to remove all non alpha/numeric/space characters
503+
| | ignore_space: whether to remove all spaces
504+
| | ignore_numeric: whether to remove all numeric characters
505+
| | ignore_case: whether to convert all alpha characters to lower case
506+
|
507+
| Output
508+
| | distance (type: int)
509+
"""
510+
assert grain in ("char", "word"), "Illegal grain input: {}".format(grain)
511+
512+
# Preprocess text phrase into list of edit units
513+
if grain == "char":
514+
l_1 = word_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
515+
l_2 = word_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
516+
else:
517+
l_1 = sentence_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
518+
l_2 = sentence_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
519+
len_1, len_2 = len(l_1), len(l_2)
520+
521+
# Early exit if one of the lists is empty
522+
if len_1 == 0 or len_2 == 0: return max(len_1,len_2)
523+
524+
# Raise exception two lists have different length
525+
if len_1 != len_2: raise Exception("Can't calculate hamming distance between phrases of different lengths")
526+
527+
# Calculate hamming distance
528+
distance = 0
529+
for x, y in zip(l_1, l_2): distance += (1 if x != y else 0)
530+
531+
similarity = 1 - distance/len_1
532+
533+
return similarity

setup.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="pytextdist",
8-
version="0.0.1",
8+
version="0.1.1",
99
author="Yifan Wu",
1010
author_email="yw693@cornell.edu",
1111
description="A python implementation of a variety of text distance and similarity metrics.",
@@ -17,6 +17,9 @@
1717
"License :: OSI Approved :: MIT License",
1818
"Operating System :: OS Independent",
1919
],
20-
python_requires='>=3.3'
20+
python_requires='>=3.3',
21+
install_requires=[
22+
'pyyaml>=5.1.2'
23+
]
2124
)
2225

test.py

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def test_preprocessing(self):
2727

2828
def test_edit_distance(self):
2929
self.assertEqual(pytextdist.edit_distance.levenshtein_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["lev_d"])
30+
self.assertEqual(pytextdist.edit_distance.hamming_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["h_d"])
3031
self.assertEqual(pytextdist.edit_distance.lcs_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["lcs_d"])
3132
self.assertEqual(pytextdist.edit_distance.damerau_levenshtein_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["d_lev_d"])
3233
self.assertEqual(round(pytextdist.edit_distance.jaro_similarity(self.kwargs["phrase_1"], self.kwargs["phrase_2"]),2), self.kwargs["d_jaro"])
@@ -41,6 +42,7 @@ def test_edit_distance(self):
4142
"phrase_1": "bededqowd",
4243
"phrase_2": "beeddqpdw",
4344
"lev_d": 5,
45+
"h_d": 5,
4446
"lcs_d": 6,
4547
"d_lev_d": 3,
4648
"d_jaro": 0.84,

0 commit comments

Comments
 (0)