Add hamming distance function

Yifan Wu · Yifan Wu · commit 9261500b4964 · 2020-01-31T16:11:42.000-08:00
diff --git a/.travis.yml b/.travis.yml
@@ -3,11 +3,14 @@ language: python
 sudo: false
 
 python:
+  - "3.3"
+  - "3.4"
+  - "3.5"
   - "3.6"
   - "3.7"
   - "3.8"
 
 install:
   - pip install -r requirements.txt
-  
+
 script: "python3 test.py"
diff --git a/README.md b/README.md
@@ -1,15 +1,34 @@
 # python-text-distance
 
+[![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/) 
+[![Generic badge](https://img.shields.io/badge/pypi%20package-0.1.1-blue.svg)](https://pypi.org/project/pytextdist/)
+[![Build Status](https://travis-ci.com/ywu94/python-text-distance.svg?branch=master)](https://travis-ci.com/ywu94/python-text-distance)
+
 A python implementation of a variety of text distance and similarity metrics.
 
-### Requirements
 ---
-Python 3.3+
 
-### Install
+## Install
+
+Requirements: `py>=3.3`, `pyyaml>=5.1.2`
+
+Install Command: `pip install pytextdist`
+
 ---
-```
-pip install pytextdistance
-```
 
+## Modules
+
+### Edit Distance
+
+All edit distances listed in [Edit Distance on Wikipedia](https://en.wikipedia.org/wiki/Edit_distance) are implemented.
+
+> **[Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance)**: edit with insertion, deletion, and substitution
+
+> **[Longest Common Subsequence Distance](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)**: edit with insertion and deletion 
+
+> **[Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)**: edit with insertion, deletion, substitution, and transposition of two adjacent units
+
+> **[Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance)**: edit with substition
+
+> **[Jaro & Jaro-Winkler Similarity](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)**: edit with transposition
 
diff --git a/pytextdist/__init__.py b/pytextdist/__init__.py
@@ -4,7 +4,7 @@
 import yaml
 
 __name__ = "pytextdist"
-__version__ = "0.0.2"
+__version__ = "0.1.1"
 
 import importlib
 
@@ -15,7 +15,6 @@
 from . import edit_distance 
 importlib.reload(edit_distance)
 
-
 """
 Set up logging
 | Default logging configuration can be edited in logging.yaml.
diff --git a/pytextdist/edit_distance.py b/pytextdist/edit_distance.py
@@ -151,15 +151,17 @@ def lcs_distance(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ign
 		for j in range(1, len_2+1):
 			manipulation[i][j] = manipulation[i-1][j-1] + 1 if l_1[i-1] == l_2[j-1] else max(manipulation[i][j-1], manipulation[i-1][j])
 
-	return manipulation[-1][-1]
+	distance = len_1 + len_2 - 2 * manipulation[-1][-1]
+
+	return distance
 
 @input_validator(str, str)
 def lcs_similarity(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ignore_space=True, ignore_numeric=True, ignore_case=True):
 	"""
 	Get longest common subsequence similarity between two text phrases
 	|
 	| Formula
-	| | longest common subsequence / longest length among two
+	| | 1 - (longest common subsequence / sum of lengths)
 	|
 	| Argument
 	| | phrase_1, phrase_2: text phrases to compare
@@ -200,7 +202,9 @@ def lcs_similarity(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, i
 		for j in range(1, len_2+1):
 			manipulation[i][j] = manipulation[i-1][j-1] + 1 if l_1[i-1] == l_2[j-1] else max(manipulation[i][j-1], manipulation[i-1][j])
 
-	similarity = manipulation[-1][-1]/max(len_1,len_2)
+	distance = len_1 + len_2 - 2 * manipulation[-1][-1]
+
+	similarity = 1 - distance/(len_1+len_2)
 
 	return similarity
 
@@ -436,3 +440,94 @@ def jaro_winkler_similarity(phrase_1, phrase_2, p=0.1, grain="char", ignore_non_
 	similarity = jaro_similarity + l_common_prefix*p*(1-jaro_similarity)
 
 	return similarity
+
+@input_validator(str, str)
+def hamming_distance(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ignore_space=True, ignore_numeric=True, ignore_case=True):
+	"""
+	Get Hamming distance between two text phrases
+	|
+	| Argument
+	| | phrase_1, phrase_2: text phrases to compare
+	|
+	| Parameter
+	| | grain: "char" or "word", grain for edit
+	|
+	| Parameter for preprocessing
+	| | ignore_non_alnumspc: whether to remove all non alpha/numeric/space characters
+	| | ignore_space: whether to remove all spaces
+	| | ignore_numeric: whether to remove all numeric characters
+	| | ignore_case: whether to convert all alpha characters to lower case
+	|
+	| Output
+	| | distance (type: int)
+	"""
+	assert grain in ("char", "word"), "Illegal grain input: {}".format(grain)
+
+	# Preprocess text phrase into list of edit units
+	if grain == "char":
+		l_1 = word_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
+		l_2 = word_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
+	else:
+		l_1 = sentence_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
+		l_2 = sentence_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
+	len_1, len_2 = len(l_1), len(l_2)
+
+	# Early exit if one of the lists is empty
+	if len_1 == 0 or len_2 == 0: return max(len_1,len_2)
+
+	# Raise exception two lists have different length
+	if len_1 != len_2: raise Exception("Can't calculate hamming distance between phrases of different lengths")
+
+	# Calculate hamming distance
+	distance = 0
+	for x, y in zip(l_1, l_2): distance += (1 if x != y else 0)
+
+	return distance
+
+@input_validator(str, str)
+def hamming_similarity(phrase_1, phrase_2, grain="char", ignore_non_alnumspc=True, ignore_space=True, ignore_numeric=True, ignore_case=True):
+	"""
+	Get Hamming similarity between two text phrases
+	|
+	| Formula
+	| | 1 - (Hamming distance / longest length among two)
+	|
+	| Argument
+	| | phrase_1, phrase_2: text phrases to compare
+	|
+	| Parameter
+	| | grain: "char" or "word", grain for edit
+	|
+	| Parameter for preprocessing
+	| | ignore_non_alnumspc: whether to remove all non alpha/numeric/space characters
+	| | ignore_space: whether to remove all spaces
+	| | ignore_numeric: whether to remove all numeric characters
+	| | ignore_case: whether to convert all alpha characters to lower case
+	|
+	| Output
+	| | distance (type: int)
+	"""
+	assert grain in ("char", "word"), "Illegal grain input: {}".format(grain)
+
+	# Preprocess text phrase into list of edit units
+	if grain == "char":
+		l_1 = word_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
+		l_2 = word_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case, ignore_space=ignore_space)
+	else:
+		l_1 = sentence_preprocessing(phrase_1, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
+		l_2 = sentence_preprocessing(phrase_2, ignore_non_alnumspc=ignore_non_alnumspc, ignore_numeric=ignore_numeric, ignore_case=ignore_case)
+	len_1, len_2 = len(l_1), len(l_2)
+
+	# Early exit if one of the lists is empty
+	if len_1 == 0 or len_2 == 0: return max(len_1,len_2)
+
+	# Raise exception two lists have different length
+	if len_1 != len_2: raise Exception("Can't calculate hamming distance between phrases of different lengths")
+
+	# Calculate hamming distance
+	distance = 0
+	for x, y in zip(l_1, l_2): distance += (1 if x != y else 0)
+
+	similarity = 1 - distance/len_1
+
+	return similarity
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
 	name="pytextdist",
-	version="0.0.1",
+	version="0.1.1",
 	author="Yifan Wu",
 	author_email="yw693@cornell.edu",
 	description="A python implementation of a variety of text distance and similarity metrics.",
@@ -17,6 +17,9 @@
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires='>=3.3'
+    python_requires='>=3.3',
+    install_requires=[
+        'pyyaml>=5.1.2'
+    ]
 )
 
diff --git a/test.py b/test.py
@@ -27,6 +27,7 @@ def test_preprocessing(self):
 
 	def test_edit_distance(self):
 		self.assertEqual(pytextdist.edit_distance.levenshtein_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["lev_d"])
+		self.assertEqual(pytextdist.edit_distance.hamming_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["h_d"])
 		self.assertEqual(pytextdist.edit_distance.lcs_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["lcs_d"])	
 		self.assertEqual(pytextdist.edit_distance.damerau_levenshtein_distance(self.kwargs["phrase_1"], self.kwargs["phrase_2"]), self.kwargs["d_lev_d"])
 		self.assertEqual(round(pytextdist.edit_distance.jaro_similarity(self.kwargs["phrase_1"], self.kwargs["phrase_2"]),2), self.kwargs["d_jaro"])
@@ -41,6 +42,7 @@ def test_edit_distance(self):
 		"phrase_1": "bededqowd",
 		"phrase_2": "beeddqpdw",
 		"lev_d": 5,
+		"h_d": 5,
 		"lcs_d": 6,
 		"d_lev_d": 3,
 		"d_jaro": 0.84,