Skip to content

Commit

Permalink
renamed pdq_hash_similarity to pdq_hash_similarities to have consiste…
Browse files Browse the repository at this point in the history
…nt naming (#2)
  • Loading branch information
SamSweere authored Mar 15, 2024
1 parent 4402ca9 commit 47cbda8
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 71 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ It takes the following parameters:
- `index` (string): A unique identifier.
- `url_duplicates` (list of strings or `None`): Indices with duplicate URLs.
- `pdq_hash_duplicates` (list of strings or `None`): Indices with perceptual hashes similar within the threshold.
- `pdq_hash_similarity` (list of strings or `None`): Hash similarity scores for perceptual hashes within the threshold.
- `pdq_hash_similarities` (list of strings or `None`): Hash similarity scores for perceptual hashes within the threshold.

### Post-processing

Expand Down
18 changes: 9 additions & 9 deletions cir_duplicate_detector/pdq_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,16 @@ def pdq_hash_output_formatter(
results from the duplicate detection.
Returns:
pd.DataFrame: DataFrame with columns `pdq_hash_duplicates` and `pdq_hash_similarity`.
pd.DataFrame: DataFrame with columns `pdq_hash_duplicates` and `pdq_hash_similarities`.
"""

# Format the duplicate_detection_results into a DataFrame of the desired output structure
pdq_hash_dup_df = pd.DataFrame(index=pdq_hash_series.index.drop_duplicates())

pdq_hash_dup_df["pdq_hash_duplicates"] = pd.NA
pdq_hash_dup_df["pdq_hash_duplicates"] = pdq_hash_dup_df["pdq_hash_duplicates"].astype(object)
pdq_hash_dup_df["pdq_hash_similarity"] = pd.NA
pdq_hash_dup_df["pdq_hash_similarity"] = pdq_hash_dup_df["pdq_hash_similarity"].astype(object)
pdq_hash_dup_df["pdq_hash_similarities"] = pd.NA
pdq_hash_dup_df["pdq_hash_similarities"] = pdq_hash_dup_df["pdq_hash_similarities"].astype(object)

for index, result in zip(pdq_hash_series.index, duplicate_detection_results, strict=True):
# Format the result into a list of indexes, if no duplicates are found, set the value to pd.NA
Expand All @@ -196,21 +196,21 @@ def pdq_hash_output_formatter(
if isinstance(pdq_hash_dup_df["pdq_hash_duplicates"][index], list):
if item["index"] not in pdq_hash_dup_df["pdq_hash_duplicates"][index] and index != item["index"]:
pdq_hash_dup_df.loc[index, "pdq_hash_duplicates"].append(item["index"])
pdq_hash_dup_df.loc[index, "pdq_hash_similarity"].append(dist_normalized)
pdq_hash_dup_df.loc[index, "pdq_hash_similarities"].append(dist_normalized)
elif index != item["index"]:
pdq_hash_dup_df.loc[index, "pdq_hash_duplicates"] = [item["index"]]
pdq_hash_dup_df.loc[index, "pdq_hash_similarity"] = [dist_normalized]
pdq_hash_dup_df.loc[index, "pdq_hash_similarities"] = [dist_normalized]
# Update the duplicate of the duplicate index
# First check if the index exists, if not create it
if item["index"] not in pdq_hash_dup_df.index:
pdq_hash_dup_df.loc[item["index"]] = pd.NA
if isinstance(pdq_hash_dup_df["pdq_hash_duplicates"][item["index"]], list):
if index not in pdq_hash_dup_df["pdq_hash_duplicates"][item["index"]] and index != item["index"]:
pdq_hash_dup_df.loc[item["index"], "pdq_hash_duplicates"].append(index)
pdq_hash_dup_df.loc[item["index"], "pdq_hash_similarity"].append(dist_normalized)
pdq_hash_dup_df.loc[item["index"], "pdq_hash_similarities"].append(dist_normalized)
elif index != item["index"]:
pdq_hash_dup_df.loc[item["index"], "pdq_hash_duplicates"] = [index]
pdq_hash_dup_df.loc[item["index"], "pdq_hash_similarity"] = [dist_normalized]
pdq_hash_dup_df.loc[item["index"], "pdq_hash_similarities"] = [dist_normalized]

# Drop all rows that do not have duplicates
pdq_hash_dup_df = pdq_hash_dup_df.dropna()
Expand Down Expand Up @@ -274,9 +274,9 @@ def find_pdq_hash_duplicates(

def empty_output_df() -> pd.DataFrame:
# Create an empty output dataframe containing the correct index name and dtype
empty_output_df = pd.DataFrame(columns=["pdq_hash_duplicates", "pdq_hash_similarity"])
empty_output_df = pd.DataFrame(columns=["pdq_hash_duplicates", "pdq_hash_similarities"])
empty_output_df["pdq_hash_duplicates"] = empty_output_df["pdq_hash_duplicates"].astype(object)
empty_output_df["pdq_hash_similarity"] = empty_output_df["pdq_hash_similarity"].astype(object)
empty_output_df["pdq_hash_similarities"] = empty_output_df["pdq_hash_similarities"].astype(object)
empty_output_df.index.name = pdq_hash_series.index.name
empty_output_df.index = empty_output_df.index.astype(pdq_hash_series.index.dtype)

Expand Down
Loading

0 comments on commit 47cbda8

Please sign in to comment.