Skip to content

Commit e2072d6

Browse files
committed
feat(web): add duplicate detection within/across lists (task-26)
- Created duplicate-detection utility with functions for detecting and managing duplicates - Added calculateDuplicateStats to analyze duplicate statistics - Added suggestDuplicateRemovals to identify entities that can be safely removed - Integrated duplicate detection into CatalogueEntities component - Added warning banner when duplicates are detected in a list - Added modal to view and remove all duplicates with one click - Duplicates detected by matching entity type and ID Duplicate detection features: - Automatic calculation of duplicate stats when entities change - Visual warning with duplicate count and percentage - One-click removal of all duplicate entities - Preserves most recently added instance of each duplicate group
1 parent 68dba0e commit e2072d6

File tree

3 files changed

+346
-4
lines changed

3 files changed

+346
-4
lines changed

apps/web/src/components/catalogue/CatalogueEntities.tsx

Lines changed: 129 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import type { EntityType } from "@bibgraph/types";
77
import { ENTITY_METADATA } from "@bibgraph/types";
8-
import { type CatalogueEntity } from "@bibgraph/utils"
8+
import { calculateDuplicateStats, type CatalogueEntity, type DuplicateStats, suggestDuplicateRemovals } from "@bibgraph/utils"
99
import { logger } from "@bibgraph/utils";
1010
import {
1111
closestCenter,
@@ -44,6 +44,7 @@ import {
4444
} from "@mantine/core";
4545
import { notifications } from "@mantine/notifications";
4646
import {
47+
IconAlertTriangle,
4748
IconDots,
4849
IconEdit,
4950
IconExternalLink,
@@ -449,6 +450,8 @@ export const CatalogueEntities = ({ onNavigate }: CatalogueEntitiesProps) => {
449450
const [showBulkConfirm, setShowBulkConfirm] = useState(false);
450451
const [showBulkMoveModal, setShowBulkMoveModal] = useState(false);
451452
const [targetListId, setTargetListId] = useState<string | null>(null);
453+
const [showDuplicatesModal, setShowDuplicatesModal] = useState(false);
454+
const [duplicateStats, setDuplicateStats] = useState<DuplicateStats | null>(null);
452455

453456
// T075: Virtual scrolling ref for large lists
454457
const parentRef = useRef<HTMLDivElement>(null);
@@ -691,6 +694,23 @@ export const CatalogueEntities = ({ onNavigate }: CatalogueEntitiesProps) => {
691694
}
692695
};
693696

697+
// Calculate duplicate statistics when entities change
698+
React.useEffect(() => {
699+
if (entities.length > 0) {
700+
const stats = calculateDuplicateStats(entities);
701+
setDuplicateStats(stats);
702+
logger.debug("catalogue-entities", "Duplicate stats calculated", {
703+
total: stats.totalEntities,
704+
unique: stats.uniqueEntities,
705+
duplicates: stats.duplicateCount,
706+
removable: stats.removableCount,
707+
percentage: stats.duplicatePercentage.toFixed(1),
708+
});
709+
} else {
710+
setDuplicateStats(null);
711+
}
712+
}, [entities]);
713+
694714
// Get unique entity types for filter dropdown
695715
const entityTypes = [...new Set(entities.map((e) => e.entityType))];
696716

@@ -738,14 +758,50 @@ export const CatalogueEntities = ({ onNavigate }: CatalogueEntitiesProps) => {
738758
return (
739759
<Card style={{ border: BORDER_STYLE_GRAY_3 }} padding="md">
740760
<Stack gap="md">
761+
{/* Duplicate Warning Banner */}
762+
{duplicateStats && duplicateStats.removableCount > 0 && (
763+
<Alert
764+
variant="light"
765+
color="yellow"
766+
icon={<IconAlertTriangle size={ICON_SIZE.MD} />}
767+
title="Duplicates Found"
768+
>
769+
<Group justify="space-between">
770+
<Text size="sm">
771+
{duplicateStats.removableCount} duplicate {duplicateStats.removableCount === 1 ? 'entity' : 'entities'} found ({duplicateStats.duplicatePercentage.toFixed(1)}% of list)
772+
</Text>
773+
<Button
774+
size="xs"
775+
variant="light"
776+
onClick={() => setShowDuplicatesModal(true)}
777+
>
778+
View & Remove Duplicates
779+
</Button>
780+
</Group>
781+
</Alert>
782+
)}
783+
741784
{/* Header */}
742785
<Group justify="space-between">
743786
<Text size="lg" fw={500}>
744787
{entities.length} {entities.length === 1 ? "entity" : "entities"} in "{selectedList.title}"
745788
</Text>
746-
<Badge size="sm" color="blue">
747-
{selectedList.type === "bibliography" ? "Bibliography" : "List"}
748-
</Badge>
789+
<Group gap="xs">
790+
{duplicateStats && duplicateStats.removableCount > 0 && (
791+
<Button
792+
size="xs"
793+
variant="light"
794+
color="yellow"
795+
leftSection={<IconAlertTriangle size={14} />}
796+
onClick={() => setShowDuplicatesModal(true)}
797+
>
798+
{duplicateStats.removableCount} Duplicates
799+
</Button>
800+
)}
801+
<Badge size="sm" color="blue">
802+
{selectedList.type === "bibliography" ? "Bibliography" : "List"}
803+
</Badge>
804+
</Group>
749805
</Group>
750806

751807
{/* Filters */}
@@ -991,6 +1047,75 @@ export const CatalogueEntities = ({ onNavigate }: CatalogueEntitiesProps) => {
9911047
</Group>
9921048
</Stack>
9931049
</Modal>
1050+
1051+
{/* Duplicates Modal */}
1052+
<Modal
1053+
opened={showDuplicatesModal}
1054+
onClose={() => setShowDuplicatesModal(false)}
1055+
title="Duplicate Entities"
1056+
size="lg"
1057+
>
1058+
{duplicateStats && duplicateStats.removableCount > 0 ? (
1059+
<Stack gap="md">
1060+
<Text size="sm">
1061+
Found <Text fw={700}>{duplicateStats.removableCount}</Text> duplicate {duplicateStats.removableCount === 1 ? 'entity' : 'entities'} in this list.
1062+
Removing duplicates will free up <Text fw={700}>{duplicateStats.duplicatePercentage.toFixed(1)}%</Text> of the list.
1063+
</Text>
1064+
1065+
<Alert variant="light" color="blue">
1066+
<Text size="sm">
1067+
Duplicates are detected when the same entity (same type and ID) appears multiple times in the list.
1068+
Only the most recently added duplicate will be kept; earlier duplicates will be removed.
1069+
</Text>
1070+
</Alert>
1071+
1072+
<Group justify="flex-end" gap="xs">
1073+
<Button
1074+
variant="subtle"
1075+
onClick={() => setShowDuplicatesModal(false)}
1076+
>
1077+
Cancel
1078+
</Button>
1079+
<Button
1080+
color="red"
1081+
onClick={async () => {
1082+
if (!selectedList.id) return;
1083+
1084+
try {
1085+
const toRemove = suggestDuplicateRemovals(entities);
1086+
await bulkRemoveEntities(selectedList.id, toRemove);
1087+
setShowDuplicatesModal(false);
1088+
logger.info("catalogue-entities", "Duplicates removed", {
1089+
removedCount: toRemove.length,
1090+
});
1091+
notifications.show({
1092+
title: "Duplicates Removed",
1093+
message: `Removed ${toRemove.length} duplicate entities`,
1094+
color: "green",
1095+
});
1096+
} catch (error) {
1097+
logger.error("catalogue-entities", "Failed to remove duplicates", {
1098+
error,
1099+
});
1100+
notifications.show({
1101+
title: "Error",
1102+
message: "Failed to remove duplicates",
1103+
color: "red",
1104+
});
1105+
}
1106+
}}
1107+
>
1108+
Remove All Duplicates
1109+
</Button>
1110+
</Group>
1111+
</Stack>
1112+
) : (
1113+
<Stack gap="md">
1114+
<Text size="sm">No duplicates found in this list.</Text>
1115+
<Button onClick={() => setShowDuplicatesModal(false)}>Close</Button>
1116+
</Stack>
1117+
)}
1118+
</Modal>
9941119
</Card>
9951120
);
9961121
};
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
/**
2+
* Duplicate Detection Utilities
3+
*
4+
* Provides utilities for detecting duplicate entities within and across catalogue lists.
5+
* @package
6+
*/
7+
8+
import type { CatalogueEntity } from './storage/catalogue-db.js';
9+
10+
/**
11+
* Key function for identifying duplicates
12+
* @param entity - The entity to generate a key for
13+
* @returns A unique key identifying the entity
14+
*/
15+
const entityKey = (entity: CatalogueEntity): string => {
16+
return `${entity.entityType}:${entity.entityId}`;
17+
};
18+
19+
/**
20+
* Result of duplicate detection
21+
*/
22+
export interface DuplicateGroup {
23+
/** The canonical key for this duplicate group */
24+
key: string;
25+
/** All entities that are duplicates of each other */
26+
entities: CatalogueEntity[];
27+
/** Lists where these duplicates appear */
28+
listIds: string[];
29+
/** Number of duplicates in this group */
30+
count: number;
31+
}
32+
33+
/**
34+
* Options for duplicate detection
35+
*/
36+
export interface DuplicateDetectionOptions {
37+
/** Whether to include entities with the same ID but different types (default: false) */
38+
crossType?: boolean;
39+
/** Minimum number of occurrences to consider as duplicate (default: 2) */
40+
minOccurrences?: number;
41+
}
42+
43+
/**
44+
* Detect duplicate entities within a single list
45+
* @param entities - Entities to check for duplicates
46+
* @param options - Detection options
47+
* @returns Array of duplicate groups found
48+
*/
49+
export const detectDuplicatesInList = (
50+
entities: CatalogueEntity[],
51+
options: DuplicateDetectionOptions = {}
52+
): DuplicateGroup[] => {
53+
const { crossType = false, minOccurrences = 2 } = options;
54+
55+
// Group entities by their key
56+
const entityMap = new Map<string, CatalogueEntity[]>();
57+
58+
for (const entity of entities) {
59+
const key = crossType
60+
? entity.entityId // Only use entity ID, ignore type
61+
: entityKey(entity); // Use both type and ID
62+
63+
if (!entityMap.has(key)) {
64+
entityMap.set(key, []);
65+
}
66+
const existing = entityMap.get(key);
67+
if (existing) {
68+
existing.push(entity);
69+
}
70+
}
71+
72+
// Filter to only groups with duplicates
73+
const duplicates: DuplicateGroup[] = [];
74+
75+
for (const [key, groupEntities] of entityMap.entries()) {
76+
if (groupEntities.length >= minOccurrences) {
77+
const listIds = new Set<string>();
78+
for (const entity of groupEntities) {
79+
if (entity.listId) {
80+
listIds.add(entity.listId);
81+
}
82+
}
83+
84+
duplicates.push({
85+
key,
86+
entities: groupEntities,
87+
listIds: [...listIds],
88+
count: groupEntities.length,
89+
});
90+
}
91+
}
92+
93+
// Sort by count (most duplicates first)
94+
return duplicates.sort((a, b) => b.count - a.count);
95+
};
96+
97+
/**
98+
* Detect duplicate entities across multiple lists
99+
* @param listEntities - Map of list ID to entities in that list
100+
* @param options - Detection options
101+
* @returns Array of duplicate groups found
102+
*/
103+
export const detectDuplicatesAcrossLists = (
104+
listEntities: Map<string, CatalogueEntity[]>,
105+
options: DuplicateDetectionOptions = {}
106+
): DuplicateGroup[] => {
107+
const { crossType = false, minOccurrences = 2 } = options;
108+
109+
// Collect all entities with their list IDs
110+
const entityMap = new Map<string, Array<{ entity: CatalogueEntity; listId: string }>>();
111+
112+
for (const [listId, entities] of listEntities.entries()) {
113+
for (const entity of entities) {
114+
const key = crossType
115+
? entity.entityId
116+
: entityKey(entity);
117+
118+
if (!entityMap.has(key)) {
119+
entityMap.set(key, []);
120+
}
121+
const existing = entityMap.get(key);
122+
if (existing) {
123+
existing.push({ entity, listId });
124+
}
125+
}
126+
}
127+
128+
// Filter to only groups with duplicates across lists
129+
const duplicates: DuplicateGroup[] = [];
130+
131+
for (const [key, items] of entityMap.entries()) {
132+
if (items.length >= minOccurrences) {
133+
const uniqueListIds = new Set(items.map(item => item.listId));
134+
135+
// Only include if duplicates appear in multiple lists
136+
if (uniqueListIds.size >= 2) {
137+
duplicates.push({
138+
key,
139+
entities: items.map(item => item.entity),
140+
listIds: [...uniqueListIds],
141+
count: items.length,
142+
});
143+
}
144+
}
145+
}
146+
147+
// Sort by count (most duplicates first)
148+
return duplicates.sort((a, b) => b.count - a.count);
149+
};
150+
151+
/**
152+
* Calculate duplicate statistics for a list
153+
* @param entities - Entities to analyze
154+
* @returns Statistics about duplicates
155+
*/
156+
export interface DuplicateStats {
157+
/** Total number of entities */
158+
totalEntities: number;
159+
/** Number of unique entities (no duplicates) */
160+
uniqueEntities: number;
161+
/** Number of duplicate entities (counted with multiplicity) */
162+
duplicateCount: number;
163+
/** Number of entities that could be removed by deduplication */
164+
removableCount: number;
165+
/** Percentage of entities that are duplicates */
166+
duplicatePercentage: number;
167+
}
168+
169+
export const calculateDuplicateStats = (
170+
entities: CatalogueEntity[]
171+
): DuplicateStats => {
172+
const totalEntities = entities.length;
173+
const duplicates = detectDuplicatesInList(entities);
174+
175+
// Count all entities involved in duplicate groups
176+
const duplicateEntities = duplicates.reduce((sum, group) => sum + group.count, 0);
177+
178+
// Calculate how many could be removed (all but one from each group)
179+
const removableCount = duplicates.reduce((sum, group) => sum + (group.count - 1), 0);
180+
181+
const uniqueEntities = totalEntities - removableCount;
182+
183+
return {
184+
totalEntities,
185+
uniqueEntities,
186+
duplicateCount: duplicateEntities,
187+
removableCount,
188+
duplicatePercentage: totalEntities > 0
189+
? (removableCount / totalEntities) * 100
190+
: 0,
191+
};
192+
};
193+
194+
/**
195+
* Suggest entities to remove to eliminate duplicates
196+
* @param entities - Entities to analyze
197+
* @returns Array of entity record IDs that can be safely removed
198+
*/
199+
export const suggestDuplicateRemovals = (
200+
entities: CatalogueEntity[]
201+
): string[] => {
202+
const duplicates = detectDuplicatesInList(entities);
203+
const toRemove: string[] = [];
204+
205+
for (const group of duplicates) {
206+
// Keep the first one, mark the rest for removal
207+
for (let i = 1; i < group.entities.length; i++) {
208+
const entity = group.entities[i];
209+
if (entity.id) {
210+
toRemove.push(entity.id);
211+
}
212+
}
213+
}
214+
215+
return toRemove;
216+
};

0 commit comments

Comments
 (0)