forked from openfoodfacts/robotoff
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_duplicates.py
70 lines (62 loc) · 2.48 KB
/
remove_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from peewee import fn
from robotoff.models import ImageModel, ImagePrediction, LogoAnnotation, db
with db.connection_context():
with db.atomic():
counts = {
item["source_image"]: item["count"]
for item in ImageModel.select(
ImageModel.source_image,
fn.COUNT(ImageModel.id).alias("count"),
)
.group_by(ImageModel.source_image)
.having(fn.COUNT(ImageModel.id) > 1)
.dicts()
.iterator()
}
print(f"duplicated groups: {len(counts)}")
with db.connection_context():
for source_image in counts.keys():
with db.atomic():
image_ids = sorted(
set(
item[0]
for item in ImageModel.select(ImageModel.id)
.where(ImageModel.source_image == source_image)
.tuples()
)
)
print(f"Image {source_image}, image IDs: {image_ids}")
if len(image_ids) <= 1:
continue
image_ids_to_delete = image_ids[1:]
image_prediction_ids_to_delete = list(
item[0]
for item in ImagePrediction.select(ImagePrediction.id)
.where(ImagePrediction.image_id.in_(image_ids_to_delete))
.tuples()
)
logo_annotation_ids_to_delete = list(
item[0]
for item in LogoAnnotation.select(LogoAnnotation.id)
.where(
LogoAnnotation.image_prediction_id.in_(
image_prediction_ids_to_delete
)
)
.tuples()
)
# print(f"image IDs to delete: {image_ids_to_delete}")
# print(f"image prediction IDs to delete: {image_prediction_ids_to_delete}")
# print(f"logo annotation IDs to delete: {logo_annotation_ids_to_delete}")
if logo_annotation_ids_to_delete:
LogoAnnotation.delete().where(
LogoAnnotation.id.in_(logo_annotation_ids_to_delete)
).execute()
if image_prediction_ids_to_delete:
ImagePrediction.delete().where(
ImagePrediction.id.in_(image_prediction_ids_to_delete)
).execute()
if image_ids_to_delete:
ImageModel.delete().where(
ImageModel.id.in_(image_ids_to_delete)
).execute()