Skip to content

Commit f612bd3

Browse files
authored
Fix automatic title for file resources (#2775)
1 parent 200c7bd commit f612bd3

File tree

1 file changed

+44
-6
lines changed

1 file changed

+44
-6
lines changed

nucliadb/src/nucliadb/ingest/orm/resource.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
from nucliadb.common import datamanagers
2929
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
30-
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
30+
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
3131
from nucliadb.common.maindb.driver import Transaction
3232
from nucliadb.ingest.fields.base import Field
3333
from nucliadb.ingest.fields.conversation import Conversation
@@ -49,6 +49,7 @@
4949
ExtractedVectorsWrapper,
5050
FieldClassifications,
5151
FieldComputedMetadataWrapper,
52+
FieldFile,
5253
FieldID,
5354
FieldMetadata,
5455
FieldQuestionAnswerWrapper,
@@ -682,15 +683,52 @@ async def _apply_file_extracted_data(self, file_extracted_data: FileExtractedDat
682683
maybe_update_basic_icon(self.basic, file_extracted_data.icon)
683684
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
684685

686+
async def _should_update_resource_title_from_file_metadata(self) -> bool:
687+
"""
688+
We only want to update resource title from file metadata if the title is empty,
689+
equal to the resource uuid or equal to any of the file filenames in the resource.
690+
"""
691+
basic = await self.get_basic()
692+
if basic is None:
693+
return True
694+
current_title = basic.title
695+
if current_title == "":
696+
# If the title is empty, we should update it
697+
return True
698+
if current_title == self.uuid:
699+
# If the title is the same as the resource uuid, we should update it
700+
return True
701+
fields = await self.get_fields(force=True)
702+
filenames = set()
703+
for (field_type, _), field_obj in fields.items():
704+
if field_type == FieldType.FILE:
705+
field_value: Optional[FieldFile] = await field_obj.get_value()
706+
if field_value is not None:
707+
if field_value.file.filename not in ("", None):
708+
filenames.add(field_value.file.filename)
709+
if current_title in filenames:
710+
# If the title is equal to any of the file filenames, we should update it
711+
return True
712+
return False
713+
685714
async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
686715
"""
687716
Update the resource title with the first file that has a title extracted.
688717
"""
689-
for file_extracted_data in message.file_extracted_data:
690-
if file_extracted_data.title != "":
691-
await self.update_resource_title(file_extracted_data.title)
692-
# Break after the first file with a title is found
693-
break
718+
if not await self._should_update_resource_title_from_file_metadata():
719+
return
720+
for fed in message.file_extracted_data:
721+
if fed.title == "":
722+
# Skip if the extracted title is empty
723+
continue
724+
fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
725+
logger.info(
726+
"Updating resource title from file extracted data",
727+
extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
728+
)
729+
await self.update_resource_title(fed.title)
730+
# Break after the first file with a title is found
731+
break
694732

695733
async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
696734
assert self.basic is not None

0 commit comments

Comments
 (0)