|
27 | 27 |
|
28 | 28 | from nucliadb.common import datamanagers
|
29 | 29 | from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
|
30 |
| -from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR |
| 30 | +from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId |
31 | 31 | from nucliadb.common.maindb.driver import Transaction
|
32 | 32 | from nucliadb.ingest.fields.base import Field
|
33 | 33 | from nucliadb.ingest.fields.conversation import Conversation
|
|
49 | 49 | ExtractedVectorsWrapper,
|
50 | 50 | FieldClassifications,
|
51 | 51 | FieldComputedMetadataWrapper,
|
| 52 | + FieldFile, |
52 | 53 | FieldID,
|
53 | 54 | FieldMetadata,
|
54 | 55 | FieldQuestionAnswerWrapper,
|
@@ -682,15 +683,52 @@ async def _apply_file_extracted_data(self, file_extracted_data: FileExtractedDat
|
682 | 683 | maybe_update_basic_icon(self.basic, file_extracted_data.icon)
|
683 | 684 | maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
|
684 | 685 |
|
| 686 | + async def _should_update_resource_title_from_file_metadata(self) -> bool: |
| 687 | + """ |
| 688 | + We only want to update resource title from file metadata if the title is empty, |
| 689 | + equal to the resource uuid or equal to any of the file filenames in the resource. |
| 690 | + """ |
| 691 | + basic = await self.get_basic() |
| 692 | + if basic is None: |
| 693 | + return True |
| 694 | + current_title = basic.title |
| 695 | + if current_title == "": |
| 696 | + # If the title is empty, we should update it |
| 697 | + return True |
| 698 | + if current_title == self.uuid: |
| 699 | + # If the title is the same as the resource uuid, we should update it |
| 700 | + return True |
| 701 | + fields = await self.get_fields(force=True) |
| 702 | + filenames = set() |
| 703 | + for (field_type, _), field_obj in fields.items(): |
| 704 | + if field_type == FieldType.FILE: |
| 705 | + field_value: Optional[FieldFile] = await field_obj.get_value() |
| 706 | + if field_value is not None: |
| 707 | + if field_value.file.filename not in ("", None): |
| 708 | + filenames.add(field_value.file.filename) |
| 709 | + if current_title in filenames: |
| 710 | + # If the title is equal to any of the file filenames, we should update it |
| 711 | + return True |
| 712 | + return False |
| 713 | + |
685 | 714 | async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
|
686 | 715 | """
|
687 | 716 | Update the resource title with the first file that has a title extracted.
|
688 | 717 | """
|
689 |
| - for file_extracted_data in message.file_extracted_data: |
690 |
| - if file_extracted_data.title != "": |
691 |
| - await self.update_resource_title(file_extracted_data.title) |
692 |
| - # Break after the first file with a title is found |
693 |
| - break |
| 718 | + if not await self._should_update_resource_title_from_file_metadata(): |
| 719 | + return |
| 720 | + for fed in message.file_extracted_data: |
| 721 | + if fed.title == "": |
| 722 | + # Skip if the extracted title is empty |
| 723 | + continue |
| 724 | + fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field) |
| 725 | + logger.info( |
| 726 | + "Updating resource title from file extracted data", |
| 727 | + extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title}, |
| 728 | + ) |
| 729 | + await self.update_resource_title(fed.title) |
| 730 | + # Break after the first file with a title is found |
| 731 | + break |
694 | 732 |
|
695 | 733 | async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
|
696 | 734 | assert self.basic is not None
|
|
0 commit comments