Skip to content

Commit e43f553

Browse files
authored
metadata for unstructured files (#446)
1 parent c972a2a commit e43f553

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

backend/src/document_sources/local_file.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,17 @@ def get_documents_from_file_by_path(file_path,file_name):
4343

4444
if page.metadata['page_number']>page_number:
4545
page_number+=1
46+
if not metadata:
47+
metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
4648
pages.append(Document(page_content = page_content, metadata=metadata))
4749
page_content=''
4850

4951
if page == unstructured_pages[-1]:
52+
if not metadata:
53+
metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
5054
pages.append(Document(page_content = page_content, metadata=metadata))
5155

52-
elif page.metadata['category']=='PageBreak':
56+
elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]:
5357
page_number+=1
5458
pages.append(Document(page_content = page_content, metadata=metadata))
5559
page_content=''
@@ -65,5 +69,4 @@ def get_documents_from_file_by_path(file_path,file_name):
6569
else:
6670
logging.info(f'File {file_name} does not exist')
6771
raise Exception(f'File {file_name} does not exist')
68-
6972
return file_name, pages , file_extension

0 commit comments

Comments
 (0)