Skip to content

Commit

Permalink
normalize parser struct for all file types (Mintplex-Labs#321)
Browse files Browse the repository at this point in the history
  • Loading branch information
timothycarambat authored and franzbischoff committed Nov 4, 2023
1 parent a5ccd3d commit c0cc9a2
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 5 deletions.
2 changes: 1 addition & 1 deletion collector/scripts/watch/convert/as_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def as_odt(**kwargs):
'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}",
'author': 'Unknown', # TODO: Find a better author
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'ODT Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
Expand Down
5 changes: 1 addition & 4 deletions collector/scripts/watch/convert/as_mbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,8 @@ def as_mbox(**kwargs):
"docAuthor": message["From"],
"description": f"email from {message['From']} to {message['To']}",
"docSource": "mbox file uploaded by the user.",
"chunkSource": subject,
"published": file_creation_time(fullpath),
"sender": message["From"],
"recipient": message["To"],
"subject": subject,
"date_sent": date_sent,
"wordCount": len(content),
"pageContent": content,
"token_count_estimate": len(tokenize(content)),
Expand Down
1 change: 1 addition & 0 deletions collector/scripts/watch/convert/as_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def as_text(**kwargs):
'title': f"{filename}{ext}",
'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description
'docSource': 'a text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath),
'wordCount': len(content),
Expand Down

0 comments on commit c0cc9a2

Please sign in to comment.