forked from uttamo/kindle-clippings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
notes_parser.py
106 lines (84 loc) · 4.13 KB
/
notes_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os.path
from pathlib import Path
from typing import Tuple
import datetime as dt
import re
# TODO why not Path.cwd()?
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
def parse_file(path: str, title_reg=re.compile('.*'), starting_time = '1986-09-29', ending_time = dt.datetime.now().isoformat()) -> dict:
"Given a clippings.txt file return its contents in the /proper/ form"
# TODO perform path check
full_path = os.path.join(MODULE_DIR, path)
with open(full_path, 'r', encoding='utf-8-sig') as clippings_file:
content = clippings_file.read()
_unsuitable_sections = []
notes_data = {}
_highlights = {}
# TODO fix vars' names (deStart, deEnd)
deStart = dt.datetime.fromisoformat(starting_time)
deEnd = dt.datetime.fromisoformat(ending_time)
sections = content.split('==========')
for section in sections:
if not section or section == '\n':
continue
elif 'Your Highlight' in section or 'Your Note' in section:
# TODO Notes should be linked with highlights most of the time, for now, just differentiating
pass
elif 'Your Bookmark' in section:
continue
else:
raise NotImplementedError(f'Unsupported note type: {section}')
lines = [line for line in section.split('\n') if line]
if len(lines) != 3:
_unsuitable_sections.append(lines)
continue
# Non-empty lines
# line 1 - book title (author)
# line 2 - highlight info
# line 3 - highlighted text
# Title and author
book_title, author_name = parse_title_and_author_name(lines[0])
# title check
if not title_reg.search(book_title):
continue
location, time_added = parse_highlight_info(lines[1])
# time check
if time_added < deStart or time_added > deEnd:
continue
# TODO, fix the naming standards here, highlights and notes are not the same
# TODO, fix the dictionary structure here, I think it can be improved ( to allow for quicklier searching and grouping )
# - instead of title first, author first maybe?
# Sometimes the clippings file has duplicated notes, so we need to check if a particular note has already been
# encountered
highlighted_text = lines[2]
existing_highlights_for_author = _highlights.get(book_title, [])
# FIXME I must have broken this functionality
if highlighted_text in existing_highlights_for_author:
continue
else:
if book_title in _highlights:
_highlights[book_title].append(highlighted_text)
else:
_highlights[book_title] = [highlighted_text]
note_dict = {'location': location, 'time_added': time_added, 'highlighted_text': highlighted_text}
if book_title in notes_data:
notes_data[book_title]['notes'].append(note_dict)
else:
notes_data[book_title] = {'author': author_name, 'notes': [note_dict]}
return notes_data
def parse_title_and_author_name(first_line: str) -> Tuple[str, str]:
"Given the proper line of the clippings.txt section, return the book's title and the author's name"
_author_bracket_index = first_line.rfind('(') # find rightmost '(' in case book title contains '('
author_name = first_line[_author_bracket_index:][1:-1].strip()
book_title = first_line[:_author_bracket_index].strip()
book_title = book_title.replace('\ufeff', '') # todo - find better solution
return book_title, author_name
def parse_highlight_info(second_line: str) -> Tuple[str, dt.datetime]:
"Given the proper line of the clippings.txt section, return the snippet's location, and the date in which the note was taken"
location_str = re.findall(r'(?<=.location ).+(?= \|)', second_line)[0]
_time_added_str = re.findall('(?<=.Added on ).+', second_line)[0]
time_added = dt.datetime.strptime(_time_added_str, '%A, %d %B %Y %H:%M:%S')
return location_str, time_added
def parse_highlighted_text(third_line: str) -> str:
"Probably obsolete function here"
return third_line.rstrip(',')