Skip to content

Commit 385765f

Browse files
committed
Modified hierarchy logic
1 parent e6adefb commit 385765f

File tree

8 files changed

+145
-5
lines changed

8 files changed

+145
-5
lines changed

m_aux/outputs.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23
import shutil
34

45

@@ -40,3 +41,12 @@ def prepare_output_folder(folder_path):
4041
print(f"Folder '{folder_path}' was emptied and recreated.")
4142
except Exception as e:
4243
print(f"An error occurred while preparing the folder: {e}")
44+
45+
46+
def normalize_string(name):
47+
"""Normalizes names for files and directories."""
48+
# Replace spaces with dashes, remove special characters, trim, and lowercase
49+
name = re.sub(r"\s+", "-", name) # Spaces to dashes
50+
name = re.sub(r"[^\w\-]", "", name) # Remove non-word characters except dashes
51+
name = name.strip("-") # Trim leading and trailing dashes
52+
return name.lower()

m_parse/block_models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ class EmbedBlock(BaseModel):
8181
url: str
8282

8383

84+
class BulletedListItemBlock(BaseModel):
85+
rich_text: List[RichText]
86+
color: str
87+
88+
8489
class Block(BaseModel):
8590
object: str
8691
id: str
@@ -96,6 +101,7 @@ class Block(BaseModel):
96101
image: Optional[ImageBlock] = None
97102
bookmark: Optional[BookmarkBlock] = None
98103
embed: Optional[EmbedBlock] = None
104+
bulleted_list_item: Optional[BulletedListItemBlock] = None
99105
# Add other fields and types as necessary
100106
dynamic_parents: Dict[str, ParentReference] = Field(default_factory=dict)
101107

m_parse/markdown_processing.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from m_parse.block_models import (
33
Block,
44
BookmarkBlock,
5+
BulletedListItemBlock,
56
ChildPageBlock,
67
CodeBlock,
78
EmbedBlock,
@@ -15,6 +16,7 @@
1516
validate_block,
1617
)
1718
from m_parse.markdown_processing_helpers import (
19+
markdown_bullet,
1820
markdown_code_block,
1921
markdown_convert_paragraph_styles,
2022
markdown_headings,
@@ -232,7 +234,7 @@ def parse_child_page(block: ChildPageBlock):
232234
)
233235
)
234236
page_processed_blocks.append(
235-
parsing_block_return(block.id, changelog, block.type, path_hierarchy)
237+
parsing_block_return(block.id, changelog, "changelog", path_hierarchy)
236238
)
237239
return page_processed_blocks
238240

@@ -288,6 +290,21 @@ def parse_embed(block: Block) -> dict:
288290
)
289291

290292

293+
@validate_block(BulletedListItemBlock)
294+
def parse_bulleted_list_item(block: Block, indent_level: int = 0) -> dict:
295+
"""Parses a bulleted list item block into Markdown format, considering indentation and
296+
styles."""
297+
bullet_items = []
298+
for rich_text_item in block.bulleted_list_item.rich_text:
299+
content = rich_text_item.plain_text
300+
annotations = rich_text_item.annotations
301+
bullet_items.append(markdown_bullet(content, annotations, indent=indent_level))
302+
303+
md = "\n".join(bullet_items)
304+
305+
return parsing_block_return(block.id, md, block.type, calculate_path_on_hierarchy(block))
306+
307+
291308
@validate_block(LinkToPageBlock)
292309
def parse_link_to_page(block: LinkToPageBlock):
293310
# TODO: Implement this function

m_parse/markdown_processing_helpers.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,13 @@ def markdown_image(image_url: str, caption: str = "") -> str:
180180
- str: The formatted Markdown image link string.
181181
"""
182182
return f"![{caption}]({image_url})"
183+
184+
185+
def markdown_bullet(content: str, annotations: dict, indent: int = 0) -> str:
186+
"""Generates a markdown bullet list item with optional indentation and styles."""
187+
# Convert styles for the bullet content
188+
styled_content = markdown_convert_paragraph_styles(content, annotations)
189+
# Calculate the indentation spaces
190+
indent_spaces = " " * indent
191+
# Combine to form the markdown bullet item
192+
return f"{indent_spaces}- {styled_content}"

m_search/notion_blocks.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@ def fetch_and_process_block_hierarchy(root_block_id):
1414
- list: A list of all processed blocks, each with added parent hierarchy information.
1515
"""
1616
processed_blocks = []
17+
root_block = fetch_block_details(root_block_id)
18+
root_block_parent = root_block.get("parent", None)
19+
root_block_parent_id = (
20+
(root_block_parent.get("block_id") or root_block_parent.get("page_id")).strip()
21+
if root_block_parent
22+
else None
23+
)
1724

1825
def process_block(block_id, parent_hierarchy=[]):
1926
"""Recursively processes a block and its children, adding parent hierarchy information.
@@ -28,7 +35,9 @@ def process_block(block_id, parent_hierarchy=[]):
2835
return
2936

3037
# Add parent hierarchy information to the current block
31-
add_parent_hierarchy(current_block, parent_hierarchy.copy())
38+
add_parent_hierarchy(
39+
current_block, parent_hierarchy.copy(), root_block_id, root_block_parent_id
40+
)
3241

3342
# Add the processed block to the list
3443
processed_blocks.append(current_block)
@@ -49,7 +58,9 @@ def process_block(block_id, parent_hierarchy=[]):
4958
return processed_blocks
5059

5160

52-
def add_parent_hierarchy(block, parent_hierarchy=[]):
61+
def add_parent_hierarchy(
62+
block, parent_hierarchy=[], root_block_id=None, root_block_parent_id=None
63+
):
5364
"""Adds parent hierarchy identifiers to a block, ensuring no duplications and starting labeling
5465
from c_parent_1.
5566
@@ -62,6 +73,9 @@ def add_parent_hierarchy(block, parent_hierarchy=[]):
6273
parent_id = parent_id.strip() if parent_id else None
6374
parent_type = block.get("type")
6475

76+
if root_block_id and block["id"] != root_block_id and root_block_parent_id:
77+
parent_hierarchy.insert(0, {"block_id": root_block_parent_id, "type": "root"})
78+
6579
# Normalize block_id before comparison
6680
normalized_parent_id = parent_id.replace("-", "") if parent_id else None
6781

m_write/notion_processed_blocks.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Module for processing and writing Notion blocks to Markdown files."""
2+
3+
import os
4+
from m_write.write_helpers import ensure_dir, write_md_file, find_page_title
5+
from m_aux.outputs import normalize_string
6+
from m_aux.pretty_print import pretty_print
7+
8+
# def process_blocks(blocks, root_dir, parent_id=None):
9+
# """Processes and writes blocks to the appropriate .md files and directories, with improvements."""
10+
# for block in blocks:
11+
# # Handle 'child_page' blocks differently to use their title for naming
12+
# if block['type'] == 'child_page':
13+
# page_title = find_page_title(blocks, block['id'])
14+
# # Normalize the page title for use in file and directory names
15+
# normalized_title = normalize_string(page_title)
16+
# output_path = os.path.join(root_dir, block['path'].replace('/', os.sep), normalized_title)
17+
# ensure_dir(output_path) # Ensure the directory for the page exists
18+
# output_file = os.path.join(output_path, normalized_title + ".md")
19+
# else:
20+
# # Find the title of the parent page for non-'child_page' blocks
21+
# if parent_id:
22+
# parent_title = find_page_title(blocks, parent_id)
23+
# normalized_title = normalize_string(parent_title)
24+
# output_path = os.path.join(root_dir, block['path'].replace('/', os.sep), normalized_title)
25+
# else:
26+
# output_path = os.path.join(root_dir, block['path'].replace('/', os.sep))
27+
# ensure_dir(output_path)
28+
# output_file = os.path.join(output_path, normalize_string(block['md'][:50]) + ".md") # Use first 50 chars of md as filename
29+
30+
# write_md_file(output_file, block['md'])
31+
32+
# # Recursively process subpages for 'child_page' blocks
33+
# if block['type'] == 'child_page':
34+
# subpages = [b for b in blocks if b['path'].startswith(block['path']) and b['id'] != block['id']]
35+
# if subpages:
36+
# process_blocks(subpages, output_path, block['id'])
37+
38+
39+
def process_blocks(blocks, root_dir):
40+
pretty_print(blocks, "Processed blocks")

m_write/write_helpers.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os
2+
from m_aux.outputs import normalize_string
3+
4+
def ensure_dir(directory):
5+
"""Ensures that a directory exists."""
6+
if not os.path.exists(directory):
7+
os.makedirs(directory)
8+
9+
def write_md_file(file_path, content):
10+
"""Writes Markdown content to a file."""
11+
with open(file_path, 'w', encoding='utf-8') as md_file:
12+
md_file.write(content)
13+
14+
def find_page_title(blocks, block_id):
15+
"""Finds a page title given a block ID."""
16+
for block in blocks:
17+
if block['id'] == block_id and 'md' in block:
18+
return normalize_string(block['md'].strip('# ').strip())
19+
return None
20+
21+
def get_md_content(blocks, block_id):
22+
"""Fetches the Markdown content for a block by its ID."""
23+
for block in blocks:
24+
if block['id'] == block_id:
25+
return block['md']
26+
return None
27+
28+
def is_page(blocks, block_id):
29+
"""Checks if a block is of type 'child_page'."""
30+
for block in blocks:
31+
if block['id'] == block_id:
32+
return block['type'] == 'child_page'
33+
return False
34+
35+
def get_pages(blocks):
36+
"""Extracts child pages from blocks and sorts them by the depth of their path."""
37+
# Filter only child pages
38+
pages = [block for block in blocks if block['type'] == 'child_page']
39+
# Sort pages by path length, then by path value for equal lengths
40+
pages.sort(key=lambda x: (len(x['path'].split('/')), x['path']))
41+
return pages

main.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
get_all_children_blocks,
1414
)
1515
from m_search.notion_pages import fetch_page_details
16+
from m_write.notion_processed_blocks import process_blocks
1617

1718

1819
def main():
@@ -68,9 +69,10 @@ def main():
6869
# children_blocks = get_all_children_blocks(args.page_id)
6970
# pretty_print(children_blocks)
7071
blocks = fetch_and_process_block_hierarchy(args.page_id)
71-
pretty_print(blocks)
72+
pretty_print(blocks, "Fetched blocks")
7273
processed_blocks = dispatch_blocks_parsing(blocks)
73-
pretty_print(processed_blocks)
74+
# pretty_print(processed_blocks)
75+
process_blocks(processed_blocks, args.outputs_dir)
7476
# pretty_print(children_blocks)
7577
# page_details = fetch_page_details(notion, args.page_id)
7678
# pretty_print(page_details)

0 commit comments

Comments
 (0)