Skip to content

Commit

Permalink
Adds perf boost: Only reads beginning of the file. Adds date metadata…
Browse files Browse the repository at this point in the history
… if not found.
  • Loading branch information
menganha committed Dec 9, 2022
1 parent d567e7c commit bfa17a1
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 55 deletions.
94 changes: 48 additions & 46 deletions pyblog/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@


class Post:
MANDATORY_LABELS = ['draft', 'date']
MANDATORY_LABELS = ['draft']
DEFAULT_TAG = 'blog'
INVALID_LABELS = ['_metadata', 'target_path', 'source_path', 'title']

TITLE_REGEXP = re.compile(r'^\s?#\s(.*)', flags=re.MULTILINE)
METADATA_REGEXP = re.compile(r'^\s?(\w+):\s(.+)', flags=re.MULTILINE)
TITLE_REGEXP = re.compile(r'^\s*#\s*(.*?)\s*$', flags=re.MULTILINE)
METADATA_REGEXP = re.compile(r'^\s*(\w+)\s*:\s*(.+?)\s*$', flags=re.MULTILINE)

def __init__(self, source_path: Path, target_path: Path):
self.source_path = source_path
Expand Down Expand Up @@ -52,56 +52,58 @@ def parse_metadata(self) -> dict[str, str]:
"""
Gets all the labels like "label: value" at the beginning of the post and also retrieve the title following
this label
TODO: Make it so that it doesn't read the whole file, iterating over each line individually
"""
metadata = {}
with self.source_path.open() as file:
raw_text = file.read().strip()

metadata_matches = []
prev_match_end_pos = 0
for idx, match in enumerate(Post.METADATA_REGEXP.finditer(raw_text)):
if idx == 0:
metadata_matches.append(match)
prev_match_end_pos = match.end()
continue
text_in_between = raw_text[prev_match_end_pos:match.start()].strip()
if not text_in_between:
metadata_matches.append(match)
prev_match_end_pos = match.end()
else:
break
for raw_line in file:
line = raw_line.strip()
if line:
if match := self.METADATA_REGEXP.match(line):
metadata.update({match.group(1).lower(): match.group(2).lower()})
elif match := self.TITLE_REGEXP.match(line):
metadata.update({'title': match.group(1)})
break
else:
break

# Add default tag if nothing is found
if 'tags' not in metadata:
metadata.update({'tags': [Post.DEFAULT_TAG]})

metadata = {}
for match in metadata_matches:
key = match.group(1).lower()
value = match.group(2).lower()
# Add default date if not found and prepend it to the file
if 'date' not in metadata:
today = dt.date.today()
metadata.update({'date': today})
with self.source_path.open() as file:
content = file.read()
with self.source_path.open(mode='w') as file:
file.write(f'date: {today.isoformat()}\n')
file.write(content)
else:
metadata.update({'date': dt.date.fromisoformat(metadata['date'])})

# Check for errors
missing_mandatory_labels = set(Post.MANDATORY_LABELS).difference(set(metadata))
if missing_mandatory_labels:
raise ValueError(f'The following mandatory label(s) is missing: {missing_mandatory_labels}')
elif 'title' not in metadata:
raise ValueError('No title found after the data labels')

# Convert into the correct type the value of the keys
for key, value in metadata.items():
if key in Post.INVALID_LABELS:
print(f'Invalid metadata label entry: {key}: {value}')
print(f'Invalid metadata label entry: "{key}". Ignoring...')
continue
if key == 'date':
value = dt.date.fromisoformat(value)
elif key == 'tags' and '[' not in value:
value = [value]
elif '[' in value:
value = [list_element.strip() for list_element in value.strip(' []').split(',')]
metadata.update({key: value})

# add default tag if nothing is found
if 'tags' not in metadata:
metadata.update({'tags': [Post.DEFAULT_TAG]})

# Parse title which is also part of the metadata
last_metadata_end_pos = metadata_matches[-1].end() if metadata_matches else 0
match_title = Post.TITLE_REGEXP.search(raw_text)
if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
actual_value = [list_element.strip() for list_element in metadata[key].strip(' []').split(',')]
else:
actual_value = value

if not match_title or raw_text[last_metadata_end_pos:match_title.start()].strip():
raise ValueError('No title found or text or the title does not follow directly the metadata labels')
else:
metadata.update({'title': match_title.group(1)})
if not Post.METADATA_REGEXP.match(raw_text):
raise ValueError('No metadata label found at the beginning of the text')
if not set(Post.MANDATORY_LABELS).issubset(set(metadata)):
raise ValueError(f'Not all mandatory labels {Post.MANDATORY_LABELS} found not found in metadata')
if key == 'tags' and isinstance(actual_value, str):
actual_value = [actual_value]

metadata.update({key: actual_value})

return metadata

Expand Down
53 changes: 44 additions & 9 deletions tests/test_post.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import datetime as dt
from inspect import cleandoc
from pathlib import Path
from textwrap import dedent

import pytest

Expand All @@ -9,10 +9,12 @@

@pytest.fixture()
def valid_text_path(tmp_path) -> Path:
text = dedent("""
text = cleandoc("""
draft: yes
tags: [lifestyle, manana]
another_label: another_value
weird_label: sola
paco : perfume
date: 2022-12-11
# My first post
Expand All @@ -28,9 +30,24 @@ def valid_text_path(tmp_path) -> Path:
return post_path


@pytest.fixture()
def valid_text_no_date_path(tmp_path) -> Path:
text = cleandoc("""
draft: yes
# My no date post
This post does not have a date but it should!
""")
post_path = tmp_path / 'post.md'
post_path.write_text(text)
return post_path


@pytest.fixture()
def invalid_text_path_1(tmp_path) -> Path:
text = dedent("""
text = cleandoc("""
This is my first invalid post.
""")
Expand All @@ -41,7 +58,7 @@ def invalid_text_path_1(tmp_path) -> Path:

@pytest.fixture()
def invalid_text_path_2(tmp_path) -> Path:
text = dedent("""
text = cleandoc("""
a_label: yes
# Title
Expand All @@ -55,7 +72,7 @@ def invalid_text_path_2(tmp_path) -> Path:

@pytest.fixture()
def invalid_text_path_3(tmp_path) -> Path:
text = dedent("""
text = cleandoc("""
draft: yes
some other irrelevant text
Expand All @@ -68,7 +85,7 @@ def invalid_text_path_3(tmp_path) -> Path:

@pytest.fixture()
def invalid_text_path_4(tmp_path) -> Path:
text = dedent("""
text = cleandoc("""
source_path: some_path
some other irrelevant text
Expand All @@ -86,21 +103,39 @@ def dummy_target_path(tmp_path) -> Path:

def test_parse_metadata(valid_text_path, dummy_target_path):
post = Post(valid_text_path, dummy_target_path)
expected_dict = {'draft': 'yes', 'tags': ['lifestyle', 'manana'], 'another_label': 'another_value', 'title': 'My first post',
'date': dt.date(2022, 12, 11)}
expected_dict = {'draft': 'yes', 'tags': ['lifestyle', 'manana'], 'another_label': 'another_value',
'title': 'My first post', 'paco': 'perfume', 'weird_label': 'sola', 'date': dt.date(2022, 12, 11)}
assert post._metadata == expected_dict


def test_parse_markdown(valid_text_path, dummy_target_path):
post = Post(valid_text_path, dummy_target_path)
expected_html = dedent("""
expected_html = cleandoc("""
<p>This is my firs valid post. If there's a text that seems like a tag within the post</p>
<p>definition: not a tag</p>
<p>it shouldn't be considered as such.</p>
""").strip()
assert post.get_content_in_html() == expected_html


def test_add_date_to_file(valid_text_no_date_path, dummy_target_path):
post = Post(valid_text_no_date_path, dummy_target_path)
todays_date = dt.date.today()
expected_post_file_content = cleandoc(f"""
date: {todays_date.isoformat()}
draft: yes
# My no date post
This post does not have a date but it should!
""")
assert post._metadata['date'] == todays_date
with post.source_path.open() as file:
post_file_content = file.read()
assert expected_post_file_content == post_file_content


def test_invalid_initializer_1(invalid_text_path_1, dummy_target_path):
with pytest.raises(ValueError):
Post(invalid_text_path_1, dummy_target_path)
Expand Down

0 comments on commit bfa17a1

Please sign in to comment.