Adds perf boost: Only reads beginning of the file. Adds date metadata…

… if not found.
menganha · Dec 9, 2022 · bfa17a1 · bfa17a1
1 parent d567e7c
commit bfa17a1
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 55 deletions.
diff --git a/pyblog/post.py b/pyblog/post.py
@@ -13,12 +13,12 @@
 
 
 class Post:
-    MANDATORY_LABELS = ['draft', 'date']
+    MANDATORY_LABELS = ['draft']
     DEFAULT_TAG = 'blog'
     INVALID_LABELS = ['_metadata', 'target_path', 'source_path', 'title']
 
-    TITLE_REGEXP = re.compile(r'^\s?#\s(.*)', flags=re.MULTILINE)
-    METADATA_REGEXP = re.compile(r'^\s?(\w+):\s(.+)', flags=re.MULTILINE)
+    TITLE_REGEXP = re.compile(r'^\s*#\s*(.*?)\s*$', flags=re.MULTILINE)
+    METADATA_REGEXP = re.compile(r'^\s*(\w+)\s*:\s*(.+?)\s*$', flags=re.MULTILINE)
 
     def __init__(self, source_path: Path, target_path: Path):
         self.source_path = source_path
@@ -52,56 +52,58 @@ def parse_metadata(self) -> dict[str, str]:
         """
         Gets all the labels like "label: value" at the beginning of the post and also retrieve the title following
         this label
-        TODO: Make it so that it doesn't read the whole file, iterating over each line individually
         """
+        metadata = {}
         with self.source_path.open() as file:
-            raw_text = file.read().strip()
-
-        metadata_matches = []
-        prev_match_end_pos = 0
-        for idx, match in enumerate(Post.METADATA_REGEXP.finditer(raw_text)):
-            if idx == 0:
-                metadata_matches.append(match)
-                prev_match_end_pos = match.end()
-                continue
-            text_in_between = raw_text[prev_match_end_pos:match.start()].strip()
-            if not text_in_between:
-                metadata_matches.append(match)
-                prev_match_end_pos = match.end()
-            else:
-                break
+            for raw_line in file:
+                line = raw_line.strip()
+                if line:
+                    if match := self.METADATA_REGEXP.match(line):
+                        metadata.update({match.group(1).lower(): match.group(2).lower()})
+                    elif match := self.TITLE_REGEXP.match(line):
+                        metadata.update({'title': match.group(1)})
+                        break
+                    else:
+                        break
+
+        # Add default tag if nothing is found
+        if 'tags' not in metadata:
+            metadata.update({'tags': [Post.DEFAULT_TAG]})
 
-        metadata = {}
-        for match in metadata_matches:
-            key = match.group(1).lower()
-            value = match.group(2).lower()
+        # Add default date if not found and prepend it to the file
+        if 'date' not in metadata:
+            today = dt.date.today()
+            metadata.update({'date': today})
+            with self.source_path.open() as file:
+                content = file.read()
+            with self.source_path.open(mode='w') as file:
+                file.write(f'date: {today.isoformat()}\n')
+                file.write(content)
+        else:
+            metadata.update({'date': dt.date.fromisoformat(metadata['date'])})
+
+        # Check for errors
+        missing_mandatory_labels = set(Post.MANDATORY_LABELS).difference(set(metadata))
+        if missing_mandatory_labels:
+            raise ValueError(f'The following mandatory label(s) is missing: {missing_mandatory_labels}')
+        elif 'title' not in metadata:
+            raise ValueError('No title found after the data labels')
+
+        # Convert into the correct type the value of the keys
+        for key, value in metadata.items():
             if key in Post.INVALID_LABELS:
-                print(f'Invalid metadata label entry: {key}: {value}')
+                print(f'Invalid metadata label entry: "{key}". Ignoring...')
                 continue
-            if key == 'date':
-                value = dt.date.fromisoformat(value)
-            elif key == 'tags' and '[' not in value:
-                value = [value]
-            elif '[' in value:
-                value = [list_element.strip() for list_element in value.strip(' []').split(',')]
-            metadata.update({key: value})
-
-        # add default tag if nothing is found
-        if 'tags' not in metadata:
-            metadata.update({'tags': [Post.DEFAULT_TAG]})
 
-        # Parse title which is also part of the metadata
-        last_metadata_end_pos = metadata_matches[-1].end() if metadata_matches else 0
-        match_title = Post.TITLE_REGEXP.search(raw_text)
+            if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
+                actual_value = [list_element.strip() for list_element in metadata[key].strip(' []').split(',')]
+            else:
+                actual_value = value
 
-        if not match_title or raw_text[last_metadata_end_pos:match_title.start()].strip():
-            raise ValueError('No title found or text or the title does not follow directly the metadata labels')
-        else:
-            metadata.update({'title': match_title.group(1)})
-        if not Post.METADATA_REGEXP.match(raw_text):
-            raise ValueError('No metadata label found at the beginning of the text')
-        if not set(Post.MANDATORY_LABELS).issubset(set(metadata)):
-            raise ValueError(f'Not all mandatory labels {Post.MANDATORY_LABELS} found not found in metadata')
+            if key == 'tags' and isinstance(actual_value, str):
+                actual_value = [actual_value]
+
+            metadata.update({key: actual_value})
 
         return metadata
 

diff --git a/tests/test_post.py b/tests/test_post.py
@@ -1,6 +1,6 @@
 import datetime as dt
+from inspect import cleandoc
 from pathlib import Path
-from textwrap import dedent
 
 import pytest
 
@@ -9,10 +9,12 @@
 
 @pytest.fixture()
 def valid_text_path(tmp_path) -> Path:
-    text = dedent("""
+    text = cleandoc("""
     draft: yes
     tags: [lifestyle, manana]
     another_label: another_value
+       weird_label: sola   
+    paco  :   perfume
     date: 2022-12-11
     
     # My first post
@@ -28,9 +30,24 @@ def valid_text_path(tmp_path) -> Path:
     return post_path
 
 
+@pytest.fixture()
+def valid_text_no_date_path(tmp_path) -> Path:
+    text = cleandoc("""
+    draft: yes
+
+    # My no date post
+    
+    This post does not have a date but it should!
+
+    """)
+    post_path = tmp_path / 'post.md'
+    post_path.write_text(text)
+    return post_path
+
+
 @pytest.fixture()
 def invalid_text_path_1(tmp_path) -> Path:
-    text = dedent("""
+    text = cleandoc("""
     
     This is my first invalid post.
     """)
@@ -41,7 +58,7 @@ def invalid_text_path_1(tmp_path) -> Path:
 
 @pytest.fixture()
 def invalid_text_path_2(tmp_path) -> Path:
-    text = dedent("""
+    text = cleandoc("""
     a_label: yes
     
     # Title
@@ -55,7 +72,7 @@ def invalid_text_path_2(tmp_path) -> Path:
 
 @pytest.fixture()
 def invalid_text_path_3(tmp_path) -> Path:
-    text = dedent("""
+    text = cleandoc("""
     draft: yes
    
     some other irrelevant text 
@@ -68,7 +85,7 @@ def invalid_text_path_3(tmp_path) -> Path:
 
 @pytest.fixture()
 def invalid_text_path_4(tmp_path) -> Path:
-    text = dedent("""
+    text = cleandoc("""
     source_path: some_path
    
     some other irrelevant text 
@@ -86,21 +103,39 @@ def dummy_target_path(tmp_path) -> Path:
 
 def test_parse_metadata(valid_text_path, dummy_target_path):
     post = Post(valid_text_path, dummy_target_path)
-    expected_dict = {'draft': 'yes', 'tags': ['lifestyle', 'manana'], 'another_label': 'another_value', 'title': 'My first post',
-                     'date': dt.date(2022, 12, 11)}
+    expected_dict = {'draft': 'yes', 'tags': ['lifestyle', 'manana'], 'another_label': 'another_value',
+                     'title': 'My first post', 'paco': 'perfume', 'weird_label': 'sola', 'date': dt.date(2022, 12, 11)}
     assert post._metadata == expected_dict
 
 
 def test_parse_markdown(valid_text_path, dummy_target_path):
     post = Post(valid_text_path, dummy_target_path)
-    expected_html = dedent("""
+    expected_html = cleandoc("""
     <p>This is my firs valid post. If there's a text that seems like a tag within the post</p>
     <p>definition: not a tag</p>
     <p>it shouldn't be considered as such.</p>
     """).strip()
     assert post.get_content_in_html() == expected_html
 
 
+def test_add_date_to_file(valid_text_no_date_path, dummy_target_path):
+    post = Post(valid_text_no_date_path, dummy_target_path)
+    todays_date = dt.date.today()
+    expected_post_file_content = cleandoc(f"""
+    date: {todays_date.isoformat()}
+    draft: yes
+
+    # My no date post
+    
+    This post does not have a date but it should!
+    
+    """)
+    assert post._metadata['date'] == todays_date
+    with post.source_path.open() as file:
+        post_file_content = file.read()
+    assert expected_post_file_content == post_file_content
+
+
 def test_invalid_initializer_1(invalid_text_path_1, dummy_target_path):
     with pytest.raises(ValueError):
         Post(invalid_text_path_1, dummy_target_path)