updated path handling to use pathlib; improved type handling

WolfgangFahl · May 10, 2024 · ea3321b · ea3321b
1 parent 2f2a7c9
commit ea3321b
Show file tree

Hide file tree

Showing 9 changed files with 37 additions and 34 deletions.
diff --git a/ceurws/ceur_ws.py b/ceurws/ceur_ws.py
@@ -1,6 +1,5 @@
 import calendar
 import datetime
-import os
 import re
 from pathlib import Path
 from typing import Optional, Union
@@ -26,11 +25,11 @@ class CEURWS:
     """
 
     URL = "http://ceur-ws.org"
-    home = str(Path.home())
-    CACHE_DIR = "%s/.ceurws" % home
-    CACHE_FILE = f"{CACHE_DIR}/ceurws.db"
-    CACHE_HTML = f"{CACHE_DIR}/index.html"
-    CONFIG = StorageConfig(cacheFile=CACHE_FILE)
+    home = Path.home()
+    CACHE_DIR = home.joinpath(".ceurws")
+    CACHE_FILE = CACHE_DIR.joinpath("ceurws.db")
+    CACHE_HTML = CACHE_DIR.joinpath("index.html")
+    CONFIG = StorageConfig(cacheFile=str(CACHE_FILE))
 
 
 class Volume(JSONAble):
@@ -534,13 +533,13 @@ def getIndexHtml(self, force: bool = False):
         get the index html
         """
         cacheHtml = CEURWS.CACHE_HTML
-        if os.path.isfile(cacheHtml) and not force:
+        if cacheHtml.is_file() and not force:
             with open(cacheHtml) as file:
                 html_page = file.read()
         else:
             req = Request(CEURWS.URL, headers={"User-Agent": "pyCEURMake"})
             html_page = urlopen(req).read().decode()
-            Path(CEURWS.CACHE_DIR).mkdir(parents=True, exist_ok=True)
+            CEURWS.CACHE_DIR.mkdir(parents=True, exist_ok=True)
             with open(cacheHtml, "w") as htmlFile:
                 print(html_page, file=htmlFile)
         return html_page

diff --git a/ceurws/models/ceur.py b/ceurws/models/ceur.py
@@ -13,7 +13,7 @@
 from sqlmodel import Field, SQLModel
 
 
-class Volume(SQLModel, table=True):
+class Volume(SQLModel, table=True): # type: ignore
     """
     a single CEUR-WS Volume
     """
@@ -58,7 +58,7 @@ class Volume(SQLModel, table=True):
     tdtitle: Optional[str] = Field(default=None)
 
 
-class Paper(SQLModel, table=True):
+class Paper(SQLModel, table=True): # type: ignore
     """
     Represents a paper with details such as authors, volume number, and title.
     """

diff --git a/ceurws/models/dblp2.py b/ceurws/models/dblp2.py
@@ -9,7 +9,7 @@
 from sqlmodel import Field, SQLModel
 
 
-class Scholar(SQLModel, table=True):
+class Scholar(SQLModel, table=True): # type: ignore
     """
     Represents a scholar with information fetched from DBLP and possibly other sources.
     """
@@ -21,7 +21,7 @@ class Scholar(SQLModel, table=True):
     gnd_id: Optional[str] = None
 
 
-class Paper(SQLModel, table=True):
+class Paper(SQLModel, table=True): # type: ignore
     """
     A paper indexed in DBLP with additional details. The paper URL is used as the unique identifier.
     """
@@ -33,7 +33,7 @@ class Paper(SQLModel, table=True):
     pdf_url: Optional[str] = None
 
 
-class Proceeding(SQLModel, table=True):
+class Proceeding(SQLModel, table=True): # type: ignore
     """
     A proceeding indexed in DBLP with additional details.
     """
@@ -44,7 +44,7 @@ class Proceeding(SQLModel, table=True):
     dblp_event_id: Optional[str] = None
 
 
-class Editorship(SQLModel, table=True):
+class Editorship(SQLModel, table=True): # type: ignore
     """
     Represents the relationship between a scholar and a proceeding, indicating the scholar's role as an editor.
     """
@@ -53,7 +53,7 @@ class Editorship(SQLModel, table=True):
     dblp_author_id: str = Field(foreign_key="scholar.dblp_author_id", primary_key=True)
 
 
-class Authorship(SQLModel, table=True):
+class Authorship(SQLModel, table=True): # type: ignore
     """
     Represents the relationship between a scholar and a paper, capturing the authorship details.
     """

diff --git a/ceurws/utils/download.py b/ceurws/utils/download.py
@@ -7,10 +7,10 @@
 """
 
 import gzip
-import os
 import shutil
 import time
-import urllib
+import urllib.request
+from pathlib import Path
 from typing import Optional
 
 
@@ -32,7 +32,7 @@ def getFileContent(path: str):
             return content
 
     @staticmethod
-    def needsDownload(filePath: str, force: bool = False) -> bool:
+    def needsDownload(filePath: Path, force: bool = False) -> bool:
         """
         check if a download of the given filePath is necessary that is the file
         does not exist has a size of zero or the download should be forced
@@ -44,10 +44,10 @@ def needsDownload(filePath: str, force: bool = False) -> bool:
         Return:
             bool: True if  a download for this file needed
         """
-        if not os.path.isfile(filePath):
+        if not filePath.is_file():
             result = True
         else:
-            stats = os.stat(filePath)
+            stats = filePath.stat()
             size = stats.st_size
             result = force or size == 0
         return result
@@ -56,7 +56,7 @@ def needsDownload(filePath: str, force: bool = False) -> bool:
     def downloadBackupFile(
         url: str,
         fileName: str,
-        targetDirectory: str,
+        targetDirectory: Path,
         force: bool = False,
         profile: bool = True,
     ):
@@ -73,19 +73,19 @@ def downloadBackupFile(
         Returns:
             Name of the extracted file with path to the backup directory
         """
-        extractTo = f"{targetDirectory}/{fileName}"
+        extractTo = targetDirectory.joinpath(fileName)
+        zipped = targetDirectory.joinpath(f"{fileName}.gz")
         # we might want to check whether a new version is available
         if Download.needsDownload(extractTo, force=force):
-            if not os.path.isdir(targetDirectory):
-                os.makedirs(targetDirectory)
-            zipped = f"{extractTo}.gz"
+            if not targetDirectory.is_dir():
+                targetDirectory.parent.mkdir(parents=True, exist_ok=True)
             msg = f"Downloading {zipped} from {url} ... this might take a few seconds ..."
             profiler = Profiler(msg=msg, profile=profile)
             urllib.request.urlretrieve(url, zipped)
             profiler.time(extraMsg=f" unzipping {extractTo} from {zipped}")
             with gzip.open(zipped, "rb") as gzipped, open(extractTo, "wb") as unzipped:
                 shutil.copyfileobj(gzipped, unzipped)
-            if not os.path.isfile(extractTo):
+            if not extractTo.is_file():
                 raise Exception(f"could not extract {fileName} from {zipped}")
         return extractTo
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -158,4 +158,6 @@ legacy_tox_ini = """
 
 [[tool.mypy.overrides]]
 module = "yaml,dateutil.parser"
-ignore_missing_imports = true
+ignore_missing_imports = true
+follow-imports = "skip"
+strict-optional = true
diff --git a/tests/test_papertocparser.py b/tests/test_papertocparser.py
@@ -54,24 +54,25 @@ def check_paper_toc_parser(
         Returns:
             list: a list paper records
         """
+        paper_records = []
         try:
             record, soup = self.volumeParser.parse_volume(vol_number)
             if debug:
                 print(json.dumps(record, indent=2))
             if soup:
-                ptp = PaperTocParser(number=vol_number, soup=soup, debug=debug)
+                ptp = PaperTocParser(number=str(vol_number), soup=soup, debug=debug)
                 paper_records = ptp.parsePapers()
                 if debug:
                     print(json.dumps(paper_records, indent=2))
                 for paper_record in paper_records:
                     for key in paper_record:
                         counter[key] += 1
-                return paper_records
         except Exception as ex:
             counter["failure"] += 1
             if show_failure:
                 print(f"{vol_number} paper toc parsing fails with {str(ex)})")
-            return []
+        finally:
+            return paper_records
 
     def test_volExamples(self):
         """

diff --git a/tests/test_volume_neo4j.py b/tests/test_volume_neo4j.py
@@ -1,6 +1,7 @@
 import json
 import os
 import unittest
+from typing import Union
 
 from ceurws.location import LocationLookup
 from ceurws.volume_neo4j import Editor, Neo4j, Volume
@@ -29,7 +30,7 @@ def test_neo4j_available(self):
             available = Neo4j.is_port_available(self.neo4j.host, port)
             self.assertTrue(available, f"{service} service at {port}")
 
-    def create_test_volume(self, year: int = 2023) -> int:
+    def create_test_volume(self, year: int = 2023) -> Union[int, None]:
         """
         Creates a test Volume node for the given year.
 

diff --git a/tests/test_volumeparser.py b/tests/test_volumeparser.py
@@ -220,7 +220,7 @@ def test_parseEditors_stressTest(self):
             count_homepages,
         )
 
-    @unittest.skip
+    @unittest.skip("Only for manual editor extraction and debugging")
     def test_parseEditor(self):
         total = 3354
         end = 0

diff --git a/tests/test_wikidatasync.py b/tests/test_wikidatasync.py
@@ -8,7 +8,7 @@
 import dataclasses
 import pprint
 import time
-import typing
+from typing import Union
 import unittest
 
 from ez_wikidata.wdproperty import PropertyMapping, WdDatatype
@@ -599,7 +599,7 @@ def test_getEventsOfProceedingsByVolnumber(self):
 
         @dataclasses.dataclass
         class TestParam:
-            volumenumber: typing.Union[int, str]
+            volumenumber: Union[int, str]
             expected_qids: list[str]
 
         test_params = [