Skip to content

Commit

Permalink
updated path handling to use pathlib; improved type handling
Browse files Browse the repository at this point in the history
  • Loading branch information
tholzheim committed May 10, 2024
1 parent 2f2a7c9 commit ea3321b
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 34 deletions.
15 changes: 7 additions & 8 deletions ceurws/ceur_ws.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import calendar
import datetime
import os
import re
from pathlib import Path
from typing import Optional, Union
Expand All @@ -26,11 +25,11 @@ class CEURWS:
"""

URL = "http://ceur-ws.org"
home = str(Path.home())
CACHE_DIR = "%s/.ceurws" % home
CACHE_FILE = f"{CACHE_DIR}/ceurws.db"
CACHE_HTML = f"{CACHE_DIR}/index.html"
CONFIG = StorageConfig(cacheFile=CACHE_FILE)
home = Path.home()
CACHE_DIR = home.joinpath(".ceurws")
CACHE_FILE = CACHE_DIR.joinpath("ceurws.db")
CACHE_HTML = CACHE_DIR.joinpath("index.html")
CONFIG = StorageConfig(cacheFile=str(CACHE_FILE))


class Volume(JSONAble):
Expand Down Expand Up @@ -534,13 +533,13 @@ def getIndexHtml(self, force: bool = False):
get the index html
"""
cacheHtml = CEURWS.CACHE_HTML
if os.path.isfile(cacheHtml) and not force:
if cacheHtml.is_file() and not force:
with open(cacheHtml) as file:
html_page = file.read()
else:
req = Request(CEURWS.URL, headers={"User-Agent": "pyCEURMake"})
html_page = urlopen(req).read().decode()
Path(CEURWS.CACHE_DIR).mkdir(parents=True, exist_ok=True)
CEURWS.CACHE_DIR.mkdir(parents=True, exist_ok=True)
with open(cacheHtml, "w") as htmlFile:
print(html_page, file=htmlFile)
return html_page
Expand Down
4 changes: 2 additions & 2 deletions ceurws/models/ceur.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sqlmodel import Field, SQLModel


class Volume(SQLModel, table=True):
class Volume(SQLModel, table=True): # type: ignore
"""
a single CEUR-WS Volume
"""
Expand Down Expand Up @@ -58,7 +58,7 @@ class Volume(SQLModel, table=True):
tdtitle: Optional[str] = Field(default=None)


class Paper(SQLModel, table=True):
class Paper(SQLModel, table=True): # type: ignore
"""
Represents a paper with details such as authors, volume number, and title.
"""
Expand Down
10 changes: 5 additions & 5 deletions ceurws/models/dblp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sqlmodel import Field, SQLModel


class Scholar(SQLModel, table=True):
class Scholar(SQLModel, table=True): # type: ignore
"""
Represents a scholar with information fetched from DBLP and possibly other sources.
"""
Expand All @@ -21,7 +21,7 @@ class Scholar(SQLModel, table=True):
gnd_id: Optional[str] = None


class Paper(SQLModel, table=True):
class Paper(SQLModel, table=True): # type: ignore
"""
A paper indexed in DBLP with additional details. The paper URL is used as the unique identifier.
"""
Expand All @@ -33,7 +33,7 @@ class Paper(SQLModel, table=True):
pdf_url: Optional[str] = None


class Proceeding(SQLModel, table=True):
class Proceeding(SQLModel, table=True): # type: ignore
"""
A proceeding indexed in DBLP with additional details.
"""
Expand All @@ -44,7 +44,7 @@ class Proceeding(SQLModel, table=True):
dblp_event_id: Optional[str] = None


class Editorship(SQLModel, table=True):
class Editorship(SQLModel, table=True): # type: ignore
"""
Represents the relationship between a scholar and a proceeding, indicating the scholar's role as an editor.
"""
Expand All @@ -53,7 +53,7 @@ class Editorship(SQLModel, table=True):
dblp_author_id: str = Field(foreign_key="scholar.dblp_author_id", primary_key=True)


class Authorship(SQLModel, table=True):
class Authorship(SQLModel, table=True): # type: ignore
"""
Represents the relationship between a scholar and a paper, capturing the authorship details.
"""
Expand Down
22 changes: 11 additions & 11 deletions ceurws/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
"""

import gzip
import os
import shutil
import time
import urllib
import urllib.request
from pathlib import Path
from typing import Optional


Expand All @@ -32,7 +32,7 @@ def getFileContent(path: str):
return content

@staticmethod
def needsDownload(filePath: str, force: bool = False) -> bool:
def needsDownload(filePath: Path, force: bool = False) -> bool:
"""
check if a download of the given filePath is necessary that is the file
does not exist has a size of zero or the download should be forced
Expand All @@ -44,10 +44,10 @@ def needsDownload(filePath: str, force: bool = False) -> bool:
Return:
bool: True if a download for this file needed
"""
if not os.path.isfile(filePath):
if not filePath.is_file():
result = True
else:
stats = os.stat(filePath)
stats = filePath.stat()
size = stats.st_size
result = force or size == 0
return result
Expand All @@ -56,7 +56,7 @@ def needsDownload(filePath: str, force: bool = False) -> bool:
def downloadBackupFile(
url: str,
fileName: str,
targetDirectory: str,
targetDirectory: Path,
force: bool = False,
profile: bool = True,
):
Expand All @@ -73,19 +73,19 @@ def downloadBackupFile(
Returns:
Name of the extracted file with path to the backup directory
"""
extractTo = f"{targetDirectory}/{fileName}"
extractTo = targetDirectory.joinpath(fileName)
zipped = targetDirectory.joinpath(f"{fileName}.gz")
# we might want to check whether a new version is available
if Download.needsDownload(extractTo, force=force):
if not os.path.isdir(targetDirectory):
os.makedirs(targetDirectory)
zipped = f"{extractTo}.gz"
if not targetDirectory.is_dir():
targetDirectory.parent.mkdir(parents=True, exist_ok=True)
msg = f"Downloading {zipped} from {url} ... this might take a few seconds ..."
profiler = Profiler(msg=msg, profile=profile)
urllib.request.urlretrieve(url, zipped)
profiler.time(extraMsg=f" unzipping {extractTo} from {zipped}")
with gzip.open(zipped, "rb") as gzipped, open(extractTo, "wb") as unzipped:
shutil.copyfileobj(gzipped, unzipped)
if not os.path.isfile(extractTo):
if not extractTo.is_file():
raise Exception(f"could not extract {fileName} from {zipped}")
return extractTo

Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,6 @@ legacy_tox_ini = """

[[tool.mypy.overrides]]
module = "yaml,dateutil.parser"
ignore_missing_imports = true
ignore_missing_imports = true
follow-imports = "skip"
strict-optional = true
7 changes: 4 additions & 3 deletions tests/test_papertocparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,24 +54,25 @@ def check_paper_toc_parser(
Returns:
list: a list paper records
"""
paper_records = []
try:
record, soup = self.volumeParser.parse_volume(vol_number)
if debug:
print(json.dumps(record, indent=2))
if soup:
ptp = PaperTocParser(number=vol_number, soup=soup, debug=debug)
ptp = PaperTocParser(number=str(vol_number), soup=soup, debug=debug)
paper_records = ptp.parsePapers()
if debug:
print(json.dumps(paper_records, indent=2))
for paper_record in paper_records:
for key in paper_record:
counter[key] += 1
return paper_records
except Exception as ex:
counter["failure"] += 1
if show_failure:
print(f"{vol_number} paper toc parsing fails with {str(ex)})")
return []
finally:
return paper_records

def test_volExamples(self):
"""
Expand Down
3 changes: 2 additions & 1 deletion tests/test_volume_neo4j.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import unittest
from typing import Union

from ceurws.location import LocationLookup
from ceurws.volume_neo4j import Editor, Neo4j, Volume
Expand Down Expand Up @@ -29,7 +30,7 @@ def test_neo4j_available(self):
available = Neo4j.is_port_available(self.neo4j.host, port)
self.assertTrue(available, f"{service} service at {port}")

def create_test_volume(self, year: int = 2023) -> int:
def create_test_volume(self, year: int = 2023) -> Union[int, None]:
"""
Creates a test Volume node for the given year.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_volumeparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_parseEditors_stressTest(self):
count_homepages,
)

@unittest.skip
@unittest.skip("Only for manual editor extraction and debugging")
def test_parseEditor(self):
total = 3354
end = 0
Expand Down
4 changes: 2 additions & 2 deletions tests/test_wikidatasync.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import dataclasses
import pprint
import time
import typing
from typing import Union
import unittest

from ez_wikidata.wdproperty import PropertyMapping, WdDatatype
Expand Down Expand Up @@ -599,7 +599,7 @@ def test_getEventsOfProceedingsByVolnumber(self):

@dataclasses.dataclass
class TestParam:
volumenumber: typing.Union[int, str]
volumenumber: Union[int, str]
expected_qids: list[str]

test_params = [
Expand Down

0 comments on commit ea3321b

Please sign in to comment.