Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79,484 changes: 40,113 additions & 39,371 deletions src/paperqa/clients/client_data/journal_quality.csv

Large diffs are not rendered by default.

152 changes: 147 additions & 5 deletions src/paperqa/clients/journal_quality.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
from __future__ import annotations

import asyncio
import csv
import logging
import os
import tempfile
from collections.abc import Awaitable, Callable, Sequence
from pathlib import Path
from typing import Any, ClassVar

import anyio
import httpx
from pydantic import ValidationError
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TextColumn,
TimeElapsedColumn,
)

from paperqa.types import DocDetails

from .client_models import JournalQuery, MetadataPostProcessor

logger = logging.getLogger(__name__)

DEFAULT_JOURNAL_QUALITY_CSV_PATH = (
Path(__file__).parent / "client_data" / "journal_quality.csv"
)

# TODO: refresh script for journal quality data

Expand All @@ -25,13 +41,11 @@ class JournalQualityPostProcessor(MetadataPostProcessor[JournalQuery]):
def __init__(self, journal_quality_path: os.PathLike | str | None = None) -> None:
if journal_quality_path is None:
# Construct the path relative to module
self.journal_quality_path = str(
os.path.join(
os.path.dirname(__file__), "client_data", "journal_quality.csv"
)
self.journal_quality_path: str | os.PathLike = (
DEFAULT_JOURNAL_QUALITY_CSV_PATH
)
else:
self.journal_quality_path = str(journal_quality_path)
self.journal_quality_path = journal_quality_path
self.data: dict[str, Any] | None = None

def load_data(self) -> None:
Expand Down Expand Up @@ -72,3 +86,131 @@ def query_creator(self, doc_details: DocDetails, **kwargs) -> JournalQuery | Non
"Must have a valid journal name to query journal quality data."
)
return None


# SEE: https://en.wikipedia.org/wiki/JUFO
JUFO_PORTAL_DOWNLOAD_QUALITY_URL = (
"https://jfp.csc.fi/jufoportal_base/api/download?query=&isActive=true&col=Jufo_ID"
"&col=Name&col=Abbreviation&col=Level&col=ISSNL&col=ISSN1&col=ISSN2&col=ISBN"
"&col=Other_Title&col=Title_details&col=Continues&col=Continued_by&col=Website"
"&col=Country&col=country_code&col=Publisher&col=Language&col=lang_code3"
"&col=lang_code2&col=Year_Start&col=Year_End&col=isScientific&col=isProfessional"
"&col=isGeneral&col=Type_fi&col=Type_sv&col=Type_en&col=Jufo_History"
)


async def download_file(
dest_path: str | os.PathLike,
url: str = JUFO_PORTAL_DOWNLOAD_QUALITY_URL,
client: httpx.AsyncClient | None = None,
) -> Path:
dest_path = Path(dest_path)

async def download(client_: httpx.AsyncClient) -> None:
progress = Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TimeElapsedColumn(),
)

async with client_.stream("GET", url, timeout=60) as response:
response.raise_for_status()
task_id = progress.add_task(
f"Downloading {dest_path.name}",
total=int(response.headers.get("Content-Length", 0)) or None,
)
with progress:
async with await anyio.open_file(dest_path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=2048):
if not chunk:
continue
await f.write(chunk)
progress.update(task_id, advance=len(chunk))

if client is None:
async with httpx.AsyncClient() as client: # noqa: PLR1704
await download(client)
else:
await download(client)
return dest_path


async def process_csv(
file_path: str | os.PathLike,
override_allowlist: Sequence[tuple[str, int]] | None = (
("annual review of pathology", 2),
("annual review of pathology: mechanisms of disease", 2),
("biochimica et biophysica acta (bba) - bioenergetics", 1),
("biochimica et biophysica acta (bba) - biomembranes", 1),
("biochimica et biophysica acta (bba) - gene regulatory mechanisms", 1),
("biochimica et biophysica acta (bba) - general subjects", 1),
(
"biochimica et biophysica acta (bba) - molecular and cell biology of lipids",
1,
),
("biochimica et biophysica acta (bba) - molecular basis of disease", 1),
("biochimica et biophysica acta (bba) - molecular cell research", 1),
("biochimica et biophysica acta (bba) - proteins and proteomics", 1),
("biochimica et biophysica acta (bba) - reviews on cancer", 1),
("bmc evolutionary biology", 2),
("pnas", 3),
("proceedings of the national academy of sciences", 3),
),
override_blocklist: Sequence[tuple[str, int]] | None = (("scientific reports", 0),),
records_callback: Callable[[Sequence[tuple[str, int]]], Awaitable] | None = None,
) -> list[tuple[str, int]]:
async with await anyio.open_file(file_path, encoding="utf-8") as f:
content = await f.read()

lines = content.splitlines()
progress = Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TimeElapsedColumn(),
MofNCompleteColumn(),
)

task_id = progress.add_task("Processing", total=len(lines) - 1)
# Keys are case-insensitive, values are case-sensitive
records: dict[tuple[str, int], tuple[str, int]] = {}
with progress:
for row in csv.DictReader(lines):
data = (
row["Name"],
(
int(row["Level"])
if str(row.get("Level", "")).isdigit()
else DocDetails.UNDEFINED_JOURNAL_QUALITY
),
)
records[data[0].lower(), data[1]] = data
progress.update(task_id, advance=1)
for row_override in override_allowlist or []:
records[row_override[0].lower(), row_override[1]] = row_override
for row_override in override_blocklist or []:
records.pop((row_override[0].lower(), row_override[1]), None)
records_list = [records[key] for key in sorted(records)]

if records_callback is not None:
await records_callback(records_list)
return records_list


async def main() -> None:
with tempfile.TemporaryDirectory() as tmpdir:
downloaded_path = await download_file(
dest_path=Path(tmpdir) / "journal_quality.csv"
)
records = await process_csv(downloaded_path)

with open( # noqa: ASYNC230
DEFAULT_JOURNAL_QUALITY_CSV_PATH, "w", encoding="utf-8"
) as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["clean_name", "quality"])
for name, quality in records:
writer.writerow([name.lower(), quality])


if __name__ == "__main__":
asyncio.run(main())

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading