Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 241 additions & 0 deletions scielo_usage_counter/translator/books.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import re

from urllib.parse import urlparse, parse_qsl

from scielo_usage_counter.values import (
MEDIA_LANGUAGE_UNDEFINED,
MEDIA_FORMAT_HTML,
MEDIA_FORMAT_PDF,
MEDIA_FORMAT_XML,
MEDIA_FORMAT_UNDEFINED,
CONTENT_TYPE_ABSTRACT,
CONTENT_TYPE_FULL_TEXT,
CONTENT_TYPE_UNDEFINED,
DEFAULT_SCIELO_ISSN,
)


# Patterns to support parameter extraction for SciELO Books
# Pattern for PDF files: /id/{book_id}/pdf/{filename}.pdf
REGEX_BOOKS_SITE_PDF = re.compile(r'/id/(?P<book_id>\w+)/pdf/(?P<filename>[\w\-]+\.pdf)', re.IGNORECASE)
# Pattern for chapter pages: /id/{book_id}/{chapter_number}
REGEX_BOOKS_SITE_CHAPTER = re.compile(r'/id/(?P<book_id>\w+)/(?P<chapter_id>\d+)(?:[?#]|$)', re.IGNORECASE)
# Pattern for book landing pages: /id/{book_id}
REGEX_BOOKS_SITE_BOOK = re.compile(r'/id/(?P<book_id>\w+)(?:[?#]|$)', re.IGNORECASE)


class URLTranslatorBooksSite:
"""
Translator for SciELO Livros (Books) URLs.

This class handles URL translation for the SciELO Books platform, extracting
relevant metadata such as book IDs, chapter IDs, media formats, and content types.
"""

def __init__(self, journals_metadata, articles_metadata):
"""
Initialize the URLTranslatorBooksSite.

:param journals_metadata: Dictionary containing journal metadata
:param articles_metadata: Dictionary containing article/book metadata
"""
self.journals_metadata = journals_metadata
self.articles_metadata = articles_metadata

def pipeline_translate(self, url):
"""
Execute the complete translation pipeline for a SciELO Books URL.

:param url: URL string to translate
:return: Dictionary containing extracted metadata
"""
self.url_params = self.extract_url_params(url)

book_id, chapter_id, filename = self.extract_identifiers(url)
pid_generic = self._build_pid_generic(book_id, chapter_id)

media_format = self.extract_media_format(url, filename)
media_language = self.extract_media_language(pid_generic)
content_type = self.extract_content_type(url, chapter_id, filename)
scielo_issn = self.extract_issn(book_id)

return {
'scielo_issn': scielo_issn,
'journal_main_title': self.journals_metadata.get('issn_to_title', {}).get(scielo_issn),
'journal_subject_area_capes': self.journals_metadata.get('issn_to_subject_area_capes', {}).get(scielo_issn),
'journal_subject_area_wos': self.journals_metadata.get('issn_to_subject_area_wos', {}).get(scielo_issn),
'journal_publisher_name': self.journals_metadata.get('issn_to_publisher_name', {}).get(scielo_issn),
'journal_acronym': self.journals_metadata.get('issn_to_acronym', {}).get(scielo_issn),
'pid_v2': None,
'pid_v3': None,
'pid_generic': pid_generic,
'book_id': book_id,
'chapter_id': chapter_id,
'media_format': media_format,
'media_language': media_language,
'content_type': content_type,
'year_of_publication': self.articles_metadata.get('pid_generic_to_publication_date', {}).get(pid_generic),
}

def extract_url_params(self, url):
"""
Extract query parameters from URL.

:param url: URL string
:return: Dictionary of URL parameters
"""
url_params = {
'book_id': '',
'chapter_id': '',
'media_format': '',
'media_language': '',
}

url_parsed = urlparse(url)
params = dict(parse_qsl(url_parsed.query))

for k, v in params.items():
if k == 'lang':
url_params['media_language'] = v
elif k == 'format':
url_params['media_format'] = v
else:
url_params[k] = v

return url_params

def extract_identifiers(self, url):
"""
Extract book ID and chapter ID from URL.

Supports SciELO Books URL patterns:
- /id/{book_id}/pdf/{filename}.pdf - PDF download
- /id/{book_id}/{chapter_number} - Chapter page
- /id/{book_id} - Book landing page

:param url: URL string
:return: Tuple of (book_id, chapter_id, filename)
"""
# Try to match PDF pattern first (most specific)
match = re.search(REGEX_BOOKS_SITE_PDF, url)
if match:
book_id = match.group('book_id')
filename = match.group('filename')
# Extract chapter number from filename if present
# Pattern: author-ISBN-CHAPTER.pdf where ISBN is 10 or 13 digits
# Examples: magalhaes-9788578791889-18.pdf (chapter 18), sadek-9788579820342.pdf (no chapter)
chapter_match = re.search(r'-(\d{10,13})-(\d+)\.pdf$', filename)
chapter_id = chapter_match.group(2) if chapter_match else None
return book_id, chapter_id, filename

# Try to match chapter pattern
match = re.search(REGEX_BOOKS_SITE_CHAPTER, url)
if match:
return match.group('book_id'), match.group('chapter_id'), None

# Try to match book landing page pattern
match = re.search(REGEX_BOOKS_SITE_BOOK, url)
if match:
return match.group('book_id'), None, None

return None, None, None

def _build_pid_generic(self, book_id, chapter_id):
"""
Build a generic PID from book and chapter IDs.

:param book_id: Book identifier
:param chapter_id: Chapter identifier (optional)
:return: Generic PID string or None
"""
if not book_id:
return None

if chapter_id:
return f"book:{book_id}/chapter:{chapter_id}"

return f"book:{book_id}"

def extract_media_format(self, url, filename=None):
"""
Determine the media format from URL.

:param url: URL string
:param filename: Optional filename from URL
:return: Media format string (html, pdf, xml, etc.)
"""
# Check for explicit format in query params
if self.url_params.get('media_format'):
return self.url_params['media_format']

# Check if it's a PDF file
if filename and filename.endswith('.pdf'):
return MEDIA_FORMAT_PDF

# Check URL patterns for PDF
if re.search(REGEX_BOOKS_SITE_PDF, url):
return MEDIA_FORMAT_PDF

# Default to HTML for book/chapter landing pages
return MEDIA_FORMAT_HTML

def extract_media_language(self, pid_generic):
"""
Extract media language from metadata or URL parameters.

:param pid_generic: Generic PID
:return: Language code string
"""
media_language = self.url_params.get('media_language')
if media_language:
return media_language

# Try to get from metadata if available
# Note: Using pid_v2_to_default_lang as a fallback for compatibility,
# though books use pid_generic format. This allows for potential
# future metadata storage if book language metadata becomes available.
if pid_generic:
stored_lang = self.articles_metadata.get('pid_v2_to_default_lang', {}).get(pid_generic)
if stored_lang:
return stored_lang

return MEDIA_LANGUAGE_UNDEFINED

def extract_content_type(self, url, chapter_id, filename=None):
"""
Determine content type from URL and identifiers.

:param url: URL string
:param chapter_id: Chapter identifier (optional)
:param filename: Optional filename from URL
:return: Content type string
"""
# Get media_format if not already extracted
if not hasattr(self, 'url_params'):
self.url_params = self.extract_url_params(url)

# PDF files are full text
if filename and filename.endswith('.pdf'):
return CONTENT_TYPE_FULL_TEXT

# Check URL patterns for PDF
if re.search(REGEX_BOOKS_SITE_PDF, url):
return CONTENT_TYPE_FULL_TEXT

# Chapter pages are considered full text
if chapter_id:
return CONTENT_TYPE_FULL_TEXT

# Book landing pages without chapter are abstracts
return CONTENT_TYPE_ABSTRACT

def extract_issn(self, book_id):
"""
Extract ISSN for a book. Books may not have ISSNs like journals.

:param book_id: Book identifier
:return: ISSN string (defaults to DEFAULT_SCIELO_ISSN)
"""
# Books typically don't have ISSNs (journals do)
# Using default ISSN for books platform
return DEFAULT_SCIELO_ISSN
9 changes: 9 additions & 0 deletions scielo_usage_counter/url_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scielo_usage_counter.translator.opac_alpha import URLTranslatorOPACAlphaSite
from scielo_usage_counter.translator.dataverse import URLTranslatorDataverseSite
from scielo_usage_counter.translator.preprints import URLTranslatorPreprintsSite
from scielo_usage_counter.translator.books import URLTranslatorBooksSite


# Patterns to support identify a URL as a Classic Site URL
Expand Down Expand Up @@ -52,6 +53,13 @@
re.compile(r'/?plugins/generic/(hypothesis|pdfJsViewer)/', re.IGNORECASE),
]

# Patterns to support identify a URL as a Books Site URL
PATTERNS_BOOKS_SITE = [
re.compile(r'/id/\w+/pdf/[\w\-]+\.pdf', re.IGNORECASE), # /id/{book_id}/pdf/{filename}.pdf
re.compile(r'/id/\w+/\d+(?:[?#]|$)', re.IGNORECASE), # /id/{book_id}/{chapter_number}
re.compile(r'/id/\w+(?:[?#]|$)', re.IGNORECASE), # /id/{book_id}
]


class URLTranslationManager:
def __init__(self, journals_metadata, articles_metadata, translator=None):
Expand Down Expand Up @@ -187,6 +195,7 @@ def identify_translator_class(self, url):
parsed_url = urlparse(url)

for pattern, url_translator_class in [
(PATTERNS_BOOKS_SITE, URLTranslatorBooksSite),
(PATTERNS_CLASSIC_SITE, URLTranslatorClassicSite),
(PATTERNS_OPAC_SITE, URLTranslatorOPACSite),
(PATTERNS_PREPRINTS_SITE, URLTranslatorPreprintsSite),
Expand Down
10 changes: 10 additions & 0 deletions tests/fixtures/usage.books.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
45.229.23.28 - - [20/May/2023:23:59:49 -0300] "GET /id/y742k/pdf/magalhaes-9788578791889-18.pdf HTTP/2.0" 200 342341 "https://www.google.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
66.249.72.48 - - [20/May/2023:23:59:50 -0300] "GET /id/82r9t/pdf/sadek-9788579820342.pdf HTTP/1.1" 302 239 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.5672.126 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
179.0.70.35 - - [20/May/2023:23:59:53 -0300] "GET /id/5v9s3/pdf/rivera-9788575413036.pdf HTTP/2.0" 200 1873485 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
179.0.70.35 - - [21/May/2023:00:00:13 -0300] "GET /id/sq6d8/pdf/deslandes-9788575413296.pdf HTTP/2.0" 200 1073438 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
181.67.82.230 - - [21/May/2023:00:00:17 -0300] "GET /id/yjxdq/pdf/mororo-9788574554938-01.pdf HTTP/2.0" 200 408451 "https://www.google.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
201.8.90.180 - - [21/May/2023:00:00:25 -0300] "GET /id/q7gtd HTTP/2.0" 200 4298 "https://www.google.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
187.19.244.135 - - [21/May/2023:00:00:28 -0300] "GET /id/4ndgv HTTP/2.0" 200 4112 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
187.19.244.135 - - [21/May/2023:00:00:42 -0300] "GET /id/4ndgv/pdf/paim-9788575413593-05.pdf HTTP/2.0" 200 195911 "https://books.scielo.org/id/4ndgv" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
192.168.1.96 - - [21/May/2023:00:00:39 -0300] "GET /search/index.php?output=rss&amp;site=livros&amp;col=&amp;lang=en&amp;sort=publication_date+desc HTTP/1.1" 200 34222 "-" "WordPress/6.2.1; https://books.scielo.org"
192.168.169.125 - suporte.scielo [21/May/2023:00:00:40 -0300] "GET /id/gbvb4 HTTP/1.1" 200 3934 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"
Loading