Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ geoip2==4.4.0
mysqlclient==2.0.3
requests==2.26.0
reverse_geocoder==1.4
-e git+https://github.com/scieloorg/scielo_log_validator.git@0.4.0#egg=scielo_log_validator
-e git+https://github.com/scieloorg/scielo_log_validator.git@0.5.1#egg=scielo_log_validator
sqlalchemy==1.4.26
wget==3.2
65 changes: 55 additions & 10 deletions scielo_usage_counter/log_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,16 @@ def timedelta_from_timezone(self, timezone):
return datetime.timedelta(hours=hours, minutes=minutes)

def format_date(self, date, timezone):
# Check if date is Unix timestamp (bunnynet format)
if timezone is None and date and date.isdigit():
try:
unix_ts = int(date)
dt_obj = datetime.datetime.utcfromtimestamp(unix_ts)
return dt_obj.strftime('%Y-%m-%d %H:%M:%S')
except (ValueError, OSError):
return None

# Standard Apache date format
try:
date = datetime.datetime.strptime(date, '%d/%b/%Y:%H:%M:%S')
date -= self.timedelta_from_timezone(timezone)
Expand All @@ -377,6 +387,18 @@ def format_client_version(self, device):
return device.client_version() or device.UNKNOWN

def match_with_best_pattern(self, line):
# Detect bunnynet pipe-delimited format by pipe count
pipe_count = line.count('|')
if pipe_count >= 11:
bunny_match = re.match(values.PATTERN_BUNNYCDN_LOG_FORMAT, line)
if bunny_match:
extracted = bunny_match.groupdict()
ip_addr = extracted.get('ip', '')
ip_type = self.get_ip_origin_type(ip_addr)
if ip_type != IP_ORIGIN_UNKNOWN:
return bunny_match, ip_addr

# Try standard Apache patterns
patterns = [
values.PATTERN_NCSA_EXTENDED_LOG_FORMAT,
values.PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN,
Expand Down Expand Up @@ -446,22 +468,51 @@ def parse_line(self, line):
}

data = match.groupdict()

# Check if this is bunnynet format (has unix_ts field)
is_bunnynet = 'unix_ts' in data

if is_bunnynet:
# Bunnynet logs don't have explicit method, assume GET
processed_line['http_method'] = 'GET'
processed_line['http_response_status'] = data.get('status')
processed_line['user_agent'] = self.format_user_agent(data.get('user_agent'))
processed_line['url'] = data.get('path')
processed_line['ip_address'] = ip_value

# Bunnynet provides country code directly
processed_line['country_code'] = data.get('geo_code')
if not processed_line['country_code']:
processed_line['country_code'] = self.geoip.ip_to_country_code(processed_line['ip_address'])

# Handle Unix timestamp
unix_ts = data.get('unix_ts')
processed_line['local_datetime'] = self.format_date(unix_ts, None)
else:
# Standard Apache log format
processed_line['http_method'] = data.get('method')
processed_line['http_response_status'] = data.get('status')
processed_line['user_agent'] = self.format_user_agent(data.get('user_agent'))
processed_line['url'] = data.get('path')
processed_line['ip_address'] = ip_value
processed_line['country_code'] = self.geoip.ip_to_country_code(processed_line['ip_address'])

date = data.get('date')
timezone = data.get('timezone')
processed_line['local_datetime'] = self.format_date(date, timezone)

processed_line['http_method'] = data.get('method')
# Validation checks
if not self.has_valid_method(processed_line['http_method']):
self.stats.increment('ignored_lines_invalid_method')
processed_line['is_valid'] = False

processed_line['http_response_status'] = data.get('status')
if not self.has_valid_status(processed_line['http_response_status']):
if self.status_is_redirect(processed_line['http_response_status']):
self.stats.increment('ignored_lines_http_redirects')
elif self.status_is_error(processed_line['http_response_status']):
self.stats.increment('ignored_lines_http_errors')
processed_line['is_valid'] = False

processed_line['user_agent'] = self.format_user_agent(data.get('user_agent'))

if self.user_agent_is_bot(processed_line['user_agent']):
self.stats.increment('ignored_lines_bot')
processed_line['is_valid'] = False
Expand All @@ -484,20 +535,14 @@ def parse_line(self, line):
self.stats.increment('ignored_lines_invalid_client_version')
processed_line['is_valid'] = False

processed_line['url'] = data.get('path')
if not self.has_supported_url(processed_line['url']):
self.stats.increment('ignored_lines_static_resources')
processed_line['is_valid'] = False

processed_line['ip_address'] = ip_value
processed_line['country_code'] = self.geoip.ip_to_country_code(processed_line['ip_address'])
if not processed_line['country_code']:
self.stats.increment('ignored_lines_invalid_country_code')
processed_line['is_valid'] = False

date = data.get('date')
timezone = data.get('timezone')
processed_line['local_datetime'] = self.format_date(date, timezone)
if not processed_line['local_datetime']:
self.stats.increment('ignored_lines_invalid_local_datetime')
processed_line['is_valid'] = False
Expand Down
7 changes: 7 additions & 0 deletions scielo_usage_counter/translator/opac_bunnynet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .opac import URLTranslatorOPACSite


class BunnynetOPACBridge(URLTranslatorOPACSite):

def __init__(self, jrnl_data, artcl_data):
super().__init__(jrnl_data, artcl_data)
16 changes: 16 additions & 0 deletions scielo_usage_counter/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@
r'(?P<domain>.*?)\s' + PATTERN_COMMON_LOG_FORMAT_WITH_IP_LIST + r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)

# BunnyCDN log format: pipe-delimited with 12 fields
PATTERN_BUNNYCDN_LOG_FORMAT = (
r'(?P<cache_result>[A-Z]+)\|'
r'(?P<status>\d+)\|'
r'(?P<unix_ts>\d+)\|'
r'(?P<length>\d+)\|'
r'(?P<zone_identifier>\d+)\|'
r'(?P<ip>[\w*.:-]+)\|'
r'(?P<referrer>[^|]*)\|'
r'(?P<path>[^|]+)\|'
r'(?P<geo_code>[A-Z]{2})\|'
r'(?P<user_agent>[^|]+)\|'
r'(?P<request_identifier>[a-f0-9]+)\|'
r'(?P<geo_code_duplicate>[A-Z]{2})'
)

# https://github.com/matomo-org/matomo-log-analytics/blob/4.x-dev/import_logs.py
EXTENSIONS_STATIC = set([
'gif',
Expand Down
4 changes: 4 additions & 0 deletions tests/fixtures/usage.bunnynet.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
HIT|200|1757548785|9146|4339610|240e:3b0:a00e:10a3::|-|https://www.scielo.br/media/images/FAPESP.png|CN|Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36|7caaeff65a64c5235f863868a7c94d69|CN
HIT|200|1757548786|5432|4339610|186.225.0.1|-|https://www.scielo.br/j/neco/a/dqLRqnpmnncSmnzMCB8bzPG/|BR|Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36|8dbbeef65a64c5235f863868a7c94d70|BR
MISS|200|1757548787|12345|4339610|177.52.0.1|https://www.google.com|https://www.scielo.br/j/psoc/a/hbSYnTbyNfzxcWT3FpXrL5G/?lang=es|BR|Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36|9eccfef65a64c5235f863868a7c94d71|BR
HIT|200|1757548788|7890|4339610|200.144.0.1|-|https://www.scielo.br/j/rbz/a/cKnLLBn5NnshCX93Y6qYpHv/?format=pdf|BR|Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0|afdd0ef65a64c5235f863868a7c94d72|BR
2 changes: 1 addition & 1 deletion tests/fixtures/usage.cub.log.processed.summary
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ignored_lines_static_resources ignored_lines_bot ignored_lines_invalid_method ignored_lines_invalid_user_agent ignored_lines_invalid_client_name ignored_lines_invalid_client_version ignored_lines_invalid_country_code ignored_lines_invalid_local_datetime ignored_lines_http_redirects ignored_lines_http_errors total_ignored_lines total_imported_lines lines_parsed total_time
36 16 0 0 0 0 0 0 0 0 46 2 48 0.03392291069030762
36 16 0 0 0 0 0 0 0 0 46 2 48 0.021204233169555664
2 changes: 1 addition & 1 deletion tests/fixtures/usage.dat.processed.summary
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ignored_lines_static_resources ignored_lines_bot ignored_lines_invalid_method ignored_lines_invalid_user_agent ignored_lines_invalid_client_name ignored_lines_invalid_client_version ignored_lines_invalid_country_code ignored_lines_invalid_local_datetime ignored_lines_http_redirects ignored_lines_http_errors total_ignored_lines total_imported_lines lines_parsed total_time
1172 19284 290 0 0 0 12035 0 50 670 32518 3954 36472 13.707395792007446
1172 19284 290 0 0 0 12035 0 50 670 32518 3954 36472 7.313845872879028
2 changes: 1 addition & 1 deletion tests/fixtures/usage.esp.log.processed.summary
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ignored_lines_static_resources ignored_lines_bot ignored_lines_invalid_method ignored_lines_invalid_user_agent ignored_lines_invalid_client_name ignored_lines_invalid_client_version ignored_lines_invalid_country_code ignored_lines_invalid_local_datetime ignored_lines_http_redirects ignored_lines_http_errors total_ignored_lines total_imported_lines lines_parsed total_time
46 1 0 0 0 0 0 0 0 0 47 17 64 0.11307644844055176
46 1 0 0 0 0 0 0 0 0 47 17 64 0.06390190124511719
2 changes: 1 addition & 1 deletion tests/fixtures/usage.log.processed.summary
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ignored_lines_static_resources ignored_lines_bot ignored_lines_invalid_method ignored_lines_invalid_user_agent ignored_lines_invalid_client_name ignored_lines_invalid_client_version ignored_lines_invalid_country_code ignored_lines_invalid_local_datetime ignored_lines_http_redirects ignored_lines_http_errors total_ignored_lines total_imported_lines lines_parsed total_time
185 3 2 0 0 0 2 1 4 3 187 13 200 0.19219541549682617
185 3 2 0 0 0 2 1 4 3 187 13 200 0.10813069343566895
43 changes: 43 additions & 0 deletions tests/test_log_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,3 +566,46 @@ def test_increment(self):
self.assertEqual(self.stats.total_ignored_lines, 50)
self.assertEqual(self.stats.total_imported_lines, 50)
self.assertEqual(self.stats.lines_parsed, 100)


class TestBunnynetLogParsing(unittest.TestCase):

@classmethod
def setUpClass(self):
self.maxDiff = None
self.lp = log_handler.LogParser(
mmdb_path='tests/fixtures/map.mmdb',
robots_path='tests/fixtures/counter-robots.txt'
)

def test_bunnynet_log_format_date(self):
"""Test Unix timestamp conversion for bunnynet logs"""
unix_ts = '1757548785'
timezone = None
result = self.lp.format_date(unix_ts, timezone)
self.assertEqual(result, '2025-09-10 23:59:45')

def test_bunnynet_log_pattern_match(self):
"""Test bunnynet pipe-delimited log pattern matching"""
log_line = 'HIT|200|1757548786|5432|4339610|186.225.0.1|-|https://www.scielo.br/j/neco/a/test/|BR|Mozilla/5.0|8dbbeef65a64c5235f863868a7c94d70|BR'
match, ip = self.lp.match_with_best_pattern(log_line)
self.assertIsNotNone(match)
self.assertEqual(ip, '186.225.0.1')

data = match.groupdict()
self.assertEqual(data.get('status'), '200')
self.assertEqual(data.get('unix_ts'), '1757548786')
self.assertEqual(data.get('path'), 'https://www.scielo.br/j/neco/a/test/')

def test_bunnynet_full_line_parse(self):
"""Test full bunnynet log line parsing"""
log_line = 'HIT|200|1757548786|5432|4339610|186.225.0.1|-|https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1806-37132013000500595|BR|Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36|8dbbeef65a64c5235f863868a7c94d70|BR'
self.lp.output_mode = 'dict'
result = self.lp.parse_line(log_line)

if result:
self.assertEqual(result['http_response_status'], '200')
self.assertEqual(result['ip_address'], '186.225.0.1')
self.assertEqual(result['country_code'], 'BR')
self.assertEqual(result['local_datetime'], '2025-09-10 23:59:46')

112 changes: 112 additions & 0 deletions tests/translator/test_opac_bunnynet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import unittest

from scielo_usage_counter.url_translator import URLTranslationManager
from scielo_usage_counter.translator.opac_bunnynet import BunnynetOPACBridge
from scielo_usage_counter.values import (
MEDIA_FORMAT_HTML,
MEDIA_FORMAT_PDF,
CONTENT_TYPE_FULL_TEXT,
)


class TestBunnynetTranslator(unittest.TestCase):
def setUp(self):
self.journals_metadata = [
{
'acronym': 'neco',
'scielo_issn': '0103-6351',
'issns': ['0103-6351'],
'title': 'Nova Economia',
'publisher_name': 'Universidade Federal de Minas Gerais',
'subject_areas': ['Economics'],
'wos_subject_areas': ['ECONOMICS']
},
{
'acronym': 'psoc',
'scielo_issn': '1807-0310',
'issns': ['1807-0310',],
'title': 'Psicologia & Sociedade',
'publisher_name': 'Universidade Federal do Rio Grande do Sul',
'subject_areas': ['Psychology',],
'wos_subject_areas': ['PSYCHOLOGY, SOCIAL',]
},
{
'acronym': 'rbz',
'scielo_issn': '1516-3598',
'issns': ['1516-3598'],
'title': 'Revista Brasileira de Zootecnia',
'publisher_name': 'Sociedade Brasileira de Zootecnia',
'subject_areas': ['Agricultural Sciences'],
'wos_subject_areas': ['AGRICULTURE, DAIRY & ANIMAL SCIENCE']
},
]
self.articles_metadata = [
{
'pid_v2': '',
'pid_v3': 'dqLRqnpmnncSmnzMCB8bzPG',
'default_lang': 'pt',
'text_langs': ['pt', 'en'],
'scielo_issn': '0103-6351',
'publication_year': '2021',
'files': []
},
{
'pid_v2': '',
'pid_v3': 'hbSYnTbyNfzxcWT3FpXrL5G',
'default_lang': 'es',
'text_langs': ['es', 'en'],
'scielo_issn': '1807-0310',
'publication_year': '2019',
'files': []
},
{
'pid_v2': '',
'pid_v3': 'cKnLLBn5NnshCX93Y6qYpHv',
'default_lang': 'en',
'text_langs': ['en', 'es'],
'scielo_issn': '1516-3598',
'publication_year': '2020',
'files': []
},
]
self.tm = URLTranslationManager(self.journals_metadata, self.articles_metadata)

def test_bunnynet_translator_article_url(self):
url = '/j/neco/a/dqLRqnpmnncSmnzMCB8bzPG/'

obtained = self.tm.translate(url)

self.assertEqual(obtained['scielo_issn'], '0103-6351')
self.assertIsNone(obtained['pid_v2'])
self.assertEqual(obtained['pid_v3'], 'dqLRqnpmnncSmnzMCB8bzPG')
self.assertEqual(obtained['media_format'], MEDIA_FORMAT_HTML)
self.assertEqual(obtained['media_language'], 'pt')
self.assertEqual(obtained['content_type'], CONTENT_TYPE_FULL_TEXT)

def test_bunnynet_translator_article_with_lang(self):
url = '/j/psoc/a/hbSYnTbyNfzxcWT3FpXrL5G/?lang=es'

obtained = self.tm.translate(url)

self.assertEqual(obtained['scielo_issn'], '1807-0310')
self.assertIsNone(obtained['pid_v2'])
self.assertEqual(obtained['pid_v3'], 'hbSYnTbyNfzxcWT3FpXrL5G')
self.assertEqual(obtained['media_format'], MEDIA_FORMAT_HTML)
self.assertEqual(obtained['media_language'], 'es')
self.assertEqual(obtained['content_type'], CONTENT_TYPE_FULL_TEXT)

def test_bunnynet_translator_pdf_url(self):
url = '/j/rbz/a/cKnLLBn5NnshCX93Y6qYpHv/?format=pdf'

obtained = self.tm.translate(url)

self.assertEqual(obtained['scielo_issn'], '1516-3598')
self.assertIsNone(obtained['pid_v2'])
self.assertEqual(obtained['pid_v3'], 'cKnLLBn5NnshCX93Y6qYpHv')
self.assertEqual(obtained['media_format'], MEDIA_FORMAT_PDF)
self.assertEqual(obtained['media_language'], 'en')
self.assertEqual(obtained['content_type'], CONTENT_TYPE_FULL_TEXT)


if __name__ == '__main__':
unittest.main()
Loading