Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
behave==1.2.5
factory-boy==2.8.1
fake-factory==0.7.2
httpretty==0.8.14
ipdb
ipython
pytest-benchmark==3.0.0
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
'org.plos = share.transformers.org_plos:PLoSTransformer',
'org.psyarxiv = share.transformers.org_psyarxiv:PsyarxivTransformer',
'org.socarxiv = share.transformers.org_socarxiv:SocarxivTransformer',
'org.swbiodiversity = share.transformers.org_swbiodiversity:SWTransformer',
'v1_push = share.transformers.v1_push:V1Transformer',
],
'share.harvesters': [
Expand Down Expand Up @@ -91,6 +92,7 @@
'org.ncar = share.harvesters.org_ncar:NCARHarvester',
'org.neurovault = share.harvesters.org_neurovault:NeuroVaultHarvester',
'org.plos = share.harvesters.org_plos:PLOSHarvester',
'org.swbiodiversity = share.harvesters.org_swbiodiversity:SWHarvester',
]
}
)
64 changes: 64 additions & 0 deletions share/harvesters/org_swbiodiversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import itertools
import logging
import re

from bs4 import BeautifulSoup, Comment
from furl import furl

from share.harvest import BaseHarvester


logger = logging.getLogger(__name__)


class SWHarvester(BaseHarvester):
"""

"""
VERSION = 1

def _do_fetch(self, start, end, **kwargs):
end_date = end.date()
start_date = start.date()
logger.info('Harvesting swbiodiversity %s - %s', start_date, end_date)
return self.fetch_records()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to disregard date ranges. Did we discuss this at some point?


def fetch_records(self):
response = self.requests.get(self.kwargs['list_url'])
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
records = soup.find_all('a')

record_list = []
for record in records:
record_content = re.findall('collid=(\d+)', record.get('href'))
if record_content and record_content[0] not in record_list:
record_list.append(record_content[0])
total = len(record_list)

logging.info('Found %d results from swbiodiversity', total)

for count, identifier in enumerate(record_list):

logger.info('On collection %d of %d (%d%%)', count, total, (count / total) * 100)

collection_page = furl(self.kwargs['list_url'])
collection_page.args['collid'] = identifier
response = self.requests.get(collection_page.url)
response.raise_for_status()

raw_data = BeautifulSoup(response.content, 'html.parser')
# Peel out script tags and css things to minimize size of HTML
for el in itertools.chain(
raw_data('img'),
raw_data('link', rel=('stylesheet', 'dns-prefetch')),
raw_data('link', {'type': re.compile('.')}),
raw_data('noscript'),
raw_data('script'),
raw_data(string=lambda x: isinstance(x, Comment)),
):
el.extract()

record = raw_data.find(id='innertext')

yield identifier, str(record)
Binary file added share/sources/org.swbiodiversity/icon.ico
Binary file not shown.
16 changes: 16 additions & 0 deletions share/sources/org.swbiodiversity/source.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
configs:
- base_url: http://swbiodiversity.org/seinet/
disabled: false
earliest_date: null
harvester: org.swbiodiversity
harvester_kwargs:
list_url: http://swbiodiversity.org/seinet/collections/misc/collprofiles.php
label: org.swbiodiversity
rate_limit_allowance: 1
rate_limit_period: 2
transformer: org.swbiodiversity
transformer_kwargs: {}
home_page: http://swbiodiversity.org/seinet/
long_title: SEINet - Arizona Chapter Collections
name: org.swbiodiversity
user: providers.org.swbiodiversity
138 changes: 138 additions & 0 deletions share/transformers/org_swbiodiversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import re

from bs4 import BeautifulSoup

from share.transform.chain import ctx
from share.transform.chain import links as tools
from share.transform.chain.parsers import Parser
from share.transform.chain.soup import SoupXMLTransformer


class AgentIdentifier(Parser):
uri = tools.IRI(ctx)


class WorkIdentifier(Parser):
uri = tools.IRI(ctx)


class Organization(Parser):
name = ctx


class Publisher(Parser):
agent = tools.Delegate(Organization, ctx)


class Institution(Parser):
name = ctx


class IsAffiliatedWith(Parser):
related = tools.Delegate(Institution)


class Person(Parser):
given_name = tools.ParseName(tools.Try(ctx.name)).first
family_name = tools.ParseName(tools.Try(ctx.name)).last
identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.Try(ctx.email))


class Creator(Parser):
agent = tools.Delegate(Person, ctx)


class Dataset(Parser):
title = tools.Try(ctx['title'])
description = tools.Try(ctx['description'])

rights = tools.Try(
tools.Join(
tools.Concat(
tools.Try(ctx['access-rights']),
tools.Try(ctx['usage-rights'])
)
)
)

related_agents = tools.Map(tools.Delegate(Creator), tools.Try(ctx.contact))

class Extra:
access_rights = tools.Try(ctx['access-rights'])
usage_rights = tools.Try(ctx['usage-rights'])
collection_statistics = tools.Try(ctx['collection-statistics'])
management = tools.Try(ctx['management'])
collection_type = tools.Try(ctx['collection-type'])
last_update = tools.ParseDate(tools.Try(ctx['last-update']))


class SWTransformer(SoupXMLTransformer):
VERSION = 1
root_parser = Dataset

def unwrap_data(self, input_data):
record = BeautifulSoup(input_data, 'lxml').html
data = {}
title = self.extract_text(record.h1)
if title:
data['title'] = title
start = record.div.div
description = self.extract_text(start.find_next())
if description:
data['description'] = description
if start:
body = start.find_all_next(style='margin-top:5px;')
body = list(map(self.extract_text, body))

for entry in body:

if 'Contact:' in entry:
contact_dict = {}
contact = entry.replace('Contact:', '').strip()
contact_email = contact[contact.find("(") + 1:contact.find(")")]
contact_name = contact.split('(', 1)[0].strip()
if ', Curator' in contact_name:
contact_name = contact_name.replace(', Curator', '').strip()
if contact and contact_email and re.match(r"[^@]+@[^@]+\.[^@]+", contact_email):
contact_dict['email'] = contact_email
if contact_name:
contact_dict['name'] = contact_name
if contact_dict:
data['contact'] = contact_dict

if 'Collection Type:' in entry:
collection_type = entry.replace('Collection Type: ', '')
data['collection-type'] = collection_type

if 'Management:' in entry:
management = entry.replace('Management: ', '')
if 'Last Update:' in management:
management_update = management.split('Last Update:', 1)
management = management_update[0]
last_update = management_update[1]
if last_update:
data['last-update'] = last_update.strip()
data['management'] = management.strip()

if 'Usage Rights:' in entry:
usage_rights = entry.replace('Usage Rights: ', '')
data['usage-rights'] = usage_rights

if 'Access Rights' in entry or 'Rights Holder:' in entry:
access_rights = entry.replace('Access Rights: ', '').replace('Rights Holder: ', '')
data['access-rights'] = access_rights

collection_statistics = start.find_all_next('li')
collection_statistics = list(map(self.extract_text, collection_statistics))
data['collection-statistics'] = self.process_collection_stat(collection_statistics)
return data

def extract_text(self, text):
return text.text.strip()

def process_collection_stat(self, list_values):
stat = {}
for item in list_values:
value = item.split()
stat[item.replace(str(value[0]), '').strip()] = value[0]
return stat
144 changes: 144 additions & 0 deletions tests/share/harvesters/test_swbiodiversity_harvester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from datetime import timedelta

from furl import furl
from httpretty import httpretty, httprettified
import pendulum
import pytest

from share.models import SourceConfig

main_page = '''
<html>
<head>
<title>SEINet - Arizona Chapter Collection Profiles</title>
</head>
<body>
<table>
<div id="innertext">
<h1>SEINet - Arizona Chapter Collections </h1>
<div>
Select a collection to see full details.
</div>
<table style='margin:10px;'>
<tr>
<td>
<h3>
<a href='collprofiles.php?collid=223'>
A. Michael Powell Herbarium
</a>
</h3>
<div style='margin:10px;'>
<div>Sample description</div>
<div style='margin-top:5px;'>
<b>Contact:</b>
Test Author (author@email.com)
</div>
</div>
</td>
</tr>
</table>
</div>
</table>
</body>
</html>
'''

collection_page = '''
<html>
<head>
<title>SEINet - Arizona Chapter A. Michael Powell Herbarium Collection Profiles</title
</head>
<body>
<table>
<!-- This is inner text! -->
<div id="innertext">
<h1>A. Michael Powell Herbarium (SRSC)</h1>
<div style='margin:10px;'>
<div>
Sample description
</div>
<div style='margin-top:5px;'>
<b>Contact:</b> Test Author (author@email.com)
</div>
<div style="margin-top:5px;">
</div>
<div style="margin-top:5px;">
<b>Collection Type: </b>Preserved Specimens
</div>
<div style="margin-top:5px;">
<b>Management: </b>Data snapshot of local collection database <div style="margin-top:5px;"><b>Last Update:</b> 1 October 2016</div>
</div>
<div style="margin-top:5px;">
<b>Usage Rights:</b> <a href="http://creativecommons.org/licenses/by-nc/3.0/" target="_blank">CC BY-NC (Attribution-Non-Commercial)</a>
</div>
<div style="margin-top:5px;">
<b>Rights Holder:</b> Sul Ross University
</div>
<div style="clear:both;margin-top:5px;">
<div style="font-weight:bold;">Collection Statistics:</div>
<ul style="margin-top:5px;">
<li>4,868 specimen records</li>
<li>1,195 (25%) georeferenced</li>
<li>2,954 (61%) with images</li><li>2,849 (59%) identified to species</li>
<li>104 families</li>
<li>361 genera</li>
<li>661 species</li>
<li>762 total taxa (including subsp. and var.)</li>
</ul>
</div>
</div>
</table>
</body>
</html>
'''


@pytest.mark.django_db
@httprettified
def test_swbiodiversity_harvester():
httpretty.enable()
httpretty.allow_net_connect = False

config = SourceConfig.objects.get(label=('org.swbiodiversity'))
url = config.harvester_kwargs['list_url']
harvester = config.get_harvester()

httpretty.register_uri(httpretty.GET, url,
body=main_page, content_type='text/html', match_querystring=True)
collection = furl(url)
collection.args['collid'] = 223
httpretty.register_uri(httpretty.GET, url + ';collid=(\d+)',
body=collection_page, content_type='text/html', match_querystring=True)
start = pendulum.utcnow() - timedelta(days=3)
end = pendulum.utcnow()
result = harvester._do_fetch(start, end)
for data in result:
assert data[0] == '223'
assert "".join(data[1].split()) == "".join('''
<div id="innertext">
<h1>SEINet - Arizona Chapter Collections </h1>
<div>
Select a collection to see full details.
</div>
<table style="margin:10px;">
<tr>
<td>
<h3>
<a href="collprofiles.php?collid=223">
A. Michael Powell Herbarium
</a>
</h3>
<div style="margin:10px;">
<div>Sample description</div>
<div style="margin-top:5px;">
<b>Contact:</b>
Test Author (author@email.com)
</div>
</div>
</td>
</tr>
</table>
</div>
'''"".split())

httpretty.disable()
Loading