CenterForOpenScience · chrisseto · Jul 6, 2017 · May 26, 2017 · May 30, 2017 · May 30, 2017
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -3,6 +3,7 @@
 behave==1.2.5
 factory-boy==2.8.1
 fake-factory==0.7.2
+httpretty==0.8.14
 ipdb
 ipython
 pytest-benchmark==3.0.0

diff --git a/setup.py b/setup.py
@@ -59,6 +59,7 @@
             'org.plos = share.transformers.org_plos:PLoSTransformer',
             'org.psyarxiv = share.transformers.org_psyarxiv:PsyarxivTransformer',
             'org.socarxiv = share.transformers.org_socarxiv:SocarxivTransformer',
+            'org.swbiodiversity = share.transformers.org_swbiodiversity:SWTransformer',
             'v1_push = share.transformers.v1_push:V1Transformer',
         ],
         'share.harvesters': [
@@ -91,6 +92,7 @@
             'org.ncar = share.harvesters.org_ncar:NCARHarvester',
             'org.neurovault = share.harvesters.org_neurovault:NeuroVaultHarvester',
             'org.plos = share.harvesters.org_plos:PLOSHarvester',
+            'org.swbiodiversity = share.harvesters.org_swbiodiversity:SWHarvester',
         ]
     }
 )
diff --git a/share/harvesters/org_swbiodiversity.py b/share/harvesters/org_swbiodiversity.py
@@ -0,0 +1,64 @@
+import itertools
+import logging
+import re
+
+from bs4 import BeautifulSoup, Comment
+from furl import furl
+
+from share.harvest import BaseHarvester
+
+
+logger = logging.getLogger(__name__)
+
+
+class SWHarvester(BaseHarvester):
+    """
+
+    """
+    VERSION = 1
+
+    def _do_fetch(self, start, end, **kwargs):
+        end_date = end.date()
+        start_date = start.date()
+        logger.info('Harvesting swbiodiversity %s - %s', start_date, end_date)
+        return self.fetch_records()
+
+    def fetch_records(self):
+        response = self.requests.get(self.kwargs['list_url'])
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'lxml')
+        records = soup.find_all('a')
+
+        record_list = []
+        for record in records:
+            record_content = re.findall('collid=(\d+)', record.get('href'))
+            if record_content and record_content[0] not in record_list:
+                record_list.append(record_content[0])
+        total = len(record_list)
+
+        logging.info('Found %d results from swbiodiversity', total)
+
+        for count, identifier in enumerate(record_list):
+
+            logger.info('On collection %d of %d (%d%%)', count, total, (count / total) * 100)
+
+            collection_page = furl(self.kwargs['list_url'])
+            collection_page.args['collid'] = identifier
+            response = self.requests.get(collection_page.url)
+            response.raise_for_status()
+
+            raw_data = BeautifulSoup(response.content, 'html.parser')
+            # Peel out script tags and css things to minimize size of HTML
+            for el in itertools.chain(
+                    raw_data('img'),
+                    raw_data('link', rel=('stylesheet', 'dns-prefetch')),
+                    raw_data('link', {'type': re.compile('.')}),
+                    raw_data('noscript'),
+                    raw_data('script'),
+                    raw_data(string=lambda x: isinstance(x, Comment)),
+            ):
+                el.extract()
+
+            record = raw_data.find(id='innertext')
+
+            yield identifier, str(record)
diff --git a/share/sources/org.swbiodiversity/icon.ico b/share/sources/org.swbiodiversity/icon.ico
diff --git a/share/sources/org.swbiodiversity/source.yaml b/share/sources/org.swbiodiversity/source.yaml
@@ -0,0 +1,16 @@
+configs:
+- base_url: http://swbiodiversity.org/seinet/
+  disabled: false
+  earliest_date: null
+  harvester: org.swbiodiversity
+  harvester_kwargs:
+      list_url: http://swbiodiversity.org/seinet/collections/misc/collprofiles.php
+  label: org.swbiodiversity
+  rate_limit_allowance: 1
+  rate_limit_period: 2
+  transformer: org.swbiodiversity
+  transformer_kwargs: {}
+home_page: http://swbiodiversity.org/seinet/
+long_title: SEINet - Arizona Chapter Collections
+name: org.swbiodiversity
+user: providers.org.swbiodiversity
diff --git a/share/transformers/org_swbiodiversity.py b/share/transformers/org_swbiodiversity.py
@@ -0,0 +1,138 @@
+import re
+
+from bs4 import BeautifulSoup
+
+from share.transform.chain import ctx
+from share.transform.chain import links as tools
+from share.transform.chain.parsers import Parser
+from share.transform.chain.soup import SoupXMLTransformer
+
+
+class AgentIdentifier(Parser):
+    uri = tools.IRI(ctx)
+
+
+class WorkIdentifier(Parser):
+    uri = tools.IRI(ctx)
+
+
+class Organization(Parser):
+    name = ctx
+
+
+class Publisher(Parser):
+    agent = tools.Delegate(Organization, ctx)
+
+
+class Institution(Parser):
+    name = ctx
+
+
+class IsAffiliatedWith(Parser):
+    related = tools.Delegate(Institution)
+
+
+class Person(Parser):
+    given_name = tools.ParseName(tools.Try(ctx.name)).first
+    family_name = tools.ParseName(tools.Try(ctx.name)).last
+    identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.Try(ctx.email))
+
+
+class Creator(Parser):
+    agent = tools.Delegate(Person, ctx)
+
+
+class Dataset(Parser):
+    title = tools.Try(ctx['title'])
+    description = tools.Try(ctx['description'])
+
+    rights = tools.Try(
+        tools.Join(
+            tools.Concat(
+                tools.Try(ctx['access-rights']),
+                tools.Try(ctx['usage-rights'])
+            )
+        )
+    )
+
+    related_agents = tools.Map(tools.Delegate(Creator), tools.Try(ctx.contact))
+
+    class Extra:
+        access_rights = tools.Try(ctx['access-rights'])
+        usage_rights = tools.Try(ctx['usage-rights'])
+        collection_statistics = tools.Try(ctx['collection-statistics'])
+        management = tools.Try(ctx['management'])
+        collection_type = tools.Try(ctx['collection-type'])
+        last_update = tools.ParseDate(tools.Try(ctx['last-update']))
+
+
+class SWTransformer(SoupXMLTransformer):
+    VERSION = 1
+    root_parser = Dataset
+
+    def unwrap_data(self, input_data):
+        record = BeautifulSoup(input_data, 'lxml').html
+        data = {}
+        title = self.extract_text(record.h1)
+        if title:
+            data['title'] = title
+        start = record.div.div
+        description = self.extract_text(start.find_next())
+        if description:
+            data['description'] = description
+        if start:
+            body = start.find_all_next(style='margin-top:5px;')
+            body = list(map(self.extract_text, body))
+
+            for entry in body:
+
+                if 'Contact:' in entry:
+                    contact_dict = {}
+                    contact = entry.replace('Contact:', '').strip()
+                    contact_email = contact[contact.find("(") + 1:contact.find(")")]
+                    contact_name = contact.split('(', 1)[0].strip()
+                    if ', Curator' in contact_name:
+                        contact_name = contact_name.replace(', Curator', '').strip()
+                    if contact and contact_email and re.match(r"[^@]+@[^@]+\.[^@]+", contact_email):
+                        contact_dict['email'] = contact_email
+                    if contact_name:
+                        contact_dict['name'] = contact_name
+                    if contact_dict:
+                        data['contact'] = contact_dict
+
+                if 'Collection Type:' in entry:
+                    collection_type = entry.replace('Collection Type: ', '')
+                    data['collection-type'] = collection_type
+
+                if 'Management:' in entry:
+                    management = entry.replace('Management: ', '')
+                    if 'Last Update:' in management:
+                        management_update = management.split('Last Update:', 1)
+                        management = management_update[0]
+                        last_update = management_update[1]
+                        if last_update:
+                            data['last-update'] = last_update.strip()
+                    data['management'] = management.strip()
+
+                if 'Usage Rights:' in entry:
+                    usage_rights = entry.replace('Usage Rights: ', '')
+                    data['usage-rights'] = usage_rights
+
+                if 'Access Rights' in entry or 'Rights Holder:' in entry:
+                    access_rights = entry.replace('Access Rights: ', '').replace('Rights Holder: ', '')
+                    data['access-rights'] = access_rights
+
+            collection_statistics = start.find_all_next('li')
+            collection_statistics = list(map(self.extract_text, collection_statistics))
+            data['collection-statistics'] = self.process_collection_stat(collection_statistics)
+        return data
+
+    def extract_text(self, text):
+        return text.text.strip()
+
+    def process_collection_stat(self, list_values):
+        stat = {}
+        for item in list_values:
+            value = item.split()
+            stat[item.replace(str(value[0]), '').strip()] = value[0]
+        return stat
diff --git a/tests/share/harvesters/test_swbiodiversity_harvester.py b/tests/share/harvesters/test_swbiodiversity_harvester.py
@@ -0,0 +1,144 @@
+from datetime import timedelta
+
+from furl import furl
+from httpretty import httpretty, httprettified
+import pendulum
+import pytest
+
+from share.models import SourceConfig
+
+main_page = '''
+<html>
+<head>
+    <title>SEINet - Arizona Chapter  Collection Profiles</title>
+</head>
+<body>
+<table>
+    <div id="innertext">
+            <h1>SEINet - Arizona Chapter Collections </h1>
+            <div>
+                Select a collection to see full details.
+            </div>
+            <table style='margin:10px;'>
+                    <tr>
+                        <td>
+                            <h3>
+                                <a href='collprofiles.php?collid=223'>
+                                    A. Michael Powell Herbarium
+                                </a>
+                            </h3>
+                            <div style='margin:10px;'>
+                                <div>Sample description</div>
+                                <div style='margin-top:5px;'>
+                                    <b>Contact:</b>
+                                    Test Author (author@email.com)
+                                </div>
+                            </div>
+                        </td>
+                    </tr>
+            </table>
+    </div>
+</table>
+</body>
+</html>
+'''
+
+collection_page = '''
+<html>
+<head>
+    <title>SEINet - Arizona Chapter A. Michael Powell Herbarium Collection Profiles</title
+</head>
+<body>
+<table>
+<!-- This is inner text! -->
+    <div id="innertext">
+        <h1>A. Michael Powell Herbarium (SRSC)</h1>
+        <div style='margin:10px;'>
+        <div>
+        Sample description
+        </div>
+        <div style='margin-top:5px;'>
+                <b>Contact:</b> Test Author (author@email.com)
+                                </div>
+        <div style="margin-top:5px;">
+        </div>
+        <div style="margin-top:5px;">
+                    <b>Collection Type: </b>Preserved Specimens
+        </div>
+        <div style="margin-top:5px;">
+                <b>Management: </b>Data snapshot of local collection database <div style="margin-top:5px;"><b>Last Update:</b> 1 October 2016</div>
+        </div>
+        <div style="margin-top:5px;">
+            <b>Usage Rights:</b> <a href="http://creativecommons.org/licenses/by-nc/3.0/" target="_blank">CC BY-NC (Attribution-Non-Commercial)</a>
+        </div>
+        <div style="margin-top:5px;">
+            <b>Rights Holder:</b> Sul Ross University
+        </div>
+        <div style="clear:both;margin-top:5px;">
+            <div style="font-weight:bold;">Collection Statistics:</div>
+            <ul style="margin-top:5px;">
+                <li>4,868 specimen records</li>
+                <li>1,195 (25%) georeferenced</li>
+                <li>2,954 (61%) with images</li><li>2,849 (59%) identified to species</li>
+                <li>104 families</li>
+                <li>361 genera</li>
+                <li>661 species</li>
+                <li>762 total taxa (including subsp. and var.)</li>
+            </ul>
+        </div>
+    </div>
+</table>
+</body>
+</html>
+'''
+
+
+@pytest.mark.django_db
+@httprettified
+def test_swbiodiversity_harvester():
+    httpretty.enable()
+    httpretty.allow_net_connect = False
+
+    config = SourceConfig.objects.get(label=('org.swbiodiversity'))
+    url = config.harvester_kwargs['list_url']
+    harvester = config.get_harvester()
+
+    httpretty.register_uri(httpretty.GET, url,
+                           body=main_page, content_type='text/html', match_querystring=True)
+    collection = furl(url)
+    collection.args['collid'] = 223
+    httpretty.register_uri(httpretty.GET, url + ';collid=(\d+)',
+                           body=collection_page, content_type='text/html', match_querystring=True)
+    start = pendulum.utcnow() - timedelta(days=3)
+    end = pendulum.utcnow()
+    result = harvester._do_fetch(start, end)
+    for data in result:
+        assert data[0] == '223'
+        assert "".join(data[1].split()) == "".join('''
+            <div id="innertext">
+            <h1>SEINet - Arizona Chapter Collections </h1>
+            <div>
+                Select a collection to see full details.
+            </div>
+            <table style="margin:10px;">
+            <tr>
+            <td>
+            <h3>
+            <a href="collprofiles.php?collid=223">
+                A. Michael Powell Herbarium
+            </a>
+            </h3>
+            <div style="margin:10px;">
+            <div>Sample description</div>
+            <div style="margin-top:5px;">
+            <b>Contact:</b>
+               Test Author (author@email.com)
+            </div>
+            </div>
+            </td>
+            </tr>
+            </table>
+            </div>
+            '''"".split())
+
+    httpretty.disable()