449 bring branch up to date with master

City-Bureau · Jul 17, 2018 · 82301ae · 82301ae
2 parents fe270ad + 08c203e
commit 82301ae
Show file tree

Hide file tree

Showing 11 changed files with 6,843 additions and 465 deletions.
diff --git a/city_scrapers/mixins/wayne_commission.py b/city_scrapers/mixins/wayne_commission.py
@@ -47,7 +47,7 @@ def _parse_status(self, item, data):
         Postponed meetings will be considered cancelled.
         """
 
-        status_str = item.xpath('.//td[4]/text() | .//td[4]/a/text() | .//td[4]/p/a/text()').extract_first()
+        status_str = item.xpath('.//td[4]/a/text() | .//td[4]/text()').extract_first()
         # If the agenda column text contains "postponed," we consider it cancelled.
         if re.search(r'postpone', status_str, re.IGNORECASE):
             return 'cancelled'

diff --git a/city_scrapers/pipelines/__init__.py b/city_scrapers/pipelines/__init__.py
@@ -4,7 +4,6 @@
 from .travis import TravisValidationPipeline
 from .csv import CsvPipeline
 from .item import CityScrapersItemPipeline
-from .s3_item import CityScrapersS3ItemPipeline
 
 
 __all__ = (
@@ -14,5 +13,4 @@
     'TravisValidationPipeline',
     'CsvPipeline',
     'CityScrapersItemPipeline',
-    'CityScrapersS3ItemPipeline',
 )
diff --git a/city_scrapers/settings/prod.py b/city_scrapers/settings/prod.py
@@ -14,7 +14,7 @@
     # disabled until we can rebuild it on another provider
     #'city_scrapers.pipelines.GeocoderPipeline': 200,
     'city_scrapers.pipelines.CityScrapersItemPipeline': 200,
-    'city_scrapers.pipelines.CityScrapersS3ItemPipeline': 300,
+    'city_scrapers.pipelines.s3_item.CityScrapersS3ItemPipeline': 300,
     'city_scrapers.pipelines.AirtablePipeline': 400
 }
 

diff --git a/city_scrapers/spiders/cook_hospitals.py b/city_scrapers/spiders/cook_hospitals.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
-from datetime import datetime
-
+from datetime import datetime, timedelta
+from dateutil.parser import parse
 from city_scrapers.spider import Spider
 
 
@@ -24,7 +24,6 @@ def parse(self, response):
             data = {
                 '_type': 'event',
                 'name': self._parse_name(item),
-                'end': {'date': None, 'time': None, 'note': ''},
                 'all_day': False,
                 'sources': [{'url': response.url, 'note': ''}],
             }
@@ -36,11 +35,11 @@ def parse(self, response):
                 new_item = {
                     # TODO unsure where this should come from
                     'event_description': self._parse_description(subitem),
-                    'start': self._parse_start(subitem),
                     'location': self._parse_location(subitem),
                     'documents': self._parse_documents(subitem)
                 }
                 new_item.update(data)
+                new_item.update(self._parse_times(subitem))
                 new_item['status'] = self._generate_status(new_item, '')
                 new_item['id'] = self._generate_id(new_item)
                 new_item['classification'] = self._parse_classification(new_item['name'])
@@ -91,14 +90,29 @@ def _parse_description(subitem):
         return ''
 
     @staticmethod
-    def _parse_start(subitem):
+    def _parse_times(subitem):
         """
         Combine start time with year, month, and day.
         """
-        start_time = subitem.xpath('text()').extract_first().strip()
-        dt = datetime.strptime(start_time, '%B %d, %Y - %H:%M %p')
-        return {
-            'date': dt.date(),
-            'time': dt.time(),
-            'note': ''
+        tokens = subitem.xpath('text()').extract_first().strip().split(' - ')
+        date = parse(tokens[0])
+        time = parse(tokens[1])
+        times = {
+            'start': {
+                'date': date.date(),
+                'time': time.time(),
+                'note': ''
+            },
+            'end' : {
+                'date': date.date()
+            }
         }
+
+        if len(tokens) > 2:
+            times['end']['time'] = parse(tokens[2]).time()
+            times['end']['note'] = ''
+        else:
+            times['end']['time'] = (time + timedelta(hours=3)).time()
+            times['end']['note'] = 'End time is estimated to be 3 hours after the start time'
+
+        return times
diff --git a/city_scrapers/spiders/il_labor.py b/city_scrapers/spiders/il_labor.py
@@ -4,8 +4,8 @@
 specification (http://docs.opencivicdata.org/en/latest/data/event.html).
 """
 
-from datetime import datetime
-from pytz import timezone
+from datetime import datetime, timedelta
+from dateutil.parser import parse
 
 from city_scrapers.spider import Spider
 
@@ -39,22 +39,24 @@ def parse(self, response):
             If the date/time info can't be parsed, assume that it is a `no meeting`
             notice.
             """
-            start_time = self._parse_start(item)
-            if start_time is None:
+            start_datetime = self._parse_start(item)
+            if start_datetime is None:
                 continue
+
             name = self._parse_name(item)
             data = {
                 '_type': 'event',
                 'name': name,
-                'description': self._parse_description(item),
+                'event_description': self._parse_description(response),
                 'classification': self._parse_classification(item),
-                'start_time': start_time if start_time else None,
-                'end_time': None,
+                'start': { 'date': start_datetime.date(), 'time': start_datetime.time() },
+                'end': { 'date': start_datetime.date(), 'time': (start_datetime + timedelta(hours=3)).time() },
                 'all_day': self._parse_all_day(item),
                 'timezone': self.event_timezone,
                 'status': self._parse_status(item),
                 'location': self._parse_location(item),
-                'sources': self._parse_sources(response)
+                'sources': self._parse_sources(response),
+                'documents': None
             }
             data['id'] = self._generate_id(data)
             yield data
@@ -89,21 +91,17 @@ def _parse_location(self, item):
         """
         childs_siblings = item.xpath('child::*')
         if len(childs_siblings) > 1:
-            addresses = item.xpath('child::div/div/p[position()>1]/text()').extract()
+            addresses = item.xpath('child::div/div/p/text()').extract()
             address = ' Or '.join([a.strip() for a in addresses])
         else:
             address = item.xpath('following-sibling::div[1]/div/p/text()').extract_first()
             if address:
                 address = address.strip()
 
         return {
-            'url': None,
+            'url': '',
             'address': address,
-            'name': None,
-            'coordinates': {
-                'latitude': None,
-                'longitude': None,
-            },
+            'name': ''
         }
 
     def _parse_all_day(self, item):
@@ -118,25 +116,18 @@ def _parse_name(self, item):
         """
         return item.css('strong::text').extract_first().capitalize()
 
-    def _parse_description(self, item):
+    def _parse_description(self, response):
         """
         No meeting-specific description, so use a generic description from page.
         """
-        return ("The Illinois Public Labor Relations Act (Act) governs labor relations "
-                "between most public employers in Illinois and their employees. Throughout "
-                "the State, the Illinois Labor Relations Board regulates the designation of "
-                "employee representatives; the negotiation of wages, hours, and other conditions "
-                "of employment; and resolves, or if necessary, adjudicates labor disputes.")
+        return response.css('#ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField p::text').extract_first().strip()
 
     def _parse_start(self, item):
         """
         Parse start date and time from the second `<strong>`
         """
-        time_string = item.css('strong:nth-of-type(2)::text').extract_first().replace('.', '')
         try:
-            naive = datetime.strptime(time_string, '%A, %B %d, %Y at %I:%M %p')
-            tz = timezone(self.event_timezone)
-            return tz.localize(naive)
+            return parse(item.css('strong:nth-of-type(2)::text').extract_first())
         except ValueError:
             return None
 

diff --git a/city_scrapers/spiders/wayne_audit.py b/city_scrapers/spiders/wayne_audit.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+# WE ARE BRINGING IN A MIXIN WHICH IMPORTS OTHER LIBRARIES.
+# MIXINS ARE STORED IN /city-scrapers/city-scrapers/mixins
+# YOU CAN TAKE THE DEFINITIONS OUT OF THE MIXIN AND ADD THEM HERE IF THEY ARE
+# UNIQUE.
+
+from city_scrapers.spider import Spider
+from city_scrapers.mixins.wayne_commission import Wayne_commission
+
+
+class Wayne_auditSpider(Wayne_commission, Spider):
+    name = 'wayne_audit'
+    long_name = 'Wayne County Audit Committee'
+    agency_id = 'Wayne County Audit Committee'
+    start_urls = ['https://www.waynecounty.com/elected/commission/audit.aspx']
+
+    def parse(self, response):
+        """
+        `parse` should always `yield` a dict that follows the Event Schema
+        <https://city-bureau.github.io/city-scrapers/06_event_schema.html>.
+
+        Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping
+        needs.
+        """
+
+        entries = response.xpath('//tbody/tr')
+
+        for item in entries:
+            data = {
+                '_type': 'event',
+                'name': 'Audit Committee',
+                'event_description': self._parse_description(item),
+                'classification': 'Committee',
+                'start': self._parse_start(item),
+                'end': {'date': None, 'time': None, 'note': ''},
+                'all_day': False,
+                'location': self._parse_location(),
+                'documents': self._parse_documents(item, response.url),
+                'sources': [{'url': response.url, 'note': ''}]
+            }
+            data['id'] = self._generate_id(data)
+            data['status'] = self._parse_status(item, data)
+
+            yield data
+
+    @staticmethod
+    def _parse_description(response):
+        """
+        Event description taken from static text at top of page.
+        """
+        desc_xpath = '//h2[contains(text(), "Audit")]/following-sibling::div/section/p/text()'
+        desc = response.xpath(desc_xpath).extract_first()
+        return desc
+
+    @staticmethod
+    def _parse_location():
+        """
+        Location hardcoded. Text on the URL claims meetings are all held at
+        the same location.
+        """
+        return {
+            'name': '7th floor meeting room, Guardian Building',
+            'address': '500 Griswold St, Detroit, MI 48226',
+            'neighborhood': '',
+        }