Skip to content

Commit

Permalink
449 bring branch up to date with master
Browse files Browse the repository at this point in the history
  • Loading branch information
novellac committed Jul 17, 2018
2 parents fe270ad + 08c203e commit 82301ae
Show file tree
Hide file tree
Showing 11 changed files with 6,843 additions and 465 deletions.
2 changes: 1 addition & 1 deletion city_scrapers/mixins/wayne_commission.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _parse_status(self, item, data):
Postponed meetings will be considered cancelled.
"""

status_str = item.xpath('.//td[4]/text() | .//td[4]/a/text() | .//td[4]/p/a/text()').extract_first()
status_str = item.xpath('.//td[4]/a/text() | .//td[4]/text()').extract_first()
# If the agenda column text contains "postponed," we consider it cancelled.
if re.search(r'postpone', status_str, re.IGNORECASE):
return 'cancelled'
Expand Down
2 changes: 0 additions & 2 deletions city_scrapers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from .travis import TravisValidationPipeline
from .csv import CsvPipeline
from .item import CityScrapersItemPipeline
from .s3_item import CityScrapersS3ItemPipeline


__all__ = (
Expand All @@ -14,5 +13,4 @@
'TravisValidationPipeline',
'CsvPipeline',
'CityScrapersItemPipeline',
'CityScrapersS3ItemPipeline',
)
2 changes: 1 addition & 1 deletion city_scrapers/settings/prod.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# disabled until we can rebuild it on another provider
#'city_scrapers.pipelines.GeocoderPipeline': 200,
'city_scrapers.pipelines.CityScrapersItemPipeline': 200,
'city_scrapers.pipelines.CityScrapersS3ItemPipeline': 300,
'city_scrapers.pipelines.s3_item.CityScrapersS3ItemPipeline': 300,
'city_scrapers.pipelines.AirtablePipeline': 400
}

Expand Down
36 changes: 25 additions & 11 deletions city_scrapers/spiders/cook_hospitals.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-

from datetime import datetime

from datetime import datetime, timedelta
from dateutil.parser import parse
from city_scrapers.spider import Spider


Expand All @@ -24,7 +24,6 @@ def parse(self, response):
data = {
'_type': 'event',
'name': self._parse_name(item),
'end': {'date': None, 'time': None, 'note': ''},
'all_day': False,
'sources': [{'url': response.url, 'note': ''}],
}
Expand All @@ -36,11 +35,11 @@ def parse(self, response):
new_item = {
# TODO unsure where this should come from
'event_description': self._parse_description(subitem),
'start': self._parse_start(subitem),
'location': self._parse_location(subitem),
'documents': self._parse_documents(subitem)
}
new_item.update(data)
new_item.update(self._parse_times(subitem))
new_item['status'] = self._generate_status(new_item, '')
new_item['id'] = self._generate_id(new_item)
new_item['classification'] = self._parse_classification(new_item['name'])
Expand Down Expand Up @@ -91,14 +90,29 @@ def _parse_description(subitem):
return ''

@staticmethod
def _parse_start(subitem):
def _parse_times(subitem):
"""
Combine start time with year, month, and day.
"""
start_time = subitem.xpath('text()').extract_first().strip()
dt = datetime.strptime(start_time, '%B %d, %Y - %H:%M %p')
return {
'date': dt.date(),
'time': dt.time(),
'note': ''
tokens = subitem.xpath('text()').extract_first().strip().split(' - ')
date = parse(tokens[0])
time = parse(tokens[1])
times = {
'start': {
'date': date.date(),
'time': time.time(),
'note': ''
},
'end' : {
'date': date.date()
}
}

if len(tokens) > 2:
times['end']['time'] = parse(tokens[2]).time()
times['end']['note'] = ''
else:
times['end']['time'] = (time + timedelta(hours=3)).time()
times['end']['note'] = 'End time is estimated to be 3 hours after the start time'

return times
41 changes: 16 additions & 25 deletions city_scrapers/spiders/il_labor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
specification (http://docs.opencivicdata.org/en/latest/data/event.html).
"""

from datetime import datetime
from pytz import timezone
from datetime import datetime, timedelta
from dateutil.parser import parse

from city_scrapers.spider import Spider

Expand Down Expand Up @@ -39,22 +39,24 @@ def parse(self, response):
If the date/time info can't be parsed, assume that it is a `no meeting`
notice.
"""
start_time = self._parse_start(item)
if start_time is None:
start_datetime = self._parse_start(item)
if start_datetime is None:
continue

name = self._parse_name(item)
data = {
'_type': 'event',
'name': name,
'description': self._parse_description(item),
'event_description': self._parse_description(response),
'classification': self._parse_classification(item),
'start_time': start_time if start_time else None,
'end_time': None,
'start': { 'date': start_datetime.date(), 'time': start_datetime.time() },
'end': { 'date': start_datetime.date(), 'time': (start_datetime + timedelta(hours=3)).time() },
'all_day': self._parse_all_day(item),
'timezone': self.event_timezone,
'status': self._parse_status(item),
'location': self._parse_location(item),
'sources': self._parse_sources(response)
'sources': self._parse_sources(response),
'documents': None
}
data['id'] = self._generate_id(data)
yield data
Expand Down Expand Up @@ -89,21 +91,17 @@ def _parse_location(self, item):
"""
childs_siblings = item.xpath('child::*')
if len(childs_siblings) > 1:
addresses = item.xpath('child::div/div/p[position()>1]/text()').extract()
addresses = item.xpath('child::div/div/p/text()').extract()
address = ' Or '.join([a.strip() for a in addresses])
else:
address = item.xpath('following-sibling::div[1]/div/p/text()').extract_first()
if address:
address = address.strip()

return {
'url': None,
'url': '',
'address': address,
'name': None,
'coordinates': {
'latitude': None,
'longitude': None,
},
'name': ''
}

def _parse_all_day(self, item):
Expand All @@ -118,25 +116,18 @@ def _parse_name(self, item):
"""
return item.css('strong::text').extract_first().capitalize()

def _parse_description(self, item):
def _parse_description(self, response):
"""
No meeting-specific description, so use a generic description from page.
"""
return ("The Illinois Public Labor Relations Act (Act) governs labor relations "
"between most public employers in Illinois and their employees. Throughout "
"the State, the Illinois Labor Relations Board regulates the designation of "
"employee representatives; the negotiation of wages, hours, and other conditions "
"of employment; and resolves, or if necessary, adjudicates labor disputes.")
return response.css('#ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField p::text').extract_first().strip()

def _parse_start(self, item):
"""
Parse start date and time from the second `<strong>`
"""
time_string = item.css('strong:nth-of-type(2)::text').extract_first().replace('.', '')
try:
naive = datetime.strptime(time_string, '%A, %B %d, %Y at %I:%M %p')
tz = timezone(self.event_timezone)
return tz.localize(naive)
return parse(item.css('strong:nth-of-type(2)::text').extract_first())
except ValueError:
return None

Expand Down
66 changes: 66 additions & 0 deletions city_scrapers/spiders/wayne_audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-

# WE ARE BRINGING IN A MIXIN WHICH IMPORTS OTHER LIBRARIES.
# MIXINS ARE STORED IN /city-scrapers/city-scrapers/mixins
# YOU CAN TAKE THE DEFINITIONS OUT OF THE MIXIN AND ADD THEM HERE IF THEY ARE
# UNIQUE.

from city_scrapers.spider import Spider
from city_scrapers.mixins.wayne_commission import Wayne_commission


class Wayne_auditSpider(Wayne_commission, Spider):
name = 'wayne_audit'
long_name = 'Wayne County Audit Committee'
agency_id = 'Wayne County Audit Committee'
start_urls = ['https://www.waynecounty.com/elected/commission/audit.aspx']

def parse(self, response):
"""
`parse` should always `yield` a dict that follows the Event Schema
<https://city-bureau.github.io/city-scrapers/06_event_schema.html>.
Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping
needs.
"""

entries = response.xpath('//tbody/tr')

for item in entries:
data = {
'_type': 'event',
'name': 'Audit Committee',
'event_description': self._parse_description(item),
'classification': 'Committee',
'start': self._parse_start(item),
'end': {'date': None, 'time': None, 'note': ''},
'all_day': False,
'location': self._parse_location(),
'documents': self._parse_documents(item, response.url),
'sources': [{'url': response.url, 'note': ''}]
}
data['id'] = self._generate_id(data)
data['status'] = self._parse_status(item, data)

yield data

@staticmethod
def _parse_description(response):
"""
Event description taken from static text at top of page.
"""
desc_xpath = '//h2[contains(text(), "Audit")]/following-sibling::div/section/p/text()'
desc = response.xpath(desc_xpath).extract_first()
return desc

@staticmethod
def _parse_location():
"""
Location hardcoded. Text on the URL claims meetings are all held at
the same location.
"""
return {
'name': '7th floor meeting room, Guardian Building',
'address': '500 Griswold St, Detroit, MI 48226',
'neighborhood': '',
}
Loading

0 comments on commit 82301ae

Please sign in to comment.