-
-
Notifications
You must be signed in to change notification settings - Fork 311
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #508 from mkrump/spider-det_city_council
Create Detroit City Council spider
- Loading branch information
Showing
4 changed files
with
2,870 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# -*- coding: utf-8 -*- | ||
from urllib.parse import urljoin | ||
|
||
import scrapy | ||
from dateutil.parser import parse | ||
|
||
from city_scrapers.spider import Spider | ||
|
||
|
||
class DetCityCouncilSpider(Spider): | ||
name = 'det_city_council' | ||
agency_id = 'Detroit City Council' | ||
timezone = 'America/Detroit' | ||
allowed_domains = ['www.detroitmi.gov'] | ||
start_urls = ['http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions'] | ||
|
||
def parse(self, response): | ||
""" | ||
`parse` should always `yield` a dict that follows the Event Schema | ||
<https://city-bureau.github.io/city-scrapers/06_event_schema.html>. | ||
Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping | ||
needs. | ||
""" | ||
months_crawled = response.meta.get('months_crawled', 0) | ||
if months_crawled < 12: | ||
yield from self._next_month(response, months_crawled) | ||
yield from self._generate_requests(response) | ||
|
||
@staticmethod | ||
def _next_month(response, months_crawled): | ||
form_params_xpath = "//td[@class='EventNextPrev'][2]/a/@href" | ||
event_target, event_argument = response.xpath(form_params_xpath).re(r'\'(.*?)\'') | ||
yield scrapy.FormRequest( | ||
url=response.url, | ||
formdata={'__EVENTTARGET': event_target, '__EVENTARGUMENT': event_argument}, | ||
meta={'months_crawled': months_crawled + 1}, | ||
) | ||
|
||
def _generate_requests(self, response): | ||
anchors = response.xpath("//a[contains(@id, 'ctlEvents')]") | ||
anchors = [anchor for anchor in anchors if not self._is_recess_event(anchor)] | ||
for a in anchors: | ||
yield response.follow(a, self._parse_item) | ||
|
||
@staticmethod | ||
def _is_recess_event(anchor): | ||
return 'RECESS' in anchor.xpath('text()').extract_first('').upper() | ||
|
||
def _parse_item(self, response): | ||
name = self._parse_name(response) | ||
description = self._parse_description(response) | ||
start = self._get_date(response, "Start Date") | ||
end = self._get_date(response, "End Date") | ||
location = self._get_location(response) | ||
documents = self._parse_documents(response) | ||
|
||
data = { | ||
'_type': 'event', | ||
'name': name, | ||
'event_description': description, | ||
'classification': 'Committee', | ||
'start': start, | ||
'end': end, | ||
'all_day': False, | ||
'location': location, | ||
'documents': documents, | ||
'sources': [{'url': response.url, 'note': ''}], | ||
} | ||
data['id'] = self._generate_id(data) | ||
data['status'] = self._generate_status(data, text='') | ||
yield data | ||
|
||
@staticmethod | ||
def _parse_description(response): | ||
description_xpath = '//div[span[contains(., "Description")]]/following-sibling::div//p/text()' | ||
description = response.xpath(description_xpath).extract_first('').strip() | ||
return description | ||
|
||
@staticmethod | ||
def _parse_name(response): | ||
name_xpath = '//span[@class="Head"]/text()' | ||
name_text = response.xpath(name_xpath).extract_first() | ||
name_value = name_text.split('-')[0].strip() | ||
return name_value | ||
|
||
def _get_location(self, response): | ||
location_xpath = '//div[span[contains(., "Location")]]/following-sibling::div[1]/span/a/text()' | ||
location_text = response.xpath(location_xpath).extract_first() | ||
return self._choose_location(location_text) | ||
|
||
@staticmethod | ||
def _choose_location(location_text): | ||
if 'YOUNG MUNICIPAL CENTER' in location_text.upper(): | ||
return { | ||
'neighborhood': '', | ||
'name': 'Coleman A. Young Municipal Center', | ||
'address': '2 Woodward Detroit, MI 48226' | ||
} | ||
return {'neighborhood': '', 'name': '', 'address': location_text} | ||
|
||
@staticmethod | ||
def _get_date(response, contains): | ||
date_xpath = '//div[span[contains(., "{}")]]/following-sibling::div[1]/span[1]/text()'.format(contains) | ||
date_text = response.xpath(date_xpath).extract_first() | ||
if date_text: | ||
dt = parse(date_text) | ||
return {'date': dt.date(), 'time': dt.time(), 'note': ''} | ||
return {'date': None, 'time': None, 'note': ''} | ||
|
||
@staticmethod | ||
def _parse_documents(response): | ||
documents_selector = '//div[span[contains(., "Description")]]/following-sibling::div//a' | ||
anchors = response.xpath(documents_selector) | ||
documents = [] | ||
for a in anchors: | ||
documents_text = a.xpath('text()').extract_first() | ||
documents_link = a.xpath('@href').extract_first() | ||
url = urljoin(response.url, documents_link) | ||
documents.append({'url': url, 'note': documents_text}) | ||
return documents |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import urllib | ||
from datetime import date, time | ||
from urllib.parse import parse_qsl | ||
|
||
import pytest | ||
import scrapy | ||
|
||
from city_scrapers.spiders.det_city_council import DetCityCouncilSpider | ||
from tests.utils import file_response | ||
|
||
test_response = file_response('files/det_city_council.html') | ||
spider = DetCityCouncilSpider() | ||
|
||
|
||
def test_request_count(): | ||
requests = list(spider.parse(test_response)) | ||
number_next_page_request = 1 | ||
event_requests = 34 | ||
assert len(requests) == number_next_page_request + event_requests | ||
|
||
all_calendar_events = {urllib.parse.unquote(request.url) for request in requests if 'Details' in request.url} | ||
select_calendar_events = { | ||
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6552/mctl/EventDetails", | ||
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6580/mctl/EventDetails", | ||
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6573/mctl/EventDetails", | ||
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6585/mctl/EventDetails", | ||
} | ||
assert select_calendar_events.issubset(all_calendar_events) | ||
|
||
form_requests = [request for request in requests if isinstance(request, scrapy.FormRequest)] | ||
assert len(form_requests) == 1 | ||
|
||
form_request = form_requests[0] | ||
months_crawled = form_request.meta.get('months_crawled') | ||
params = parse_qsl(form_request.body.decode(form_request.encoding)) | ||
|
||
assert months_crawled == 1 | ||
# ASP.NET page paging has to be done via form request | ||
# so make sure updated form params are in request | ||
assert ('__EVENTTARGET', 'dnn$ctr8319$Events$EventMonth$EventCalendar') in params | ||
assert ('__EVENTARGUMENT', 'V6787') in params | ||
|
||
|
||
test_detail = file_response( | ||
'files/det_city_council_detail.html', | ||
'http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6556/mctl/EventDetails' | ||
) | ||
parsed_items = [item for item in spider._parse_item(test_detail) if isinstance(item, dict)] | ||
|
||
|
||
def test_name(): | ||
assert parsed_items[0]['name'] == 'Planning & Economic Development' | ||
|
||
|
||
def test_description(): | ||
assert parsed_items[0]['event_description'] == \ | ||
'The Detroit City Council has scheduled an PLANNING & ECONOMIC ' \ | ||
'DEVELOPMENT COMMITTEE to be held on Thursday, July 5, ' \ | ||
'2018 at 10:00 a.m. in the Committee of the Whole Room, ' \ | ||
'13th floor, Coleman A. Young Municipal Center.' | ||
|
||
|
||
def test_start(): | ||
assert parsed_items[0]['start'] == { | ||
'date': date(2018, 7, 5), 'time': time(10, 00), 'note': '' | ||
} | ||
|
||
|
||
def test_end(): | ||
assert parsed_items[0]['end'] == { | ||
'date': None, 'time': None, 'note': '' | ||
} | ||
|
||
|
||
def test_id(): | ||
assert parsed_items[0]['id'] == 'det_city_council/201807051000/x/planning_economic_development' | ||
|
||
|
||
def test_status(): | ||
assert parsed_items[0]['status'] == 'passed' | ||
|
||
|
||
def test_location(): | ||
assert parsed_items[0]['location'] == { | ||
'neighborhood': '', | ||
'name': 'Coleman A. Young Municipal Center', | ||
'address': '2 Woodward Detroit, MI 48226' | ||
} | ||
|
||
|
||
def test_sources(): | ||
assert parsed_items[0]['sources'] == [{ | ||
'url': 'http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6556/mctl/EventDetails', | ||
'note': '' | ||
}] | ||
|
||
|
||
def test_documents(): | ||
assert parsed_items[0]['documents'] == [ | ||
{ | ||
'url': "http://www.detroitmi.gov/Portals/0/docs/City Clerk/Council 2018/Planning Economic/CAL 07-5-18 PED.pdf?ver=2018-07-03-164858-537", | ||
'note': 'PLANNING & ECONOMIC DEVELOPMENT Agenda' | ||
} | ||
] | ||
|
||
|
||
def test_choose_location(): | ||
assert spider._choose_location("Other") == {'neighborhood': '', 'name': '', 'address': "Other"} | ||
assert spider._choose_location("Young Municipal Center and stuff") == { | ||
'neighborhood': '', | ||
'name': 'Coleman A. Young Municipal Center', | ||
'address': '2 Woodward Detroit, MI 48226' | ||
} | ||
|
||
|
||
def test_classification(): | ||
assert parsed_items[0]['classification'] == 'Committee' | ||
|
||
|
||
@pytest.mark.parametrize('item', parsed_items) | ||
def test_all_day(item): | ||
assert item['all_day'] is False | ||
|
||
|
||
@pytest.mark.parametrize('item', parsed_items) | ||
def test__type(item): | ||
assert item['_type'] == 'event' |