Skip to content

Commit

Permalink
Merge pull request #508 from mkrump/spider-det_city_council
Browse files Browse the repository at this point in the history
Create Detroit City Council spider
  • Loading branch information
mkrump authored Aug 15, 2018
2 parents 47f0df1 + 13b6dd0 commit 722f741
Show file tree
Hide file tree
Showing 4 changed files with 2,870 additions and 0 deletions.
121 changes: 121 additions & 0 deletions city_scrapers/spiders/det_city_council.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
from urllib.parse import urljoin

import scrapy
from dateutil.parser import parse

from city_scrapers.spider import Spider


class DetCityCouncilSpider(Spider):
name = 'det_city_council'
agency_id = 'Detroit City Council'
timezone = 'America/Detroit'
allowed_domains = ['www.detroitmi.gov']
start_urls = ['http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions']

def parse(self, response):
"""
`parse` should always `yield` a dict that follows the Event Schema
<https://city-bureau.github.io/city-scrapers/06_event_schema.html>.
Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping
needs.
"""
months_crawled = response.meta.get('months_crawled', 0)
if months_crawled < 12:
yield from self._next_month(response, months_crawled)
yield from self._generate_requests(response)

@staticmethod
def _next_month(response, months_crawled):
form_params_xpath = "//td[@class='EventNextPrev'][2]/a/@href"
event_target, event_argument = response.xpath(form_params_xpath).re(r'\'(.*?)\'')
yield scrapy.FormRequest(
url=response.url,
formdata={'__EVENTTARGET': event_target, '__EVENTARGUMENT': event_argument},
meta={'months_crawled': months_crawled + 1},
)

def _generate_requests(self, response):
anchors = response.xpath("//a[contains(@id, 'ctlEvents')]")
anchors = [anchor for anchor in anchors if not self._is_recess_event(anchor)]
for a in anchors:
yield response.follow(a, self._parse_item)

@staticmethod
def _is_recess_event(anchor):
return 'RECESS' in anchor.xpath('text()').extract_first('').upper()

def _parse_item(self, response):
name = self._parse_name(response)
description = self._parse_description(response)
start = self._get_date(response, "Start Date")
end = self._get_date(response, "End Date")
location = self._get_location(response)
documents = self._parse_documents(response)

data = {
'_type': 'event',
'name': name,
'event_description': description,
'classification': 'Committee',
'start': start,
'end': end,
'all_day': False,
'location': location,
'documents': documents,
'sources': [{'url': response.url, 'note': ''}],
}
data['id'] = self._generate_id(data)
data['status'] = self._generate_status(data, text='')
yield data

@staticmethod
def _parse_description(response):
description_xpath = '//div[span[contains(., "Description")]]/following-sibling::div//p/text()'
description = response.xpath(description_xpath).extract_first('').strip()
return description

@staticmethod
def _parse_name(response):
name_xpath = '//span[@class="Head"]/text()'
name_text = response.xpath(name_xpath).extract_first()
name_value = name_text.split('-')[0].strip()
return name_value

def _get_location(self, response):
location_xpath = '//div[span[contains(., "Location")]]/following-sibling::div[1]/span/a/text()'
location_text = response.xpath(location_xpath).extract_first()
return self._choose_location(location_text)

@staticmethod
def _choose_location(location_text):
if 'YOUNG MUNICIPAL CENTER' in location_text.upper():
return {
'neighborhood': '',
'name': 'Coleman A. Young Municipal Center',
'address': '2 Woodward Detroit, MI 48226'
}
return {'neighborhood': '', 'name': '', 'address': location_text}

@staticmethod
def _get_date(response, contains):
date_xpath = '//div[span[contains(., "{}")]]/following-sibling::div[1]/span[1]/text()'.format(contains)
date_text = response.xpath(date_xpath).extract_first()
if date_text:
dt = parse(date_text)
return {'date': dt.date(), 'time': dt.time(), 'note': ''}
return {'date': None, 'time': None, 'note': ''}

@staticmethod
def _parse_documents(response):
documents_selector = '//div[span[contains(., "Description")]]/following-sibling::div//a'
anchors = response.xpath(documents_selector)
documents = []
for a in anchors:
documents_text = a.xpath('text()').extract_first()
documents_link = a.xpath('@href').extract_first()
url = urljoin(response.url, documents_link)
documents.append({'url': url, 'note': documents_text})
return documents
1,313 changes: 1,313 additions & 0 deletions tests/files/det_city_council.html

Large diffs are not rendered by default.

1,309 changes: 1,309 additions & 0 deletions tests/files/det_city_council_detail.html

Large diffs are not rendered by default.

127 changes: 127 additions & 0 deletions tests/test_det_city_council.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import urllib
from datetime import date, time
from urllib.parse import parse_qsl

import pytest
import scrapy

from city_scrapers.spiders.det_city_council import DetCityCouncilSpider
from tests.utils import file_response

test_response = file_response('files/det_city_council.html')
spider = DetCityCouncilSpider()


def test_request_count():
requests = list(spider.parse(test_response))
number_next_page_request = 1
event_requests = 34
assert len(requests) == number_next_page_request + event_requests

all_calendar_events = {urllib.parse.unquote(request.url) for request in requests if 'Details' in request.url}
select_calendar_events = {
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6552/mctl/EventDetails",
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6580/mctl/EventDetails",
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6573/mctl/EventDetails",
"http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6585/mctl/EventDetails",
}
assert select_calendar_events.issubset(all_calendar_events)

form_requests = [request for request in requests if isinstance(request, scrapy.FormRequest)]
assert len(form_requests) == 1

form_request = form_requests[0]
months_crawled = form_request.meta.get('months_crawled')
params = parse_qsl(form_request.body.decode(form_request.encoding))

assert months_crawled == 1
# ASP.NET page paging has to be done via form request
# so make sure updated form params are in request
assert ('__EVENTTARGET', 'dnn$ctr8319$Events$EventMonth$EventCalendar') in params
assert ('__EVENTARGUMENT', 'V6787') in params


test_detail = file_response(
'files/det_city_council_detail.html',
'http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6556/mctl/EventDetails'
)
parsed_items = [item for item in spider._parse_item(test_detail) if isinstance(item, dict)]


def test_name():
assert parsed_items[0]['name'] == 'Planning & Economic Development'


def test_description():
assert parsed_items[0]['event_description'] == \
'The Detroit City Council has scheduled an PLANNING & ECONOMIC ' \
'DEVELOPMENT COMMITTEE to be held on Thursday, July 5, ' \
'2018 at 10:00 a.m. in the Committee of the Whole Room, ' \
'13th floor, Coleman A. Young Municipal Center.'


def test_start():
assert parsed_items[0]['start'] == {
'date': date(2018, 7, 5), 'time': time(10, 00), 'note': ''
}


def test_end():
assert parsed_items[0]['end'] == {
'date': None, 'time': None, 'note': ''
}


def test_id():
assert parsed_items[0]['id'] == 'det_city_council/201807051000/x/planning_economic_development'


def test_status():
assert parsed_items[0]['status'] == 'passed'


def test_location():
assert parsed_items[0]['location'] == {
'neighborhood': '',
'name': 'Coleman A. Young Municipal Center',
'address': '2 Woodward Detroit, MI 48226'
}


def test_sources():
assert parsed_items[0]['sources'] == [{
'url': 'http://www.detroitmi.gov/Government/City-Council/City-Council-Sessions/ModuleID/8319/ItemID/6556/mctl/EventDetails',
'note': ''
}]


def test_documents():
assert parsed_items[0]['documents'] == [
{
'url': "http://www.detroitmi.gov/Portals/0/docs/City Clerk/Council 2018/Planning Economic/CAL 07-5-18 PED.pdf?ver=2018-07-03-164858-537",
'note': 'PLANNING & ECONOMIC DEVELOPMENT Agenda'
}
]


def test_choose_location():
assert spider._choose_location("Other") == {'neighborhood': '', 'name': '', 'address': "Other"}
assert spider._choose_location("Young Municipal Center and stuff") == {
'neighborhood': '',
'name': 'Coleman A. Young Municipal Center',
'address': '2 Woodward Detroit, MI 48226'
}


def test_classification():
assert parsed_items[0]['classification'] == 'Committee'


@pytest.mark.parametrize('item', parsed_items)
def test_all_day(item):
assert item['all_day'] is False


@pytest.mark.parametrize('item', parsed_items)
def test__type(item):
assert item['_type'] == 'event'

0 comments on commit 722f741

Please sign in to comment.