diff --git a/antara/antara/spiders/antara_spider.py b/antara/antara/spiders/antara_spider.py index 67dec0d..b2912f4 100644 --- a/antara/antara/spiders/antara_spider.py +++ b/antara/antara/spiders/antara_spider.py @@ -22,9 +22,9 @@ def parse(self, response): for indek in indeks: item = AntaraItem() - item['title'] = indek.xpath('div/div[@class="bxpd"]/h3/a/text()').extract()[0] - item['link'] = "http://www.antaranews.com" + indek.xpath('div/div[@class="bxpd"]/h3/a/@href').extract()[0] - item['images'] = indek.xpath('div/div[@class="imgpg"]/a/img/@src').extract()[0] + item['title'] = indek.xpath('div/div[@class="bxpd"]/h3/a/text()').extract_first() + item['link'] = response.urljoin(indek.xpath('div/div[@class="bxpd"]/h3/a/@href').extract_first()) + item['images'] = indek.xpath('div/div[@class="imgpg"]/a/img/@src').extract_first() item['category'] = "" item['date'] = time.strftime("%d/%m/%Y") item['desc'] = "" diff --git a/kompas/kompas/spiders/kompas_spider.py b/kompas/kompas/spiders/kompas_spider.py index d340787..215f0c7 100644 --- a/kompas/kompas/spiders/kompas_spider.py +++ b/kompas/kompas/spiders/kompas_spider.py @@ -21,11 +21,11 @@ def parse(self, response): for indek in indeks: item = KompasItem() - item['title'] = indek.xpath('div[@class="article__list__title"]/h3/a/text()').extract()[0] - item['link'] = indek.xpath('div[@class="article__list__title"]/h3/a/@href').extract()[0] - item['images'] = indek.xpath('div[@class="article__list__asset clearfix"]/div/img/@src').extract()[0] - item['category'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__subtitle article__subtitle--inline"]/text()').extract()[0] - item['date'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__date"]/text()').extract()[0] + item['title'] = indek.xpath('div[@class="article__list__title"]/h3/a/text()').extract_first() + item['link'] = indek.xpath('div[@class="article__list__title"]/h3/a/@href').extract_first() + item['images'] = indek.xpath('div[@class="article__list__asset clearfix"]/div/img/@src').extract_first() + item['category'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__subtitle article__subtitle--inline"]/text()').extract_first() + item['date'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__date"]/text()').extract_first() item['desc'] = "" yield item diff --git a/liputan6/liputan6/spiders/liputan6_spider.py b/liputan6/liputan6/spiders/liputan6_spider.py index d5e55b4..93814e8 100644 --- a/liputan6/liputan6/spiders/liputan6_spider.py +++ b/liputan6/liputan6/spiders/liputan6_spider.py @@ -21,11 +21,11 @@ def parse(self, response): for indek in indeks: item = Liputan6Item() - item['title'] = indek.xpath('aside/header/h4/a/@title').extract()[0].strip() - item['link'] = indek.xpath('aside/header/h4/a/@href').extract()[0].strip() + item['title'] = indek.xpath('aside/header/h4/a/@title').extract_first().strip() + item['link'] = indek.xpath('aside/header/h4/a/@href').extract_first().strip() item['images'] = "" - item['category'] = indek.xpath('aside/header/a/text()').extract()[0].strip() - item['date'] = indek.xpath('aside/header/span/time/@datetime').extract()[0].strip() - item['desc'] = indek.xpath('aside/div/text()').extract()[0].strip() + item['category'] = indek.xpath('aside/header/a/text()').extract_first().strip() + item['date'] = indek.xpath('aside/header/span/time/@datetime').extract_first().strip() + item['desc'] = indek.xpath('aside/div/text()').extract_first().strip() yield item diff --git a/merdeka/merdeka/spiders/merdeka_spider.py b/merdeka/merdeka/spiders/merdeka_spider.py index 63e77ee..c3c5c18 100644 --- a/merdeka/merdeka/spiders/merdeka_spider.py +++ b/merdeka/merdeka/spiders/merdeka_spider.py @@ -22,10 +22,10 @@ def parse(self, response): for indek in indeks: item = MerdekaItem() - item['title'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/text()').extract()[0] - item['link'] = "https://www.merdeka.com" + indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/@href').extract()[0] - item['images'] = indek.xpath('div[@class="mdk-tag-contln-l"]/a/img/@src').extract()[0] - item['category'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-date"]/span/text()').extract()[0] + item['title'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/text()').extract_first() + item['link'] = response.urljoin(indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/@href').extract_first()) + item['images'] = indek.xpath('div[@class="mdk-tag-contln-l"]/a/img/@src').extract_first() + item['category'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-date"]/span/text()').extract_first() item['date'] = time.strftime("%d/%m/%Y") item['desc'] = "" diff --git a/okezone/okezone/spiders/okezone_spider.py b/okezone/okezone/spiders/okezone_spider.py index 8a98fff..6003ec2 100644 --- a/okezone/okezone/spiders/okezone_spider.py +++ b/okezone/okezone/spiders/okezone_spider.py @@ -1,6 +1,7 @@ import scrapy import time import sys +from __future__ import print_function from bs4 import BeautifulSoup from scrapy.selector import Selector from scrapy.http.request import Request @@ -35,7 +36,7 @@ def parse(self, response): yield detail_request def parse_detail(self, response): - print "Crawling detail news" + print("Crawling detail news") item = response.meta['item'] selector = Selector(response) description = selector.xpath('//*[@id="contentx"]').extract_first() diff --git a/republika/republika/spiders/republika_spider.py b/republika/republika/spiders/republika_spider.py index 352bac5..52bd50b 100644 --- a/republika/republika/spiders/republika_spider.py +++ b/republika/republika/spiders/republika_spider.py @@ -18,7 +18,7 @@ def parse(self, response): @url http://www.republika.co.id/indeks @returns items """ - print "Crawling list of news" + print("Crawling list of news") indeks = Selector(response).xpath('//div[@class="wp-indeks"]') indeks_length = len(indeks) if float(indeks_length) > 0: @@ -51,7 +51,7 @@ def parse(self, response): ) def parse_detail(self, response): - print "Crawling detail news" + print("Crawling detail news") item = response.meta['item'] selector = Selector(response) description = selector.xpath('//div[@class="content-detail"]').extract_first() diff --git a/tempo/tempo/spiders/tempo_spider.py b/tempo/tempo/spiders/tempo_spider.py index 206ec08..67c728d 100644 --- a/tempo/tempo/spiders/tempo_spider.py +++ b/tempo/tempo/spiders/tempo_spider.py @@ -34,7 +34,7 @@ def parse(self, response): yield detail_request def parse_detail(self, response): - print "Crawling detail news" + print("Crawling detail news") item = response.meta['item'] selector = Selector(response) description = selector.xpath('//article').extract_first() diff --git a/tirto/tirto/spiders/tirto_spider.py b/tirto/tirto/spiders/tirto_spider.py index 3bc7dfd..356c656 100644 --- a/tirto/tirto/spiders/tirto_spider.py +++ b/tirto/tirto/spiders/tirto_spider.py @@ -37,7 +37,7 @@ def parse(self, response): yield detail_request def parse_detail(self, response): - print "Crawling detail news" + print("Crawling detail news") item = response.meta['item'] selector = Selector(response) description = selector.xpath('//div[@class="content-text-editor"]').extract_first() diff --git a/viva/viva/spiders/viva_spider.py b/viva/viva/spiders/viva_spider.py index 6881a6c..788a4d7 100644 --- a/viva/viva/spiders/viva_spider.py +++ b/viva/viva/spiders/viva_spider.py @@ -23,18 +23,18 @@ def parse(self, response): for indek in indeks: item = VivaItem() - news_link = indek.xpath('div[@class="content_center"]/span/a[2]/@href').extract()[0] - item['title'] = indek.xpath('div[@class="content_center"]/span/a[2]/h3/text()').extract()[0] + news_link = indek.xpath('div[@class="content_center"]/span/a[2]/@href').extract_first() + item['title'] = indek.xpath('div[@class="content_center"]/span/a[2]/h3/text()').extract_first() item['link'] = news_link - item['images'] = indek.xpath('div[@class="thumb"]/a/img/@data-original').extract()[0] - item['category'] = indek.xpath('div[@class="content_center"]/span/a[1]/h5/text()').extract()[0].strip() - item['date'] = indek.xpath('div[@class="content_center"]/span/div[@class="date"]/text()').extract()[0] + item['images'] = indek.xpath('div[@class="thumb"]/a/img/@data-original').extract_first() + item['category'] = indek.xpath('div[@class="content_center"]/span/a[1]/h5/text()').extract_first().strip() + item['date'] = indek.xpath('div[@class="content_center"]/span/div[@class="date"]/text()').extract_first() detail_request = Request(news_link, callback=self.parse_detail) detail_request.meta['item'] = item yield detail_request def parse_detail(self, response): - print "Crawling detail news" + print("Crawling detail news") item = response.meta['item'] selector = Selector(response) description = selector.xpath('//div[@id="article-detail-content"]').extract_first()