Skip to content

Commit

Permalink
Merge pull request #22 from brandonmburroughs/fix_spiders
Browse files Browse the repository at this point in the history
Updating extraction in spiders
  • Loading branch information
harryandriyan authored Oct 1, 2017
2 parents b41554d + e52b398 commit 68691d6
Show file tree
Hide file tree
Showing 9 changed files with 29 additions and 28 deletions.
6 changes: 3 additions & 3 deletions antara/antara/spiders/antara_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def parse(self, response):

for indek in indeks:
item = AntaraItem()
item['title'] = indek.xpath('div/div[@class="bxpd"]/h3/a/text()').extract()[0]
item['link'] = "http://www.antaranews.com" + indek.xpath('div/div[@class="bxpd"]/h3/a/@href').extract()[0]
item['images'] = indek.xpath('div/div[@class="imgpg"]/a/img/@src').extract()[0]
item['title'] = indek.xpath('div/div[@class="bxpd"]/h3/a/text()').extract_first()
item['link'] = response.urljoin(indek.xpath('div/div[@class="bxpd"]/h3/a/@href').extract_first())
item['images'] = indek.xpath('div/div[@class="imgpg"]/a/img/@src').extract_first()
item['category'] = ""
item['date'] = time.strftime("%d/%m/%Y")
item['desc'] = ""
Expand Down
10 changes: 5 additions & 5 deletions kompas/kompas/spiders/kompas_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def parse(self, response):

for indek in indeks:
item = KompasItem()
item['title'] = indek.xpath('div[@class="article__list__title"]/h3/a/text()').extract()[0]
item['link'] = indek.xpath('div[@class="article__list__title"]/h3/a/@href').extract()[0]
item['images'] = indek.xpath('div[@class="article__list__asset clearfix"]/div/img/@src').extract()[0]
item['category'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__subtitle article__subtitle--inline"]/text()').extract()[0]
item['date'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__date"]/text()').extract()[0]
item['title'] = indek.xpath('div[@class="article__list__title"]/h3/a/text()').extract_first()
item['link'] = indek.xpath('div[@class="article__list__title"]/h3/a/@href').extract_first()
item['images'] = indek.xpath('div[@class="article__list__asset clearfix"]/div/img/@src').extract_first()
item['category'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__subtitle article__subtitle--inline"]/text()').extract_first()
item['date'] = indek.xpath('div[@class="article__list__info"]/div[@class="article__date"]/text()').extract_first()
item['desc'] = ""

yield item
10 changes: 5 additions & 5 deletions liputan6/liputan6/spiders/liputan6_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def parse(self, response):

for indek in indeks:
item = Liputan6Item()
item['title'] = indek.xpath('aside/header/h4/a/@title').extract()[0].strip()
item['link'] = indek.xpath('aside/header/h4/a/@href').extract()[0].strip()
item['title'] = indek.xpath('aside/header/h4/a/@title').extract_first().strip()
item['link'] = indek.xpath('aside/header/h4/a/@href').extract_first().strip()
item['images'] = ""
item['category'] = indek.xpath('aside/header/a/text()').extract()[0].strip()
item['date'] = indek.xpath('aside/header/span/time/@datetime').extract()[0].strip()
item['desc'] = indek.xpath('aside/div/text()').extract()[0].strip()
item['category'] = indek.xpath('aside/header/a/text()').extract_first().strip()
item['date'] = indek.xpath('aside/header/span/time/@datetime').extract_first().strip()
item['desc'] = indek.xpath('aside/div/text()').extract_first().strip()

yield item
8 changes: 4 additions & 4 deletions merdeka/merdeka/spiders/merdeka_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def parse(self, response):

for indek in indeks:
item = MerdekaItem()
item['title'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/text()').extract()[0]
item['link'] = "https://www.merdeka.com" + indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/@href').extract()[0]
item['images'] = indek.xpath('div[@class="mdk-tag-contln-l"]/a/img/@src').extract()[0]
item['category'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-date"]/span/text()').extract()[0]
item['title'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/text()').extract_first()
item['link'] = response.urljoin(indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-titlebar"]/a/@href').extract_first())
item['images'] = indek.xpath('div[@class="mdk-tag-contln-l"]/a/img/@src').extract_first()
item['category'] = indek.xpath('div[@class="mdk-tag-contln-r2"]/div[@class="mdk-tag-contln-date"]/span/text()').extract_first()
item['date'] = time.strftime("%d/%m/%Y")
item['desc'] = ""

Expand Down
3 changes: 2 additions & 1 deletion okezone/okezone/spiders/okezone_spider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import scrapy
import time
import sys
from __future__ import print_function
from bs4 import BeautifulSoup
from scrapy.selector import Selector
from scrapy.http.request import Request
Expand Down Expand Up @@ -35,7 +36,7 @@ def parse(self, response):
yield detail_request

def parse_detail(self, response):
print "Crawling detail news"
print("Crawling detail news")
item = response.meta['item']
selector = Selector(response)
description = selector.xpath('//*[@id="contentx"]').extract_first()
Expand Down
4 changes: 2 additions & 2 deletions republika/republika/spiders/republika_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def parse(self, response):
@url http://www.republika.co.id/indeks
@returns items
"""
print "Crawling list of news"
print("Crawling list of news")
indeks = Selector(response).xpath('//div[@class="wp-indeks"]')
indeks_length = len(indeks)
if float(indeks_length) > 0:
Expand Down Expand Up @@ -51,7 +51,7 @@ def parse(self, response):
)

def parse_detail(self, response):
print "Crawling detail news"
print("Crawling detail news")
item = response.meta['item']
selector = Selector(response)
description = selector.xpath('//div[@class="content-detail"]').extract_first()
Expand Down
2 changes: 1 addition & 1 deletion tempo/tempo/spiders/tempo_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def parse(self, response):
yield detail_request

def parse_detail(self, response):
print "Crawling detail news"
print("Crawling detail news")
item = response.meta['item']
selector = Selector(response)
description = selector.xpath('//article').extract_first()
Expand Down
2 changes: 1 addition & 1 deletion tirto/tirto/spiders/tirto_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def parse(self, response):
yield detail_request

def parse_detail(self, response):
print "Crawling detail news"
print("Crawling detail news")
item = response.meta['item']
selector = Selector(response)
description = selector.xpath('//div[@class="content-text-editor"]').extract_first()
Expand Down
12 changes: 6 additions & 6 deletions viva/viva/spiders/viva_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ def parse(self, response):

for indek in indeks:
item = VivaItem()
news_link = indek.xpath('div[@class="content_center"]/span/a[2]/@href').extract()[0]
item['title'] = indek.xpath('div[@class="content_center"]/span/a[2]/h3/text()').extract()[0]
news_link = indek.xpath('div[@class="content_center"]/span/a[2]/@href').extract_first()
item['title'] = indek.xpath('div[@class="content_center"]/span/a[2]/h3/text()').extract_first()
item['link'] = news_link
item['images'] = indek.xpath('div[@class="thumb"]/a/img/@data-original').extract()[0]
item['category'] = indek.xpath('div[@class="content_center"]/span/a[1]/h5/text()').extract()[0].strip()
item['date'] = indek.xpath('div[@class="content_center"]/span/div[@class="date"]/text()').extract()[0]
item['images'] = indek.xpath('div[@class="thumb"]/a/img/@data-original').extract_first()
item['category'] = indek.xpath('div[@class="content_center"]/span/a[1]/h5/text()').extract_first().strip()
item['date'] = indek.xpath('div[@class="content_center"]/span/div[@class="date"]/text()').extract_first()
detail_request = Request(news_link, callback=self.parse_detail)
detail_request.meta['item'] = item
yield detail_request

def parse_detail(self, response):
print "Crawling detail news"
print("Crawling detail news")
item = response.meta['item']
selector = Selector(response)
description = selector.xpath('//div[@id="article-detail-content"]').extract_first()
Expand Down

0 comments on commit 68691d6

Please sign in to comment.