Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
change median code
Browse files Browse the repository at this point in the history
Fixes #5
  • Loading branch information
keremkoseoglu committed Jun 14, 2022
1 parent 479e0b0 commit 0bbfa97
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 39 deletions.
2 changes: 2 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[MESSAGES CONTROL]
disable=C0301, R0902, R0903
15 changes: 6 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
""" Sahibinden main module """
from sahibinden.search import Search

def run_test():
""" Just a test """
search = Search("https://www.sahibinden.com/arazi-suv-pickup-nissan-qashqai/benzin/manuel?a277_max=2013&a277_min=2013&a276_min=100000&a276_max=200000")
for price in search.result.prices:
print(str(price))
print(search.result.price_median)
import test

if __name__ == "__main__":
run_test()
# test.regular_search()
# test.quick_median_single_page()
# test.quick_median_multi_page_no_offset()
# test.quick_median_multi_page_with_offset()
pass
77 changes: 77 additions & 0 deletions sahibinden/quick_median.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
""" Quick median module """
from dataclasses import dataclass
from math import ceil
import time
from luta.crawler import Crawler # pylint: disable=E0401
from sahibinden.toolkit import extract_crawler_prices, SearchInput

@dataclass
class PageAnalysis:
""" Sayfa analizi """
is_multi_page: bool = False
page_count: int = 0
mid_page: int = 0
crawler: Crawler = None

class QuickMedian():
""" Quick median class """
URL_SORT_SUFFIX = "sorting=price_asc"

def __init__(self, search_input: SearchInput):
self._input = search_input
self.median = 0

self._ensure_url_sorted_by_price()
self._search()

def _search(self):
analysis = self._analyse_initial_page()

if not analysis.is_multi_page or analysis.mid_page == 0:
self.median = QuickMedian._get_mid_price(analysis.crawler)
return

if self._input.post_sleep > 0:
time.sleep(self._input.post_sleep)

url_suffix = f"&pagingOffset={ str(20 * analysis.mid_page) }"
url = self._input.url + url_suffix
mid_crawler = Crawler(url)
self.median = QuickMedian._get_mid_price(mid_crawler)

def _analyse_initial_page(self) -> PageAnalysis:
result = PageAnalysis()
result.crawler = Crawler(self._input.url)
html = result.crawler.html

if "sayfa içerisinde" in html:
pages_str = result.crawler.get_value_between('<p class="mbdef">Toplam',
'sayfa içerisinde')
if pages_str == "":
raise Exception("Cant determine page count; possibly front end change")
pages_str = pages_str.replace(" ", "")
pages_int = int(pages_str)

result.is_multi_page = True
result.page_count = pages_int
result.mid_page = ceil(pages_int / 2) - 1
else:
result.is_multi_page = False
result.page_count = 1
result.mid_page = 0

return result

@staticmethod
def _get_mid_price(crw: Crawler) -> float:
prices = extract_crawler_prices(crw)
if len(prices) <= 0:
raise Exception("No price found")
price_pos = ceil(len(prices) / 2) - 1
return prices[price_pos]

def _ensure_url_sorted_by_price(self):
if QuickMedian.URL_SORT_SUFFIX in self._input.url:
return
separator = "&" if "?" in self._input.url else "?"
self._input.url += f"{ separator }{ QuickMedian.URL_SORT_SUFFIX }"
41 changes: 12 additions & 29 deletions sahibinden/search.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,31 @@
""" Search module """
import time
from luta.crawler import Crawler
from luta.crawler import Crawler # pylint: disable=E0401
from sahibinden.search_result import SearchResult

_HOST = "http://www.sahibinden.com"
from sahibinden.toolkit import extract_crawler_prices, HOST, SearchInput

class Search():
""" Search class """
def __init__(self, url: str, post_sleep: int = 0):
self._url = url
self._post_sleep = post_sleep
def __init__(self, search_input: SearchInput):
self._input = search_input
self._html = ""
self._prices = []
self._search()
self.result = SearchResult(self._prices)

@staticmethod
def _parse_price(price: str) -> float:
if price is None:
return 0
prc = price.strip()
if len(prc) <= 0:
return 0
prc_split = prc.split(" ")
if len(prc_split) <= 0:
return 0
prc_text = prc_split[0].strip()
prc_text = prc_text.replace(".", "").replace(",", ".")
return float(prc_text)

def _search(self):
url = self._url
url = self._input.url
while True:
crw = Crawler(url)

prices = crw.get_values_between('<td class="searchResultsPriceValue">', '</div>')
prices = extract_crawler_prices(crw)
for price in prices:
float_price = Search._parse_price(price)
self._prices.append(float_price)
self._prices.append(price)

next_url = crw.get_last_value_between('<a href="', '" class="prevNextBut" title="Sonraki"')
next_url = crw.get_last_value_between('<a href="',
'" class="prevNextBut" title="Sonraki"')
if next_url == "":
return
url = _HOST + next_url
url = HOST + next_url

if self._post_sleep > 0:
time.sleep(self._post_sleep)
if self._input.post_sleep > 0:
time.sleep(self._input.post_sleep)
1 change: 1 addition & 0 deletions sahibinden/search_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self, prices:List[float]):

@property
def prices(self) -> List[float]:
""" Returns all prices """
return self._prices


Expand Down
36 changes: 36 additions & 0 deletions sahibinden/toolkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
""" Sahibinden toolkit """
from dataclasses import dataclass
from typing import List
from luta.crawler import Crawler # pylint: disable=E0401

HOST = "http://www.sahibinden.com"

def parse_price(price: str) -> float:
""" Parses new price """
if price is None:
return 0
prc = price.strip()
if len(prc) <= 0:
return 0
prc_split = prc.split(" ")
if len(prc_split) <= 0:
return 0
prc_text = prc_split[0].strip()
prc_text = prc_text.replace(".", "").replace(",", ".")
return float(prc_text)

def extract_crawler_prices(crw: Crawler) -> List[float]:
""" Extract prices from crawler """
result = []
prices = crw.get_values_between('<td class="searchResultsPriceValue">', '</div>')
for price in prices:
float_price = parse_price(price)
result.append(float_price)
result.sort()
return result

@dataclass
class SearchInput:
""" Search input """
url: str
post_sleep: int = 0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setuptools.setup(
name="sahibinden-keremkoseoglu",
version="0.0.5",
version="1.0.0",
author="Kerem Koseoglu",
author_email="kerem@keremkoseoglu.com",
description="sahibinden.com Web spider",
Expand Down
29 changes: 29 additions & 0 deletions test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
""" Tests module """
from sahibinden.search import Search
from sahibinden.quick_median import QuickMedian
from sahibinden.toolkit import SearchInput

def regular_search():
""" Regular search """
search_input = SearchInput(url="https://www.sahibinden.com/arazi-suv-pickup-nissan-qashqai/benzin/manuel?a277_max=2013&a277_min=2013&a276_min=100000&a276_max=200000")
search = Search(search_input)
for price in search.result.prices:
print(str(price))
print(search.result.price_median)

def quick_median_single_page():
""" Quick median single page """
_quick_median_with_url("https://www.sahibinden.com/arazi-suv-pickup-nissan-qashqai/benzin/manuel?a277_max=2013&a277_min=2013&a276_min=100000&a276_max=200000")

def quick_median_multi_page_no_offset():
""" Quick median multiple page without offset """
_quick_median_with_url("https://www.sahibinden.com/bas-gitar?query_text_mf=fender+jazz+bass&query_text=fender+jazz+bass")

def quick_median_multi_page_with_offset():
""" Quick median multiple page with offset """
_quick_median_with_url("https://www.sahibinden.com/bas-gitar?query_text_mf=bass")

def _quick_median_with_url(url: str):
search_input = SearchInput(url=url)
quick_median = QuickMedian(search_input)
print(str(quick_median.median))

0 comments on commit 0bbfa97

Please sign in to comment.