This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
8 changed files
with
164 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[MESSAGES CONTROL] | ||
disable=C0301, R0902, R0903 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,9 @@ | ||
""" Sahibinden main module """ | ||
from sahibinden.search import Search | ||
|
||
def run_test(): | ||
""" Just a test """ | ||
search = Search("https://www.sahibinden.com/arazi-suv-pickup-nissan-qashqai/benzin/manuel?a277_max=2013&a277_min=2013&a276_min=100000&a276_max=200000") | ||
for price in search.result.prices: | ||
print(str(price)) | ||
print(search.result.price_median) | ||
import test | ||
|
||
if __name__ == "__main__": | ||
run_test() | ||
# test.regular_search() | ||
# test.quick_median_single_page() | ||
# test.quick_median_multi_page_no_offset() | ||
# test.quick_median_multi_page_with_offset() | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
""" Quick median module """ | ||
from dataclasses import dataclass | ||
from math import ceil | ||
import time | ||
from luta.crawler import Crawler # pylint: disable=E0401 | ||
from sahibinden.toolkit import extract_crawler_prices, SearchInput | ||
|
||
@dataclass | ||
class PageAnalysis: | ||
""" Sayfa analizi """ | ||
is_multi_page: bool = False | ||
page_count: int = 0 | ||
mid_page: int = 0 | ||
crawler: Crawler = None | ||
|
||
class QuickMedian(): | ||
""" Quick median class """ | ||
URL_SORT_SUFFIX = "sorting=price_asc" | ||
|
||
def __init__(self, search_input: SearchInput): | ||
self._input = search_input | ||
self.median = 0 | ||
|
||
self._ensure_url_sorted_by_price() | ||
self._search() | ||
|
||
def _search(self): | ||
analysis = self._analyse_initial_page() | ||
|
||
if not analysis.is_multi_page or analysis.mid_page == 0: | ||
self.median = QuickMedian._get_mid_price(analysis.crawler) | ||
return | ||
|
||
if self._input.post_sleep > 0: | ||
time.sleep(self._input.post_sleep) | ||
|
||
url_suffix = f"&pagingOffset={ str(20 * analysis.mid_page) }" | ||
url = self._input.url + url_suffix | ||
mid_crawler = Crawler(url) | ||
self.median = QuickMedian._get_mid_price(mid_crawler) | ||
|
||
def _analyse_initial_page(self) -> PageAnalysis: | ||
result = PageAnalysis() | ||
result.crawler = Crawler(self._input.url) | ||
html = result.crawler.html | ||
|
||
if "sayfa içerisinde" in html: | ||
pages_str = result.crawler.get_value_between('<p class="mbdef">Toplam', | ||
'sayfa içerisinde') | ||
if pages_str == "": | ||
raise Exception("Cant determine page count; possibly front end change") | ||
pages_str = pages_str.replace(" ", "") | ||
pages_int = int(pages_str) | ||
|
||
result.is_multi_page = True | ||
result.page_count = pages_int | ||
result.mid_page = ceil(pages_int / 2) - 1 | ||
else: | ||
result.is_multi_page = False | ||
result.page_count = 1 | ||
result.mid_page = 0 | ||
|
||
return result | ||
|
||
@staticmethod | ||
def _get_mid_price(crw: Crawler) -> float: | ||
prices = extract_crawler_prices(crw) | ||
if len(prices) <= 0: | ||
raise Exception("No price found") | ||
price_pos = ceil(len(prices) / 2) - 1 | ||
return prices[price_pos] | ||
|
||
def _ensure_url_sorted_by_price(self): | ||
if QuickMedian.URL_SORT_SUFFIX in self._input.url: | ||
return | ||
separator = "&" if "?" in self._input.url else "?" | ||
self._input.url += f"{ separator }{ QuickMedian.URL_SORT_SUFFIX }" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,31 @@ | ||
""" Search module """ | ||
import time | ||
from luta.crawler import Crawler | ||
from luta.crawler import Crawler # pylint: disable=E0401 | ||
from sahibinden.search_result import SearchResult | ||
|
||
_HOST = "http://www.sahibinden.com" | ||
from sahibinden.toolkit import extract_crawler_prices, HOST, SearchInput | ||
|
||
class Search(): | ||
""" Search class """ | ||
def __init__(self, url: str, post_sleep: int = 0): | ||
self._url = url | ||
self._post_sleep = post_sleep | ||
def __init__(self, search_input: SearchInput): | ||
self._input = search_input | ||
self._html = "" | ||
self._prices = [] | ||
self._search() | ||
self.result = SearchResult(self._prices) | ||
|
||
@staticmethod | ||
def _parse_price(price: str) -> float: | ||
if price is None: | ||
return 0 | ||
prc = price.strip() | ||
if len(prc) <= 0: | ||
return 0 | ||
prc_split = prc.split(" ") | ||
if len(prc_split) <= 0: | ||
return 0 | ||
prc_text = prc_split[0].strip() | ||
prc_text = prc_text.replace(".", "").replace(",", ".") | ||
return float(prc_text) | ||
|
||
def _search(self): | ||
url = self._url | ||
url = self._input.url | ||
while True: | ||
crw = Crawler(url) | ||
|
||
prices = crw.get_values_between('<td class="searchResultsPriceValue">', '</div>') | ||
prices = extract_crawler_prices(crw) | ||
for price in prices: | ||
float_price = Search._parse_price(price) | ||
self._prices.append(float_price) | ||
self._prices.append(price) | ||
|
||
next_url = crw.get_last_value_between('<a href="', '" class="prevNextBut" title="Sonraki"') | ||
next_url = crw.get_last_value_between('<a href="', | ||
'" class="prevNextBut" title="Sonraki"') | ||
if next_url == "": | ||
return | ||
url = _HOST + next_url | ||
url = HOST + next_url | ||
|
||
if self._post_sleep > 0: | ||
time.sleep(self._post_sleep) | ||
if self._input.post_sleep > 0: | ||
time.sleep(self._input.post_sleep) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
""" Sahibinden toolkit """ | ||
from dataclasses import dataclass | ||
from typing import List | ||
from luta.crawler import Crawler # pylint: disable=E0401 | ||
|
||
HOST = "http://www.sahibinden.com" | ||
|
||
def parse_price(price: str) -> float: | ||
""" Parses new price """ | ||
if price is None: | ||
return 0 | ||
prc = price.strip() | ||
if len(prc) <= 0: | ||
return 0 | ||
prc_split = prc.split(" ") | ||
if len(prc_split) <= 0: | ||
return 0 | ||
prc_text = prc_split[0].strip() | ||
prc_text = prc_text.replace(".", "").replace(",", ".") | ||
return float(prc_text) | ||
|
||
def extract_crawler_prices(crw: Crawler) -> List[float]: | ||
""" Extract prices from crawler """ | ||
result = [] | ||
prices = crw.get_values_between('<td class="searchResultsPriceValue">', '</div>') | ||
for price in prices: | ||
float_price = parse_price(price) | ||
result.append(float_price) | ||
result.sort() | ||
return result | ||
|
||
@dataclass | ||
class SearchInput: | ||
""" Search input """ | ||
url: str | ||
post_sleep: int = 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
""" Tests module """ | ||
from sahibinden.search import Search | ||
from sahibinden.quick_median import QuickMedian | ||
from sahibinden.toolkit import SearchInput | ||
|
||
def regular_search(): | ||
""" Regular search """ | ||
search_input = SearchInput(url="https://www.sahibinden.com/arazi-suv-pickup-nissan-qashqai/benzin/manuel?a277_max=2013&a277_min=2013&a276_min=100000&a276_max=200000") | ||
search = Search(search_input) | ||
for price in search.result.prices: | ||
print(str(price)) | ||
print(search.result.price_median) | ||
|
||
def quick_median_single_page(): | ||
""" Quick median single page """ | ||
_quick_median_with_url("https://www.sahibinden.com/arazi-suv-pickup-nissan-qashqai/benzin/manuel?a277_max=2013&a277_min=2013&a276_min=100000&a276_max=200000") | ||
|
||
def quick_median_multi_page_no_offset(): | ||
""" Quick median multiple page without offset """ | ||
_quick_median_with_url("https://www.sahibinden.com/bas-gitar?query_text_mf=fender+jazz+bass&query_text=fender+jazz+bass") | ||
|
||
def quick_median_multi_page_with_offset(): | ||
""" Quick median multiple page with offset """ | ||
_quick_median_with_url("https://www.sahibinden.com/bas-gitar?query_text_mf=bass") | ||
|
||
def _quick_median_with_url(url: str): | ||
search_input = SearchInput(url=url) | ||
quick_median = QuickMedian(search_input) | ||
print(str(quick_median.median)) |