Skip to content

Commit

Permalink
main function
Browse files Browse the repository at this point in the history
  • Loading branch information
let-robots-reign committed Aug 22, 2018
1 parent d2e662b commit ec80d6a
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 85 deletions.
27 changes: 14 additions & 13 deletions avito_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
import os
import sys

# на каких записях останавливаться
with open("breakpoints/avito.txt", "r", encoding="utf8") as file:
break_apartment, break_cottage, break_land, break_commercial = [tuple(x.strip().split("--")) for x in
file.readlines()]
print(break_apartment, break_cottage, break_land, break_commercial)
chrome_driver = os.getcwd() + "\\chromedriver.exe"


def get_html(url):
req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
Expand Down Expand Up @@ -449,6 +456,13 @@ def parse(category_url, base_url, category_name):


def main():
# defining chrome options for selenium
# options = Options()
# options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')
#

url_apartments = "https://www.avito.ru/saratovskaya_oblast/kvartiry?p=1&s=104&s_trg=3&bt=1"
base_url = "https://www.avito.ru/saratovskaya_oblast/kvartiry?"
parse(url_apartments, base_url, "apartments")
Expand All @@ -467,17 +481,4 @@ def main():


if __name__ == "__main__":
# на каких записях останавливаться
with open("breakpoints/avito.txt", "r", encoding="utf8") as file:
break_apartment, break_cottage, break_land, break_commercial = [tuple(x.strip().split("--"))
for x in file.readlines()]

# defining chrome options for selenium
# options = Options()
# options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')
#
chrome_driver = os.getcwd() + "\\chromedriver.exe"

main()
5 changes: 1 addition & 4 deletions breakpoints/avito.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
2-к квартира, 54 м², 3/9 эт. в Балаково,1000
Дом 80 м² на участке 5 сот. в Саратове,2500000
Участок 7 сот. (ИЖС) в Энгельсе,150000
Помещение свободного назначения, 31 м² в Саратове,2000000
3-к квартира, 130 м², 4/10 эт. в Саратове--6290000
3 changes: 2 additions & 1 deletion cian_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from selenium.webdriver.chrome.options import Options
import os

chrome_driver = os.getcwd() + "\\chromedriver.exe"


def get_html(url):
req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
Expand Down Expand Up @@ -419,6 +421,5 @@ def main():
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')
#
chrome_driver = os.getcwd() + "\\chromedriver.exe"

main()
55 changes: 28 additions & 27 deletions irr_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,34 @@
import base64


# на каких записях останавливаться
with open("breakpoints/irr.txt", "r", encoding="utf8") as file:
break_apartment_sell, break_apartment_rent, break_commercials_sell, break_commercial_rent, break_cottage_sell, break_cottage_rent = [
tuple(x.strip().split("--")) for x in file.readlines()]
# получаем вчерашнюю дату
today = datetime.datetime.today()
yesterday = str(today - datetime.timedelta(days=2)).split()[0].split("-")
if yesterday[1][0] == "0":
yesterday[1] = yesterday[1][1:]
if yesterday[2][0] == "0":
yesterday[2] = yesterday[2][1:]
months = {
"1": "января",
"2": "февраля",
"3": "марта",
"4": "апреля",
"5": "мая",
"6": "июня",
"7": "июля",
"8": "августа",
"9": "сентября",
"10": "октября",
"11": "ноября",
"12": "декабря"
}
date_break_point = yesterday[2] + " " + months[yesterday[1]]


def get_html(url):
req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
return req.text.encode(req.encoding)
Expand Down Expand Up @@ -388,31 +416,4 @@ def main():


if __name__ == "__main__":
# на каких записях останавливаться
with open("breakpoints/irr.txt", "r", encoding="utf8") as file:
break_apartment_sell, break_apartment_rent, break_commercials_sell, break_commercial_rent, break_cottage_sell, break_cottage_rent = [tuple(x.strip().split("--")) for x in file.readlines()]

# получаем вчерашнюю дату
today = datetime.datetime.today()
yesterday = str(today - datetime.timedelta(days=2)).split()[0].split("-")
if yesterday[1][0] == "0":
yesterday[1] = yesterday[1][1:]
if yesterday[2][0] == "0":
yesterday[2] = yesterday[2][1:]
months = {
"1": "января",
"2": "февраля",
"3": "марта",
"4": "апреля",
"5": "мая",
"6": "июня",
"7": "июля",
"8": "августа",
"9": "сентября",
"10": "октября",
"11": "ноября",
"12": "декабря"
}
date_break_point = yesterday[2] + " " + months[yesterday[1]]

main()
24 changes: 12 additions & 12 deletions kvadrat64_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
from selenium.webdriver.chrome.options import Options
import os

chrome_driver = os.getcwd() + "\\chromedriver.exe"
# на каких записях останавливаться
with open("breakpoints/kvadrat.txt", "r", encoding="utf8") as file:
break_apartment_sell, break_apartment_rent, break_cottages_sell, break_cottages_rent, break_commercials_sell, break_commercials_rent, break_dachas_sell, break_saratov_lands_sell, break_region_lands_sell = [
tuple(x.strip().split("--")) for x in file.readlines()]


def transform_date(date_str):
"""
Expand Down Expand Up @@ -511,6 +517,12 @@ def parse(category_url, category_name, sell_type):


def main():
# defining chrome options for selenium
# options = Options()
# options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')

url_apartments_sell = "http://kvadrat64.ru/sellflatbank-50-1.html"
parse(url_apartments_sell, "apartments", "Продажа")

Expand Down Expand Up @@ -540,16 +552,4 @@ def main():


if __name__ == "__main__":
# на каких записях останавливаться
with open("breakpoints/kvadrat.txt", "r", encoding="utf8") as file:
break_apartment_sell, break_apartment_rent, break_cottages_sell, break_cottages_rent, break_commercials_sell, break_commercials_rent, break_dachas_sell, break_saratov_lands_sell, break_region_lands_sell = [tuple(x.strip().split("--")) for x in file.readlines()]

# defining chrome options for selenium
# options = Options()
# options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')

chrome_driver = os.getcwd() + "\\chromedriver.exe"

main()
1 change: 0 additions & 1 deletion logs.txt

This file was deleted.

42 changes: 42 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import avito_parsing
import irr_parsing
import kvadrat64_parsing
import ya_realty_parsing
import cian_parsing
import youla_parsing
import threading
import os

if os.path.isfile("total_data.txt"):
os.remove("total_data.txt")

if os.path.isfile("logs.txt"):
os.remove("logs.txt")

avito = avito_parsing.main()
irr = irr_parsing.main()
kvadrat = kvadrat64_parsing.main()
ya = ya_realty_parsing.main()
cian = cian_parsing.main()
youla = youla_parsing.main()

t1 = threading.Thread(target=kvadrat)
t2 = threading.Thread(target=irr)
t3 = threading.Thread(target=ya)
t4 = threading.Thread(target=youla)
t5 = threading.Thread(target=cian)
t6 = threading.Thread(target=avito)

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()

t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
17 changes: 6 additions & 11 deletions total_data.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
Саратов, Производственная улица, 13--4--84 м²--https://realty.yandex.ru/offer/7550619761618484226/
Энгельс, проспект Строителей--1--38 м²--https://realty.yandex.ru/offer/5357523029750260481/
Саратов, 1-я Прокатная улица, 3--2--54 м²--https://realty.yandex.ru/offer/7782013542388112897/
поселок Прибрежный, квартал Базальт--Дом, 80 м², участок, 8 соток--https://realty.yandex.ru/offer/1381268641866934529/
Саратовская область, Саратов, р-н Кировский, ул. Большая Горная, 221--2--69 м²--https://saratov.cian.ru/sale/flat/191909312/
Саратовская область, Саратов, р-н Октябрьский, Вольская ул., 32/34--1--35 м²--https://saratov.cian.ru/rent/flat/161041732/
Саратовская область, Саратов, р-н Кировский, Техническая ул., 20--3--64,6 м²--https://saratov.cian.ru/sale/flat/190747310/
Саратовская область, Саратов, р-н Ленинский, ул. Имени С.Ф. Тархова, 38--1--39 м²--https://saratov.cian.ru/sale/flat/192142598/
Саратовская область, Саратов, р-н Волжский, ул. Им. Исаева Н.В., 6--3--79,6 м²--https://saratov.cian.ru/sale/flat/185278383/
Саратовская область, Саратов, р-н Волжский, проезд 1-й Скоморохова, 17--1--31 м²--https://saratov.cian.ru/sale/flat/189935883/
Саратов, большая садовая улица, 139/150--2 комнаты--56 м²--https://youla.ru/saratov/nedvijimost/prodaja-kvartiri/kvartira-2-komnaty-56-m2-5b7d45ca074b3e518e5ec911
НАБЕРЕЖНАЯ,Саратов, район Волжский, Валовая улица, 2/10,ВОЛЖСКИЙ Р-Н--3--130--https://avito.ru/saratov/kvartiry/3-k_kvartira_130_m_410_et._1540634631
Саратов, район Ленинский, Ленинский район--2--53--https://avito.ru/saratov/kvartiry/2-k_kvartira_53_m_49_et._869302710
Саратовская область, Балаково, улица Ленина, 76--студии--35--https://avito.ru/balakovo/kvartiry/studiya_35_m_25_et._1341066353
ул.Тархова д 18--1--41--https://avito.ru/saratov/kvartiry/1-k_kvartira_41_m_610_et._607945621
Балашов, Саратовская область, улица Ленина, 8--1--32--https://avito.ru/balashov/kvartiry/1-k_kvartira_32_m_44_et._1357663250
Саратовская область, Балашов, проспект Космонавтов--1--22--https://avito.ru/balashov/kvartiry/1-k_kvartira_22_m_55_et._1129188130
29 changes: 15 additions & 14 deletions ya_realty_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
from selenium.webdriver.chrome.options import Options
import os

chrome_driver = os.getcwd() + "\\chromedriver.exe"
# на каких записях останавливаться
with open("breakpoints/ya.txt", "r", encoding="utf8") as file:
break_apartment_sell, break_apartment_rent, break_cottage_sell, break_cottage_rent, break_commercials_sell, break_commercial_rent = [
tuple(x.strip().split("--"))
for x in file.readlines()]


def transform_date(date):
"""
Expand Down Expand Up @@ -345,7 +352,7 @@ def crawl_page(first_offer, html, category, sell_type):
if first_offer:
# сохраняем самую первую запись как точку выхода
modifier = "w" if (category == "apartments" and sell_type == "Продажа") else "a"
with open("breakpoints/ya.txt", modifier, encoding="utf8") as file:
with open("breakpoints/new_ya.txt", modifier, encoding="utf8") as file:
file.write("%s--%s\n" % (data[0], data[1]))
first_offer = False

Expand Down Expand Up @@ -385,6 +392,13 @@ def parse(category_url, category_name, sell_type):


def main():
# defining chrome options for selenium
# options = Options()
# options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')
#

url_apartments_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kvartira/?sort=DATE_DESC&page=0"
parse(url_apartments_sell, "apartments", "Продажа")

Expand All @@ -405,17 +419,4 @@ def main():


if __name__ == "__main__":
# на каких записях останавливаться
with open("breakpoints/ya.txt", "r", encoding="utf8") as file:
break_apartment_sell, break_apartment_rent, break_cottage_sell, break_cottage_rent, break_commercials_sell, break_commercial_rent = [tuple(x.strip().split("--"))
for x in file.readlines()]

# defining chrome options for selenium
# options = Options()
# options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
# options.add_argument('--disable-gpu')
# options.add_argument('--headless')
#
chrome_driver = os.getcwd() + "\\chromedriver.exe"

main()
4 changes: 2 additions & 2 deletions youla_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from selenium.webdriver.chrome.options import Options
import os

chrome_driver = os.getcwd() + "\\chromedriver.exe"


def get_html(url):
req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
Expand Down Expand Up @@ -309,6 +311,4 @@ def main():
#options.add_argument('--disable-gpu')
#options.add_argument('--headless')

chrome_driver = os.getcwd() + "\\chromedriver.exe"

main()

0 comments on commit ec80d6a

Please sign in to comment.