Merge pull request #15 from csfyrakis/master

toby petty · web-flow · commit 77e3c0fe3b9a · 2019-04-01T21:20:35.000-04:00
Added extra column for when was the property updated or added in the market
diff --git a/rightmove_webscraper.py b/rightmove_webscraper.py
@@ -91,25 +91,29 @@ def get_page(self, request_content):
         xp_addresses = """//address[@class="propertyCard-address"]//span/text()"""
         xp_weblinks = """//div[@class="propertyCard-details"]\
         //a[@class="propertyCard-link"]/@href"""
+        
         xp_agent_urls = """//div[@class="propertyCard-contactsItem"]\
         //div[@class="propertyCard-branchLogo"]\
         //a[@class="propertyCard-branchLogo-link"]/@href"""
+        xp_time_on_market = """//span[@class="propertyCard-contactsAddedOrReduced"]/text()"""
+
 
         # Create data lists from xpaths:
         price_pcm = tree.xpath(xp_prices)
         titles = tree.xpath(xp_titles)
         addresses = tree.xpath(xp_addresses)
+        time_in_market = tree.xpath(xp_time_on_market)
         base = "http://www.rightmove.co.uk"
         weblinks = ["{}{}".format(base, tree.xpath(xp_weblinks)[w]) \
                     for w in range(len(tree.xpath(xp_weblinks)))]
         agent_urls = ["{}{}".format(base, tree.xpath(xp_agent_urls)[a]) \
                       for a in range(len(tree.xpath(xp_agent_urls)))]
 
         # Store the data in a Pandas DataFrame:
-        data = [price_pcm, titles, addresses, weblinks, agent_urls]
+        data = [price_pcm, titles, addresses, weblinks, agent_urls,time_in_market]
         temp_df = pd.DataFrame(data)
         temp_df = temp_df.transpose()
-        temp_df.columns = ["price", "type", "address", "url", "agent_url"]
+        temp_df.columns = ["price", "type", "address", "url", "agent_url","time_in_market"]
 
         # Drop empty rows which come from placeholders in the html:
         temp_df = temp_df[temp_df["address"].notnull()]