A great version

trentenwen · Mar 28, 2023 · cef0ab9 · cef0ab9
1 parent 3a6ba73
commit cef0ab9
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.idea/*
diff --git a/README.md b/README.md
@@ -1 +1,10 @@
-# spiders
+# spiders
+
+## Installation
+
+``` shell
+# Pre-install
+pip install rsa qrcode pillow requests pycryptodome requests_toolbelt gmssl PyExecJS
+# Our protagonist
+pip install DecryptLogin
+```
diff --git a/csvoutput/Hogwarts-Legacy.csv b/csvoutput/Hogwarts-Legacy.csv
diff --git a/webdriver_install.py b/webdriver_install.py
@@ -0,0 +1,18 @@
+# selenium 4
+from selenium import webdriver
+
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
+
+from selenium.webdriver.edge.service import Service as EdgeService
+from webdriver_manager.microsoft import EdgeChromiumDriverManager
+
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from webdriver_manager.firefox import GeckoDriverManager
+
+
+driverChrome = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
+
+driverEdge = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))
+
+driverFirefox = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
diff --git a/webparse/__pycache__/steam_review_parse.cpython-38.pyc b/webparse/__pycache__/steam_review_parse.cpython-38.pyc
diff --git a/webtry.py → webparse/steam_review_parse.py b/webtry.py → webparse/steam_review_parse.py
@@ -1,80 +1,14 @@
 
-import numpy as np
+import os
 import pandas as pd
-
-import time
-from selenium import webdriver
 from scrapy.selector import Selector
 
-games_list = {
-    # 'Hogwarts Legacy': 'https://steamcommunity.com/app/990080/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Forza Horizon 5': 'https://steamcommunity.com/app/1551360/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Cities: Skylines': 'https://steamcommunity.com/app/255710/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Atomic Heart': 'https://steamcommunity.com/app/668580/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'It Takes Two': 'https://steamcommunity.com/app/1426210/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Sid Meier’s Civilization VI': 'https://steamcommunity.com/app/289070/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Red Dead Redemption 2': 'https://steamcommunity.com/app/1174180/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Grand Theft Auto V': 'https://steamcommunity.com/app/271590/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Overcooked! 2': 'https://steamcommunity.com/app/728880/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Monster Hunter: World': 'https://steamcommunity.com/app/582010/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    # 'Tomb Raider': 'https://steamcommunity.com/app/203160/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    'Cyberpunk 2077': 'https://steamcommunity.com/app/1091500/reviews/?browsefilter=toprated&snr=1_5_100010_',
-    'LEGO Star Wars: The Skywalker Saga': 'https://steamcommunity.com/app/920210/reviews/?browsefilter=toprated&snr=1_5_100010_'
-}
-
-for title in games_list:
-
-    driver = webdriver.Chrome()
-    driver.get(games_list[title])
-
-    driver.implicitly_wait(1)
-
-    single_trying_time = 5  # for normal trying
-    keep_trying_count = 10  # retry times for one page
-    keep_trying_time = 10   # retrying period
 
-    counter = 0
-    filename = title + '.csv'
-    keep_trying_count_all = keep_trying_count
-
-    # print(driver.get_window_size().get("width"), driver.get_window_size().get("height"))
+def steam_helper(selector: Selector, filename):
 
     df = pd.DataFrame(columns=['author', 'home_url', 'helpful', 'funny', 'appraise', 'played_hour',
                                'post_month', 'post_day', 'games', 'reply', 'image', 'review'])
 
-    while True:
-
-        # Get the height of the page
-        last_height = driver.execute_script("return document.body.scrollHeight")
-
-        # Scroll down to the bottom of the page
-        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-
-        # Wait for the page to load
-        time.sleep(single_trying_time)
-
-        # Get the new height of the page
-        new_height = driver.execute_script("return document.body.scrollHeight")
-
-        # print(last_height, '->', new_height)
-
-        # Keep trying mechanism
-        # If the height of the page has not changed, we have reached the bottom
-        if new_height == last_height:
-            if keep_trying_count > 0:
-                time.sleep(keep_trying_time)
-                keep_trying_count -= 1
-                continue
-            print(f'Arrive at bottom: {counter}')
-            break
-        else:
-            if keep_trying_count != keep_trying_count_all:
-                keep_trying_count = keep_trying_count_all
-        counter += 1
-
-    # Once we've generated all the dynamic content, we can use Scrapy's Selector to extract the data we need
-    selector = Selector(text=driver.page_source)
-    # Extract data here using Scrapy selectors
     app_cards = selector.xpath(
         '//div[contains(@class, "apphub_Card") and '
         'contains(@class, "modalContentLink") and '
@@ -83,8 +17,8 @@
 
     print(f'\nLog: number of reviews: {len(app_cards)}\n')
 
-    card_index = 0
     for card in app_cards:
+
         # Helpful & Funny
         helpfulFunny = card.xpath('.//div[@class="found_helpful"]/text()').getall()
         if len(helpfulFunny) == 0:
@@ -97,9 +31,11 @@
             helpful = helpfulFunny[0].strip().split(' ')[0]
             funny = helpfulFunny[1].strip().split(' ')[0]
 
-        # Review
+        # Review appraise
         appraise_text = card.xpath('.//div[@class="title"]/text()').get()
         appraise = 1 if appraise_text == 'Recommended' else 0
+
+        # Played hour
         played_hour = card.xpath('.//div[@class="hours"]/text()').get()
         if played_hour is not None:
             played_hour = played_hour.split(' ')[0]
@@ -113,7 +49,7 @@
         review = card.xpath('.//div[@class="apphub_CardTextContent"]/text()').getall()
         review = ''.join(review).strip()
 
-        # Own games
+        # Number of owned games
         games = card.xpath('.//div[contains(@class, "apphub_CardContentMoreLink") and contains(@class, "ellipsis")]/text()').get()
         if games is not None:
             games = games.split(' ')[0]
@@ -122,6 +58,7 @@
 
         # Reply
         reply = card.xpath('.//div[contains(@class, "apphub_CardCommentCount") and contains(@class, "alignNews")]/text()').get().strip()
+
         # Head image
         image = card.xpath('.//div[contains(@class, "appHubIconHolder")]/img').xpath('@src').extract_first()
         image = 1 if image != 'https://avatars.cloudflare.steamstatic.com/fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb.jpg' else 0
@@ -156,11 +93,12 @@
             'image': image,
             'review': review
         }, index=[0])], ignore_index=True)
-        card_index += 1
-        if card_index > 2000:
-            break
 
+    # Finished
     print(df)
-    df.to_csv(filename, index=True)
 
-    driver.quit()
+    # Save to disk
+    df.to_csv(os.path.join('../csvoutput', filename), index=True)
+    print('Save to', os.path.join('../csvoutput', filename))
+
+
diff --git a/webtools/rolling.py b/webtools/rolling.py
@@ -0,0 +1,53 @@
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import NoSuchElementException
+from scrapy.selector import Selector
+
+from webparse.steam_review_parse import steam_helper
+
+driver = webdriver.Chrome()
+
+# Crawl Target
+driver.get('https://steamcommunity.com/app/990080/reviews/?browsefilter=toprated&snr=1_5_100010_')
+
+# User Configuration
+target_page = 10  # 100 pages contain about 1000 reviews
+waiting_time = 1  # range of waiting in each time
+tolerated_times = 30  # number of tolerant times for each page
+
+# Start Crawling
+page_counter = 1
+current_tolerated = tolerated_times
+while True:
+
+    try:
+        driver.implicitly_wait(waiting_time)
+        driver.find_element(By.ID, 'page' + str(page_counter))
+    except NoSuchElementException:
+        # Not find
+        # - rolling -> to avoid too much rolling - since one window maybe contain several page block
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
+        # - decrease tolerant
+        if current_tolerated > 0:
+            current_tolerated -= 1
+            continue
+        else:
+            # Reach the end or Network break
+            break
+    else:
+        # Find new page, reset tolerated counter
+        if current_tolerated != tolerated_times:
+            current_tolerated = tolerated_times
+        # Page counter + 1
+        page_counter += 1
+        if page_counter == target_page:
+            break
+
+# Start Parsing
+selector = Selector(text=driver.page_source)
+
+steam_helper(selector, 'Hogwarts-Legacy.csv')
+
+driver.quit()
+