Skip to content

Commit

Permalink
A great version
Browse files Browse the repository at this point in the history
  • Loading branch information
trentenwen committed Mar 28, 2023
1 parent 3a6ba73 commit cef0ab9
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 77 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea/*
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@
# spiders
# spiders

## Installation

``` shell
# Pre-install
pip install rsa qrcode pillow requests pycryptodome requests_toolbelt gmssl PyExecJS
# Our protagonist
pip install DecryptLogin
```
91 changes: 91 additions & 0 deletions csvoutput/Hogwarts-Legacy.csv

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions webdriver_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# selenium 4
from selenium import webdriver

from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager

from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager


driverChrome = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

driverEdge = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))

driverFirefox = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
Binary file not shown.
90 changes: 14 additions & 76 deletions webtry.py → webparse/steam_review_parse.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,14 @@

import numpy as np
import os
import pandas as pd

import time
from selenium import webdriver
from scrapy.selector import Selector

games_list = {
# 'Hogwarts Legacy': 'https://steamcommunity.com/app/990080/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Forza Horizon 5': 'https://steamcommunity.com/app/1551360/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Cities: Skylines': 'https://steamcommunity.com/app/255710/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Atomic Heart': 'https://steamcommunity.com/app/668580/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'It Takes Two': 'https://steamcommunity.com/app/1426210/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Sid Meier’s Civilization VI': 'https://steamcommunity.com/app/289070/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Red Dead Redemption 2': 'https://steamcommunity.com/app/1174180/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Grand Theft Auto V': 'https://steamcommunity.com/app/271590/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Overcooked! 2': 'https://steamcommunity.com/app/728880/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Monster Hunter: World': 'https://steamcommunity.com/app/582010/reviews/?browsefilter=toprated&snr=1_5_100010_',
# 'Tomb Raider': 'https://steamcommunity.com/app/203160/reviews/?browsefilter=toprated&snr=1_5_100010_',
'Cyberpunk 2077': 'https://steamcommunity.com/app/1091500/reviews/?browsefilter=toprated&snr=1_5_100010_',
'LEGO Star Wars: The Skywalker Saga': 'https://steamcommunity.com/app/920210/reviews/?browsefilter=toprated&snr=1_5_100010_'
}

for title in games_list:

driver = webdriver.Chrome()
driver.get(games_list[title])

driver.implicitly_wait(1)

single_trying_time = 5 # for normal trying
keep_trying_count = 10 # retry times for one page
keep_trying_time = 10 # retrying period

counter = 0
filename = title + '.csv'
keep_trying_count_all = keep_trying_count

# print(driver.get_window_size().get("width"), driver.get_window_size().get("height"))
def steam_helper(selector: Selector, filename):

df = pd.DataFrame(columns=['author', 'home_url', 'helpful', 'funny', 'appraise', 'played_hour',
'post_month', 'post_day', 'games', 'reply', 'image', 'review'])

while True:

# Get the height of the page
last_height = driver.execute_script("return document.body.scrollHeight")

# Scroll down to the bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Wait for the page to load
time.sleep(single_trying_time)

# Get the new height of the page
new_height = driver.execute_script("return document.body.scrollHeight")

# print(last_height, '->', new_height)

# Keep trying mechanism
# If the height of the page has not changed, we have reached the bottom
if new_height == last_height:
if keep_trying_count > 0:
time.sleep(keep_trying_time)
keep_trying_count -= 1
continue
print(f'Arrive at bottom: {counter}')
break
else:
if keep_trying_count != keep_trying_count_all:
keep_trying_count = keep_trying_count_all
counter += 1

# Once we've generated all the dynamic content, we can use Scrapy's Selector to extract the data we need
selector = Selector(text=driver.page_source)
# Extract data here using Scrapy selectors
app_cards = selector.xpath(
'//div[contains(@class, "apphub_Card") and '
'contains(@class, "modalContentLink") and '
Expand All @@ -83,8 +17,8 @@

print(f'\nLog: number of reviews: {len(app_cards)}\n')

card_index = 0
for card in app_cards:

# Helpful & Funny
helpfulFunny = card.xpath('.//div[@class="found_helpful"]/text()').getall()
if len(helpfulFunny) == 0:
Expand All @@ -97,9 +31,11 @@
helpful = helpfulFunny[0].strip().split(' ')[0]
funny = helpfulFunny[1].strip().split(' ')[0]

# Review
# Review appraise
appraise_text = card.xpath('.//div[@class="title"]/text()').get()
appraise = 1 if appraise_text == 'Recommended' else 0

# Played hour
played_hour = card.xpath('.//div[@class="hours"]/text()').get()
if played_hour is not None:
played_hour = played_hour.split(' ')[0]
Expand All @@ -113,7 +49,7 @@
review = card.xpath('.//div[@class="apphub_CardTextContent"]/text()').getall()
review = ''.join(review).strip()

# Own games
# Number of owned games
games = card.xpath('.//div[contains(@class, "apphub_CardContentMoreLink") and contains(@class, "ellipsis")]/text()').get()
if games is not None:
games = games.split(' ')[0]
Expand All @@ -122,6 +58,7 @@

# Reply
reply = card.xpath('.//div[contains(@class, "apphub_CardCommentCount") and contains(@class, "alignNews")]/text()').get().strip()

# Head image
image = card.xpath('.//div[contains(@class, "appHubIconHolder")]/img').xpath('@src').extract_first()
image = 1 if image != 'https://avatars.cloudflare.steamstatic.com/fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb.jpg' else 0
Expand Down Expand Up @@ -156,11 +93,12 @@
'image': image,
'review': review
}, index=[0])], ignore_index=True)
card_index += 1
if card_index > 2000:
break

# Finished
print(df)
df.to_csv(filename, index=True)

driver.quit()
# Save to disk
df.to_csv(os.path.join('../csvoutput', filename), index=True)
print('Save to', os.path.join('../csvoutput', filename))


53 changes: 53 additions & 0 deletions webtools/rolling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from scrapy.selector import Selector

from webparse.steam_review_parse import steam_helper

driver = webdriver.Chrome()

# Crawl Target
driver.get('https://steamcommunity.com/app/990080/reviews/?browsefilter=toprated&snr=1_5_100010_')

# User Configuration
target_page = 10 # 100 pages contain about 1000 reviews
waiting_time = 1 # range of waiting in each time
tolerated_times = 30 # number of tolerant times for each page

# Start Crawling
page_counter = 1
current_tolerated = tolerated_times
while True:

try:
driver.implicitly_wait(waiting_time)
driver.find_element(By.ID, 'page' + str(page_counter))
except NoSuchElementException:
# Not find
# - rolling -> to avoid too much rolling - since one window maybe contain several page block
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# - decrease tolerant
if current_tolerated > 0:
current_tolerated -= 1
continue
else:
# Reach the end or Network break
break
else:
# Find new page, reset tolerated counter
if current_tolerated != tolerated_times:
current_tolerated = tolerated_times
# Page counter + 1
page_counter += 1
if page_counter == target_page:
break

# Start Parsing
selector = Selector(text=driver.page_source)

steam_helper(selector, 'Hogwarts-Legacy.csv')

driver.quit()

0 comments on commit cef0ab9

Please sign in to comment.