Skip to content

Commit

Permalink
New method of getting comment replies, using AJAX POST
Browse files Browse the repository at this point in the history
  • Loading branch information
neon-ninja committed Jun 28, 2022
1 parent 10ad8b4 commit 5882a5d
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 5 deletions.
25 changes: 21 additions & 4 deletions facebook_scraper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,17 +1098,34 @@ def extract_comment_replies(self, replies_url):
if not self.options.get("progress"):
logger.debug(f"Fetching {replies_url}")
try:
response = self.request(replies_url)
fb_dtsg = self.full_post_html.find("input[name='fb_dtsg']", first=True).attrs["value"]
encryptedAjaxResponseToken = re.search(
r'encrypted":"([^"]+)', self.full_post_html.html
).group(1)
response = self.request(
replies_url,
post=True,
params={"fb_dtsg": fb_dtsg, "__a": encryptedAjaxResponseToken},
)
except exceptions.TemporarilyBanned:
raise
except Exception as e:
logger.error(e)
return
# Skip first element, as it will be this comment itself
reply_selector = 'div[data-sigil="comment"]'
prefix_length = len('for (;;);')
data = json.loads(response.text[prefix_length:]) # Strip 'for (;;);'
for action in data['payload']['actions']:
if action["cmd"] == "replace":
html = utils.make_html_element(
action['html'],
url=FB_MOBILE_BASE_URL,
)
break

reply_selector = 'div[data-sigil="comment inline-reply"]'
if self.options.get("noscript"):
reply_selector = '#root div[id]'
replies = response.html.find(reply_selector)[1:]
replies = html.find(reply_selector)
try:
for reply in replies:
yield self.parse_comment(reply)
Expand Down
8 changes: 7 additions & 1 deletion facebook_scraper/facebook_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,13 @@ def get(self, url, **kwargs):
if not url.startswith("http"):
url = utils.urljoin(FB_MOBILE_BASE_URL, url)

response = self.session.get(url=url, **self.requests_kwargs, **kwargs)
if kwargs.get("post"):
kwargs.pop("post")
if kwargs.get("params"):
self.requests_kwargs["params"].update(kwargs.pop("params"))
response = self.session.post(url=url, **self.requests_kwargs, **kwargs)
else:
response = self.session.get(url=url, **self.requests_kwargs, **kwargs)
DEBUG = False
if DEBUG:
for filename in os.listdir("."):
Expand Down

0 comments on commit 5882a5d

Please sign in to comment.