Skip to content

Commit

Permalink
Handle translation for shared and user text, split into separate prop…
Browse files Browse the repository at this point in the history
…erties. (#940)

* Move members-search inside try-block

* Handle translation for shared and user text, split into separate properties.
  • Loading branch information
NielsOerbaek authored Dec 5, 2022
1 parent 7f00382 commit eb42d53
Showing 1 changed file with 69 additions and 42 deletions.
111 changes: 69 additions & 42 deletions facebook_scraper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Any, Dict, Optional
from urllib.parse import parse_qs, urlparse
from tqdm.auto import tqdm
from collections import defaultdict

from . import utils, exceptions
from .constants import FB_BASE_URL, FB_MOBILE_BASE_URL, FB_W3_BASE_URL
Expand Down Expand Up @@ -86,6 +87,7 @@ class PostExtractor:
bad_json_key_regex = re.compile(r'(?P<prefix>[{,])(?P<key>\w+):')

more_url_regex = re.compile(r'(?<=…\s)<a')
has_translation_regex = re.compile(r'<span.*>Rate Translation</span>')
post_story_regex = re.compile(r'href="(\/story[^"]+)" aria')

def __init__(self, element, options, request_fn, full_post_html=None):
Expand Down Expand Up @@ -271,69 +273,94 @@ def extract_text(self) -> PartialPost:

element = self.element

story_containers = element.find(".story_body_container")

has_more = self.more_url_regex.search(element.html)
if has_more and self.full_post_html:
element = self.full_post_html.find('.story_body_container', first=True)
if not element and self.full_post_html.find("div.msg", first=True):
text = self.full_post_html.find("div.msg", first=True).text
return {"text": text, "post_text": text}

nodes = element.find('p, header, span[role=presentation]')
if nodes and len(nodes) > 1:
post_text = []
shared_text = []
ended = False
index_non_header = next(
(i for i, node in enumerate(nodes) if node.tag != 'header'), 1
)
for node in nodes[index_non_header:]:
if node.tag == 'header':
ended = True

# Remove '... More'
# This button is meant to display the hidden text that is already loaded
# Not to be confused with the 'More' that opens the article in a new page
if node.tag == 'p':
node = utils.make_html_element(
html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1)
)

texts = defaultdict(str)

if not ended:
post_text.append(node.text)
else:
shared_text.append(node.text)
for container_index, container in enumerate(story_containers):

has_translation = self.has_translation_regex.search(container.html)
if has_translation:
original = container.find('div[style="display:none"]', first=True)
translation = utils.make_html_element(
html=container.html.replace(original.html, "")
)
content_versions = [("hidden_original", original), ("translation", translation)]
else:
content_versions = [("original", container)]

# Separation between paragraphs
paragraph_separator = '\n\n'

text = paragraph_separator.join(itertools.chain(post_text, shared_text))
post_text = paragraph_separator.join(post_text)
shared_text = paragraph_separator.join(shared_text)

original_text = None
hidden_div = element.find('div[style="display:none"]', first=True)
if hidden_div:
original_text = []
for node in hidden_div.find("p,span[role=presentation]"):
node = utils.make_html_element(
html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1)
for version, content in content_versions:
post_text = []
shared_text = []
nodes = content.find('p, header, span[role=presentation]')

if version == "hidden_original":
if container_index == 0:
post_text.append(content.text)
else:
shared_text.append(content.text)

elif nodes:
ended = False
index_non_header = next(
(i for i, node in enumerate(nodes) if node.tag != 'header'), 1
)
original_text.append(node.text)
original_text = paragraph_separator.join(original_text)
for node in nodes[index_non_header:]:
if node.tag == 'header' or container_index > 0:
ended = True

# Remove '... More'
# This button is meant to display the hidden text that is already loaded
# Not to be confused with the 'More' that opens the article in a new page
if node.tag == 'p':
node = utils.make_html_element(
html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1)
)

if not ended:
post_text.append(node.text)
else:
shared_text.append(node.text)

text = paragraph_separator.join(itertools.chain(post_text, shared_text))
post_text = paragraph_separator.join(post_text)
shared_text = paragraph_separator.join(shared_text)

if version in ["original", "hidden_original"]:
texts["text"] += text
texts["post_text"] += post_text
texts["shared_text"] += shared_text
if version == "translation":
texts["translated_text"] += text
texts["translated_post_text"] += post_text
texts["translated_shared_text"] += shared_text

if texts:
if texts["translated_text"]:
texts["original_text"] = texts["text"]
return dict(texts)

return {
'text': text,
'post_text': post_text,
'shared_text': shared_text,
'original_text': original_text,
}
elif element.find(".story_body_container>div", first=True):
text = element.find(".story_body_container>div", first=True).text
return {'text': text, 'post_text': text}
elif len(nodes) == 1:
text = nodes[0].text
return {'text': text, 'post_text': text}




return None

# TODO: Add the correct timezone
Expand Down

0 comments on commit eb42d53

Please sign in to comment.