diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 31aedb56..5922663e 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -8,6 +8,7 @@ from typing import Any, Dict, Optional from urllib.parse import parse_qs, urlparse from tqdm.auto import tqdm +from collections import defaultdict from . import utils, exceptions from .constants import FB_BASE_URL, FB_MOBILE_BASE_URL, FB_W3_BASE_URL @@ -86,6 +87,7 @@ class PostExtractor: bad_json_key_regex = re.compile(r'(?P[{,])(?P\w+):') more_url_regex = re.compile(r'(?<=…\s)Rate Translation') post_story_regex = re.compile(r'href="(\/story[^"]+)" aria') def __init__(self, element, options, request_fn, full_post_html=None): @@ -271,6 +273,8 @@ def extract_text(self) -> PartialPost: element = self.element + story_containers = element.find(".story_body_container") + has_more = self.more_url_regex.search(element.html) if has_more and self.full_post_html: element = self.full_post_html.find('.story_body_container', first=True) @@ -278,55 +282,75 @@ def extract_text(self) -> PartialPost: text = self.full_post_html.find("div.msg", first=True).text return {"text": text, "post_text": text} - nodes = element.find('p, header, span[role=presentation]') - if nodes and len(nodes) > 1: - post_text = [] - shared_text = [] - ended = False - index_non_header = next( - (i for i, node in enumerate(nodes) if node.tag != 'header'), 1 - ) - for node in nodes[index_non_header:]: - if node.tag == 'header': - ended = True - - # Remove '... More' - # This button is meant to display the hidden text that is already loaded - # Not to be confused with the 'More' that opens the article in a new page - if node.tag == 'p': - node = utils.make_html_element( - html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1) - ) + + texts = defaultdict(str) - if not ended: - post_text.append(node.text) - else: - shared_text.append(node.text) + for container_index, container in enumerate(story_containers): + + has_translation = self.has_translation_regex.search(container.html) + if has_translation: + original = container.find('div[style="display:none"]', first=True) + translation = utils.make_html_element( + html=container.html.replace(original.html, "") + ) + content_versions = [("hidden_original", original), ("translation", translation)] + else: + content_versions = [("original", container)] # Separation between paragraphs paragraph_separator = '\n\n' - text = paragraph_separator.join(itertools.chain(post_text, shared_text)) - post_text = paragraph_separator.join(post_text) - shared_text = paragraph_separator.join(shared_text) - - original_text = None - hidden_div = element.find('div[style="display:none"]', first=True) - if hidden_div: - original_text = [] - for node in hidden_div.find("p,span[role=presentation]"): - node = utils.make_html_element( - html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1) + for version, content in content_versions: + post_text = [] + shared_text = [] + nodes = content.find('p, header, span[role=presentation]') + + if version == "hidden_original": + if container_index == 0: + post_text.append(content.text) + else: + shared_text.append(content.text) + + elif nodes: + ended = False + index_non_header = next( + (i for i, node in enumerate(nodes) if node.tag != 'header'), 1 ) - original_text.append(node.text) - original_text = paragraph_separator.join(original_text) + for node in nodes[index_non_header:]: + if node.tag == 'header' or container_index > 0: + ended = True + + # Remove '... More' + # This button is meant to display the hidden text that is already loaded + # Not to be confused with the 'More' that opens the article in a new page + if node.tag == 'p': + node = utils.make_html_element( + html=node.html.replace('>… <', '><', 1).replace('>More<', '', 1) + ) + + if not ended: + post_text.append(node.text) + else: + shared_text.append(node.text) + + text = paragraph_separator.join(itertools.chain(post_text, shared_text)) + post_text = paragraph_separator.join(post_text) + shared_text = paragraph_separator.join(shared_text) + + if version in ["original", "hidden_original"]: + texts["text"] += text + texts["post_text"] += post_text + texts["shared_text"] += shared_text + if version == "translation": + texts["translated_text"] += text + texts["translated_post_text"] += post_text + texts["translated_shared_text"] += shared_text + + if texts: + if texts["translated_text"]: + texts["original_text"] = texts["text"] + return dict(texts) - return { - 'text': text, - 'post_text': post_text, - 'shared_text': shared_text, - 'original_text': original_text, - } elif element.find(".story_body_container>div", first=True): text = element.find(".story_body_container>div", first=True).text return {'text': text, 'post_text': text} @@ -334,6 +358,9 @@ def extract_text(self) -> PartialPost: text = nodes[0].text return {'text': text, 'post_text': text} + + + return None # TODO: Add the correct timezone