Merge pull request shaikhsajid1111#107 from lrudolph333/master

Adding functionality for scraping FB groups
moda20 · Mar 30, 2024 · 2fd7993 · 2fd7993
2 parents 1873638 + 9b3f37d
commit 2fd7993
Show file tree

Hide file tree

Showing 7 changed files with 404 additions and 201 deletions.
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@
 - Internet Connection
 - Python 3.7+
 - Chrome or Firefox browser installed on your machine
-<br>
+  <br>
 
 <hr>
 <h2 id="Installation">Installation:</h2>
@@ -64,31 +64,36 @@ git clone https://github.com/shaikhsajid1111/facebook_page_scraper
 ```
 python3 setup.py install
 ```
+
 <br>
 <p id="pypiInstallation">Installing with pypi</p>
 
 ```
 pip3 install facebook-page-scraper
 ```
+
 <br>
 <hr>
 <h2 id="instantiation"> How to use? </h2>
 
-
-
 ```python
 #import Facebook_scraper class from facebook_page_scraper
 from facebook_page_scraper import Facebook_scraper
 
 #instantiate the Facebook_scraper class
 
-page_name = "metaai"
+page_or_group_name = "Meta"
 posts_count = 10
 browser = "firefox"
 proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
 timeout = 600 #600 seconds
 headless = True
-meta_ai = Facebook_scraper(page_name, posts_count, browser, proxy=proxy, timeout=timeout, headless=headless)
+# get env password
+fb_password = os.getenv('fb_password')
+fb_email = os.getenv('fb_email')
+# indicates if the Facebook target is a FB group or FB page
+isGroup= False
+meta_ai = Facebook_scraper(page_or_group_name, posts_count, browser, proxy=proxy, timeout=timeout, headless=headless, isGroup=isGroup)
 
 ```
 
@@ -104,13 +109,13 @@ meta_ai = Facebook_scraper(page_name, posts_count, browser, proxy=proxy, timeout
 
 <tr>
 <td>
-page_name
+page_or_group_name
 </td>
 <td>
 String
 </td>
 <td>
-Name of the facebook page
+Name of the facebook page or group
 </td>
 </tr>
 
@@ -173,7 +178,45 @@ Whether to run browser in headless mode?. Default is True
  </code>
 </td>
 </tr>
+<tr>
+
+<td>
+isGroup
+</td>
+<td>
+Boolean
+</td>
+<td>
+Whether the Facebook target is a group or page. Default is False
+ </code>
+</td>
+</tr>
+
+<tr>
+<td>
+username
+</td>
+<td>
+String
+</td>
+<td>
+username to log into Facebook when scraping (recommended to use .env)
+ </code>
+</td>
+</tr>
 
+<tr>
+<td>
+password
+</td>
+<td>
+String
+</td>
+<td>
+password to log into Facebook when scraping (recommended to use .env)
+ </code>
+</td>
+</tr>
 
 </table>
 <br>
@@ -184,6 +227,7 @@ Whether to run browser in headless mode?. Default is True
 <br
 
 >
+
 <h3 id="JSONWay"> For post's data in <b>JSON</b> format:</h3>
 
 ```python
@@ -224,11 +268,11 @@ Output:
 
 }
 ```
+
 <div id="jsonOutput">
 Output Structure for JSON format:
 
-
-``` javascript
+```javascript
 {
     "id": {
         "name": string,
@@ -253,14 +297,15 @@ Output Structure for JSON format:
 }
 
 ```
+
 </div>
 <br>
 <hr>
 <br>
 
 <h3 id="CSVWay"> For saving post's data directly to <b>CSV</b> file</h3>
 
-``` python
+```python
 #call scrap_to_csv(filename,directory) method
 
 
@@ -270,7 +315,8 @@ meta_ai.scrap_to_csv(filename, directory)
 
 ```
 
-content of ```data_file.csv```:
+content of `data_file.csv`:
+
 ```csv
 id,name,shares,likes,loves,wow,cares,sad,angry,haha,reactions_count,comments,content,posted_on,video,image,post_url
 2024182624425347,Meta AI,0,154,19,0,0,0,0,0,173,2,"We’ve built data2vec, the first general high-performance self-supervised algorithm for speech, vision, and text. We applied it to different modalities and found it matches or outperforms the best self-supervised algorithms. We hope this brings us closer to a world where computers can learn to solve many different tasks without supervision. Learn more and get the code:  https://ai.facebook.com/…/the-first-high-performance-self-s…",2022-01-20T22:43:35,,https://scontent-bom1-2.xx.fbcdn.net/v/t39.30808-6/s480x480/272147088_2024182621092014_6532581039236849529_n.jpg?_nc_cat=100&ccb=1-5&_nc_sid=8024bb&_nc_ohc=j4_1PAndJTIAX82OLNq&_nc_ht=scontent-bom1-2.xx&oh=00_AT9us__TvC9eYBqRyQEwEtYSit9r2UKYg0gFoRK7Efrhyw&oe=61F17B71,https://www.facebook.com/MetaAI/photos/a.360372474139712/2024182624425347/?type=3&__xts__%5B0%5D=68.ARAse4eiZmZQDOZumNZEDR0tQkE5B6g50K6S66JJPccb-KaWJWg6Yz4v19BQFSZRMd04MeBmV24VqvqMB3oyjAwMDJUtpmgkMiITtSP8HOgy8QEx_vFlq1j-UEImZkzeEgSAJYINndnR5aSQn0GUwL54L3x2BsxEqL1lElL7SnHfTVvIFUDyNfAqUWIsXrkI8X5KjoDchUj7aHRga1HB5EE0x60dZcHogUMb1sJDRmKCcx8xisRgk5XzdZKCQDDdEkUqN-Ch9_NYTMtxlchz1KfR0w9wRt8y9l7E7BNhfLrmm4qyxo-ZpA&__tn__=-R
@@ -327,8 +373,6 @@ Directory where CSV file have to be stored.
 <hr>
 <br>
 
-
-
 <h3 id="outputKeys">Keys of the outputs:</h3>
 <table>
 <th>
@@ -338,8 +382,6 @@ Directory where CSV file have to be stored.
 Key
 </td>
 
-
-
 <td>
 Type
 </td>
@@ -351,7 +393,6 @@ Description
 <tr>
 </th>
 
-
 <td>
 <tr>
 
@@ -416,7 +457,6 @@ Total reaction count of post
 </td>
 </tr>
 
-
 <tr>
 <td>
 comments
@@ -453,10 +493,9 @@ URLs of video present in that post
 </td>
 </tr>
 
-
 <tr>
 <td>
-image
+images
 </td>
 <td>
  List
@@ -490,7 +529,6 @@ URL for that post
 </td>
 </tr>
 
-
 </table>
 <br>
 

diff --git a/facebook_page_scraper/driver_utilities.py b/facebook_page_scraper/driver_utilities.py
@@ -1,14 +1,16 @@
 #!/usr/bin/env python3
 
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-from selenium.common.exceptions import NoSuchElementException, WebDriverException
-from random import randint
-from selenium.webdriver.common.keys import Keys
 import logging
 import sys
 import time
+from random import randint
+
+from selenium.common.exceptions import (NoSuchElementException,
+                                        WebDriverException)
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
 
 logger = logging.getLogger(__name__)
 format = logging.Formatter(
@@ -118,7 +120,7 @@ def __close_popup(driver):
             logger.exception("Error at close_popup method : {}".format(ex))
 
     @staticmethod
-    def __wait_for_element_to_appear(driver, layout):
+    def __wait_for_element_to_appear(driver, layout, timeout):
         """expects driver's instance, wait for posts to show.
         post's CSS class name is userContentWrapper
         """
@@ -128,22 +130,28 @@ def __wait_for_element_to_appear(driver, layout):
                 body = driver.find_element(By.CSS_SELECTOR, "body")
                 for _ in range(randint(3, 5)):
                     body.send_keys(Keys.PAGE_DOWN)
-                WebDriverWait(driver, 30).until(EC.presence_of_element_located(
+                WebDriverWait(driver, timeout).until(EC.presence_of_element_located(
                     (By.CSS_SELECTOR, '.userContentWrapper')))
+                return True
             elif layout == "new":
-                WebDriverWait(driver, 30).until(
+                WebDriverWait(driver, timeout).until(
                     EC.presence_of_element_located((By.CSS_SELECTOR, "[aria-posinset]")))
+                print("new layout loaded")
+
+                return True
 
         except WebDriverException:
             # if it was not found,it means either page is not loading or it does not exists
             logger.critical("No posts were found!")
-            Utilities.__close_driver(driver)
-            # exit the program, because if posts does not exists,we cannot go further
-            sys.exit(1)
+            return False
+            # (optional) exit the program, because if posts does not exists,we cannot go further
+            # Utilities.__close_driver(driver)
+            # sys.exit(1)
         except Exception as ex:
             logger.exception(
                 "Error at wait_for_element_to_appear method : {}".format(ex))
-            Utilities.__close_driver(driver)
+            return False
+            # Utilities.__close_driver(driver)
 
     @staticmethod
     def __click_see_more(driver, content, selector=None):