Fixed access errors and updated documentation

ishan-surana · Aug 6, 2024 · 85b440d · 85b440d
1 parent 43c9b28
commit 85b440d
Show file tree

Hide file tree

Showing 10 changed files with 91 additions and 36 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -20,9 +20,9 @@ sphinx:
   configuration: docs/conf.py
 
 # Optionally build your docs in additional formats such as PDF and ePub
-# formats:
-#    - pdf
-#    - epub
+formats:
+   - pdf
+   - epub
 
 # Optional but recommended, declare the Python requirements required
 # to build your documentation

diff --git a/MetaDataScraper.egg-info/PKG-INFO b/MetaDataScraper.egg-info/PKG-INFO
@@ -1,9 +1,15 @@
 Metadata-Version: 2.1
 Name: MetaDataScraper
-Version: 1.0.2
+Version: 1.0.3
 Summary: A module designed to automate the extraction of follower counts and post details from a public Facebook page.
 Author-email: Ishan Surana <ishansurana1234@gmail.com>
+Maintainer-email: Ishan Surana <ishansurana1234@gmail.com>
 Project-URL: Homepage, https://metadatascraper.readthedocs.io/en/latest/
+Project-URL: Documentation, https://metadatascraper.readthedocs.io/en/latest/
+Project-URL: Repository, https://github.com/ishan-surana/MetaDataScraper
+Project-URL: Changelog, https://github.com/ishan-surana/MetaDataScraper/releases
+Project-URL: Issues, https://github.com/ishan-surana/MetaDataScraper/issues
+Keywords: facebook,scraper,meta,selenium,webdriver-manager,automation,web-scraping,web-crawling,web-automation,facebook-scraper,facebook-web-scraper,meta-scraper
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: Microsoft :: Windows

diff --git a/MetaDataScraper/FacebookScraper.py b/MetaDataScraper/FacebookScraper.py
@@ -58,7 +58,8 @@ class LoginlessScraper:
     -------
     To scrape a Facebook page:
 
-        ```python
+    ```python
+
         from MetaDataScraper import LoginlessScraper
         scraper = LoginlessScraper("page_id")
         data = scraper.scrape()
@@ -190,7 +191,10 @@ def __extract_post_details(self):
         _c = 1
         _error_count = 0
         while True:
-            _xpath = self._xpath_first+str(c)+self._xpath_identifier_addum+self._xpath_last
+            if _c > 100:
+                print("Reached 100 posts. Exiting extraction...\n\n")
+                break
+            _xpath = self._xpath_first+str(_c)+self._xpath_identifier_addum+self._xpath_last
             if not self.driver.find_elements(By.XPATH, _xpath):
                 _error_count += 1
                 if _error_count < 3:
@@ -368,7 +372,8 @@ class LoggedInScraper:
     -------
     To scrape a Facebook page:
 
-        ```python
+    ```python
+
         from MetaDataScraper import LoggedInScraper
         scraper = LoggedInScraper("page_id", "email", "password")
         data = scraper.scrape()
@@ -422,23 +427,22 @@ def __setup_driver(self):
 
     def __login(self):
         """Logs into Facebook using the provided credentials."""
-        logged_in = False
-        while not logged_in:
-            if self.driver.find_elements(By.ID, 'not_me_link'):
-                self.driver.find_element(By.ID, 'not_me_link').click()
-            self.driver.get('https://www.facebook.com/login')
-            self.driver.find_element(By.NAME, 'email').clear()
-            self.driver.find_element(By.NAME, 'email').send_keys(self.email)
-            self.driver.find_element(By.NAME, 'pass').clear()
-            self.driver.find_element(By.NAME, 'pass').send_keys(self.password)
-            self.driver.find_element(By.ID, 'loginbutton').click()
-            # Wait until the login process is completed
-            WebDriverWait(self.driver, 10).until(EC.url_changes('https://www.facebook.com/login'))
-            if self.driver.current_url != 'https://www.facebook.com/?sk=welcome':
-                print("Invalid credentials. Please try again.", end='\r')
-            else:
-                print(" "*100, end='\r')
-                logged_in = True
+        self._logged_in = False
+        if self.driver.find_elements(By.ID, 'not_me_link'):
+            self.driver.find_element(By.ID, 'not_me_link').click()
+        self.driver.get('https://www.facebook.com/login')
+        self.driver.find_element(By.NAME, 'email').clear()
+        self.driver.find_element(By.NAME, 'email').send_keys(self.email)
+        self.driver.find_element(By.NAME, 'pass').clear()
+        self.driver.find_element(By.NAME, 'pass').send_keys(self.password)
+        self.driver.find_element(By.ID, 'loginbutton').click()
+        # Wait until the login process is completed
+        WebDriverWait(self.driver, 10).until(EC.url_changes('https://www.facebook.com/login'))
+        if self.driver.current_url != 'https://www.facebook.com/?sk=welcome':
+            raise Exception("Invalid credentials. Please try again.")
+        else:
+            print(" "*100, end='\r')
+            self._logged_in = True
 
     def __navigate_to_page(self):
         """Navigates to the specified Facebook page."""
@@ -522,7 +526,7 @@ def __extract_post_details(self):
         _c = 1
         _error_count = 0
         while True:
-            _xpath = self._xpath_first + str(c) + self._xpath_identifier_addum + self._xpath_last
+            _xpath = self._xpath_first + str(_c) + self._xpath_identifier_addum + self._xpath_last
             if not self.driver.find_elements(By.XPATH, _xpath):
                 _error_count += 1
                 if _error_count < 3:
@@ -587,6 +591,7 @@ def __extract_post_details(self):
 
     def scrape(self):
         """Initiates the scraping process and returns a dictionary with the scraped data."""
+        self._logged_in = False
         self.__setup_driver()
         self.__login()
         self.__navigate_to_page()
@@ -595,8 +600,8 @@ def scrape(self):
         self.__scroll_to_top()
         self.__get_xpath_constructor()
         self.__extract_post_details()
-        self.driver.quit()
         print("\033[A\033[A\033[A") # DevTools line deleter
+        self.driver.quit()
         return {
             'followers': self.followers,
             'post_texts': self.post_texts,

diff --git a/dist/MetaDataScraper-1.0.3-py3-none-any.whl b/dist/MetaDataScraper-1.0.3-py3-none-any.whl
diff --git a/dist/metadatascraper-1.0.3.tar.gz b/dist/metadatascraper-1.0.3.tar.gz
diff --git a/docs/README.md b/docs/README.md
@@ -26,13 +26,25 @@ To use MetaDataScraper, follow these steps:
 
 2. Initialize the scraper with the Facebook page ID:
 
+::::{tab-set}
+
+:::{tab-item} Loginless
    ```python
    page_id = "your_target_page_id"
    scraper = LoginlessScraper(page_id)
+   ```
+:::
+
+:::{tab-item} LoggedIn
+   ```python
+   page_id = "your_target_page_id"
    email = "your_facebook_email"
    password = "your_facebook_password"
    scraper = LoggedInScraper(page_id, email, password)
    ```
+:::
+
+::::
 
 3. Scrape the Facebook page to retrieve information:
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -1,14 +1,20 @@
 # Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
 
-# -- Project information
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 project = 'MetaDataScraper'
+copyright = '2024, Ishan Surana'
 author = 'Ishan Surana'
-
+release = '1.0.3'
 repo_url = 'https://github.com/ishan-surana/MetaDataScraper/'
-version = '1.0.0'
+version = '1.0.3'
 
-# -- General configuration
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
 extensions = [
     'sphinx.ext.duration',
@@ -17,7 +23,10 @@
     'sphinx.ext.autosummary',
     'sphinx.ext.intersphinx',
     'myst_parser',
+    'sphinx_design',
+    'sphinx_copybutton',
 ]
+myst_enable_extensions = ["colon_fence"]
 
 source_suffix = {
     '.rst': 'restructuredtext',
@@ -33,9 +42,15 @@
 
 templates_path = ['_templates']
 
-# -- Options for HTML output
+exclude_patterns = []
+
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = 'sphinx_rtd_theme'
+html_theme = 'pydata_sphinx_theme'
+html_static_path = ['_static']
 
 # -- Options for EPUB output
-epub_show_urls = 'footnote'
+epub_show_urls = 'footnote'
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,3 +1,9 @@
+:html_theme.sidebar_secondary.remove:
+.. MetaDataScraper documentation master file, created by
+   sphinx-quickstart on Sun Aug  4 20:19:27 2024.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
 Welcome to the MetaDataScraper documentation!
 ===================================
 
@@ -13,5 +19,14 @@ Contents
 --------
 
 .. toctree::
+   :maxdepth: 2
 
    README
+
+.. seealso::
+
+   Source Repository
+    `GitHub <https://github.com/ishan-surana/MetaDataScraper>`_
+
+   Sponsorship
+    `Sponsorship <https://github.com/sponsors/ishan-surana>`_
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,3 +1,5 @@
 sphinx==7.1.2
-sphinx-rtd-theme==1.3.0rc1
-myst_parser
+myst_parser
+sphinx-design
+pydata-sphinx-theme
+sphinx-copybutton
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "MetaDataScraper"
-version = "1.0.2"
+version = "1.0.3"
 authors = [
   { name="Ishan Surana", email="ishansurana1234@gmail.com" },
 ]