Skip to content

Commit

Permalink
Fixed access errors and updated documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
ishan-surana committed Aug 6, 2024
1 parent 43c9b28 commit 85b440d
Show file tree
Hide file tree
Showing 10 changed files with 91 additions and 36 deletions.
6 changes: 3 additions & 3 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ sphinx:
configuration: docs/conf.py

# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
formats:
- pdf
- epub

# Optional but recommended, declare the Python requirements required
# to build your documentation
Expand Down
8 changes: 7 additions & 1 deletion MetaDataScraper.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
Metadata-Version: 2.1
Name: MetaDataScraper
Version: 1.0.2
Version: 1.0.3
Summary: A module designed to automate the extraction of follower counts and post details from a public Facebook page.
Author-email: Ishan Surana <ishansurana1234@gmail.com>
Maintainer-email: Ishan Surana <ishansurana1234@gmail.com>
Project-URL: Homepage, https://metadatascraper.readthedocs.io/en/latest/
Project-URL: Documentation, https://metadatascraper.readthedocs.io/en/latest/
Project-URL: Repository, https://github.com/ishan-surana/MetaDataScraper
Project-URL: Changelog, https://github.com/ishan-surana/MetaDataScraper/releases
Project-URL: Issues, https://github.com/ishan-surana/MetaDataScraper/issues
Keywords: facebook,scraper,meta,selenium,webdriver-manager,automation,web-scraping,web-crawling,web-automation,facebook-scraper,facebook-web-scraper,meta-scraper
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: Microsoft :: Windows
Expand Down
49 changes: 27 additions & 22 deletions MetaDataScraper/FacebookScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ class LoginlessScraper:
-------
To scrape a Facebook page:
```python
```python
from MetaDataScraper import LoginlessScraper
scraper = LoginlessScraper("page_id")
data = scraper.scrape()
Expand Down Expand Up @@ -190,7 +191,10 @@ def __extract_post_details(self):
_c = 1
_error_count = 0
while True:
_xpath = self._xpath_first+str(c)+self._xpath_identifier_addum+self._xpath_last
if _c > 100:
print("Reached 100 posts. Exiting extraction...\n\n")
break
_xpath = self._xpath_first+str(_c)+self._xpath_identifier_addum+self._xpath_last
if not self.driver.find_elements(By.XPATH, _xpath):
_error_count += 1
if _error_count < 3:
Expand Down Expand Up @@ -368,7 +372,8 @@ class LoggedInScraper:
-------
To scrape a Facebook page:
```python
```python
from MetaDataScraper import LoggedInScraper
scraper = LoggedInScraper("page_id", "email", "password")
data = scraper.scrape()
Expand Down Expand Up @@ -422,23 +427,22 @@ def __setup_driver(self):

def __login(self):
"""Logs into Facebook using the provided credentials."""
logged_in = False
while not logged_in:
if self.driver.find_elements(By.ID, 'not_me_link'):
self.driver.find_element(By.ID, 'not_me_link').click()
self.driver.get('https://www.facebook.com/login')
self.driver.find_element(By.NAME, 'email').clear()
self.driver.find_element(By.NAME, 'email').send_keys(self.email)
self.driver.find_element(By.NAME, 'pass').clear()
self.driver.find_element(By.NAME, 'pass').send_keys(self.password)
self.driver.find_element(By.ID, 'loginbutton').click()
# Wait until the login process is completed
WebDriverWait(self.driver, 10).until(EC.url_changes('https://www.facebook.com/login'))
if self.driver.current_url != 'https://www.facebook.com/?sk=welcome':
print("Invalid credentials. Please try again.", end='\r')
else:
print(" "*100, end='\r')
logged_in = True
self._logged_in = False
if self.driver.find_elements(By.ID, 'not_me_link'):
self.driver.find_element(By.ID, 'not_me_link').click()
self.driver.get('https://www.facebook.com/login')
self.driver.find_element(By.NAME, 'email').clear()
self.driver.find_element(By.NAME, 'email').send_keys(self.email)
self.driver.find_element(By.NAME, 'pass').clear()
self.driver.find_element(By.NAME, 'pass').send_keys(self.password)
self.driver.find_element(By.ID, 'loginbutton').click()
# Wait until the login process is completed
WebDriverWait(self.driver, 10).until(EC.url_changes('https://www.facebook.com/login'))
if self.driver.current_url != 'https://www.facebook.com/?sk=welcome':
raise Exception("Invalid credentials. Please try again.")
else:
print(" "*100, end='\r')
self._logged_in = True

def __navigate_to_page(self):
"""Navigates to the specified Facebook page."""
Expand Down Expand Up @@ -522,7 +526,7 @@ def __extract_post_details(self):
_c = 1
_error_count = 0
while True:
_xpath = self._xpath_first + str(c) + self._xpath_identifier_addum + self._xpath_last
_xpath = self._xpath_first + str(_c) + self._xpath_identifier_addum + self._xpath_last
if not self.driver.find_elements(By.XPATH, _xpath):
_error_count += 1
if _error_count < 3:
Expand Down Expand Up @@ -587,6 +591,7 @@ def __extract_post_details(self):

def scrape(self):
"""Initiates the scraping process and returns a dictionary with the scraped data."""
self._logged_in = False
self.__setup_driver()
self.__login()
self.__navigate_to_page()
Expand All @@ -595,8 +600,8 @@ def scrape(self):
self.__scroll_to_top()
self.__get_xpath_constructor()
self.__extract_post_details()
self.driver.quit()
print("\033[A\033[A\033[A") # DevTools line deleter
self.driver.quit()
return {
'followers': self.followers,
'post_texts': self.post_texts,
Expand Down
Binary file added dist/MetaDataScraper-1.0.3-py3-none-any.whl
Binary file not shown.
Binary file added dist/metadatascraper-1.0.3.tar.gz
Binary file not shown.
12 changes: 12 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,25 @@ To use MetaDataScraper, follow these steps:

2. Initialize the scraper with the Facebook page ID:

::::{tab-set}

:::{tab-item} Loginless
```python
page_id = "your_target_page_id"
scraper = LoginlessScraper(page_id)
```
:::

:::{tab-item} LoggedIn
```python
page_id = "your_target_page_id"
email = "your_facebook_email"
password = "your_facebook_password"
scraper = LoggedInScraper(page_id, email, password)
```
:::

::::

3. Scrape the Facebook page to retrieve information:

Expand Down
29 changes: 22 additions & 7 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Project information
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'MetaDataScraper'
copyright = '2024, Ishan Surana'
author = 'Ishan Surana'

release = '1.0.3'
repo_url = 'https://github.com/ishan-surana/MetaDataScraper/'
version = '1.0.0'
version = '1.0.3'

# -- General configuration
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

extensions = [
'sphinx.ext.duration',
Expand All @@ -17,7 +23,10 @@
'sphinx.ext.autosummary',
'sphinx.ext.intersphinx',
'myst_parser',
'sphinx_design',
'sphinx_copybutton',
]
myst_enable_extensions = ["colon_fence"]

source_suffix = {
'.rst': 'restructuredtext',
Expand All @@ -33,9 +42,15 @@

templates_path = ['_templates']

# -- Options for HTML output
exclude_patterns = []



# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'sphinx_rtd_theme'
html_theme = 'pydata_sphinx_theme'
html_static_path = ['_static']

# -- Options for EPUB output
epub_show_urls = 'footnote'
epub_show_urls = 'footnote'
15 changes: 15 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
:html_theme.sidebar_secondary.remove:
.. MetaDataScraper documentation master file, created by
sphinx-quickstart on Sun Aug 4 20:19:27 2024.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to the MetaDataScraper documentation!
===================================

Expand All @@ -13,5 +19,14 @@ Contents
--------

.. toctree::
:maxdepth: 2

README

.. seealso::

Source Repository
`GitHub <https://github.com/ishan-surana/MetaDataScraper>`_

Sponsorship
`Sponsorship <https://github.com/sponsors/ishan-surana>`_
6 changes: 4 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
sphinx==7.1.2
sphinx-rtd-theme==1.3.0rc1
myst_parser
myst_parser
sphinx-design
pydata-sphinx-theme
sphinx-copybutton
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "MetaDataScraper"
version = "1.0.2"
version = "1.0.3"
authors = [
{ name="Ishan Surana", email="ishansurana1234@gmail.com" },
]
Expand Down

0 comments on commit 85b440d

Please sign in to comment.