Add: Scrapper and test file

hossam-elshabory · Mar 21, 2023 · bea829f · bea829f
1 parent 6719623
commit bea829f
Show file tree

Hide file tree

Showing 2 changed files with 245 additions and 0 deletions.
diff --git a/Linkedin_Scrapper.py b/Linkedin_Scrapper.py
@@ -0,0 +1,187 @@
+# Import necessary packages for web scraping and logging
+import logging
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import pandas as pd
+import random
+import time
+
+# Configure logging settings
+logging.basicConfig(filename="scraping.log", level=logging.INFO)
+
+
+def scrape_linkedin_jobs(job_title: str, location: str, pages: int = None) -> list:
+    """
+    Scrape job listings from LinkedIn based on job title and location.
+
+    Parameters
+    ----------
+    job_title : str
+        The job title to search for on LinkedIn.
+    location : str
+        The location to search for jobs in on LinkedIn.
+    pages : int, optional
+        The number of pages of job listings to scrape. If None, all available pages will be scraped.
+
+    Returns
+    -------
+    list of dict
+        A list of dictionaries, where each dictionary represents a job listing
+        with the following keys: 'job_title', 'company_name', 'location', 'posted_date',
+        and 'job_description'.
+    """
+
+    # Log a message indicating that we're starting a LinkedIn job search
+    logging.info(f'Starting LinkedIn job scrape for "{job_title}" in "{location}"...')
+
+    # Sets the pages to scrape if not provided
+    pages = pages or 1
+
+    # Set up the Selenium web driver
+    driver = webdriver.Chrome("chromedriver.exe")
+
+    # Set up Chrome options to maximize the window
+    options = webdriver.ChromeOptions()
+    options.add_argument("--start-maximized")
+
+    # Initialize the web driver with the Chrome options
+    driver = webdriver.Chrome(options=options)
+
+    # Navigate to the LinkedIn job search page with the given job title and location
+    driver.get(
+        f"https://www.linkedin.com/jobs/search/?keywords={job_title}&location={location}"
+    )
+
+    # Scroll through the first 50 pages of search results on LinkedIn
+    for i in range(pages):
+
+        # Log the current page number
+        logging.info(f"Scrolling to bottom of page {i+1}...")
+
+        # Scroll to the bottom of the page using JavaScript
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+
+        try:
+            # Wait for the "Show more" button to be present on the page
+            element = WebDriverWait(driver, 5).until(
+                EC.presence_of_element_located(
+                    (By.XPATH, "/html/body/div[1]/div/main/section[2]/button")
+                )
+            )
+            # Click on the "Show more" button
+            element.click()
+
+        # Handle any exception that may occur when locating or clicking on the button
+        except Exception:
+            # Log a message indicating that the button was not found and we're retrying
+            logging.info("Show more button not found, retrying...")
+
+        # Wait for a random amount of time before scrolling to the next page
+        time.sleep(random.choice(list(range(3, 7))))
+
+    # Scrape the job postings
+    jobs = []
+    soup = BeautifulSoup(driver.page_source, "html.parser")
+    job_listings = soup.find_all(
+        "div",
+        class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
+    )
+
+    try:
+        for job in job_listings:
+            # Extract job details
+
+            # job title
+            job_title = job.find("h3", class_="base-search-card__title").text.strip()
+            # job company
+            job_company = job.find(
+                "h4", class_="base-search-card__subtitle"
+            ).text.strip()
+            # job location
+            job_location = job.find(
+                "span", class_="job-search-card__location"
+            ).text.strip()
+            # job link
+            apply_link = job.find("a", class_="base-card__full-link")["href"]
+
+            # Navigate to the job posting page and scrape the description
+            driver.get(apply_link)
+
+            # Sleeping randomly
+            time.sleep(random.choice(list(range(5, 11))))
+
+            # Use try-except block to handle exceptions when retrieving job description
+            try:
+                # Create a BeautifulSoup object from the webpage source
+                description_soup = BeautifulSoup(driver.page_source, "html.parser")
+
+                # Find the job description element and extract its text
+                job_description = description_soup.find(
+                    "div", class_="description__text description__text--rich"
+                ).text.strip()
+
+            # Handle the AttributeError exception that may occur if the element is not found
+            except AttributeError:
+                # Assign None to the job_description variable to indicate that no description was found
+                job_description = None
+
+                # Write a warning message to the log file
+                logging.warning(
+                    "AttributeError occurred while retrieving job description."
+                )
+
+            # Add job details to the jobs list
+            jobs.append(
+                {
+                    "title": job_title,
+                    "company": job_company,
+                    "location": job_location,
+                    "link": apply_link,
+                    "description": job_description,
+                }
+            )
+            # Logging scrapped job with company and location information
+            logging.info(f'Scraped "{job_title}" at {job_company} in {job_location}...')
+
+    # Catching any exception that occurs in the scrapping process
+    except Exception as e:
+        # Log an error message with the exception details
+        logging.error(f"An error occurred while scraping jobs: {str(e)}")
+
+        # Return the jobs list that has been collected so far
+        # This ensures that even if the scraping process is interrupted due to an error, we still have some data
+        return jobs
+
+    # Close the Selenium web driver
+    driver.quit()
+
+    # Return the jobs list
+    return jobs
+
+
+def save_job_data(data: dict) -> None:
+    """
+    Save job data to a CSV file.
+
+    Args:
+        data: A dictionary containing job data.
+
+    Returns:
+        None
+    """
+
+    # Create a pandas DataFrame from the job data dictionary
+    df = pd.DataFrame(data)
+
+    # Save the DataFrame to a CSV file without including the index column
+    df.to_csv("jobs.csv", index=False)
+
+    # Log a message indicating how many jobs were successfully scraped and saved to the CSV file
+    logging.info(f"Successfully scraped {len(data)} jobs and saved to jobs.csv")
+
+
+data = scrape_linkedin_jobs("Data analyst", "US")
+save_job_data(data)
diff --git a/test_linkedin_scrapper.py b/test_linkedin_scrapper.py
@@ -0,0 +1,58 @@
+import pytest
+from Linkedin_Scrapper import scrape_linkedin_jobs
+
+
+def test_scrape_linkedin_jobs_returns_list():
+    """
+    Test that the scrape_linkedin_jobs function returns a list.
+    """
+    results = scrape_linkedin_jobs("software engineer", "san francisco")
+    assert isinstance(results, list)
+
+
+def test_scrape_linkedin_jobs_returns_jobs():
+    """
+    Test that the scrape_linkedin_jobs function returns a list of job dictionaries.
+    """
+    results = scrape_linkedin_jobs("software engineer", "san francisco")
+    assert all(isinstance(job, dict) for job in results)
+
+
+def test_scrape_linkedin_jobs_job_details():
+    """
+    Test that each job dictionary returned by scrape_linkedin_jobs contains
+    the keys "title", "company", "location", "link", and "description".
+    """
+    job_keys = ["title", "company", "location", "link", "description"]
+    results = scrape_linkedin_jobs("software engineer", "san francisco")
+    for job in results:
+        assert all(key in job for key in job_keys)
+
+
+def test_scrape_linkedin_jobs_pages():
+    """
+    Test that the scrape_linkedin_jobs function returns at least one job
+    when passed the "pages" argument.
+    """
+    results = scrape_linkedin_jobs("software engineer", "san francisco", pages=2)
+    assert len(results) > 0
+
+
+def test_scrape_linkedin_jobs_job_titles():
+    """
+    Test that the titles of all jobs returned by scrape_linkedin_jobs contain the
+    search query passed in the "job_title" argument.
+    """
+    job_title = "software engineer"
+    results = scrape_linkedin_jobs(job_title, "san francisco")
+    assert all(job_title.lower() in job["title"].lower() for job in results)
+
+
+def test_scrape_linkedin_jobs_job_locations():
+    """
+    Test that the locations of all jobs returned by scrape_linkedin_jobs contain the
+    search query passed in the "location" argument.
+    """
+    location = "san francisco"
+    results = scrape_linkedin_jobs("software engineer", location)
+    assert all(location.lower() in job["location"].lower() for job in results)