Skip to content

Commit

Permalink
Add: Scrapper and test file
Browse files Browse the repository at this point in the history
  • Loading branch information
hossam el shabory committed Mar 21, 2023
1 parent 6719623 commit bea829f
Show file tree
Hide file tree
Showing 2 changed files with 245 additions and 0 deletions.
187 changes: 187 additions & 0 deletions Linkedin_Scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Import necessary packages for web scraping and logging
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import random
import time

# Configure logging settings
logging.basicConfig(filename="scraping.log", level=logging.INFO)


def scrape_linkedin_jobs(job_title: str, location: str, pages: int = None) -> list:
"""
Scrape job listings from LinkedIn based on job title and location.
Parameters
----------
job_title : str
The job title to search for on LinkedIn.
location : str
The location to search for jobs in on LinkedIn.
pages : int, optional
The number of pages of job listings to scrape. If None, all available pages will be scraped.
Returns
-------
list of dict
A list of dictionaries, where each dictionary represents a job listing
with the following keys: 'job_title', 'company_name', 'location', 'posted_date',
and 'job_description'.
"""

# Log a message indicating that we're starting a LinkedIn job search
logging.info(f'Starting LinkedIn job scrape for "{job_title}" in "{location}"...')

# Sets the pages to scrape if not provided
pages = pages or 1

# Set up the Selenium web driver
driver = webdriver.Chrome("chromedriver.exe")

# Set up Chrome options to maximize the window
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")

# Initialize the web driver with the Chrome options
driver = webdriver.Chrome(options=options)

# Navigate to the LinkedIn job search page with the given job title and location
driver.get(
f"https://www.linkedin.com/jobs/search/?keywords={job_title}&location={location}"
)

# Scroll through the first 50 pages of search results on LinkedIn
for i in range(pages):

# Log the current page number
logging.info(f"Scrolling to bottom of page {i+1}...")

# Scroll to the bottom of the page using JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

try:
# Wait for the "Show more" button to be present on the page
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/div/main/section[2]/button")
)
)
# Click on the "Show more" button
element.click()

# Handle any exception that may occur when locating or clicking on the button
except Exception:
# Log a message indicating that the button was not found and we're retrying
logging.info("Show more button not found, retrying...")

# Wait for a random amount of time before scrolling to the next page
time.sleep(random.choice(list(range(3, 7))))

# Scrape the job postings
jobs = []
soup = BeautifulSoup(driver.page_source, "html.parser")
job_listings = soup.find_all(
"div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
)

try:
for job in job_listings:
# Extract job details

# job title
job_title = job.find("h3", class_="base-search-card__title").text.strip()
# job company
job_company = job.find(
"h4", class_="base-search-card__subtitle"
).text.strip()
# job location
job_location = job.find(
"span", class_="job-search-card__location"
).text.strip()
# job link
apply_link = job.find("a", class_="base-card__full-link")["href"]

# Navigate to the job posting page and scrape the description
driver.get(apply_link)

# Sleeping randomly
time.sleep(random.choice(list(range(5, 11))))

# Use try-except block to handle exceptions when retrieving job description
try:
# Create a BeautifulSoup object from the webpage source
description_soup = BeautifulSoup(driver.page_source, "html.parser")

# Find the job description element and extract its text
job_description = description_soup.find(
"div", class_="description__text description__text--rich"
).text.strip()

# Handle the AttributeError exception that may occur if the element is not found
except AttributeError:
# Assign None to the job_description variable to indicate that no description was found
job_description = None

# Write a warning message to the log file
logging.warning(
"AttributeError occurred while retrieving job description."
)

# Add job details to the jobs list
jobs.append(
{
"title": job_title,
"company": job_company,
"location": job_location,
"link": apply_link,
"description": job_description,
}
)
# Logging scrapped job with company and location information
logging.info(f'Scraped "{job_title}" at {job_company} in {job_location}...')

# Catching any exception that occurs in the scrapping process
except Exception as e:
# Log an error message with the exception details
logging.error(f"An error occurred while scraping jobs: {str(e)}")

# Return the jobs list that has been collected so far
# This ensures that even if the scraping process is interrupted due to an error, we still have some data
return jobs

# Close the Selenium web driver
driver.quit()

# Return the jobs list
return jobs


def save_job_data(data: dict) -> None:
"""
Save job data to a CSV file.
Args:
data: A dictionary containing job data.
Returns:
None
"""

# Create a pandas DataFrame from the job data dictionary
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file without including the index column
df.to_csv("jobs.csv", index=False)

# Log a message indicating how many jobs were successfully scraped and saved to the CSV file
logging.info(f"Successfully scraped {len(data)} jobs and saved to jobs.csv")


data = scrape_linkedin_jobs("Data analyst", "US")
save_job_data(data)
58 changes: 58 additions & 0 deletions test_linkedin_scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pytest
from Linkedin_Scrapper import scrape_linkedin_jobs


def test_scrape_linkedin_jobs_returns_list():
"""
Test that the scrape_linkedin_jobs function returns a list.
"""
results = scrape_linkedin_jobs("software engineer", "san francisco")
assert isinstance(results, list)


def test_scrape_linkedin_jobs_returns_jobs():
"""
Test that the scrape_linkedin_jobs function returns a list of job dictionaries.
"""
results = scrape_linkedin_jobs("software engineer", "san francisco")
assert all(isinstance(job, dict) for job in results)


def test_scrape_linkedin_jobs_job_details():
"""
Test that each job dictionary returned by scrape_linkedin_jobs contains
the keys "title", "company", "location", "link", and "description".
"""
job_keys = ["title", "company", "location", "link", "description"]
results = scrape_linkedin_jobs("software engineer", "san francisco")
for job in results:
assert all(key in job for key in job_keys)


def test_scrape_linkedin_jobs_pages():
"""
Test that the scrape_linkedin_jobs function returns at least one job
when passed the "pages" argument.
"""
results = scrape_linkedin_jobs("software engineer", "san francisco", pages=2)
assert len(results) > 0


def test_scrape_linkedin_jobs_job_titles():
"""
Test that the titles of all jobs returned by scrape_linkedin_jobs contain the
search query passed in the "job_title" argument.
"""
job_title = "software engineer"
results = scrape_linkedin_jobs(job_title, "san francisco")
assert all(job_title.lower() in job["title"].lower() for job in results)


def test_scrape_linkedin_jobs_job_locations():
"""
Test that the locations of all jobs returned by scrape_linkedin_jobs contain the
search query passed in the "location" argument.
"""
location = "san francisco"
results = scrape_linkedin_jobs("software engineer", location)
assert all(location.lower() in job["location"].lower() for job in results)

0 comments on commit bea829f

Please sign in to comment.