diff --git a/Linkedin_Scrapper.py b/Linkedin_Scrapper.py new file mode 100644 index 0000000..2f6c67b --- /dev/null +++ b/Linkedin_Scrapper.py @@ -0,0 +1,187 @@ +# Import necessary packages for web scraping and logging +import logging +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from bs4 import BeautifulSoup +import pandas as pd +import random +import time + +# Configure logging settings +logging.basicConfig(filename="scraping.log", level=logging.INFO) + + +def scrape_linkedin_jobs(job_title: str, location: str, pages: int = None) -> list: + """ + Scrape job listings from LinkedIn based on job title and location. + + Parameters + ---------- + job_title : str + The job title to search for on LinkedIn. + location : str + The location to search for jobs in on LinkedIn. + pages : int, optional + The number of pages of job listings to scrape. If None, all available pages will be scraped. + + Returns + ------- + list of dict + A list of dictionaries, where each dictionary represents a job listing + with the following keys: 'job_title', 'company_name', 'location', 'posted_date', + and 'job_description'. + """ + + # Log a message indicating that we're starting a LinkedIn job search + logging.info(f'Starting LinkedIn job scrape for "{job_title}" in "{location}"...') + + # Sets the pages to scrape if not provided + pages = pages or 1 + + # Set up the Selenium web driver + driver = webdriver.Chrome("chromedriver.exe") + + # Set up Chrome options to maximize the window + options = webdriver.ChromeOptions() + options.add_argument("--start-maximized") + + # Initialize the web driver with the Chrome options + driver = webdriver.Chrome(options=options) + + # Navigate to the LinkedIn job search page with the given job title and location + driver.get( + f"https://www.linkedin.com/jobs/search/?keywords={job_title}&location={location}" + ) + + # Scroll through the first 50 pages of search results on LinkedIn + for i in range(pages): + + # Log the current page number + logging.info(f"Scrolling to bottom of page {i+1}...") + + # Scroll to the bottom of the page using JavaScript + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + + try: + # Wait for the "Show more" button to be present on the page + element = WebDriverWait(driver, 5).until( + EC.presence_of_element_located( + (By.XPATH, "/html/body/div[1]/div/main/section[2]/button") + ) + ) + # Click on the "Show more" button + element.click() + + # Handle any exception that may occur when locating or clicking on the button + except Exception: + # Log a message indicating that the button was not found and we're retrying + logging.info("Show more button not found, retrying...") + + # Wait for a random amount of time before scrolling to the next page + time.sleep(random.choice(list(range(3, 7)))) + + # Scrape the job postings + jobs = [] + soup = BeautifulSoup(driver.page_source, "html.parser") + job_listings = soup.find_all( + "div", + class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + ) + + try: + for job in job_listings: + # Extract job details + + # job title + job_title = job.find("h3", class_="base-search-card__title").text.strip() + # job company + job_company = job.find( + "h4", class_="base-search-card__subtitle" + ).text.strip() + # job location + job_location = job.find( + "span", class_="job-search-card__location" + ).text.strip() + # job link + apply_link = job.find("a", class_="base-card__full-link")["href"] + + # Navigate to the job posting page and scrape the description + driver.get(apply_link) + + # Sleeping randomly + time.sleep(random.choice(list(range(5, 11)))) + + # Use try-except block to handle exceptions when retrieving job description + try: + # Create a BeautifulSoup object from the webpage source + description_soup = BeautifulSoup(driver.page_source, "html.parser") + + # Find the job description element and extract its text + job_description = description_soup.find( + "div", class_="description__text description__text--rich" + ).text.strip() + + # Handle the AttributeError exception that may occur if the element is not found + except AttributeError: + # Assign None to the job_description variable to indicate that no description was found + job_description = None + + # Write a warning message to the log file + logging.warning( + "AttributeError occurred while retrieving job description." + ) + + # Add job details to the jobs list + jobs.append( + { + "title": job_title, + "company": job_company, + "location": job_location, + "link": apply_link, + "description": job_description, + } + ) + # Logging scrapped job with company and location information + logging.info(f'Scraped "{job_title}" at {job_company} in {job_location}...') + + # Catching any exception that occurs in the scrapping process + except Exception as e: + # Log an error message with the exception details + logging.error(f"An error occurred while scraping jobs: {str(e)}") + + # Return the jobs list that has been collected so far + # This ensures that even if the scraping process is interrupted due to an error, we still have some data + return jobs + + # Close the Selenium web driver + driver.quit() + + # Return the jobs list + return jobs + + +def save_job_data(data: dict) -> None: + """ + Save job data to a CSV file. + + Args: + data: A dictionary containing job data. + + Returns: + None + """ + + # Create a pandas DataFrame from the job data dictionary + df = pd.DataFrame(data) + + # Save the DataFrame to a CSV file without including the index column + df.to_csv("jobs.csv", index=False) + + # Log a message indicating how many jobs were successfully scraped and saved to the CSV file + logging.info(f"Successfully scraped {len(data)} jobs and saved to jobs.csv") + + +data = scrape_linkedin_jobs("Data analyst", "US") +save_job_data(data) diff --git a/test_linkedin_scrapper.py b/test_linkedin_scrapper.py new file mode 100644 index 0000000..a836a5c --- /dev/null +++ b/test_linkedin_scrapper.py @@ -0,0 +1,58 @@ +import pytest +from Linkedin_Scrapper import scrape_linkedin_jobs + + +def test_scrape_linkedin_jobs_returns_list(): + """ + Test that the scrape_linkedin_jobs function returns a list. + """ + results = scrape_linkedin_jobs("software engineer", "san francisco") + assert isinstance(results, list) + + +def test_scrape_linkedin_jobs_returns_jobs(): + """ + Test that the scrape_linkedin_jobs function returns a list of job dictionaries. + """ + results = scrape_linkedin_jobs("software engineer", "san francisco") + assert all(isinstance(job, dict) for job in results) + + +def test_scrape_linkedin_jobs_job_details(): + """ + Test that each job dictionary returned by scrape_linkedin_jobs contains + the keys "title", "company", "location", "link", and "description". + """ + job_keys = ["title", "company", "location", "link", "description"] + results = scrape_linkedin_jobs("software engineer", "san francisco") + for job in results: + assert all(key in job for key in job_keys) + + +def test_scrape_linkedin_jobs_pages(): + """ + Test that the scrape_linkedin_jobs function returns at least one job + when passed the "pages" argument. + """ + results = scrape_linkedin_jobs("software engineer", "san francisco", pages=2) + assert len(results) > 0 + + +def test_scrape_linkedin_jobs_job_titles(): + """ + Test that the titles of all jobs returned by scrape_linkedin_jobs contain the + search query passed in the "job_title" argument. + """ + job_title = "software engineer" + results = scrape_linkedin_jobs(job_title, "san francisco") + assert all(job_title.lower() in job["title"].lower() for job in results) + + +def test_scrape_linkedin_jobs_job_locations(): + """ + Test that the locations of all jobs returned by scrape_linkedin_jobs contain the + search query passed in the "location" argument. + """ + location = "san francisco" + results = scrape_linkedin_jobs("software engineer", location) + assert all(location.lower() in job["location"].lower() for job in results)