Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Convert Linkedin job post data to pdf. #84

Merged
merged 2 commits into from
Sep 10, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Feat: Convert Linkedin job post data to pdf.
  • Loading branch information
ILB-96 committed Sep 8, 2023
commit c74e73e3eda6f99f3cce396a308d285f6f068fa6
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ validators==0.20.0
wasabi==1.1.2
watchdog==3.0.0
zipp==3.16.2

reportlab==4.0.4
easygui==0.98.3
cohere~=4.19.2
qdrant-client
86 changes: 86 additions & 0 deletions scripts/LinkedinJobToPDF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from bs4 import BeautifulSoup
import requests
import easygui
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from os import listdir
from os.path import isfile, join
import logging


'''
This script takes a LinkedIn job posting URL
and converts the description to a PDF file.
The PDF file is saved in the Data/JobDescription folder.
The name will be outputX.pdf, where X is the number of files in the folder.

IMPORTANT: Make sure the URL is to the actuall job description,
and not the job search page.
'''


def split_string(s: str, max_len: int = 82) -> list[str]:
words = s.split()
lines = []
current_line = ""

for word in words:
if len(current_line) + len(word) + 1 > max_len:
lines.append(current_line.strip())
current_line = ""
current_line += word + " "

if current_line:
lines.append(current_line.strip())

return lines


def linkedin_to_pdf():
url = easygui.enterbox("Enter the URL of the LinkedIn Job Posting:")
try:
page = requests.get(url)
content = page.text

soup = BeautifulSoup(content, "lxml")

description = (
soup.find("div", class_="show-more-less-html__markup")
.get_text(strip=True, separator="\n")
.split("Primary Location")[0]
.strip()
)
logging.info("Description: \n" + description)

return save_to_pdf(description)
except Exception as e:
logging.error(f"Could not get the description from the URL:\n{url}")
logging.error(e)
exit()


def save_to_pdf(description: str):
job_path = "Data/JobDescription/"
description = description.split("\n")
files_number = len([f for f in listdir(job_path) if isfile(join(job_path, f))])
file_name = f"output{files_number}.pdf"

c = canvas.Canvas(job_path+file_name, pagesize=letter)

y = 780
for value in description:
value = split_string(value)
for v in value:
if y < 20:
c.showPage()
y = 780
c.drawString(72, y, v)
y -= 20

c.save()
logging.info("PDF saved to Data/JobDescription/"+file_name)

return file_name[:-4]


linkedin_to_pdf()
8 changes: 6 additions & 2 deletions scripts/similarity/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@
# note this is sensitive data
# handle with care
# DO NOT COMMIT THIS FILE WITH YOUR API KEY
api_keys:
qdrant: YOUR_API_KEY

cohere:
api_key: COHERE_API_KEY
qdrant:
api_key: QDRANT_API_KEY
url: QDRANT_CLUSTER_URL
92 changes: 0 additions & 92 deletions scripts/similarity/qdrant_search.py

This file was deleted.