-
Notifications
You must be signed in to change notification settings - Fork 0
/
job_scraper.rb
102 lines (84 loc) · 2.9 KB
/
job_scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# frozen_string_literal: true
require 'httparty'
require 'nokogiri'
KEYWORDS = %w[Ruby Rails].freeze
JOB_POSTINGS_FILE = 'job_postings.txt'
PAGES_TO_SCRAPE = 5
# PageRetriever is responsible for fetching an HTML document from a given URL
class PageRetriever
def self.fetch(url)
response = HTTParty.get(url)
raise "Unable to fetch the page, response code: #{response.code}" unless response.success?
Nokogiri::HTML(response.body)
rescue StandardError => e
puts "Error occurred while fetching jobs: #{e.message}"
end
end
# JobScraper is responsible for scraping job postings from a website
class JobScraper
# TextHelper is responsible for cleaning text and HTML
class TextHelper
def self.clean_text(text)
text.gsub(/\s+/, ' ').strip
end
def self.clean_html(html)
html.gsub(/<[^>]*>/, '')
end
end
# JobFilter is responsible for filtering job postings based on given keywords
class JobFilter
def self.filter(document)
return [] unless document
document.css('tr.athing.comtr').each_with_object([]) do |job_post, acc|
acc << job_post if keyword_present_in_text?(job_post.text)
end
end
def self.keyword_present_in_text?(text)
KEYWORDS.any? { |keyword| text.include?(keyword) }
end
end
# JobDetailsExtractor is responsible for extracting details from job postings
class JobDetailsExtractor
def self.extract(comment)
{
title: TextHelper.clean_text(comment.css('.hnuser').text),
time: TextHelper.clean_text(comment.css('.age a').text),
details: TextHelper.clean_text(TextHelper.clean_html(comment.css('.commtext').inner_html))
}
end
end
# JobFormatter is responsible for formatting job postings for output
class JobFormatter
def self.format(job_postings)
output = ["Found #{job_postings.count} job postings"]
job_postings.each_with_index do |posting, index|
output << "Job posting #{index + 1}: #{posting[:title]} | #{posting[:time]}\n#{posting[:details]}\n"
end
output.join("\n")
end
end
def initialize
@all_jobs = []
end
def scrape_jobs
retrieve_all_jobs
cleaned_jobs = @all_jobs.map { |job| JobDetailsExtractor.extract(job) }
save_jobs(JobFormatter.format(cleaned_jobs))
end
private
def retrieve_all_jobs
puts "Scraping #{PAGES_TO_SCRAPE} pages for jobs with keywords: #{KEYWORDS.join(', ')}"
(1..PAGES_TO_SCRAPE).each do |page|
url = "https://news.ycombinator.com/item?id=36573871&p=#{page}"
document = PageRetriever.fetch(url)
@all_jobs += JobFilter.filter(document)
end
end
def save_jobs(jobs, job_file = JOB_POSTINGS_FILE)
File.open(job_file, 'w') { |file| file.write(jobs) }
puts "Found #{@all_jobs.count} jobs and saved them to #{JOB_POSTINGS_FILE}"
rescue StandardError => e
puts "Error occurred while saving jobs: #{e.message}"
end
end
JobScraper.new.scrape_jobs