Skip to content

Commit

Permalink
Move sponsor level data to json file
Browse files Browse the repository at this point in the history
  • Loading branch information
ShaneCurcuru committed Feb 5, 2024
1 parent 24db8df commit 981da48
Showing 1 changed file with 98 additions and 74 deletions.
172 changes: 98 additions & 74 deletions assets/ruby/sponsor_utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
module SponsorUtils
DESCRIPTION = <<-HEREDOC
SponsorUtils: good-enough scrapers and detectors of FOSS sponsors.
See also: scraping rules and sponsorship levels in sponsor_listing.json
HEREDOC
module_function
require 'csv'
Expand All @@ -14,95 +15,118 @@ module SponsorUtils
'Microsoft' => 'https://microsoft.com', # TODO determine exact normalization rules for sponsor links
'Google' => 'https://google.com'
}
# TODO: sponsor levels as an enum for consistency?
# NOTE Sponsor names are in a separate table

# @return a good enough normalized url
# FIXME .strip.downcase.sub('www.','').sub('http://', 'https://') # Some cheap normalization
def normalize_href(base, href)
return nil if href.nil?
return nil if /\/?#\z/.match(href) # Skip bare fragments
url = URI(href)
return url.absolute? ? url.to_s : URI(base).merge(href).to_s
end
# Custom parsing data
# Note OWASP parsing css may be fragile; relies on nth-of-type
DRUPAL_SPONSOR_PAGE = '.org-link a' # Sponsor is kept on a separate page

ASF_SPONSOR_CSS = { # Listing as of 2023: ul id="platinum" ul li a, gold, silver, bronze; targetedplatinum, etc.
'first' => '#platinum li a',
'second' => '#gold li a',
'third' => '#silver li a',
'fourth' => '#bronze li a',
'firstinkind' => '#targetedplatinum li a',
'secondinkind' => '#targetedgold li a',
'thirdinkind' => '#targetedsilver li a',
'fourthinkind' => '#targetedbronze li a'
}
NUMFOCUS_SPONSOR_CSS = {
'first' => '.et_pb_section_1 .et_pb_row_1 div > a',
'second' => '.et_pb_section_1 .et_pb_row_4 div > a',
'third' => '.et_pb_section_1 .et_pb_row_7 div > a',
'community' => '.et_pb_section_1 .et_pb_row_10 div > a',
'startuppartners' => '.et_pb_section_1 .et_pb_row_13 a',
'grants' => '.et_pb_section_2 a'
}
OSGEO_SPONSOR_CSS = {
'first' => '.Diamond-sponsors a',
'second' => '.Platinum-sponsors a',
'third' => '.Gold-sponsors a',
'fourth' => '.Silver-sponsors a',
'fifth' => '.Bronze-sponsors a'
}
DRUPAL_SPONSOR_CSS = { # TODO sponsor links are on separate page
'first' => '.sponsors--signature a',
'second' => '.view-display-id-attachment_6 a',
'third' => '.view-display-id-attachment_3 a',
'fourth' => '.view-display-id-attachment_1 a',
'community' => '.view-display-id-attachment_9 a'
}
DRUPAL_SPONSOR_PAGE = '.org-link a'
PYTHON_SPONSOR_CSS = { # TODO Uses ethicalads.io to disintermediate sponsor links/logos; requires custom processing
'first' => 'div[title="visionary Sponsors"] div[data-internal-year]',
'second' => 'div[title="sustainability Sponsors"] div[data-internal-year]',
'third' => 'div[title="maintaining Sponsors"] div[data-internal-year]',
'fourth' => 'div[title="contributing Sponsors"] div[data-internal-year]',
'fifth' => 'div[title="supporting Sponsors"] div[data-internal-year]',
'sixth' => 'div[title="partner Sponsors"] div[data-internal-year]',
'seventh' => 'div[title="participating Sponsors"] div[data-internal-year]',
'eighth' => 'div[title="associate Sponsors"] div[data-internal-year]'
}
FOUNDATION_MAP = {
'asf' => [ASF_SPONSOR_CSS, 'href'],
'numfocus' => [NUMFOCUS_SPONSOR_CSS, 'href'],
'osgeo' => [OSGEO_SPONSOR_CSS, 'href'],
'drupal' => [DRUPAL_SPONSOR_CSS, 'href'],
'python' => [PYTHON_SPONSOR_CSS, 'id']
}
# Return a normalized domain name for mapping to a sponsor org
# @return a good enough normalized url # FIXME
def normalize_href(href)
return URI(href.strip.downcase.sub('www.','')).authority
end

SELECTOR_OFFSET = 2
ATTR_OFFSET = 3
ATTR_HREF = 'href'
# Scrape sponsor listing defined by css selectors
# @param io input stream of html to parse
# @param shortname of foundation map to parse
# @return hash of sponsors by approximate map-defined levels
def scrape_bycss(io, foundation)
def scrape_bycss(io, orgmap)
sponsors = {}
cssmap = FOUNDATION_MAP.fetch(foundation, nil)
normalize = orgmap.fetch('normalize', nil)
cssmap = orgmap['levels']
doc = Nokogiri::HTML5(io)
body = doc.xpath('/html/body')
cssmap[0].each do | key, selector |
nodelist = body.css(selector)
cssmap.each do | key, selectors |
nodelist = body.css(selectors[SELECTOR_OFFSET])
sponsors[key] = []
attr = selectors[ATTR_OFFSET]
nodelist.each do | node |
sponsors[key] << node[cssmap[1]]
if ATTR_HREF.eql?(attr) && normalize
sponsors[key] << normalize_href(node[attr])
else
sponsors[key] << node[attr]
end
end
end
return sponsors
end
end

# ### #### ##### ######
# Main method for command line use
if __FILE__ == $PROGRAM_NAME
filename = '../../../sponsors-asf.html'
baseurl = ''
io = File.open(filename)
sponsors = SponsorUtils.scrape_bycss(io, 'asf')
puts JSON.pretty_generate(sponsors)
PYTHON_MAP = { # Hack for Python, which merely stores IDs that uses ethicalads.io
'google' => 'google.com' # etc. map ids to obvious sponsor domains
}
# Cleanup python sponsor list, since they use separate files
# @param sponsors hash of detected sponsor links
# @return sponsors hash normalized to domain names
# def cleanup_python(links)
# sponsors = {}
# links.each do | level, ary |
# ary.each do | itm |
# Un-PYTHON_MAP itm
# end
# end
# return sponsors
# end

# Rough count of number of times different urls appear at levels
# @param sponsors hash returned from scrape_bycss or cleanup
# @return hash of counts of how often domain names appear
def report_counts(sponsors)
# Setup data structure
counts = {}
counts['all'] = Hash.new(0)
%w[ first second third fourth fifth sixth seventh eighth community firstinkind secondinkind thirdinkind fourthinkind startuppartners grants ].each do | lvl |
counts[lvl] = Hash.new(0)
end
counts['all'] = Hash.new(0)
# Iterate each sponsor and all levels data
sponsors.each do | org, sponsorhash |
sponsorhash.each do | level, ary |
ary.each do | url |
counts['all'][url] += 1
counts[level][url] += 1
end
end
end
# Order for convenience
counts['all'] = Hash[counts['all'].sort_by { |k, v| -v }]
return counts
end

# ### #### ##### ######
# Main method for command line use
if __FILE__ == $PROGRAM_NAME
infile = 'sponsor_levels.json'
outfile = 'sponsor_utils.json'
io = nil
sponsors = {}
maps = JSON.parse(File.read(infile))
maps.each do | org, map |
map = map[0] # HACK: just use first map on list; by date for future use historical scans
if false
filename = "../../../sponsors-#{org}.html"
baseurl = ''
io = File.open(filename)
else
sponsorurl = map['sponsorurl']
begin
io = URI.open(sponsorurl).read
rescue StandardError => e
puts "ERROR: #{sponsorurl}: #{e.message}\n\n#{e.backtrace.join("\n\t")}"
next
end
end
sponsors[org] = SponsorUtils.scrape_bycss(io, map)
end
File.open(outfile, "w") do |f|
f.write(JSON.pretty_generate(sponsors))
end
counts = report_counts(sponsors)
puts JSON.pretty_generate(counts)
File.open('sponsor_metacount.json', "w") do |f|
f.write(JSON.pretty_generate(counts))
end
end
end

0 comments on commit 981da48

Please sign in to comment.