diff --git a/_data/lf_map.json b/_data/lf_map.json new file mode 100644 index 0000000..5015a3e --- /dev/null +++ b/_data/lf_map.json @@ -0,0 +1,25 @@ +{ + "ericsson": "ericsson.com", + "fujitsu-limited": "fujitsu.com", + "hitachi-ltd": "hitachi.com", + "huawei-technologies-co-ltd": "huawei.com", + "intel-corporation": "intel.com", + "microsoft-corporation": "microsoft.com", + "nec-corporation": "nec.com", + "oracle-america-inc": "oracle.com", + "qualcomm-inc": "qualcomm.com", + "red-hat-inc": "redhat.com", + "samsung-electronics-co-ltd": "samsung.com", + "tencent-holdings-limited": "tencent.com", + "accenture-global-solutions-limited": "accenture.com", + "beijing-baidu-netcom-science-technology-co-ltd": "baidu.com", + "cisco-systems-inc": "cisco.com", + "dell-technologies": "dell.com", + "google-llc": "google.com", + "panasonic-holdings-corporation": "panasonic.com", + "renesas-electronics-corporation": "renesas.com", + "sony-group-corporation": "sony.com", + "toshiba-corporation": "toshiba.com", + "toyota-motor-corporation": "toyota.com", + "webank-co-ltd": "webank.com" +} \ No newline at end of file diff --git a/_data/python_map.json b/_data/python_map.json new file mode 100644 index 0000000..1b67977 --- /dev/null +++ b/_data/python_map.json @@ -0,0 +1,45 @@ +{ + "google": "google.com", + "bloomberg": "bloomberg.com", + "ansys": "ansys.com", + "capital-one": "capitalone.com", + "indeed": "indeed.com", + "red-hat": "redhat.com", + "openedg-python-institute": "openedg.org", + "pydantic": "pydantic.dev", + "jane-street": "janestreet.com", + "sentry": "sentry.io", + "jetbrains": "jetbrains.com", + "haystack-by-deepset": "deepset.ai", + "temporal": "temporal.io", + "reuven-lerner-python-training": "lerner.co.il", + "elastic": "elastic.co", + "infobip": "infobip.com", + "us-digital-service": "usds.gov", + "no-starch-press": "nostarch.com", + "datadog": "datadog.com", + "hudson-river-trading": "hudsonrivertrading.com", + "auth0-by-okta": "okta.com", + "coherence": "withcoherence.com", + "hennge-japan": "hennge.com", + "akamai-cloud-computing-services": "akamai.com", + "telemetryhub": "telemetryhub.com", + "noteable": "noteable.io", + "real-python": "realpython.com", + "cape-privacy": "capeprivacy.com", + "six-feet-up": "sixfeetup.com", + "python-engineer-development-association": "pythonic-exam.com", + "sticker-mule": "stickermule.com", + "netflix": "netflix.com", + "lincoln-loop": "lincolnloop.com", + "healthvana": "healthvana.com", + "bluevine": "bluevine.com", + "guidebook": "guidebook.com", + "two-sigma": "twosigma.com", + "quansight": "quansight.com", + "dmtech-gmbh": "dm-jobs.com", + "minimumdepositcasinosorg": "minimumdepositcasinos.org", + "easeus-software": "easeus.com", + "adimianbe-sprl": "adimian.com", + "roboflow": "roboflow.com" +} \ No newline at end of file diff --git a/_sponsorships/cncf.json b/_sponsorships/cncf.json new file mode 100644 index 0000000..ddf1b99 --- /dev/null +++ b/_sponsorships/cncf.json @@ -0,0 +1,52 @@ +{ + "20240101": { + "sponsorurl": "https://github.com/cncf/landscape/blob/master/landscape.yml", + "levelurl": "https://www.cncf.io/about/join/", + "normalize": "true", + "selector": "CNCF Members", + "levels": { + "first": { + "name": "Platinum", + "amount": "370000", + "selector": "", + "attr": "homepage_url", + "benefits": "TODO: this should be structured data" + }, + "second": { + "name": "Gold", + "amount": "120000", + "selector": "", + "attr": "homepage_url", + "benefits": "TODO: this should be structured data" + }, + "third": { + "name": "Silver", + "amount": "50000", + "selector": "", + "attr": "homepage_url", + "benefits": "TODO: this should be structured data" + }, + "academic": { + "name": "Academic", + "amount": "1000", + "selector": "", + "attr": "homepage_url", + "benefits": "TODO: this should be structured data" + }, + "community": { + "name": "Nonprofit", + "amount": "1000", + "selector": "", + "attr": "homepage_url", + "benefits": "TODO: this should be structured data" + }, + "enduser": { + "name": "End User Supporter", + "amount": "0", + "selector": "", + "attr": "", + "benefits": "TODO: this should be structured data" + } + } + } +} \ No newline at end of file diff --git a/_sponsorships/sponsor_levels.json b/_sponsorships/sponsor_levels.json new file mode 100644 index 0000000..b28e29c --- /dev/null +++ b/_sponsorships/sponsor_levels.json @@ -0,0 +1,310 @@ +{ + "asf": [ + { + "startDate": "20240101", + "sponsorurl": "https://apache.org/foundation/sponsors", + "levelurl": "https://apache.org/foundation/sponsorship", + "normalize": "true", + "levels": { + "first": [ + "platinum", + "125000", + "#platinum li a", + "href" + ], + "second": [ + "gold", + "50000", + "#gold li a", + "href" + ], + "third": [ + "silver", + "25000", + "#silver li a", + "href" + ], + "fourth": [ + "bronze", + "6000", + "#bronze li a", + "href" + ], + "firstinkind": [ + "targeted platinum", + "125000", + "#targetedplatinum li a", + "href" + ], + "secondinkind": [ + "targeted gold", + "50000", + "#targetedgold li a", + "href" + ], + "thirdinkind": [ + "targeted silver", + "25000", + "#targetedsilver li a", + "href" + ], + "fourthinkind": [ + "targeted bronze", + "6000", + "#targetedbronze li a", + "href" + ] + } + } + ], + "numfocus": [ + { + "startDate": "20240101", + "sponsorurl": "https://numfocus.org/sponsors", + "levelurl": "https://numfocus.org/sponsors/become-a-sponsor", + "normalize": "true", + "levels": { + "first": [ + "principal", + "100000", + ".et_pb_section_1 .et_pb_row_1 div > a", + "href" + ], + "second": [ + "sustaining", + "50000", + ".et_pb_section_1 .et_pb_row_4 div > a", + "href" + ], + "third": [ + "contributing", + "10000", + ".et_pb_section_1 .et_pb_row_7 div > a", + "href" + ], + "community": [ + "platinum", + "1000", + ".et_pb_section_1 .et_pb_row_10 div > a", + "href" + ], + "startuppartners": [ + "startuppartners", + "0", + ".et_pb_section_1 .et_pb_row_13 a", + "href" + ], + "grants": [ + "grants", + "0", + ".et_pb_section_2 a", + "href" + ] + } + } + ], + "osgeo": [ + { + "startDate": "20240101", + "sponsorurl": "https://www.osgeo.org/sponsors/", + "levelurl": "https://www.osgeo.org/sponsors/", + "normalize": "true", + "levels": { + "first": [ + "diamond", + "30000", + ".Diamond-sponsors a", + "href" + ], + "second": [ + "platinum", + "20000", + ".Platinum-sponsors a", + "href" + ], + "third": [ + "gold", + "10000", + ".Gold-sponsors a", + "href" + ], + "fourth": [ + "silver", + "3000", + ".Silver-sponsors a", + "href" + ], + "fifth": [ + "bronze", + "500", + ".Bronze-sponsors a", + "href" + ] + } + } + ], + "drupal": [ + { + "startDate": "20240101", + "sponsorurl": "https://www.drupal.org/supporters/partners", + "levelurl": "https://www.drupal.org/association/supporters", + "levels": { + "first": [ + "enterprise", + "25000", + ".sponsors--signature a", + "href" + ], + "second": [ + "signature", + "15000", + ".view-display-id-attachment_6 a", + "href" + ], + "third": [ + "premium", + "7500", + ".view-display-id-attachment_3 a", + "href" + ], + "fourth": [ + "classic", + "2500", + ".view-display-id-attachment_1 a", + "href" + ], + "community": [ + "community", + "1000", + ".view-display-id-attachment_9 a", + "href" + ] + } + } + ], + "python": [ + { + "startDate": "20240101", + "sponsorurl": "https://www.python.org/psf/sponsors/", + "levelurl": "https://www.python.org/sponsors/application/", + "levels": { + "first": [ + "visionary", + "150000", + "div[title=\"visionary Sponsors\"] div[data-internal-year]", + "id" + ], + "second": [ + "sustainability", + "90000", + "div[title=\"sustainability Sponsors\"] div[data-internal-year]", + "id" + ], + "third": [ + "maintaining", + "60000", + "div[title=\"maintaining Sponsors\"] div[data-internal-year]", + "id" + ], + "fourth": [ + "contributing", + "30000", + "div[title=\"contributing Sponsors\"] div[data-internal-year]", + "id" + ], + "fifth": [ + "supporting", + "15000", + "div[title=\"supporting Sponsors\"] div[data-internal-year]", + "id" + ], + "sixth": [ + "partner", + "10000", + "div[title=\"partner Sponsors\"] div[data-internal-year]", + "id" + ], + "seventh": [ + "participating", + "3750", + "div[title=\"participating Sponsors\"] div[data-internal-year]", + "id" + ], + "eighth": [ + "associate", + "1500", + "div[title=\"associate Sponsors\"] div[data-internal-year]", + "id" + ] + } + } + ], + "owasp": [ + { + "startDate": "20240101", + "sponsorurl": "https://owasp.org/supporters/list", + "levelurl": "https://owasp.org/supporters/packages", + "normalize": "true", + "levels": { + "first": [ + "diamond", + "25000", + "section#sec-main ul:first-of-type a", + "href" + ], + "second": [ + "platinum", + "25000", + "section#sec-main ul:nth-of-type(2) a", + "href" + ], + "third": [ + "gold", + "15000", + "section#sec-main ul:nth-of-type(3) a", + "href" + ], + "fourth": [ + "silver", + "5000", + "section#sec-main ul:nth-of-type(4) a", + "href" + ] + } + } + ], + "lf": [ + { + "startDate": "20240101", + "sponsorurl": "https://lf-landscape.netlify.app/pages/members", + "levelurl": "https://www.linuxfoundation.org/hubfs/lf_member_benefits_122723a.pdf?hsLang=en", + "normalize": "true", + "levels": { + "first": [ + "platinum", + "500000", + "div[data-section-id=\"lf-members-platinum\"] > div", + "data-id" + ], + "second": [ + "gold", + "100000", + "div[data-section-id=\"lf-members-gold\"] > div", + "data-id" + ], + "third": [ + "silver", + "20000", + "div[data-section-id=\"lf-members-silver\"] > div", + "data-id" + ], + "fourth": [ + "associate", + "TBD", + "div[data-section-id=\"lf-members-platinum\"] > div", + "data-id" + ] + } + } + ] +} diff --git a/assets/ruby/sponsor_reports.rb b/assets/ruby/sponsor_reports.rb new file mode 100644 index 0000000..f0a105a --- /dev/null +++ b/assets/ruby/sponsor_reports.rb @@ -0,0 +1,64 @@ +#!/usr/bin/env ruby +module SponsorReports + DESCRIPTION = <<-HEREDOC + SponsorReports: Build simple reports from SponsorUtils data. + Default to run from project root directory. + HEREDOC + module_function + require 'csv' + require 'yaml' + require 'json' + + INKIND_DISCOUNT = 0.5 # Discount value from sponsor of in-kind levels + + # Report total (approx) cash outlay by sponsors accross all orgs + # in-kind donations are counted at INKIND_DISCOUNT of value (arbitrary estimate) + # @param orglist output of sponsor_utils listing org sponsors scraped + # @param levels definition of sponsorship levels by org + def sponsor_totals(orglist, levels) + report = {} + report['orgtotal'] = {} + report['sponsortotal'] = Hash.new(0) + orglist.each do | org, sponsors | + orgtotal = 0 + report['orgtotal'][org] = {} + sponsors.each do | lvl, ary | + lvlamt = levels[org][0]['levels'][lvl][1].to_i + numlvl = ary.size + amtlvl = lvlamt * numlvl + # For the organization's report, count full value for all + report['orgtotal'][org][lvl] = amtlvl + orgtotal += amtlvl + # For the sponsor's report, discount inkind levels + ary.each do | sponsorurl | + # TODO map any non-hostnames intelligently + # TODO use /inkind/ INKIND_DISCOUNT + # report['sponsortotal'][sponsorurl] += lvlamt # HACK this line randomly throws: undefined method `+' for nil:NilClass + # HACK sum up values the hard way + val = report['sponsortotal'].fetch(sponsorurl, nil) + if val + report['sponsortotal'][sponsorurl] += lvlamt + else + report['sponsortotal'][sponsorurl] = lvlamt + end + end + end + report['sponsortotal'] = Hash[report['sponsortotal'].sort_by { |k, v| -v }] + end + return report + end + + # ### #### ##### ###### + # Main method for command line use + if __FILE__ == $PROGRAM_NAME + # TODO: default dir? Command line params? Load each sponsor level by org? + levelfile = 'sponsor_levels.json' + orgfile = 'sponsor_utils.json' + levels = JSON.parse(File.read(levelfile)) + orglist = JSON.parse(File.read(orgfile)) + report = sponsor_totals(orglist, levels) + File.open('sponsor_report.json', "w") do |f| + f.write(JSON.pretty_generate(report)) + end + end +end diff --git a/assets/ruby/sponsor_utils.rb b/assets/ruby/sponsor_utils.rb index 1cd0c52..529f4df 100644 --- a/assets/ruby/sponsor_utils.rb +++ b/assets/ruby/sponsor_utils.rb @@ -2,7 +2,7 @@ module SponsorUtils DESCRIPTION = <<-HEREDOC SponsorUtils: good-enough scrapers and detectors of FOSS sponsors. - See also: scraping rules and sponsorship levels in sponsor_listing.json + Default to run from project root directory. HEREDOC module_function require 'csv' @@ -11,27 +11,40 @@ module SponsorUtils require 'open-uri' require 'nokogiri' - SPONSOR_MAP = { # Good enough map for common sponsors - 'Microsoft' => 'https://microsoft.com', # TODO determine exact normalization rules for sponsor links - 'Google' => 'https://google.com' - } + # NOTE OWASP parsing css may be fragile; relies on nth-of-type + # TODO: Eclipse dom parsing: + # div.eclipsefdn-members-list ... a with href and title that has sponsor name + # Member page: div.member-detail a - # Custom parsing data - # Note OWASP parsing css may be fragile; relies on nth-of-type - DRUPAL_SPONSOR_PAGE = '.org-link a' # Sponsor is kept on a separate page + # Map all sponsorships to common-ish levels + # - Ordinals are cash sponsorships in order + # - inkind is services donations (i.e. not just cash) + # - community is widely used as a separate level + # - grants covers any sort of government/institution grants + SPONSOR_METALEVELS = %w[ first second third fourth fifth sixth seventh eighth community firstinkind secondinkind thirdinkind fourthinkind startuppartners grants ] - # Return a normalized domain name for mapping to a sponsor org - # @return a good enough normalized url # FIXME + # Return a normalized domain name for mapping to a single sponsor org + # HACK note several special casees mapping down to single org + # @return a good enough normalized hostname def normalize_href(href) - return URI(href.strip.downcase.sub('www.','')).authority + return URI(href.strip.downcase.sub('www.', '') + .sub('opensource.google', 'google.com') + .sub('techatbloomberg.com', 'bloomberg.com') + .sub('opensource.twosigma.com', 'twosigma.com') + .sub('opensource.salesforce.com', 'salesforce.com') + # TODO: consider removing ^cloud. from: google baidu tencent + # TODO: consider removing ^aws. from amazon ^azure. from microsoft + # TODO: consider removing ^group. from mercedes-benz + # TODO: consider removing ^en. from various urls + ).authority end - SELECTOR_OFFSET = 2 + SELECTOR_OFFSET = 2 # TODO change to using per-org sponsor levels ATTR_OFFSET = 3 ATTR_HREF = 'href' - # Scrape sponsor listing defined by css selectors + # Scrape html sponsor listing defined by css selectors # @param io input stream of html to parse - # @param shortname of foundation map to parse + # @param sponsor level map of organization # @return hash of sponsors by approximate map-defined levels def scrape_bycss(io, orgmap) sponsors = {} @@ -54,34 +67,91 @@ def scrape_bycss(io, orgmap) return sponsors end - PYTHON_MAP = { # Hack for Python, which merely stores IDs that uses ethicalads.io - 'google' => 'google.com' # etc. map ids to obvious sponsor domains - } - # Cleanup python sponsor list, since they use separate files + # Read a CNCF style landscape for a sponsor list + # @param io input stream of YAML to parse + # @param sponsor level map of organization + # @return hash of sponsors by approximate map-defined levels + def parse_landscape(io, orgmap) + landscape = YAML.safe_load(io, aliases: true) + landscape = landscape['landscape'] # will be array + category = orgmap['selector'] + landscape = landscape.select{ | h | category.eql?(h.fetch('name', ''))} + groups = landscape.first.fetch('subcategories', nil) + sponsors = {} + if groups + groups.each do | group | + levelname = group['name'] + level = '' + orgmap['levels'].each do | lvl, h | + if levelname.eql?(h.fetch('name', '')) + level = lvl + break + end + end + sponsors[level] = [] + group['items'].each do | sponsor | + sponsors[level] << normalize_href(sponsor.fetch('homepage_url', sponsor['name'])) + end + end + else + sponsors['error'] = "ERROR: parse_landscape(... #{category}) not found" + end + return sponsors + end + + # Cleanup sponsor lists that are IDs not domains + # @param sponsors hash of detected sponsor ids + # @param filename of json mapping to read + # @return sponsors hash normalized to domain names + def cleanup_with_map(links, mapname) + sponsors = {} + map = JSON.parse(File.read(mapname)) + links.each do | level, ary | + sponsors[level] = [] + ary.each do | itm | + sponsors[level] << map.fetch(itm, itm) + end + end + return sponsors + end + + DRUPAL_SPONSOR_CSS = '.org-link a' # Sponsor is kept on a separate page + DRUPAL_SPONSOR_URL = 'https://www.drupal.org' + + # Cleanup drupal sponsor list, since they use separate files # @param sponsors hash of detected sponsor links # @return sponsors hash normalized to domain names - # def cleanup_python(links) - # sponsors = {} - # links.each do | level, ary | - # ary.each do | itm | - # Un-PYTHON_MAP itm - # end - # end - # return sponsors - # end + def cleanup_drupal(links) + sponsors = {} + links.each do | level, ary | + sponsors[level] = [] + ary.each do | itm | + begin + doc = Nokogiri::HTML5(URI.open("#{DRUPAL_SPONSOR_URL}#{itm}").read) + node = doc.at_css(DRUPAL_SPONSOR_CSS) + if node + sponsors[level] << normalize_href(node['href']) + else + sponsors[level] << itm + end + rescue StandardError => e + puts "ERROR: cleanup_drupal(...#{itm}): #{e.message}\n\n#{e.backtrace.join("\n\t")}" + sponsors[level] << itm # HACK: leave as-is, will be obvious to reader + end + end + end + return sponsors + end # Rough count of number of times different urls appear at levels # @param sponsors hash returned from scrape_bycss or cleanup # @return hash of counts of how often domain names appear def report_counts(sponsors) - # Setup data structure counts = {} - counts['all'] = Hash.new(0) - %w[ first second third fourth fifth sixth seventh eighth community firstinkind secondinkind thirdinkind fourthinkind startuppartners grants ].each do | lvl | + SPONSOR_METALEVELS.each do | lvl | counts[lvl] = Hash.new(0) end counts['all'] = Hash.new(0) - # Iterate each sponsor and all levels data sponsors.each do | org, sponsorhash | sponsorhash.each do | level, ary | ary.each do | url | @@ -90,7 +160,6 @@ def report_counts(sponsors) end end end - # Order for convenience counts['all'] = Hash[counts['all'].sort_by { |k, v| -v }] return counts end @@ -98,6 +167,17 @@ def report_counts(sponsors) # ### #### ##### ###### # Main method for command line use if __FILE__ == $PROGRAM_NAME + # TODO: default dir? Command line params? Load each sponsor level by org? + orgmap = JSON.parse(File.read('_sponsors/cncf.json')) + io = File.read('../../../../f/cncf-landscape/landscape.yml') # TODO parse url from the json + orgmap = orgmap['20240101'] # HACK: select current one TODO allow different dates/versions + sponsors = parse_landscape(io, orgmap) + File.open('cncfout.json', "w") do |f| + f.write(JSON.pretty_generate(sponsors)) + end + puts "DEBUG - done testing parse_landscape" + exit 1 + infile = 'sponsor_levels.json' outfile = 'sponsor_utils.json' io = nil @@ -105,7 +185,7 @@ def report_counts(sponsors) maps = JSON.parse(File.read(infile)) maps.each do | org, map | map = map[0] # HACK: just use first map on list; by date for future use historical scans - if false + if true filename = "../../../sponsors-#{org}.html" baseurl = '' io = File.open(filename) @@ -119,12 +199,21 @@ def report_counts(sponsors) end end sponsors[org] = SponsorUtils.scrape_bycss(io, map) + case org + when 'python' + sponsors[org] = cleanup_with_map(sponsors[org], 'python_map.json') + when 'drupal' + sponsors[org] = cleanup_drupal(sponsors[org]) + when 'lf' + sponsors[org] = cleanup_with_map(sponsors[org], 'lf_map.json') + else + # No-op + end end File.open(outfile, "w") do |f| f.write(JSON.pretty_generate(sponsors)) end counts = report_counts(sponsors) - puts JSON.pretty_generate(counts) File.open('sponsor_metacount.json', "w") do |f| f.write(JSON.pretty_generate(counts)) end