diff --git a/.circleci/config.yml b/.circleci/config.yml index e47bba6b..d09b8040 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -38,6 +38,15 @@ jobs: - run: name: Run tests command: bundle exec rake + test_3_2: + docker: + - image: cimg/ruby:3.2.1 + steps: + - checkout + - ruby/install-deps + - run: + name: Run tests + command: bundle exec rake workflows: version: 2 deploy: @@ -45,4 +54,5 @@ workflows: - test_2_6 - test_2_7 - test_3_0 - - test_3_1 \ No newline at end of file + - test_3_1 + - test_3_2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bb626da..f1af9f05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,15 @@ # MetaInpector Changelog -## [Changes in 5.13.0](https://github.com/metainspector/metainspector/compare/v5.12.1...v5.13.0) +## [Changes in 5.13.0](https://github.com/jaimeiniesta/metainspector/compare/v5.12.1...v5.13.0) * Remove support for #feed that was deprecated in 5.9 * Add support for Ruby 3.1 -## [Changes in 5.12.1](https://github.com/metainspector/metainspector/compare/v5.12.0...v5.12.1) +## [Changes in 5.12.1](https://github.com/jaimeiniesta/metainspector/compare/v5.12.0...v5.12.1) * Update dependencies: rubocop, nokogiri -## [Changes in 5.12.0](https://github.com/metainspector/metainspector/compare/v5.11.2...v5.12.0) +## [Changes in 5.12.0](https://github.com/jaimeiniesta/metainspector/compare/v5.11.2...v5.12.0) * Support Ruby 3.0 diff --git a/Gemfile.lock b/Gemfile.lock index 260b455d..bbd3ef3a 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,14 +1,14 @@ PATH remote: . specs: - metainspector (5.14.0) - addressable (~> 2.8) + metainspector (5.15.0) + addressable (~> 2.8.4) faraday (~> 2.5) faraday-cookie_jar (~> 0.0) faraday-encoding (~> 0.0) faraday-follow_redirects (~> 0.3) faraday-gzip (>= 0.1, < 2.0) - faraday-http-cache (~> 2.4) + faraday-http-cache (~> 2.5) faraday-retry (~> 2.0) fastimage (~> 2.2) nesty (~> 1.0) @@ -17,7 +17,7 @@ PATH GEM remote: http://rubygems.org/ specs: - addressable (2.8.1) + addressable (2.8.4) public_suffix (>= 2.0.2, < 6.0) ast (2.4.2) awesome_print (1.9.2) @@ -40,10 +40,10 @@ GEM faraday-gzip (1.0.0) faraday (>= 1.0) zlib (~> 2.1) - faraday-http-cache (2.4.1) + faraday-http-cache (2.5.0) faraday (>= 0.8) faraday-net_http (3.0.2) - faraday-retry (2.0.0) + faraday-retry (2.1.0) faraday (~> 2.0) fastimage (2.2.6) hashdiff (1.0.1) @@ -51,22 +51,33 @@ GEM domain_name (~> 0.5) json (2.6.3) method_source (1.0.0) - mini_portile2 (2.8.1) + mini_portile2 (2.8.2) + mustermann (3.0.0) + ruby2_keywords (~> 0.0.1) nesty (1.0.2) - nokogiri (1.14.2) + nio4r (2.5.9) + nokogiri (1.14.4) mini_portile2 (~> 2.8.0) racc (~> 1.4) - parallel (1.22.1) - parser (3.2.1.0) + nokogiri (1.14.4-arm64-darwin) + racc (~> 1.4) + parallel (1.23.0) + parser (3.2.2.1) ast (~> 2.4.1) pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) public_suffix (5.0.1) + puma (6.2.2) + nio4r (~> 2.0) racc (1.6.2) + rack (2.2.7) + rack-protection (3.0.6) + rack rainbow (3.1.1) rake (13.0.6) - regexp_parser (2.7.0) + regexp_parser (2.8.0) + resolv (0.2.2) rexml (3.2.5) rspec (3.12.0) rspec-core (~> 3.12.0) @@ -81,20 +92,26 @@ GEM diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.12.0) rspec-support (3.12.0) - rubocop (1.46.0) + rubocop (1.51.0) json (~> 2.3) parallel (~> 1.10) parser (>= 3.2.0.0) rainbow (>= 2.2.2, < 4.0) regexp_parser (>= 1.8, < 3.0) rexml (>= 3.2.5, < 4.0) - rubocop-ast (>= 1.26.0, < 2.0) + rubocop-ast (>= 1.28.0, < 2.0) ruby-progressbar (~> 1.7) unicode-display_width (>= 2.4.0, < 3.0) - rubocop-ast (1.26.0) + rubocop-ast (1.28.1) parser (>= 3.2.1.0) - ruby-progressbar (1.11.0) + ruby-progressbar (1.13.0) ruby2_keywords (0.0.5) + sinatra (3.0.6) + mustermann (~> 3.0) + rack (~> 2.2, >= 2.2.4) + rack-protection (= 3.0.6) + tilt (~> 2.0) + tilt (2.1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) @@ -106,15 +123,19 @@ GEM zlib (2.1.1) PLATFORMS - ruby + arm64-darwin-22 + x86_64-linux DEPENDENCIES awesome_print (~> 1.9) metainspector! pry (~> 0.14) + puma (~> 6.2.2) rake (~> 13.0) + resolv (~> 0.2.2) rspec (~> 3.11) rubocop (~> 1.34) + sinatra (~> 3.0.6) webmock (~> 3.17) BUNDLED WITH diff --git a/README.md b/README.md index 8d5b92bf..71b9fc7c 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,10 @@ # MetaInspector -[![Gem Version](https://badge.fury.io/rb/metainspector.svg)](http://badge.fury.io/rb/metainspector) [![CircleCI](https://circleci.com/gh/metainspector/metainspector.svg?style=svg)](https://circleci.com/gh/metainspector/metainspector) [![Code Climate](https://codeclimate.com/github/jaimeiniesta/metainspector/badges/gpa.svg)](https://codeclimate.com/github/jaimeiniesta/metainspector) [![Mentioned in Awesome Ruby](https://awesome.re/mentioned-badge.svg)](https://github.com/markets/awesome-ruby) +[![Gem Version](https://badge.fury.io/rb/metainspector.svg)](http://badge.fury.io/rb/metainspector) [![CircleCI](https://circleci.com/gh/jaimeiniesta/metainspector.svg?style=svg)](https://circleci.com/gh/jaimeiniesta/metainspector) [![Code Climate](https://codeclimate.com/github/jaimeiniesta/metainspector/badges/gpa.svg)](https://codeclimate.com/github/jaimeiniesta/metainspector) [![Mentioned in Awesome Ruby](https://awesome.re/mentioned-badge.svg)](https://github.com/markets/awesome-ruby) MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags... -## See it in action! - -You can try MetaInspector using this little demo: [https://github.com/metainspector/metainspectordemo](https://github.com/metainspector/metainspectordemo) - ## Installation Install the gem from RubyGems: @@ -309,6 +305,27 @@ If you want to disallow redirects, you can do it like this: page = MetaInspector.new('facebook.com', :allow_redirections => false) ``` +You can also customize how many redirects you wish to allow: + +```ruby +page = MetaInspector.new('facebook.com', :faraday_options => { redirect: { limit: 5 } }) +``` + +And even customize what to do in between each redirect: + +```ruby +callback = proc do |previous_response, next_request| +  ip_address = Resolv.getaddress(next_request.url.host) + raise 'Invalid address' if IPAddr.new(ip_address).private? +end + +page = MetaInspector.new(url, faraday_options: { redirect: { callback: callback } }) +``` + + +The `faraday_options[:redirect]` hash is passed to the `FollowRedirects` middleware used by `Faraday`, so that we can use all available options. +Check them [here](https://github.com/lostisland/faraday_middleware/blob/main/lib/faraday_middleware/response/follow_redirects.rb#L44). + ### Headers By default, the following headers are set: diff --git a/examples/faraday_redirect_options.rb b/examples/faraday_redirect_options.rb new file mode 100644 index 00000000..8229d085 --- /dev/null +++ b/examples/faraday_redirect_options.rb @@ -0,0 +1,42 @@ +# A MetaInspector example that runs a callback in between redirects. +# The callback raises an exception if the redirection points to a URL that resolves into a private IP address. +# This is one way of triggering a known security exploit called server-side request forgery (SSRF). +# +# To properly run this example you need a server which redirects to a service like nip.io. +# The easiest way to achieve that is running the examples/redirect_web_server.rb server in one terminal window, +# and calling its address with this example in another terminal window. +# +# Usage example: +# In terminal #1: +# ruby redirect_web_server.rb +# +# In terminal #2: +# ruby faraday_redirect_options.rb http://127.0.0.1:4567 + +require 'resolv' +require '../lib/metainspector' +puts "Using MetaInspector #{MetaInspector::VERSION}" + +# Get the starting URL +url = ARGV[0] || (puts "Enter an url"; gets.strip) + +# redirect options to be passed along to Faraday::FollowRedirects::Middleware +redirects_opts = { + limit: 5, + callback: proc do |_old_response, new_response| + ip_address = Resolv.getaddress(new_response.url.host) + raise 'Invalid address' if IPAddr.new(ip_address).private? + end +} + +begin + page = MetaInspector.new(url, faraday_options: { redirect: redirects_opts }) +rescue StandardError => e + puts e.message +else + puts "\nScraping #{page.url} returned these results:" + puts "\nTITLE: #{page.title}" + + puts "\nto_hash..." + puts page.to_hash +end diff --git a/examples/redirect_web_server.rb b/examples/redirect_web_server.rb new file mode 100644 index 00000000..7077e41c --- /dev/null +++ b/examples/redirect_web_server.rb @@ -0,0 +1,5 @@ +require 'sinatra' + +get '/' do + redirect 'http://10.0.0.0.nip.io/' +end diff --git a/lib/meta_inspector/request.rb b/lib/meta_inspector/request.rb index 5b5d62cf..fc0294fb 100644 --- a/lib/meta_inspector/request.rb +++ b/lib/meta_inspector/request.rb @@ -58,6 +58,7 @@ def response def fetch Timeout::timeout(fatal_timeout) do @faraday_options.merge!(:url => url) + follow_redirects_options = @faraday_options.delete(:redirect) || {} session = Faraday.new(@faraday_options) do |faraday| faraday.request :retry, max: @retries @@ -65,7 +66,8 @@ def fetch faraday.request :gzip if @allow_redirections - faraday.use Faraday::FollowRedirects::Middleware, limit: 10 + follow_redirects_options[:limit] ||= 10 + faraday.use Faraday::FollowRedirects::Middleware, **follow_redirects_options faraday.use :cookie_jar end @@ -84,7 +86,9 @@ def fetch req.options.open_timeout = @read_timeout end - @url.url = response.env.url.to_s + if @allow_redirections + @url.url = response.env.url.to_s + end response end diff --git a/lib/meta_inspector/version.rb b/lib/meta_inspector/version.rb index 0d16a172..e133cde9 100644 --- a/lib/meta_inspector/version.rb +++ b/lib/meta_inspector/version.rb @@ -1,3 +1,3 @@ module MetaInspector - VERSION = '5.14.0' + VERSION = '5.15.0' end diff --git a/meta_inspector.gemspec b/meta_inspector.gemspec index c435f343..3be87c28 100644 --- a/meta_inspector.gemspec +++ b/meta_inspector.gemspec @@ -5,7 +5,7 @@ Gem::Specification.new do |gem| gem.email = "jaimeiniesta@gmail.com" gem.description = %q{MetaInspector lets you scrape a web page and get its links, images, texts, meta tags...} gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns metadata from a given URL} - gem.homepage = "https://github.com/metainspector/metainspector" + gem.homepage = "https://github.com/jaimeiniesta/metainspector" gem.license = "MIT" gem.files = `git ls-files`.split("\n") @@ -20,9 +20,9 @@ Gem::Specification.new do |gem| gem.add_dependency 'faraday-encoding', '~> 0.0' gem.add_dependency 'faraday-follow_redirects', '~> 0.3' gem.add_dependency 'faraday-gzip', '>= 0.1', '< 2.0' - gem.add_dependency 'faraday-http-cache', '~> 2.4' + gem.add_dependency 'faraday-http-cache', '~> 2.5' gem.add_dependency 'faraday-retry', '~> 2.0' - gem.add_dependency 'addressable', '~> 2.8' + gem.add_dependency 'addressable', '~> 2.8.4' gem.add_dependency 'fastimage', '~> 2.2' gem.add_dependency 'nesty', '~> 1.0' @@ -31,5 +31,8 @@ Gem::Specification.new do |gem| gem.add_development_dependency 'awesome_print', '~> 1.9' gem.add_development_dependency 'rake', '~> 13.0' gem.add_development_dependency 'pry', '~> 0.14' + gem.add_development_dependency 'puma', '~> 6.2.2' gem.add_development_dependency 'rubocop', '~> 1.34' + gem.add_development_dependency 'resolv', '~> 0.2.2' + gem.add_development_dependency 'sinatra', '~> 3.0.6' end diff --git a/spec/document_spec.rb b/spec/document_spec.rb index ea3569ad..e39064d1 100644 --- a/spec/document_spec.rb +++ b/spec/document_spec.rb @@ -130,11 +130,11 @@ describe 'url normalization' do it 'should normalize by default' do - expect(MetaInspector.new('http://example.com/%EF%BD%9E').url).to eq('http://example.com/~') + expect(MetaInspector.new('http://example.com?name=joe martins', allow_redirections: false).url).to eq('http://example.com/?name=joe%20martins') end it 'should not normalize if the normalize_url option is false' do - expect(MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url).to eq('http://example.com/%EF%BD%9E') + expect(MetaInspector.new('http://example.com?name=joe martins', normalize_url: false, allow_redirections: false).url).to eq('http://example.com?name=joe martins') end end diff --git a/spec/meta_inspector/links_spec.rb b/spec/meta_inspector/links_spec.rb index 14522947..9b4238da 100644 --- a/spec/meta_inspector/links_spec.rb +++ b/spec/meta_inspector/links_spec.rb @@ -108,7 +108,7 @@ it "should handle links that have an invalid byte sequence" do m = MetaInspector.new('http://example.com/invalid_byte_seq') - expect(m.links.all).to eq(["http://pagerankalert.posterous.com/", "http://element%B3wgarderoby.com/", "http://twitter.com/pagerankalert"]) + expect(m.links.all).to eq(["http://pagerankalert.posterous.com/", "http://twitter.com/pagerankalert"]) end end diff --git a/spec/meta_inspector/redirections_spec.rb b/spec/meta_inspector/redirections_spec.rb index 9b193684..35f006d3 100644 --- a/spec/meta_inspector/redirections_spec.rb +++ b/spec/meta_inspector/redirections_spec.rb @@ -1,4 +1,7 @@ require 'spec_helper' +require 'resolv' + +class PrivateIPAddressError < StandardError; end describe MetaInspector do describe "redirections" do @@ -47,6 +50,25 @@ expect(page.url).to eq("http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1") end end + + context "when there is a callback to be ran between redirects that blocks redirections to private IP addresses" do + it "raises an exception" do + stub_request(:get, "https://www.facebook.com/") + .to_return(:status => 302, + :headers => { "Location" => "http://10.0.0.0/" }) + + redirect_options = { + callback: proc do |_previous_response, next_request| + ip_address = Resolv.getaddress(next_request.url.host) + raise PrivateIPAddressError if IPAddr.new(ip_address).private? + end + } + + expect { + MetaInspector.new("https://www.facebook.com/", faraday_options: { redirect: redirect_options }) + }.to raise_error PrivateIPAddressError + end + end end private diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index e18cf07b..0d46735e 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -50,7 +50,8 @@ def fixture_file(filename) stub_request(:get, "http://example.com/author_in_body").to_return(fixture_file("author_in_body.response")) stub_request(:get, "http://example.com/author_in_link").to_return(fixture_file("author_in_link.response")) stub_request(:get, "http://example.com/author_in_twitter").to_return(fixture_file("author_in_twitter.response")) - stub_request(:get, "http://example.com/~").to_return(fixture_file("example.response")) + stub_request(:get, "http://example.com/?name=joe martins").to_return(fixture_file("example.response")) + stub_request(:get, "http://example.com/?name=joe+martins").to_return(fixture_file("example.response")) stub_request(:get, "http://facebook.com/").to_return(fixture_file("facebook.com.response")) stub_request(:get, "http://international.com").to_return(fixture_file("international.response")) stub_request(:get, "http://pagerankalert-shortcut-and-icon.com").to_return(fixture_file("pagerankalert-shortcut-and-icon.com.response"))