edgi-govdata-archiving
diff --git a/‎app/controllers/api/v0/pages_controller.rb
Lines changed: 13 additions & 3 deletions b/‎app/controllers/api/v0/pages_controller.rb
Lines changed: 13 additions & 3 deletions
diff --git a/‎app/controllers/api/v0/urls_controller.rb
Lines changed: 76 additions & 0 deletions b/‎app/controllers/api/v0/urls_controller.rb
Lines changed: 76 additions & 0 deletions
diff --git a/‎app/jobs/import_versions_job.rb
Lines changed: 18 additions & 1 deletion b/‎app/jobs/import_versions_job.rb
Lines changed: 18 additions & 1 deletion
diff --git a/‎app/models/concerns/taggable.rb
Lines changed: 2 additions & 2 deletions b/‎app/models/concerns/taggable.rb
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/models/merged_page.rb
Lines changed: 14 additions & 0 deletions b/‎app/models/merged_page.rb
Lines changed: 14 additions & 0 deletions
diff --git a/‎app/models/page.rb
Lines changed: 148 additions & 11 deletions b/‎app/models/page.rb
Lines changed: 148 additions & 11 deletions
@@ -71,7 +71,16 @@ def index
   end
 
   def show
-    page = Page.find(params[:id])
+    begin
+      page = Page.find(params[:id])
+    rescue ActiveRecord::RecordNotFound
+      merge = MergedPage.find(params[:id])
+      redirect_to(
+        api_v0_page_url(merge.target_uuid),
+        status: :permanent_redirect
+      ) and return
+    end
+
     data = page.as_json(include: [:maintainers, :tags])
     if should_allow_versions
       data['versions'] = page.versions.where(different: true).as_json
@@ -155,11 +164,12 @@ def page_collection
 
     if params[:url]
       query = params[:url]
+      collection = collection.joins(:urls)
       if query.include? '*'
         query = query.gsub('%', '\%').gsub('_', '\_').tr('*', '%')
-        collection = collection.where('url LIKE ?', query)
+        collection = collection.where('page_urls.url LIKE ?', query)
       else
-        collection = collection.where(url: query)
+        collection = collection.where('page_urls.url = ?', query)
       end
     end
 
 
@@ -0,0 +1,76 @@
+class Api::V0::UrlsController < Api::V0::ApiController
+  def index
+    urls = page.urls.order('page_urls.to_time DESC')
+
+    render json: {
+      links: { page: api_v0_page_url(page) },
+      data: urls
+    }
+  end
+
+  def show
+    @page_url ||= page.urls.find(params[:id])
+    render json: {
+      links: {
+        page: api_v0_page_url(page),
+        page_urls: api_v0_page_urls_url(page)
+      },
+      data: @page_url
+    }
+  end
+
+  def create
+    @page_url = page.urls.create!(url_params)
+    show
+  rescue ActiveRecord::RecordNotUnique
+    raise Api::ResourceExistsError, 'This page already has the given URL and timeframe'
+  end
+
+  def update
+    updates = url_params
+    if updates.key?(:url)
+      raise Api::UnprocessableError, 'You cannot change a URL\'s `url`'
+    end
+
+    @page_url ||= page.urls.find(params[:id])
+    @page_url.update(url_params)
+    show
+  end
+
+  def destroy
+    @page_url ||= page.urls.find(params[:id])
+    # You cannot delete the canonical URL.
+    if @page_url.url == page.url
+      raise Api::UnprocessableError, 'You cannot remove the page\'s canonical URL'
+    else
+      @page_url.destroy
+      redirect_to(api_v0_page_urls_url(page))
+    end
+  end
+
+  protected
+
+  def page
+    @page ||= Page.find(params[:page_id])
+  end
+
+  def url_params
+    result = params
+      .require(:page_url)
+      .permit(:url, :from_time, :to_time, :notes)
+
+    result.slice('from_time', 'to_time').each do |key, value|
+      result[key] = parse_time(key, value)
+    end
+
+    result
+  end
+
+  def parse_time(field, time_input)
+    return if time_input.nil?
+
+    Time.parse(time_input)
+  rescue ArgumentError
+    raise Api::UnprocessableError, "`#{field}` was not a valid time or `null`"
+  end
+end
@@ -173,10 +173,13 @@ def page_for_record(record, create: true, row:)
     validate_kind!([String], record, 'page_url')
     validate_kind!([Array, NilClass], record, 'page_maintainers')
     validate_kind!([Array, NilClass], record, 'page_tags')
+    validate_present!(record, 'capture_time')
+    validate_kind!([String], record, 'capture_time')
 
     url = record['page_url']
 
-    existing_page = Page.find_by_url(url)
+    capture_time = Time.parse(record['capture_time'])
+    existing_page = Page.find_by_url(url, at_time: capture_time)
     page = if existing_page
              log(object: existing_page, operation: :found, row: row)
              existing_page
@@ -191,6 +194,20 @@ def page_for_record(record, create: true, row:)
     (record['page_maintainers'] || []).each {|name| page.add_maintainer(name)}
     (record['page_tags'] || []).each {|name| page.add_tag(name)}
 
+    # If the page was not an *exact* URL match, add the URL to the page.
+    # (`page.find_by_url` used above will match by `url_key`, too.)
+    page.urls.find_or_create_by(url: url)
+    # TODO: Add URLs from redirects automatically. The main blocker for
+    # this at the moment is the following situation:
+    #
+    #   Two pages, A=https://example.com/about
+    #              B=https://example.com/about/locations
+    #   Page B is removed, but instead of returning a 404 or 403 status
+    #     code, it starts redirecting to Page A.
+    #
+    # This is unfortunately not uncommon, so we need some heuristics to
+    # account for it, e.g. URL does not already belong to another page.
+
     page
   end
 
 
@@ -2,8 +2,8 @@ module Taggable
   extend ActiveSupport::Concern
 
   included do
-    has_many :taggings, as: :taggable, foreign_key: 'taggable_uuid'
-    has_many :tags, through: :taggings
+    has_many :taggings, as: :taggable, foreign_key: 'taggable_uuid', dependent: :delete_all
+    has_many :tags, through: :taggings, dependent: nil
   end
 
   def add_tag(tag)
 
@@ -0,0 +1,14 @@
+# MergedPage keeps track of pages that were merged into others so we can
+# support old links by redirecting to the page they were merged into.
+# - The primary key is the ID of the page that was merged and removed
+# - `target_uuid` is the ID of the page it was merged into
+# - `audit_data` is any useful JSON data about the page (usually a frozen
+#   copy of its attributes).
+class MergedPage < ApplicationRecord
+  include UuidPrimaryKey
+
+  belongs_to :target,
+             class_name: 'Page',
+             foreign_key: :target_uuid,
+             required: true
+end
@@ -15,7 +15,11 @@ class Page < ApplicationRecord
   has_many :versions,
            -> { order(capture_time: :desc) },
            foreign_key: 'page_uuid',
-           inverse_of: :page
+           inverse_of: :page,
+           # You must explcitly dissociate or move versions before destroying.
+           # It's OK for a version to be orphaned from all pages, but we want
+           # to make sure that's an intentional action and not accidental.
+           dependent: :restrict_with_error
   has_one :earliest,
           (lambda do
             # DISTINCT ON requires the first ORDER to be the distinct column(s)
@@ -42,9 +46,18 @@ class Page < ApplicationRecord
           class_name: 'Version'
   # This needs a funky name because `changes` is a an activerecord method
   has_many :tracked_changes, through: :versions
+  has_many :urls,
+           class_name: 'PageUrl',
+           foreign_key: 'page_uuid',
+           inverse_of: :page,
+           dependent: :delete_all
+  has_many :current_urls,
+           -> { current },
+           class_name: 'PageUrl',
+           foreign_key: 'page_uuid'
 
-  has_many :maintainerships, foreign_key: :page_uuid
-  has_many :maintainers, through: :maintainerships
+  has_many :maintainerships, foreign_key: :page_uuid, dependent: :delete_all
+  has_many :maintainers, through: :maintainerships, dependent: nil
 
   scope(:needing_status_update, lambda do
     # NOTE: pages.status can be NULL, so use DISTINCT FROM instead of <>/!= to compare.
@@ -56,14 +69,30 @@ class Page < ApplicationRecord
   before_create :ensure_url_key
   after_create :ensure_domain_and_news_tags
   before_save :normalize_url
+  after_save :ensure_page_urls
   validate :url_must_have_domain
   validates :status,
             allow_nil: true,
             inclusion: { in: 100...600, message: 'is not between 100 and 599' }
 
-  def self.find_by_url(raw_url)
+  def self.find_by_url(raw_url, at_time: nil)
     url = normalize_url(raw_url)
-    Page.find_by(url: url) || Page.find_by(url_key: create_url_key(url))
+
+    current = PageUrl.eager_load(:page).current(at_time)
+    found = current.find_by(url: url)
+    return found.page if found
+
+    key = PageUrl.create_url_key(url)
+    found = current.find_by(url_key: key)
+    return found.page if found
+
+    with_pages = PageUrl.eager_load(:page).order(to_time: :desc)
+    found = with_pages.find_by(url: url) ||
+            with_pages.find_by(url_key: key)
+    return found.page if found
+
+    # TODO: remove this fallback when data is migrated over to Page.urls.
+    Page.find_by(url: url) || Page.find_by(url_key: key)
   end
 
   def self.normalize_url(url)
@@ -76,10 +105,6 @@ def self.normalize_url(url)
     end
   end
 
-  def self.create_url_key(url)
-    Surt.surt(url)
-  end
-
   def add_maintainer(maintainer)
     unless maintainer.is_a?(Maintainer)
       maintainer = Maintainer.find_or_create_by(name: maintainer)
@@ -132,7 +157,7 @@ def as_json(options = {})
   end
 
   def update_url_key
-    update(url_key: Page.create_url_key(url))
+    update(url_key: PageUrl.create_url_key(url))
   end
 
   def ensure_domain_and_news_tags
@@ -141,20 +166,89 @@ def ensure_domain_and_news_tags
     self.add_tag('news') if news?
   end
 
+  # Keep page creation relatively simple by automatically creating a PageUrl
+  # for the page's current URL when creating a page. (Page#url is the current
+  # canonical Url of the page, the true list of URLs associated with the page
+  # should always be the list of PageUrls in Page#urls).
+  def ensure_page_urls
+    urls.find_or_create_by!(url: url) if saved_change_to_attribute?('url')
+  end
+
   def update_status
     new_status = calculate_status
     self.update(status: new_status) unless new_status.zero?
     self.status
   end
 
+  def merge(*other_pages)
+    Page.transaction do
+      first_version_time = nil
+      other_pages.each do |other|
+        audit_data = other.create_audit_record
+
+        # Move versions from other page.
+        other.versions.to_a.each do |version|
+          self.versions << version
+          if first_version_time.nil? || first_version_time > version.capture_time
+            first_version_time = version.capture_time
+          end
+        end
+        # The above doesn't update the source page's `versions`, so reload.
+        other.versions.reload
+
+        # Copy other attributes from other page.
+        other.tags.each {|tag| add_tag(tag)}
+        other.maintainers.each {|maintainer| add_maintainer(maintainer)}
+        other.urls.each do |page_url|
+          # TODO: it would be slightly nicer to collapse/merge PageUrls with
+          # overlapping or intersecting time ranges here.
+          # NOTE: we have to be careful not to trip the DB's uniqueness
+          # constraints here or we hose the transaction.
+          just_created = false
+          merged_url = urls.find_or_create_by(
+            url: page_url.url,
+            from_time: page_url.from_time,
+            to_time: page_url.to_time
+          ) do |new_url|
+            new_url.notes = page_url.notes
+            just_created = true
+          end
+
+          unless just_created
+            new_notes = [merged_url.notes, page_url.notes].compact.join(' ')
+            merged_url.update(notes: new_notes)
+          end
+        end
+
+        # Keep a record so we can redirect requests for the merged page.
+        # Delete the actual page record rather than keep it around so we don't
+        # have to worry about messy partial indexes and querying around URLs.
+        MergedPage.create!(uuid: other.uuid, target: self, audit_data: audit_data)
+        # If the page we're removing was previously a merge target, update
+        # its references.
+        MergedPage.where(target_uuid: other.uuid).update_all(target_uuid: self.uuid)
+        # And finally drop the merged page.
+        other.destroy!
+
+        audit_json = Oj.dump(audit_data, mode: :rails)
+        Rails.logger.info("Merged page #{other.uuid} into #{uuid}. Old data: #{audit_json}")
+      end
+
+      # Recalculate denormalized attributes
+      update_page_title(first_version_time)
+      update_versions_different(first_version_time)
+      # TODO: it might be neat to clean up overlapping URL timeframes
+    end
+  end
+
   protected
 
   def news?
     url.include?('/news') || url.include?('/blog') || url.include?('/press')
   end
 
   def ensure_url_key
-    self.url_key ||= Page.create_url_key(url)
+    self.url_key ||= PageUrl.create_url_key(url)
   end
 
   def normalize_url
@@ -211,4 +305,47 @@ def calculate_status(relative_to: nil)
     success_rate = 1 - (error_time.to_f / total_time)
     success_rate < STATUS_SUCCESS_THRESHOLD ? latest_error : 200
   end
+
+  def update_page_title(from_time = nil)
+    candidates = versions.reorder(capture_time: :desc)
+    candidates = candidates.where('capture_time >= ?', from_time) if from_time
+    candidates.each do |version|
+      break if version.sync_page_title
+    end
+  end
+
+  # TODO: figure out whether there's a reasonable way to merge this logic with
+  # `Version#update_different_attribute`.
+  def update_versions_different(from_time)
+    previous_hash = nil
+    candidates = versions
+      .where('capture_time >= ?', from_time)
+      .reorder(capture_time: :asc)
+
+    candidates.each do |version|
+      if previous_hash.nil?
+        previous_hash = version.previous(different: false).try(:version_hash)
+      end
+
+      version.update(different: version.version_hash != previous_hash)
+      previous_hash = version.version_hash
+    end
+  end
+
+  # Creates a hash representing the current state of a page. Used for logging
+  # and other audit related purposes when deleting/merging pages.
+  def create_audit_record
+    # URLs are entirely unique to the page, so have to be recorded
+    # completely rather than referenced by ID.
+    urls_data = urls.collect do |page_url|
+      page_url.attributes.slice('url', 'from_time', 'to_time', 'notes')
+    end
+
+    attributes.merge({
+      'tags' => tags.collect(&:name),
+      'maintainers' => maintainers.collect(&:name),
+      'versions' => versions.collect(&:uuid),
+      'urls' => urls_data
+    })
+  end
 end