@@ -15,7 +15,11 @@ class Page < ApplicationRecord
15
15
has_many :versions ,
16
16
-> { order ( capture_time : :desc ) } ,
17
17
foreign_key : 'page_uuid' ,
18
- inverse_of : :page
18
+ inverse_of : :page ,
19
+ # You must explcitly dissociate or move versions before destroying.
20
+ # It's OK for a version to be orphaned from all pages, but we want
21
+ # to make sure that's an intentional action and not accidental.
22
+ dependent : :restrict_with_error
19
23
has_one :earliest ,
20
24
( lambda do
21
25
# DISTINCT ON requires the first ORDER to be the distinct column(s)
@@ -42,9 +46,18 @@ class Page < ApplicationRecord
42
46
class_name : 'Version'
43
47
# This needs a funky name because `changes` is a an activerecord method
44
48
has_many :tracked_changes , through : :versions
49
+ has_many :urls ,
50
+ class_name : 'PageUrl' ,
51
+ foreign_key : 'page_uuid' ,
52
+ inverse_of : :page ,
53
+ dependent : :delete_all
54
+ has_many :current_urls ,
55
+ -> { current } ,
56
+ class_name : 'PageUrl' ,
57
+ foreign_key : 'page_uuid'
45
58
46
- has_many :maintainerships , foreign_key : :page_uuid
47
- has_many :maintainers , through : :maintainerships
59
+ has_many :maintainerships , foreign_key : :page_uuid , dependent : :delete_all
60
+ has_many :maintainers , through : :maintainerships , dependent : nil
48
61
49
62
scope ( :needing_status_update , lambda do
50
63
# NOTE: pages.status can be NULL, so use DISTINCT FROM instead of <>/!= to compare.
@@ -56,14 +69,30 @@ class Page < ApplicationRecord
56
69
before_create :ensure_url_key
57
70
after_create :ensure_domain_and_news_tags
58
71
before_save :normalize_url
72
+ after_save :ensure_page_urls
59
73
validate :url_must_have_domain
60
74
validates :status ,
61
75
allow_nil : true ,
62
76
inclusion : { in : 100 ...600 , message : 'is not between 100 and 599' }
63
77
64
- def self . find_by_url ( raw_url )
78
+ def self . find_by_url ( raw_url , at_time : nil )
65
79
url = normalize_url ( raw_url )
66
- Page . find_by ( url : url ) || Page . find_by ( url_key : create_url_key ( url ) )
80
+
81
+ current = PageUrl . eager_load ( :page ) . current ( at_time )
82
+ found = current . find_by ( url : url )
83
+ return found . page if found
84
+
85
+ key = PageUrl . create_url_key ( url )
86
+ found = current . find_by ( url_key : key )
87
+ return found . page if found
88
+
89
+ with_pages = PageUrl . eager_load ( :page ) . order ( to_time : :desc )
90
+ found = with_pages . find_by ( url : url ) ||
91
+ with_pages . find_by ( url_key : key )
92
+ return found . page if found
93
+
94
+ # TODO: remove this fallback when data is migrated over to Page.urls.
95
+ Page . find_by ( url : url ) || Page . find_by ( url_key : key )
67
96
end
68
97
69
98
def self . normalize_url ( url )
@@ -76,10 +105,6 @@ def self.normalize_url(url)
76
105
end
77
106
end
78
107
79
- def self . create_url_key ( url )
80
- Surt . surt ( url )
81
- end
82
-
83
108
def add_maintainer ( maintainer )
84
109
unless maintainer . is_a? ( Maintainer )
85
110
maintainer = Maintainer . find_or_create_by ( name : maintainer )
@@ -132,7 +157,7 @@ def as_json(options = {})
132
157
end
133
158
134
159
def update_url_key
135
- update ( url_key : Page . create_url_key ( url ) )
160
+ update ( url_key : PageUrl . create_url_key ( url ) )
136
161
end
137
162
138
163
def ensure_domain_and_news_tags
@@ -141,20 +166,89 @@ def ensure_domain_and_news_tags
141
166
self . add_tag ( 'news' ) if news?
142
167
end
143
168
169
+ # Keep page creation relatively simple by automatically creating a PageUrl
170
+ # for the page's current URL when creating a page. (Page#url is the current
171
+ # canonical Url of the page, the true list of URLs associated with the page
172
+ # should always be the list of PageUrls in Page#urls).
173
+ def ensure_page_urls
174
+ urls . find_or_create_by! ( url : url ) if saved_change_to_attribute? ( 'url' )
175
+ end
176
+
144
177
def update_status
145
178
new_status = calculate_status
146
179
self . update ( status : new_status ) unless new_status . zero?
147
180
self . status
148
181
end
149
182
183
+ def merge ( *other_pages )
184
+ Page . transaction do
185
+ first_version_time = nil
186
+ other_pages . each do |other |
187
+ audit_data = other . create_audit_record
188
+
189
+ # Move versions from other page.
190
+ other . versions . to_a . each do |version |
191
+ self . versions << version
192
+ if first_version_time . nil? || first_version_time > version . capture_time
193
+ first_version_time = version . capture_time
194
+ end
195
+ end
196
+ # The above doesn't update the source page's `versions`, so reload.
197
+ other . versions . reload
198
+
199
+ # Copy other attributes from other page.
200
+ other . tags . each { |tag | add_tag ( tag ) }
201
+ other . maintainers . each { |maintainer | add_maintainer ( maintainer ) }
202
+ other . urls . each do |page_url |
203
+ # TODO: it would be slightly nicer to collapse/merge PageUrls with
204
+ # overlapping or intersecting time ranges here.
205
+ # NOTE: we have to be careful not to trip the DB's uniqueness
206
+ # constraints here or we hose the transaction.
207
+ just_created = false
208
+ merged_url = urls . find_or_create_by (
209
+ url : page_url . url ,
210
+ from_time : page_url . from_time ,
211
+ to_time : page_url . to_time
212
+ ) do |new_url |
213
+ new_url . notes = page_url . notes
214
+ just_created = true
215
+ end
216
+
217
+ unless just_created
218
+ new_notes = [ merged_url . notes , page_url . notes ] . compact . join ( ' ' )
219
+ merged_url . update ( notes : new_notes )
220
+ end
221
+ end
222
+
223
+ # Keep a record so we can redirect requests for the merged page.
224
+ # Delete the actual page record rather than keep it around so we don't
225
+ # have to worry about messy partial indexes and querying around URLs.
226
+ MergedPage . create! ( uuid : other . uuid , target : self , audit_data : audit_data )
227
+ # If the page we're removing was previously a merge target, update
228
+ # its references.
229
+ MergedPage . where ( target_uuid : other . uuid ) . update_all ( target_uuid : self . uuid )
230
+ # And finally drop the merged page.
231
+ other . destroy!
232
+
233
+ audit_json = Oj . dump ( audit_data , mode : :rails )
234
+ Rails . logger . info ( "Merged page #{ other . uuid } into #{ uuid } . Old data: #{ audit_json } " )
235
+ end
236
+
237
+ # Recalculate denormalized attributes
238
+ update_page_title ( first_version_time )
239
+ update_versions_different ( first_version_time )
240
+ # TODO: it might be neat to clean up overlapping URL timeframes
241
+ end
242
+ end
243
+
150
244
protected
151
245
152
246
def news?
153
247
url . include? ( '/news' ) || url . include? ( '/blog' ) || url . include? ( '/press' )
154
248
end
155
249
156
250
def ensure_url_key
157
- self . url_key ||= Page . create_url_key ( url )
251
+ self . url_key ||= PageUrl . create_url_key ( url )
158
252
end
159
253
160
254
def normalize_url
@@ -211,4 +305,47 @@ def calculate_status(relative_to: nil)
211
305
success_rate = 1 - ( error_time . to_f / total_time )
212
306
success_rate < STATUS_SUCCESS_THRESHOLD ? latest_error : 200
213
307
end
308
+
309
+ def update_page_title ( from_time = nil )
310
+ candidates = versions . reorder ( capture_time : :desc )
311
+ candidates = candidates . where ( 'capture_time >= ?' , from_time ) if from_time
312
+ candidates . each do |version |
313
+ break if version . sync_page_title
314
+ end
315
+ end
316
+
317
+ # TODO: figure out whether there's a reasonable way to merge this logic with
318
+ # `Version#update_different_attribute`.
319
+ def update_versions_different ( from_time )
320
+ previous_hash = nil
321
+ candidates = versions
322
+ . where ( 'capture_time >= ?' , from_time )
323
+ . reorder ( capture_time : :asc )
324
+
325
+ candidates . each do |version |
326
+ if previous_hash . nil?
327
+ previous_hash = version . previous ( different : false ) . try ( :version_hash )
328
+ end
329
+
330
+ version . update ( different : version . version_hash != previous_hash )
331
+ previous_hash = version . version_hash
332
+ end
333
+ end
334
+
335
+ # Creates a hash representing the current state of a page. Used for logging
336
+ # and other audit related purposes when deleting/merging pages.
337
+ def create_audit_record
338
+ # URLs are entirely unique to the page, so have to be recorded
339
+ # completely rather than referenced by ID.
340
+ urls_data = urls . collect do |page_url |
341
+ page_url . attributes . slice ( 'url' , 'from_time' , 'to_time' , 'notes' )
342
+ end
343
+
344
+ attributes . merge ( {
345
+ 'tags' => tags . collect ( &:name ) ,
346
+ 'maintainers' => maintainers . collect ( &:name ) ,
347
+ 'versions' => versions . collect ( &:uuid ) ,
348
+ 'urls' => urls_data
349
+ } )
350
+ end
214
351
end
0 commit comments