Skip to content

Commit

Permalink
Refactor Zipfulldata worker (#541)
Browse files Browse the repository at this point in the history
* Breaks up Zipfulldata worker into service classes
* Fixes N+1 queries for phenotype and picture phenotype CSVs
  * moving phenotype CSV generation into database for performance
* Fixes unintentional deletion of unrelated files
* Reduces the time it takes to assemble the zip file from about 10 to about 5 hours, with the bottle-neck being zipping the genotype files
  • Loading branch information
tsujigiri authored Jan 4, 2023
1 parent c40e123 commit 828d84f
Show file tree
Hide file tree
Showing 50 changed files with 2,703 additions and 2,212 deletions.
3 changes: 2 additions & 1 deletion app/models/achievement.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Achievement < ActiveRecord::Base

class Achievement < ApplicationRecord
include PgSearchCommon
has_many :user_achievements
pg_search_common_scope against: :award
Expand Down
16 changes: 16 additions & 0 deletions app/models/application_record.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# frozen_string_literal: true

class ApplicationRecord < ActiveRecord::Base
self.abstract_class = true

def self.copy_csv(sql)
Enumerator.new do |y|
conn = ActiveRecord::Base.connection.raw_connection
conn.copy_data "COPY (#{sql}) TO STDOUT WITH CSV HEADER DELIMITER ';'" do
while row = conn.get_copy_data
y << row
end
end
end
end
end
3 changes: 2 additions & 1 deletion app/models/genome_gov_paper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class GenomeGovPaper < ActiveRecord::Base

class GenomeGovPaper < ApplicationRecord
include PgSearchCommon

has_many :snp_references, as: :paper
Expand Down
5 changes: 3 additions & 2 deletions app/models/genotype.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# frozen_string_literal: true

require 'fileutils'

class Genotype < ActiveRecord::Base
class Genotype < ApplicationRecord
belongs_to :user
has_many :user_snps, dependent: :delete_all
validates_presence_of :user
Expand All @@ -20,7 +21,7 @@ def is_image?
end

def fs_filename
"#{user.id}.#{filetype}.#{id}"
"#{user_id}.#{filetype}.#{id}"
end

Paperclip.interpolates :fs_filename do |attachment, style|
Expand Down
3 changes: 2 additions & 1 deletion app/models/homepage.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Homepage < ActiveRecord::Base

class Homepage < ApplicationRecord
belongs_to :user
after_save :destroy_if_blank

Expand Down
3 changes: 2 additions & 1 deletion app/models/mendeley_paper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class MendeleyPaper < ActiveRecord::Base

class MendeleyPaper < ApplicationRecord
include PgSearchCommon

has_many :snp_references, as: :paper
Expand Down
3 changes: 2 additions & 1 deletion app/models/message.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Message < ActiveRecord::Base

class Message < ApplicationRecord
attr_encrypted :body, key: ENV.fetch('USER_DATA_SECRET_KEY')
attr_encrypted :subject, key: ENV.fetch('USER_DATA_SECRET_KEY')

Expand Down
2 changes: 1 addition & 1 deletion app/models/open_humans_profile.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

class OpenHumansProfile < ActiveRecord::Base
class OpenHumansProfile < ApplicationRecord
belongs_to :user
end
3 changes: 2 additions & 1 deletion app/models/pgp_annotation.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PgpAnnotation < ActiveRecord::Base

class PgpAnnotation < ApplicationRecord
include PgSearchCommon

belongs_to :snp
Expand Down
3 changes: 2 additions & 1 deletion app/models/phenotype.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Phenotype < ActiveRecord::Base

class Phenotype < ApplicationRecord
include PgSearchCommon

has_many :user_phenotypes, dependent: :destroy
Expand Down
3 changes: 2 additions & 1 deletion app/models/phenotype_comment.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PhenotypeComment < ActiveRecord::Base

class PhenotypeComment < ApplicationRecord
include PgSearchCommon

belongs_to :phenotype
Expand Down
3 changes: 2 additions & 1 deletion app/models/phenotype_set.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PhenotypeSet < ActiveRecord::Base

class PhenotypeSet < ApplicationRecord
include PgSearchCommon

has_and_belongs_to_many :phenotypes
Expand Down
3 changes: 2 additions & 1 deletion app/models/picture_phenotype.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PicturePhenotype < ActiveRecord::Base

class PicturePhenotype < ApplicationRecord
include PgSearchCommon

has_many :user_picture_phenotypes, dependent: :destroy
Expand Down
3 changes: 2 additions & 1 deletion app/models/picture_phenotype_comment.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PicturePhenotypeComment < ActiveRecord::Base

class PicturePhenotypeComment < ApplicationRecord
include PgSearchCommon

belongs_to :picture_phenotype
Expand Down
3 changes: 2 additions & 1 deletion app/models/plos_paper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PlosPaper < ActiveRecord::Base

class PlosPaper < ApplicationRecord
include PgSearchCommon

has_many :snp_references, as: :paper
Expand Down
3 changes: 2 additions & 1 deletion app/models/search_result.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# frozen_string_literal: true
class SearchResult < ActiveRecord::Base

class SearchResult < ApplicationRecord
end
3 changes: 2 additions & 1 deletion app/models/snp.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Snp < ActiveRecord::Base

class Snp < ApplicationRecord
include PgSearchCommon

has_many :user_snps, foreign_key: :snp_name, primary_key: :name
Expand Down
3 changes: 2 additions & 1 deletion app/models/snp_comment.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class SnpComment < ActiveRecord::Base

class SnpComment < ApplicationRecord
include PgSearchCommon

belongs_to :snp
Expand Down
3 changes: 2 additions & 1 deletion app/models/snp_reference.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class SnpReference < ActiveRecord::Base

class SnpReference < ApplicationRecord
self.primary_keys = :snp_id, :paper_id, :paper_type
belongs_to :snp
belongs_to :paper, polymorphic: true
Expand Down
3 changes: 2 additions & 1 deletion app/models/snpedia_paper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class SnpediaPaper < ActiveRecord::Base

class SnpediaPaper < ApplicationRecord
include PgSearchCommon

has_many :snp_references, as: :paper
Expand Down
3 changes: 2 additions & 1 deletion app/models/user.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class User < ActiveRecord::Base

class User < ApplicationRecord
include PgSearchCommon

has_attached_file :avatar,
Expand Down
3 changes: 2 additions & 1 deletion app/models/user_achievement.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserAchievement < ActiveRecord::Base

class UserAchievement < ApplicationRecord
belongs_to :achievement
belongs_to :user
end
3 changes: 2 additions & 1 deletion app/models/user_phenotype.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserPhenotype < ActiveRecord::Base

class UserPhenotype < ApplicationRecord
include PgSearchCommon

belongs_to :phenotype
Expand Down
3 changes: 2 additions & 1 deletion app/models/user_picture_phenotype.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserPicturePhenotype < ActiveRecord::Base

class UserPicturePhenotype < ApplicationRecord
include PgSearchCommon

belongs_to :picture_phenotype
Expand Down
1 change: 1 addition & 0 deletions app/models/user_session.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# frozen_string_literal: true

class UserSession < Authlogic::Session::Base
after_persisting :raven_set_user_context
after_destroy :raven_clear_user_context
Expand Down
3 changes: 2 additions & 1 deletion app/models/user_snp.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserSnp < ActiveRecord::Base

class UserSnp < ApplicationRecord
self.primary_keys = [:genotype_id, :snp_name]
belongs_to :snp, foreign_key: :snp_name, primary_key: :name, counter_cache: true
has_one :user, through: :genotype
Expand Down
114 changes: 114 additions & 0 deletions app/services/data_zipper_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# frozen_string_literal: true

require 'zip'
require_relative 'data_zipper_service/generate_user_phenotype_csv'
require_relative 'data_zipper_service/zip_user_picture_phenotypes'

class DataZipperService
CSV_OPTIONS = { col_sep: ';' }.freeze
PUBLIC_PATH = '/data/zip/opensnp_datadump.current.zip'
DEFAULT_OUTPUT_DIR = Rails.root.join('public', 'data', 'zip').freeze

attr_reader :time, :time_str, :zip_public_path, :zip_tmp_path, :tmp_dir,
:link_path, :output_dir, :logger

def initialize(output_dir: DEFAULT_OUTPUT_DIR, logger: Logger.new(STDOUT))
@output_dir = output_dir
@time = Time.now.utc
@time_str = time.strftime('%Y%m%d%H%M')
@tmp_dir = Rails.root.join('tmp', "opensnp_datadump.#{time_str}")
zip_file_name = "opensnp_datadump.#{time_str}.zip"
@zip_public_path = @output_dir.join(zip_file_name)
@zip_tmp_path = Rails.root.join('tmp', zip_file_name)
@link_path = @output_dir.join('opensnp_datadump.current.zip')
@logger = logger
end

def call
# only create a new file if in the current minute none has been created yet
if Dir.exist?(tmp_dir)
logger.error("Directory #{tmp_dir} already exists. Exiting...")
return false
end

begin
logger.info("Creating temp dir: #{tmp_dir}")
Dir.mkdir(tmp_dir)
logger.info("Creating zipfile: #{zip_tmp_path}")
Zip::File.open(zip_tmp_path, Zip::File::CREATE) do |zipfile|
zip_user_phenotypes(zipfile)
zip_user_picture_phenotypes(zipfile)
zip_readme(zipfile)
zip_genotype_files(zipfile)
end

# move from local storage to network storage
logger.info("Copying #{zip_tmp_path} to #{zip_public_path}")
FileUtils.cp(zip_tmp_path, zip_public_path)
logger.info("Deleting #{zip_tmp_path}")
FileUtils.rm(zip_tmp_path)
logger.info("Creating symlink #{link_path} to #{zip_public_path}")
FileUtils.ln_sf(zip_public_path, link_path)

# everything went OK, now delete old zips
delete_old_zips
ensure
logger.info("Deleting #{tmp_dir}")
FileUtils.rm_rf(tmp_dir)
end
end

def self.public_path
PUBLIC_PATH
end

private

# Create a CSV with a row for each genotype, with user data and phenotypes as
# columns.
def zip_user_phenotypes(zipfile)
logger.info('Zipping user phenotypes')
zipfile.get_output_stream("phenotypes_#{time_str}.csv") do |f|
GenerateUserPhenotypeCsv.new.call.each do |row|
f.write(row)
end
end
end

# make a CSV describing all of them - which filename is for which user's phenotype
def zip_user_picture_phenotypes(zipfile)
logger.info('Zipping user picture phenotypes')
ZipUserPicturePhenotypes.new(zipfile, tmp_dir, time_str).call
end

def zip_readme(zipfile)
logger.info('Zipping readme')
# make a README containing time of zip - this way, users can compare with page-status
# and see how old the data is
zipfile.get_output_stream('readme.txt') do |f|
f.write(
I18n.t(
'zipfulldata.readme',
time: time.ctime,
phenotype_count: Phenotype.count,
genotype_count: Genotype.count,
picture_count: PicturePhenotype.count
)
)
end
end

def zip_genotype_files(zipfile)
logger.info('Zipping genotype files')
ZipGenotypeFiles.new(zipfile).call
end

def delete_old_zips
forbidden_files = [link_path, zip_public_path].map(&:to_s)
Dir[output_dir.join('opensnp_datadump.*.zip')].each do |f|
next if forbidden_files.include?(f)
logger.info("Deleting #{f}")
File.delete(f)
end
end
end
Loading

0 comments on commit 828d84f

Please sign in to comment.