diff --git a/lib/importer/csv_importer.rb b/lib/importer/csv_importer.rb deleted file mode 100644 index 97ae96aae..000000000 --- a/lib/importer/csv_importer.rb +++ /dev/null @@ -1,53 +0,0 @@ -# frozen_string_literal: true - -module Importer - # Import a csv file with one work per row. The first row of the csv should be a - # header row. The model for each row can either be specified in a column called - # 'type' or globally by passing the model attribute - class CSVImporter - # @param [String] metadata_file path to CSV file - # @param [String] files_directory path, passed to factory constructor - # @param [#to_s, Class] model if Class, the factory class to be invoked per row. - # Otherwise, the stringable first (Xxx) portion of an "XxxFactory" constant. - def initialize(metadata_file, files_directory, model = nil) - @metadata_file = metadata_file - @files_directory = files_directory - @model = model - end - - # @return [Integer] count of objects created - def import_all - count = 0 - parser.each do |attributes| - create_fedora_objects(attributes) - count += 1 - end - count - end - - private - - def parser - CSVParser.new(@metadata_file) - end - - # @return [Class] the model class to be used - def factory_class(model) - return model if model.is_a?(Class) - if model.empty? - warn 'ERROR: No model was specified' - exit(1) # rubocop:disable Rails/Exit - end - return Factory.for(model.to_s) if model.respond_to?(:to_s) - raise "Unrecognized model type: #{model.class}" - end - - # Build a factory to create the objects in fedora. - # @param [Hash String>] attributes - # @option attributes [String] :type overrides model for a single object - # @note remaining attributes are passed to factory constructor - def create_fedora_objects(attributes) - factory_class(attributes.delete(:type) || @model).new(attributes, @files_directory).run - end - end -end diff --git a/lib/importer/csv_parser.rb b/lib/importer/csv_parser.rb deleted file mode 100644 index b992d3ab5..000000000 --- a/lib/importer/csv_parser.rb +++ /dev/null @@ -1,154 +0,0 @@ -# frozen_string_literal: true - -module Importer - # rubocop:disable Metrics/ClassLength - class CSVParser - include Enumerable - - def initialize(file_name) - @file_name = file_name - end - - # @yieldparam attributes [Hash] the attributes from one row of the file - def each(&_block) - headers = nil - CSV.foreach(@file_name) do |row| - if headers - # we already have headers, so this is not the first row. - yield attributes(headers, row) - else - # Grab headers from first row - headers = validate_headers(row) - end - end - end - - private - - # Match headers like "lc_subject_type" - def type_header_pattern - /\A.*_type\Z/ - end - - def validate_headers(row) - row.compact! - difference = (row - valid_headers) - - # Allow headers with the pattern *_type to specify the - # record type for a local authority. - # e.g. For an author, author_type might be 'Person'. - difference.delete_if { |h| h.match(type_header_pattern) } - - raise "Invalid headers: #{difference.join(', ')}" if difference.present? - - validate_header_pairs(row) - row - end - - # If you have a header like lc_subject_type, the next - # header must be the corresponding field (e.g. lc_subject) - def validate_header_pairs(row) - errors = [] - row.each_with_index do |header, i| - next if header == 'resource_type' - next unless header.match(type_header_pattern) - next_header = row[i + 1] - field_name = header.gsub('_type', '') - errors << "Invalid headers: '#{header}' column must be immediately followed by '#{field_name}' column." if next_header != field_name - end - raise errors.join(', ') if errors.present? - end - - def valid_headers - GenericWork.attribute_names + %w[id type file] + collection_headers - end - - def collection_headers - %w[collection_id collection_title collection_accession_number] - end - - def attributes(headers, row) - {}.tap do |processed| - headers.each_with_index do |header, index| - extract_field(header, row[index], processed) - end - end - end - - # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity - def extract_field(header, val, processed) - return unless val - case header - when 'type', 'id' - # type and id are singular - processed[header.to_sym] = val - when /^(created|issued|date_copyrighted|date_valid)_(.*)$/ - key = "#{Regexp.last_match(1)}_attributes".to_sym - # TODO: this only handles one date of each type - processed[key] ||= [{}] - update_date(processed[key].first, Regexp.last_match(2), val) - when 'resource_type' - extract_multi_value_field(header, val, processed) - when type_header_pattern - update_typed_field(header, val, processed) - when /^contributor$/ - update_contributor(header, val, processed) - when /^collection_(.*)$/ - processed[:collection] ||= {} - update_collection(processed[:collection], Regexp.last_match(1), val) - else - last_entry = Array(processed[header.to_sym]).last - if last_entry.is_a?(Hash) && !last_entry[:name] - update_typed_field(header, val, processed) - else - extract_multi_value_field(header, val, processed) - end - end - end - # rubocop:enable Metrics/MethodLength, Metrics/CyclomaticComplexity - - # Faking a typed field for now. - # TODO: support other types of contributors - def update_contributor(header, val, processed) - key = header.to_sym - processed[key] ||= [] - processed[key] << { name: [val.strip] } - end - - def extract_multi_value_field(header, val, processed, key = nil) - key ||= header.to_sym - processed[key] ||= [] - val = val.strip - # Workaround for https://jira.duraspace.org/browse/FCREPO-2038 - val.delete!("\r") - processed[key] << (looks_like_uri?(val) ? RDF::URI(val) : val) - end - - def looks_like_uri?(str) - str =~ %r{^https?://} - end - - # Fields that have an associated *_type column - def update_typed_field(header, val, processed) - if header.match(type_header_pattern) - stripped_header = header.gsub('_type', '') - processed[stripped_header.to_sym] ||= [] - processed[stripped_header.to_sym] << { type: val } - else - fields = Array(processed[header.to_sym]) - fields.last[:name] = val - end - end - - def update_collection(collection, field, val) - val = [val] unless %w[admin_policy_id id].include? field - collection[field.to_sym] = val - end - - def update_date(date, field, val) - date[field.to_sym] ||= [] - date[field.to_sym] << val - end - end - # rubocop:enable Metrics/ClassLength -end diff --git a/lib/importer/factory.rb b/lib/importer/factory.rb deleted file mode 100644 index 43444c0fc..000000000 --- a/lib/importer/factory.rb +++ /dev/null @@ -1,21 +0,0 @@ -# frozen_string_literal: true - -module Importer - module Factory - extend ActiveSupport::Autoload - - eager_autoload do - autoload :CollectionFactory - autoload :ETDFactory - autoload :ImageFactory - autoload :ObjectFactory - autoload :StringLiteralProcessor - autoload :WithAssociatedCollection - end - - # @param [#to_s] First (Xxx) portion of an "XxxFactory" constant - def self.for(model_name) - const_get "#{model_name}Factory" - end - end -end diff --git a/lib/importer/factory/collection_factory.rb b/lib/importer/factory/collection_factory.rb deleted file mode 100644 index d3c8ebf33..000000000 --- a/lib/importer/factory/collection_factory.rb +++ /dev/null @@ -1,25 +0,0 @@ -# frozen_string_literal: true - -module Importer - module Factory - class CollectionFactory < ObjectFactory - self.klass = Collection - self.system_identifier_field = :identifier - - def find_or_create - collection = find - return collection if collection - run(&:save!) - end - - def update - raise "Collection doesn't exist" unless object - object.attributes = update - run_callbacks(:save) do - object.save! - end - log_updated(object) - end - end - end -end diff --git a/lib/importer/factory/etd_factory.rb b/lib/importer/factory/etd_factory.rb deleted file mode 100644 index bd06eb235..000000000 --- a/lib/importer/factory/etd_factory.rb +++ /dev/null @@ -1,18 +0,0 @@ -# frozen_string_literal: true - -module Importer - module Factory - class ETDFactory < ObjectFactory - include WithAssociatedCollection - - self.klass = GenericWork - # A way to identify objects that are not Hydra minted identifiers - self.system_identifier_field = :identifier - - # TODO: add resource type? - # def create_attributes - # #super.merge(resource_type: 'ETD') - # end - end - end -end diff --git a/lib/importer/factory/image_factory.rb b/lib/importer/factory/image_factory.rb deleted file mode 100644 index e03a73d61..000000000 --- a/lib/importer/factory/image_factory.rb +++ /dev/null @@ -1,18 +0,0 @@ -# frozen_string_literal: true - -module Importer - module Factory - class ImageFactory < ObjectFactory - include WithAssociatedCollection - - self.klass = Image - # A way to identify objects that are not Hydra minted identifiers - self.system_identifier_field = :identifier - - # TODO: add resource type? - # def create_attributes - # #super.merge(resource_type: 'Image') - # end - end - end -end diff --git a/lib/importer/factory/object_factory.rb b/lib/importer/factory/object_factory.rb deleted file mode 100644 index aae8c1b3f..000000000 --- a/lib/importer/factory/object_factory.rb +++ /dev/null @@ -1,159 +0,0 @@ -# frozen_string_literal: true - -require 'importer/log_subscriber' -module Importer - module Factory - # rubocop:disable Metrics/ClassLength - class ObjectFactory - extend ActiveModel::Callbacks - define_model_callbacks :save, :create - class_attribute :klass, :system_identifier_field - attr_reader :attributes, :files_directory, :object, :files - - def initialize(attributes, files_dir = nil, files = []) - @attributes = attributes - @files_directory = files_dir - @files = files - end - - def run - arg_hash = { id: attributes[:id], name: 'UPDATE', klass: } - @object = find - if @object - ActiveSupport::Notifications.instrument('import.importer', arg_hash) { update } - else - ActiveSupport::Notifications.instrument('import.importer', arg_hash.merge(name: 'CREATE')) { create } - end - yield(object) if block_given? - object - end - - def update - raise "Object doesn't exist" unless object - run_callbacks(:save) do - work_actor.update(environment(update)) - end - log_updated(object) - end - - def create_attributes - if klass == Collection - { collection_type: }.merge(transform_attributes) - else - transform_attributes - end - end - - def update_attributes - transform_attributes.except(:id) - end - - def find - return find_by_id if attributes[:id] - return search_by_identifier if attributes[system_identifier_field].present? - raise "Missing identifier: Unable to search for existing object without " \ - "either fedora ID or #{system_identifier_field}" - end - - def find_by_id - klass.find(attributes[:id]) if klass.exists?(attributes[:id]) - end - - def search_by_identifier - query = { "#{system_identifier_field}_ssim" => - attributes[system_identifier_field] } - klass.where(query).first - end - - def collection_type - @collection_type ||= Hyrax::CollectionType.find_or_create_default_collection_type - end - - # An ActiveFedora bug when there are many habtm <-> has_many associations means they won't all get saved. - # https://github.com/projecthydra/active_fedora/issues/874 - # 2+ years later, still open! - def create - attrs = create_attributes - @object = klass.new - run_callbacks :save do - run_callbacks :create do - klass == Collection ? create_collection(attrs) : work_actor.create(environment(attrs)) - end - end - log_created(object) - end - - def log_created(obj) - msg = "Created #{klass.model_name.human} #{obj.id}" - Rails.logger.info("#{msg} (#{Array(attributes[system_identifier_field]).first})") - end - - def log_updated(obj) - msg = "Updated #{klass.model_name.human} #{obj.id}" - Rails.logger.info("#{msg} (#{Array(attributes[system_identifier_field]).first})") - end - - private - - # @param [Hash] attrs the attributes to put in the environment - # @return [Hyrax::Actors::Environment] - def environment(attrs) - Hyrax::Actors::Environment.new(@object, Ability.new(User.batch_user), attrs) - end - - def work_actor - Hyrax::CurationConcern.actor - end - - def create_collection(attrs) - @object.attributes = attrs - @object.apply_depositor_metadata(User.batch_user) - @object.save! - end - - # Override if we need to map the attributes from the parser in - # a way that is compatible with how the factory needs them. - def transform_attributes - StringLiteralProcessor.process(attributes.slice(*permitted_attributes)) - .merge(file_attributes) - end - - # Find existing file or upload new file. This assumes a Work will have unique file titles; - # could filter by URIs instead (slower). - # When an uploaded_file already exists we do not want to pass its id in `file_attributes` - # otherwise it gets reuploaded by `work_actor`. - def upload_ids - work_files_titles = object.file_sets.map(&:title) if object.present? && object.file_sets.present? - work_files_titles&.include?(attributes[:file]) ? [] : [import_file(file_paths.first)] - end - - def file_attributes - hash = {} - hash[:uploaded_files] = upload_ids if files_directory.present? && attributes[:file].present? - hash[:remote_files] = attributes[:remote_files] if attributes[:remote_files].present? - hash - end - - def file_paths - attributes[:file]&.map { |file_name| File.join(files_directory, file_name) } - end - - def import_file(path) - u = Hyrax::UploadedFile.new - u.user_id = User.find_by_user_key(User.batch_user_key).id if User.find_by_user_key(User.batch_user_key) - u.file = CarrierWave::SanitizedFile.new(path) - u.save - u.id - end - - ## TO DO: handle invalid file in CSV - ## currently the importer stops if no file corresponding to a given file_name is found - - # Regardless of what the MODS Parser gives us, these are the properties we are prepared to accept. - def permitted_attributes - klass.properties.keys.map(&:to_sym) + %i[id edit_users edit_groups read_groups visibility] - end - end - # rubocop:enable Metrics/ClassLength - end -end diff --git a/lib/importer/factory/string_literal_processor.rb b/lib/importer/factory/string_literal_processor.rb deleted file mode 100644 index 27b49d4e0..000000000 --- a/lib/importer/factory/string_literal_processor.rb +++ /dev/null @@ -1,26 +0,0 @@ -# frozen_string_literal: true - -module Importer - module Factory - # Transform the attributes from the parser into basic string literals. - # An alternative processor may choose to find or create linked entities - class StringLiteralProcessor - # @param [Hash] attributes the input from the parser - # @return [Hash] a duplicate with the structured data converted to literals - # @example: - # process(contributor: [{ name: ["Muybridge"], type: "corporate" }]) - # # => { contributor: ["Muybridge"] } - def self.process(attributes) - attributes.merge(contributors(attributes)) - end - - # @param [Hash] attributes input data - # @return [Hash] transformed contributor data - def self.contributors(attributes) - value = attributes[:contributor] - return {} unless value - { contributor: value.map { |c| c[:name].join(' — ') } } - end - end - end -end diff --git a/lib/importer/factory/with_associated_collection.rb b/lib/importer/factory/with_associated_collection.rb deleted file mode 100644 index fe9c1f4c5..000000000 --- a/lib/importer/factory/with_associated_collection.rb +++ /dev/null @@ -1,29 +0,0 @@ -# frozen_string_literal: true - -module Importer - module Factory - module WithAssociatedCollection - extend ActiveSupport::Concern - - # Strip out the :collection key, and add the member_of_collection_ids, - # which is used by Hyrax::Actors::AddAsMemberOfCollectionsActor - def create_attributes - return super if attributes[:collection].nil? - super.except(:collection).merge(member_of_collection_attributes: [id: collection.id]) - end - - # Strip out the :collection key, and add the member_of_collection_ids, - # which is used by Hyrax::Actors::AddAsMemberOfCollectionsActor - def update_attributes - return super if attributes[:collection].nil? - super.except(:collection).merge(member_of_collection_attributes: [id: collection.id]) - end - - private - - def collection - CollectionFactory.new(attributes.fetch(:collection)).find_or_create - end - end - end -end diff --git a/lib/importer/log_subscriber.rb b/lib/importer/log_subscriber.rb deleted file mode 100644 index 215fe1889..000000000 --- a/lib/importer/log_subscriber.rb +++ /dev/null @@ -1,39 +0,0 @@ -# frozen_string_literal: true - -module Importer - class LogSubscriber < ActiveSupport::LogSubscriber - def initialize - super - @odd = false - end - - def import(event) - return unless logger.debug? - - payload = event.payload - - name = "#{payload[:name]} (#{event.duration.round(1)}ms)" - id = payload[:id] || '[no id]' - klass = payload[:klass] - - if odd? - name = color(name, CYAN, true) - id = color(id, nil, true) - else - name = color(name, MAGENTA, true) - end - - debug " #{name} #{klass} #{id}" - end - - def odd? - @odd = !@odd - end - - def logger - ActiveFedora::Base.logger - end - end -end - -Importer::LogSubscriber.attach_to :importer diff --git a/lib/importer/mods_importer.rb b/lib/importer/mods_importer.rb deleted file mode 100644 index 1e61b010b..000000000 --- a/lib/importer/mods_importer.rb +++ /dev/null @@ -1,46 +0,0 @@ -# frozen_string_literal: true - -require 'stanford' -module Importer - class ModsImporter - class_attribute :parser_class - self.parser_class = Stanford::Importer::ModsParser - - def initialize(files_directory, metadata_directory = nil) - @files_directory = files_directory - @metadata_directory = metadata_directory - end - - # @return [Fixnum] the count of imports - def import_all - count = 0 - Dir.glob("#{@metadata_directory}/**/*").each do |filename| - next if File.directory?(filename) - import(filename) - count += 1 - end - count - end - - def import(file) - Rails.logger.info "Importing: #{file}" - parser = parser_class.new(file) - create_fedora_objects(parser.model, parser.attributes) - end - - # Select a factory to create the objects in fedora. - # For example, if we are importing a MODS record for an - # image, the ModsParser will return an Image model, so - # we'll select the ImageFactory to create the fedora - # objects. - def create_fedora_objects(model, attributes) - Factory.for(model.to_s).new(attributes, @files_directory, files(attributes)).run - end - - # @param [Hash] attributes the attribuutes from the parser - # @return [Array] a list of file names to import - def files(attributes) - attributes[:files] - end - end -end diff --git a/lib/importer/mods_parser.rb b/lib/importer/mods_parser.rb deleted file mode 100644 index a3013d6ce..000000000 --- a/lib/importer/mods_parser.rb +++ /dev/null @@ -1,313 +0,0 @@ -# frozen_string_literal: true - -module Importer - # rubocop:disable Metrics/ClassLength - class ModsParser - NAMESPACES = { 'mods' => Mods::MODS_NS }.freeze - - attr_reader :filename - - def initialize(filename) - @filename = filename - end - - def model - @model ||= if collection? - Collection - elsif image? - 'Image' - else - 'ETD' - end - end - - def origin_text - 'Converted from MODS 3.4 to local RDF profile by Hyku' - end - - def mods - @mods ||= Mods::Record.new.from_file(filename) - end - - def collection? - type_keys = mods.typeOfResource.attributes.map(&:keys).flatten - return false unless type_keys.include?('collection') - mods.typeOfResource.attributes.any? { |hash| hash.fetch('collection').value == 'yes' } - end - - # For now the only things we import are collections and - # images, so if it's not a collection, assume it's an image. - # TODO: Identify images or other record types based on - # the data in . - def image? - !collection? - end - - def attributes - if model == Collection - collection_attributes - else - record_attributes - end - end - - def record_attributes - common_attributes.merge(collection:, series_name:) - .merge(files) - end - - # @return [Hash] hash with a key :files, if there are any files. - def files - {} - end - - def series_name - mods.xpath("//mods:relatedItem[@type='series']", NAMESPACES) - .titleInfo.title.map(&:text) - end - - def collection_attributes - common_attributes - end - - def common_attributes - description - .merge(dates) - .merge(locations) - .merge(rights) - .merge(identifiers) - .merge(relations) - end - - # rubocop:disable Metrics/MethodLength, Metrics/AbcSize - def description - { - title: untyped_title, - alternative: alt_title, - description: mods_description, - subject:, - extent: mods.physical_description.extent.map { |node| strip_whitespace(node.text) }, - language:, - digital_origin: mods.physical_description.digitalOrigin.map(&:text), - publisher: mods.origin_info.publisher.map(&:text), - form_of_work: mods.genre.valueURI.map { |uri| RDF::URI.new(uri) }, - resource_type:, - citation:, - notes_attributes: notes, - record_origin:, - description_standard: mods.record_info.descriptionStandard.map(&:text) - } - end - # rubocop:enable Metrics/MethodLength, Metrics/AbcSize - - def language - mods.language.languageTerm.map do |term| - uris = term.valueURI.map { |uri| RDF::URI.new(uri) } - uris.presence || term.text - end - end - - def resource_type - uris = mods.xpath('//mods:mods/mods:typeOfResource/@valueURI', NAMESPACES).map { |uri| RDF::URI.new(uri.value) } - return uris if uris.present? - Array.wrap(mods.typeOfResource.text) - end - - def rights - query = '/mods:mods/mods:accessCondition[@type="use and reproduction"]' - { - restrictions: mods.xpath(query, NAMESPACES).map { |node| strip_whitespace(node.text) } - } - end - - def locations - { - location: mods.subject.geographic.valueURI.map { |uri| RDF::URI.new(uri) }, - sub_location:, - institution: institutional_location, - place_of_publication: mods.origin_info.place.placeTerm.map(&:text) - }.merge(coordinates) - end - - def sub_location - query = './mods:copyInformation/mods:subLocation' - mods.location.holdingSimple.xpath(query, NAMESPACES).map(&:text) - end - - def institutional_location - uris = mods.location.physicalLocation.valueURI.map { |uri| RDF::URI.new(uri) } - return uris if uris.present? - Array.wrap(mods.location.physicalLocation.text) - end - - def dates - { - issued_attributes: build_date(mods.origin_info.dateIssued), - created_attributes: build_date(mods.origin_info.dateCreated), - date_other_attributes: build_date(mods.origin_info.dateOther), - date_copyrighted_attributes: build_date(mods.origin_info.copyrightDate), - date_valid_attributes: build_date(mods.origin_info.dateValid) - } - end - - def identifiers - { accession_number: mods.identifier.map(&:text) } - end - - def record_origin - ro = mods.record_info.recordOrigin.map { |node| prepend_timestamp(strip_whitespace(node.text)) } - ro << prepend_timestamp(origin_text) - end - - # returns a hash with :latitude and :longitude - def coordinates - coords = mods.subject.cartographics.coordinates.map(&:text) - # a hash where any value defaults to an empty array - result = Hash.new { |h, k| h[k] = [] } - coords.each_with_object(result) do |coord, h| - (latitude, longitude) = coord.split(/,\s*/) - h[:latitude] << latitude - h[:longitude] << longitude - end - end - - def mods_description - mods.abstract.map { |e| strip_whitespace(e.text) } - end - - def relations - name_nodes = mods.xpath('//mods:mods/mods:name', NAMESPACES) - # TODO: do we want all sorts of relators? - # property_name_for_uri = Metadata::MARCREL.invert - name_nodes.each_with_object({}) do |node, relations| - # property = if value_uri = node.role.roleTerm.valueURI.first - # property_name_for_uri[RDF::URI(value_uri)] - # else - # $stderr.puts "no role was specified for name #{node.namePart.text}" - # :contributor - # end - property = :contributor - relations[property] ||= [] - relations[property] << build_relation(node) - end - end - - def build_relation(node) - uri = node.attributes['valueURI'] - if uri.blank? - { name: node.namePart.map(&:text), - type: node.attributes['type'].value } - else - RDF::URI.new(uri) - end - end - - def collection - { - identifier: collection_id, - title: collection_name - } - end - - def collection_name - node_set = mods.at_xpath("//mods:relatedItem[@type='host']", NAMESPACES) - return unless node_set - [node_set.titleInfo.title.text.strip] - end - - def collection_id - query = "//mods:relatedItem[@type='host']/mods:identifier[@type='uri']" - node_set = mods.at_xpath(query, NAMESPACES) - return [] unless node_set - Array.wrap(node_set.text) - end - - # Remove multiple whitespace - def citation - mods.xpath('//mods:note[@type="preferred citation"]', NAMESPACES).map do |node| - node.text.gsub(/\n\s+/, "\n") - end - end - - def notes - preferred_citation = 'preferred citation' - type = 'type' - mods.note.each_with_object([]) do |node, list| - next if node.attributes.key?(type) && node.attributes[type].value == preferred_citation - hash = { value: node.text.gsub(/\n\s+/, "\n") } - type_attr = node.attributes[type].try(:text) - hash[:note_type] = type_attr if type_attr - list << hash - end - end - - private - - def build_date(node) - finish = finish_point(node) - start = start_point(node) - dates = [{ start: start.map(&:text), finish: finish.map(&:text), label: date_label(node), - start_qualifier: qualifier(start), finish_qualifier: qualifier(finish) }] - dates.delete_if { |date| date.values.all?(&:blank?) } - dates - end - - def qualifier(nodes) - nodes.map { |node| node.attributes['qualifier'].try(:value) }.compact - end - - def finish_point(node) - node.css('[point="end"]') - end - - def start_point(node) - node.css("[encoding]:not([point='end'])") - end - - def date_label(node) - node.css(':not([encoding])').map(&:text) - end - - def untyped_title - mods.xpath('/mods:mods/mods:titleInfo[not(@type)]/mods:title', NAMESPACES).map(&:text) - end - - def alt_title - Array(mods.xpath('//mods:titleInfo[@type]', NAMESPACES)).flat_map do |node| - type = node.attributes['type'].text - alternative = 'alternative' - - node.title.map do |title| - value = title.text - warn_title_transform(type, value) unless type == alternative - value - end - end - end - - def warn_title_tranform(type, value) - Rails.logger.warn "Transformtion: \"#{type} title\" will be stored as \"alternative title\": #{value}" - end - - def prepend_timestamp(text) - "#{Time.now.utc.to_s(:iso8601)} #{text}" - end - - def strip_whitespace(text) - text.tr("\n", ' ').delete("\t") - end - - def subject - query = '//mods:subject/mods:name/@valueURI|//mods:subject/mods:topic/@valueURI' - uris = mods.xpath(query, NAMESPACES).map { |uri| RDF::URI.new(uri) } - return uris unless uris.empty? - mods.subject.map do |sub| - text = sub.css('name namePart').text - secondary = sub.css('topic,genre').text - text += " -- #{secondary}" if secondary.present? - text - end - end - end - # rubocop:enable Metrics/ClassLength -end diff --git a/spec/lib/importer/csv_importer_spec.rb b/spec/lib/importer/csv_importer_spec.rb deleted file mode 100644 index 5a7f699d3..000000000 --- a/spec/lib/importer/csv_importer_spec.rb +++ /dev/null @@ -1,46 +0,0 @@ -# frozen_string_literal: true - -require 'importer' - -RSpec.describe Importer::CSVImporter do - let(:image_directory) { 'spec/fixtures/images' } - - context 'when the model is passed' do - let(:csv_file) { "#{fixture_path}/csv/gse_metadata.csv" } - let(:importer) { described_class.new(csv_file, image_directory, fallback_class) } - let(:fallback_class) { Class.new { def initialize(_argx, _argy); end } } - let(:factory) { double(run: true) } - - # NOTE: 2 rows do not specify type, 17 do - it 'creates new works' do - expect(fallback_class).to receive(:new) - .with(any_args).and_return(factory).twice - expect(Importer::Factory::ETDFactory).to receive(:new) - .with(any_args).and_return(factory).exactly(17).times - importer.import_all - end - end - - context 'when the model specified on the row' do - let(:csv_file) { "#{fixture_path}/csv/sample.csv" } - let(:importer) { described_class.new(csv_file, image_directory) } - let(:collection_factory) { double } - let(:image_factory) { double } - - it 'creates new images and collections' do - expect(Importer::Factory::CollectionFactory).to receive(:new) - .with(hash_excluding(:type), image_directory) - .and_return(collection_factory) - expect(collection_factory).to receive(:run) - expect(Importer::Factory::ImageFactory).to receive(:new) - .with(hash_excluding(:type), image_directory) - .and_return(collection_factory) - expect(collection_factory).to receive(:run) - expect(Importer::Factory::ETDFactory).to receive(:new) - .with(hash_excluding(:type), image_directory) - .and_return(image_factory) - expect(image_factory).to receive(:run) - importer.import_all - end - end -end diff --git a/spec/lib/importer/csv_parser_spec.rb b/spec/lib/importer/csv_parser_spec.rb deleted file mode 100644 index ced604c87..000000000 --- a/spec/lib/importer/csv_parser_spec.rb +++ /dev/null @@ -1,85 +0,0 @@ -# frozen_string_literal: true - -require 'importer' - -RSpec.describe Importer::CSVParser do - let(:parser) { described_class.new(file) } - let(:attributes) { parser.attributes } - let(:file) { "#{fixture_path}/csv/gse_metadata.csv" } - let(:first_record) { parser.first } - - context 'Importing just images' do - it 'parses a record' do - # Title must be singular - expect(first_record[:title]).to eq ['Work in Progress - A Framework for Building Interactive Learning Modules'] - - expect(first_record[:file]).to eq ['DalmonEtAl2011_Framework_Final.pdf'] - - expect(first_record[:date_created]).to eq ['2011'] - - expect(first_record[:contributor]).to eq [{ name: ["Dalmon, Danilo"] }, - { name: ["Brandao, Leonidas"] }, - { name: ["Brandao, Anarosa"] }, - { name: ["Isotani, Seiji"] }] - expect(first_record.keys).to match_array %i[id type title description - subject resource_type contributor - date_created file] - end - end - - describe 'validating CSV headers' do - subject { parser.send(:validate_headers, headers) } - - context 'with valid headers' do - let(:headers) { %w[id title] } - - it { is_expected.to eq headers } - end - - context 'with invalid headers' do - let(:headers) { ['something bad', 'title'] } - - it 'raises an error' do - expect { subject }.to raise_error 'Invalid headers: something bad' - end - end - - context 'with nil headers' do - let(:headers) { ['title', nil] } - - it { is_expected.to eq headers } - end - - # It doesn't expect a matching column for "resource_type" - context 'with resource_type column' do - let(:headers) { %w[resource_type title] } - - it { is_expected.to eq headers } - end - end - - describe "validate_header_pairs" do - subject { parser.send(:validate_header_pairs, headers) } - - context 'with "*_type" columns' do - let(:headers) { %w[rights_holder rights_holder_type rights_holder title note_type note] } - - it { is_expected.to be_nil } - end - - # The CSV parser assumes that the *_type column comes just - # before the column that contains the value for that local - # authority. If the columns aren't in the correct order, - # raise an error. - context 'with columns in the wrong order' do - let(:headers) { %w[note note_type rights_holder_type rights_holder_type rights_holder title] } - - it 'raises an error' do - expect { subject }.to raise_error "Invalid headers: 'note_type' column " \ - "must be immediately followed by 'note' column., Invalid headers: " \ - "'rights_holder_type' column must be immediately followed by " \ - "'rights_holder' column." - end - end - end -end diff --git a/spec/lib/importer/factory/etd_factory_spec.rb b/spec/lib/importer/factory/etd_factory_spec.rb deleted file mode 100644 index 2910697af..000000000 --- a/spec/lib/importer/factory/etd_factory_spec.rb +++ /dev/null @@ -1,36 +0,0 @@ -# frozen_string_literal: true - -require 'importer' - -RSpec.describe Importer::Factory::ETDFactory, :clean do - let(:factory) { described_class.new(attributes) } - let(:files) { [] } - let(:work) { GenericWork } - - context 'when a collection already exists' do - let!(:coll) { create(:collection) } - let(:attributes) do - { - collection: { id: coll.id }, - files:, - identifier: ['123'], - title: ['Test image'], - read_groups: ['public'], - depositor: 'bob', - edit_users: ['bob'] - } - end - let(:actor) { Hyrax::CurationConcern.actor } - - it 'does not create a new collection' do - expect(actor).to receive(:create).with(Hyrax::Actors::Environment) do |k| - expect(k.attributes).to include(member_of_collection_attributes: [id: coll.id]) - end - expect do - factory.run - end.to change(Collection, :count).by(0) - end - end - - include_examples("csv_importer") -end diff --git a/spec/lib/importer/factory/image_factory_spec.rb b/spec/lib/importer/factory/image_factory_spec.rb deleted file mode 100644 index 464776554..000000000 --- a/spec/lib/importer/factory/image_factory_spec.rb +++ /dev/null @@ -1,36 +0,0 @@ -# frozen_string_literal: true - -require 'importer' - -RSpec.describe Importer::Factory::ImageFactory, :clean do - let(:factory) { described_class.new(attributes) } - let(:files) { [] } - let(:work) { Image } - - context 'when a collection already exists' do - let!(:coll) { create(:collection) } - let(:attributes) do - { - collection: { id: coll.id }, - files:, - identifier: ['123'], - title: ['Test image'], - read_groups: ['public'], - depositor: 'bob', - edit_users: ['bob'] - } - end - let(:actor) { Hyrax::CurationConcern.actor } - - it 'does not create a new collection' do - expect(actor).to receive(:create).with(Hyrax::Actors::Environment) do |k| - expect(k.attributes).to include(member_of_collection_attributes: [id: coll.id]) - end - expect do - factory.run - end.to change(Collection, :count).by(0) - end - end - - include_examples("csv_importer") -end diff --git a/spec/lib/importer/factory/string_literal_processor_spec.rb b/spec/lib/importer/factory/string_literal_processor_spec.rb deleted file mode 100644 index 5bec71bbb..000000000 --- a/spec/lib/importer/factory/string_literal_processor_spec.rb +++ /dev/null @@ -1,38 +0,0 @@ -# frozen_string_literal: true - -require 'importer' - -RSpec.describe Importer::Factory::StringLiteralProcessor do - subject { described_class.process(input) } - - let(:input) do - { title: ["Stanford residences"], - contributor: [{ name:, type: "corporate" }] } - end - - context "with a single name" do - let(:name) { ["Muybridge"] } - - it do - expect(subject).to eq(title: ["Stanford residences"], - contributor: ['Muybridge']) - end - end - - context "with multiple name parts" do - let(:name) { ["Stanford University", "Archives."] } - - it do - expect(subject).to eq(title: ["Stanford residences"], - contributor: ['Stanford University — Archives.']) - end - end - - context "without a contributor" do - let(:input) do - { title: ["Stanford residences"] } - end - - it { expect(subject).to eq(title: ["Stanford residences"]) } - end -end diff --git a/spec/lib/importer/mods_importer_spec.rb b/spec/lib/importer/mods_importer_spec.rb deleted file mode 100644 index adb3937d6..000000000 --- a/spec/lib/importer/mods_importer_spec.rb +++ /dev/null @@ -1,79 +0,0 @@ -# frozen_string_literal: true - -require 'importer' -require 'importer/mods_parser' - -RSpec.describe Importer::ModsImporter, :clean do - let(:image_directory) { File.join(fixture_path, 'images') } - let(:importer) { described_class.new(image_directory) } - let(:actor) { double } - - before do - allow(Hyrax::CurationConcern).to receive(:actor).and_return(actor) - end - - describe '#import an image' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'creates a new image and a collection' do - expect(actor).to receive(:create).with(Hyrax::Actors::Environment) do |k| - expect(k.attributes).to include(member_of_collection_attributes: [{ id: 'kx532cb7981' }], - identifier: ['xv169dn4538'], - visibility: 'open') - end - expect do - importer.import(file) - end.to change(Collection, :count).by(1) - - coll = Collection.last - expect(coll.identifier).to eq ['kx532cb7981'] - expect(coll.title).to eq ['Stanford historical photograph collection, 1887-circa 1996'] - expect(coll.visibility).to eq 'open' - end - - context 'when the collection already exists' do - let!(:coll) { create(:collection, id: 'kx532cb7981', title: ['Test Collection']) } - - it 'adds image to existing collection' do - expect(actor).to receive(:create).with(Hyrax::Actors::Environment) do |k| - expect(k.attributes).to include(member_of_collection_attributes: [{ id: coll.id }]) - end - expect do - importer.import(file) - end.to change(Collection, :count).by(0) - end - end - end - - describe '#import a Collection' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'kx532cb7981.mods') } - - it 'creates a collection' do - coll = nil - expect do - coll = importer.import(file) - end.to change(Collection, :count).by(1) - - expect(coll.identifier).to eq ['kx532cb7981'] - expect(coll.title).to eq ['Stanford historical photograph collection, 1887-circa 1996 (inclusive)'] - expect(coll.read_groups).to eq ['public'] - - expect(coll.contributor).to eq ['Stanford University — Archives.'] - end - - context 'when the collection already exists' do - let!(:existing) { FactoryBot.create(:collection, id: 'kx532cb7981', title: ['Test Collection']) } - - it 'adds metadata to existing collection', skip: 'importer.import(file) throws an error in ObjectFactory' do - coll = nil - expect do - coll = importer.import(file) - end.to change(Collection, :count).by(0) - - expect(coll.id).to eq existing.id - expect(coll.identifier).to eq ['kx532cb7981'] - expect(coll.title).to eq ["Stanford historical photograph collection, 1887-circa 1996 (inclusive)"] - end - end - end -end diff --git a/spec/lib/importer/mods_parser_spec.rb b/spec/lib/importer/mods_parser_spec.rb deleted file mode 100644 index 46073ed49..000000000 --- a/spec/lib/importer/mods_parser_spec.rb +++ /dev/null @@ -1,238 +0,0 @@ -# frozen_string_literal: true - -require 'importer' - -RSpec.describe Importer::ModsParser do - let(:parser) { described_class.new(file) } - let(:attributes) { parser.attributes } - - describe 'Determine which kind of record it is:' do - describe 'for a collection:' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'kx532cb7981.mods') } - - it 'knows it is a Collection' do - expect(parser.collection?).to eq true - expect(parser.image?).to eq false - expect(parser.model).to eq Collection - end - end - - describe 'for an image:' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'knows it is an Image' do - expect(parser.image?).to eq true - expect(parser.collection?).to eq false - expect(parser.model).to eq "Image" - end - end - end - - describe '#attributes for an Image record' do - let(:ns_decl) { "xmlns='#{Mods::MODS_NS}'" } - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'finds metadata for the image' do - expect(attributes[:description]).to eq [] - expect(attributes[:location]).to eq [] - expect(attributes[:form_of_work]).to eq [] - expect(attributes[:extent]).to eq [] - expect(attributes[:accession_number]).to eq ["8735.2", "15097"] - expect(attributes[:sub_location]).to eq [] - expect(attributes[:citation]).to eq [] - acquisition_note = attributes[:notes_attributes].first - expect(acquisition_note[:note_type]).to be nil - expect(acquisition_note[:value]).to start_with '"Left to right' - expect(attributes[:description_standard]).to eq [] - expect(attributes[:series_name]).to eq [] - expect(attributes[:restrictions]).to eq [] - expect(attributes[:institution]).to eq [ - "Stanford University. Libraries. Dept. of Special Collections & University Archives." - ] - end - - context 'with a file that has a general (untyped) note' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'imports notes' do - expect(attributes[:notes_attributes].first[:value]).to start_with( - "\"Left to right: Anna Maria Lathrop" - ) - end - end - - context 'with a file that has a publisher', skip: "need a record with originInfo" do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'imports publisher' do - expect(attributes[:publisher]).to eq ['[Cross & Dimmit Pictures]'] - end - end - - context 'with a file that has a photographer', skip: "we're not doing relators beyond contributor" do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'imports photographer' do - expect(attributes[:photographer]).to eq ['http://id.loc.gov/authorities/names/n97003180'] - end - end - - it 'imports contributor' do - expect(attributes[:contributor]).to eq [{ name: ["Muybridge"], type: "corporate" }] - end - - it 'imports language' do - # I think this record should be: - # expect(attributes[:language]).to eq ['http://id.loc.gov/vocabulary/iso639-2/zxx'] - expect(attributes[:language]).to eq ['en'] - end - - it 'imports resource_type' do - expect(attributes[:resource_type]).to eq ['still image'] - end - - it 'imports digital origin', skip: "Need a record with digital origin" do - expect(attributes[:digital_origin]).to eq ['digitized other analog'] - end - - context 'with a file that has coordinates', skip: 'Need metadata with geo data' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'imports coordinates' do - expect(attributes[:latitude]).to eq ['34.442982'] - expect(attributes[:longitude]).to eq ['-119.657362'] - end - end - - it 'finds metadata for the collection' do - # This only worked because I downloaded the mods from purl - expect(attributes[:collection][:identifier]).to eq ['http://purl.stanford.edu/kx532cb7981'] - expect(attributes[:collection][:title]).to eq ['Stanford historical photograph collection, 1887-circa 1996'] - end - - context 'with a range of dateCreated', skip: "no dates on this record" do - it 'imports created' do - expect(attributes[:created_attributes]).to eq [ - { start: ['1910'], - finish: ['1919'], - label: ['circa 1910s'], - start_qualifier: ['approximate'], - finish_qualifier: ['approximate'] } - ] - end - end - - context 'without date_created' do - let(:parser) { described_class.new(nil) } - let(:xml) do - "1989-12-01" - end - - before { allow(parser).to receive(:mods).and_return(Mods::Record.new.from_str(xml)) } - - it "doesn't return a set of empty date attributes (which would cause an empty TimeSpan to be created)" do - expect(attributes[:created_attributes]).to eq [] - end - end - - context 'with a file that has a range of dateIssued', skip: "no dates on this record" do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'imports issued' do - expect(attributes[:issued_attributes]).to eq [ - { start: ['1900'], - finish: ['1959'], - label: ['circa 1900s-1950s'], - start_qualifier: ['approximate'], - finish_qualifier: ['approximate'] } - ] - end - end - - context 'with a file that has a single dateIssued', skip: "no dates on this record" do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'imports issued' do - expect(attributes[:issued_attributes]).to eq [ - { start: ['1925'], - finish: [], - label: [], - start_qualifier: [], - finish_qualifier: [] } - ] - end - end - - context 'with date_copyrighted', skip: "no dates on this record" do - let(:parser) { described_class.new(nil) } - let(:xml) do - "1985-12-01" - end - - before { allow(parser).to receive(:mods).and_return(Mods::Record.new.from_str(xml)) } - - it 'imports date_copyrighted' do - expect(attributes[:date_copyrighted_attributes]).to eq [ - { start: ['1985-12-01'], - finish: [], - label: [], - start_qualifier: [], - finish_qualifier: [] } - ] - end - end - - context 'with dateValid' do - let(:parser) { described_class.new(nil) } - let(:xml) do - "1989-12-01" - end - - before { allow(parser).to receive(:mods).and_return(Mods::Record.new.from_str(xml)) } - - it 'imports date_valid' do - expect(attributes[:date_valid_attributes]).to eq [ - { start: ['1989-12-01'], - finish: [], - label: [], - start_qualifier: [], - finish_qualifier: [] } - ] - end - end - - context 'with a file that has an alternative title', skip: "Need a record with alt title" do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'distinguishes between title and alternative title' do - expect(attributes[:title]).to eq ['Stanford residences -- Sacramento -- Muybridge'] - expect(attributes[:alternative]).to eq ['An alternative'] - end - end - - context 'with a file that has placeTerm', skip: 'file has no originInfo' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'druid_xv169dn4538.mods') } - - it 'reads the place' do - expect(attributes[:place_of_publication]). to eq ['Santa Barbara, California'] - end - end - end - - describe '#attributes for a Collection record' do - let(:file) { File.join(fixture_path, 'mods', 'shpc', 'kx532cb7981.mods') } - - it 'finds the metadata' do - expect(attributes[:title]).to eq ['Stanford historical photograph collection, 1887-circa 1996 (inclusive)'] - expect(attributes[:creator]).to be_nil - expect(attributes[:contributor]).to eq [{ name: ['Stanford University', 'Archives.'], type: 'corporate' }] - expect(attributes[:description].first).to start_with 'The Stanford historical photograph collection' - expect(attributes[:extent]).to eq ['40 linear feet'] - expect(attributes[:language]).to eq ['eng'] - expect(attributes[:resource_type]).to eq ['still image'] - expect(attributes[:institution]).to eq [ - 'Dept. of Special Collections & University Archives Stanford Univeristy Libraries' - ] - end - end -end