MITLibraries · JPrevost · Jul 30, 2025 · Jul 23, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/app/graphql/types/counts_type.rb b/app/graphql/types/counts_type.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+
+module Types
+  class CountsType < Types::BaseObject
+    description 'Features extracted from the input term that are counts. Useful for machine learning.'
+
+    field :apa_volume_issue, Integer, null: true, description: 'Count of apa volume number pattern in the input'
+    field :brackets, Integer, null: true, description: 'Count of brackets in the input'
+    field :characters, Integer, null: true, description: 'Count of characters in the input'
+    field :colons, Integer, null: true, description: 'Count of colons in the input'
+    field :commas, Integer, null: true, description: 'Count of commas in the input'
+    field :lastnames, Integer, null: true, description: 'Count of lastnames in the input. Not recommended for use in production.'
+    field :no, Integer, null: true, description: 'Count of `no` in the input'
+    field :pages, Integer, null: true, description: 'Count of `pages` in the input'
+    field :periods, Integer, null: true, description: 'Count of Periods in the input'
+    field :pp, Integer, null: true, description: 'Count of `pp` in the input'
+    field :quotes, Integer, null: true, description: 'Count of &quot in the input'
+    field :semicolons, Integer, null: true, description: 'Count of Semicolons in the input'
+    field :vol, Integer, null: true, description: 'Count of `vol` in the input'
+    field :words, Integer, null: true, description: 'Count of Words in the input'
+    field :year_parens, Integer, null: true, description: 'Count of (year) in the input'
+  end
+end
diff --git a/app/graphql/types/features_type.rb b/app/graphql/types/features_type.rb
@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+
+module Types
+  class FeaturesType < Types::BaseObject
+    description 'Features extracted from the input term. Useful for machine learning.'
+
+    field :barcode, String, null: true
+    field :counts, CountsType, null: true
+    field :doi, String, null: true
+    field :isbn, String, null: true
+    field :issn, String, null: true
+    field :journal, String, null: true
+    field :ml_citation, Boolean, null: true
+    field :pmid, String, null: true
+  end
+end
diff --git a/app/graphql/types/search_event_type.rb b/app/graphql/types/search_event_type.rb
@@ -5,12 +5,17 @@ class SearchEventType < Types::BaseObject
     field :categories, [Types::CategoriesType], description: 'The list of categories linked to term provided in this search'
     field :created_at, GraphQL::Types::ISO8601DateTime, null: false
     field :detectors, Types::DetectorsType
+    field :features, FeaturesType, null: true
     field :id, ID, null: false
     field :phrase, String
     field :source, String
     field :term_id, Integer
     field :updated_at, GraphQL::Types::ISO8601DateTime, null: false
 
+    def features
+      @object.term.features
+    end
+
     def phrase
       @object.term.phrase
     end

diff --git a/app/graphql/types/term_type.rb b/app/graphql/types/term_type.rb
@@ -5,6 +5,7 @@ class TermType < Types::BaseObject
     field :categories, [Types::CategoriesType], description: 'The list of categories linked to this term'
     field :created_at, GraphQL::Types::ISO8601DateTime, null: false
     field :detectors, Types::DetectorsType
+    field :features, FeaturesType, null: true
     field :id, ID, null: false
     field :occurence_count, Integer
     field :phrase, String, null: false

diff --git a/app/models/detector/citation.rb b/app/models/detector/citation.rb
@@ -85,19 +85,22 @@ def detections
 
     # The record method first runs all of the parsers by running the initialize method. If the resulting score is higher
     # than the REQUIRED_SCORE value, then a Detection is registered.
+    #
     # @param term [Term]
-    # @return nil
+    #
+    # @return [Hash] a hash of features extracted from the Term
     def self.record(term)
       cit = Detector::Citation.new(term.phrase)
-      return unless cit.detection?
 
-      Detection.find_or_create_by(
-        term:,
-        detector: Detector.where(name: 'Citation').first,
-        detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
-      )
+      if cit.detection?
+        Detection.find_or_create_by(
+          term:,
+          detector: Detector.where(name: 'Citation').first,
+          detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
+        )
+      end
 
-      nil
+      cit.features
     end
 
     private

diff --git a/app/models/detector/journal.rb b/app/models/detector/journal.rb
@@ -45,18 +45,19 @@ def self.partial_term_match(phrase)
     # @note This does not care whether multiple matching journals are detected. If _any_ match is found, a Detection
     #       record is created. The uniqueness constraint on Detection records would make multiple detections irrelevant.
     #
-    # @return nil
+    # @return [Set of Journal] A set of ActiveRecord Journal records.
     def self.record(term)
       result = full_term_match(term.phrase)
-      return unless result.any?
 
-      Detection.find_or_create_by(
-        term:,
-        detector: Detector.where(name: 'Journal').first,
-        detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
-      )
+      if result.any?
+        Detection.find_or_create_by(
+          term:,
+          detector: Detector.where(name: 'Journal').first,
+          detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
+        )
+      end
 
-      nil
+      result
     end
   end
 end
diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb
@@ -38,20 +38,22 @@ def self.expected_env?
     # If a positive result is received, a Detection is registered.
     #
     # @param term [Term]
-    # @return nil
+    # @return boolean
     def self.record(term)
       result = Detector::MlCitation.new(term.phrase)
-      return unless result.detection?
-
-      # Detections are registered to the "MlCitation" detector for now, but may end up replacing the "Citation" detector
-      # in a future step.
-      Detection.find_or_create_by(
-        term:,
-        detector: Detector.where(name: 'MlCitation').first,
-        detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
-      )
 
-      nil
+      if result.detection?
+
+        # Detections are registered to the "MlCitation" detector for now, but may end up replacing the "Citation" detector
+        # in a future step.
+        Detection.find_or_create_by(
+          term:,
+          detector: Detector.where(name: 'MlCitation').first,
+          detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
+        )
+      end
+
+      result.detections
     end
 
     # lambda_path reads and returns the value of one environment variable.

diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb
@@ -32,7 +32,7 @@ def initialize(phrase)
     #       a separate Detection record (although a single check finding multiple matches would still only result in one
     #       Detection for that check).
     #
-    # @return nil
+    # @return [Hash] a hash of standard identifiers extracted from the Term
     def self.record(term)
       si = Detector::StandardIdentifiers.new(term.phrase)
 
@@ -44,7 +44,7 @@ def self.record(term)
         )
       end
 
-      nil
+      si.detections
     end
 
     private

diff --git a/app/models/term.rb b/app/models/term.rb
@@ -48,12 +48,16 @@ def cluster
   # The record_detections method is the one-stop method to call every Detector's record method that is defined within
   # the application.
   #
+  # @note This method is called by the calculate_categorizations method, so it is not necessary to call it separately.
+  # It is also called by the features method to ensure that the latest detections are always available if
+  # calculate_categorizations is not called first.
+  #
   # @return nil
   def record_detections
-    Detector::MlCitation.record(self) if Detector::MlCitation.expected_env?
-    Detector::Citation.record(self)
-    Detector::StandardIdentifiers.record(self)
-    Detector::Journal.record(self)
+    @ml_citation = Detector::MlCitation.record(self) if Detector::MlCitation.expected_env?
+    @citation_summary = Detector::Citation.record(self)
+    @std_identifiers = Detector::StandardIdentifiers.record(self)
+    @journal = Detector::Journal.record(self)
     Detector::Lcsh.record(self)
     @suggested_resource_category = Detector::SuggestedResource.record(self)
     @suggested_pattern_category = Detector::SuggestedResourcePattern.record(self)
@@ -91,6 +95,45 @@ def calculate_categorizations
     end
   end
 
+  # Extracted features from various detectors are returned as a hash. This method is used to provide a summary of
+  # the Term's features, which are used for machine learning and other purposes.
+  #
+  # @return [Hash] a hash of features extracted from the Term
+  #
+  # @note This method will call record_detections if the @citation_summary is not already populated.
+  #       This is to ensure that the features are always up-to-date with the latest detections.
+  def features
+    record_detections if @citation_summary.blank?
+    {
+      counts: {
+        apa_volume_issue: @citation_summary[:apa_volume_issue] || 0,
+        vol: @citation_summary[:vol] || 0,
+        no: @citation_summary[:no] || 0,
+        pages: @citation_summary[:pages] || 0,
+        pp: @citation_summary[:pp] || 0,
+        year_parens: @citation_summary[:year_parens] || 0,
+        brackets: @citation_summary[:brackets] || 0,
+        lastnames: @citation_summary[:lastnames] || 0,
+
+        characters: @citation_summary[:characters] || 0,
+        colons: @citation_summary[:colons] || 0,
+        commas: @citation_summary[:commas] || 0,
+        quotes: @citation_summary[:quotes] || 0,
+        periods: @citation_summary[:periods] || 0,
+        semicolons: @citation_summary[:semicolons] || 0,
+        words: @citation_summary[:words] || 0
+      },
+
+      barcode: @std_identifiers[:barcode],
+      doi: @std_identifiers[:doi],
+      isbn: @std_identifiers[:isbn],
+      issn: @std_identifiers[:issn],
+      pmid: @std_identifiers[:pmid],
+      journal: @journal&.first&.name,
+      ml_citation: @ml_citation
+    }
+  end
+
   private
 
   # register_fingerprint method gets called before a Term record is saved, ensuring that Terms should always have a

diff --git a/config/environments/development.rb b/config/environments/development.rb
@@ -78,7 +78,7 @@
 
   # Local logging overrides
   config.logger = Logger.new(STDOUT)
-  config.log_level = :debug
+  config.log_level = :info
 
   # Apply autocorrection by RuboCop to files generated by `bin/rails generate`.
   # config.generators.apply_rubocop_autocorrect_after_generate!

diff --git a/test/fixtures/terms.yml b/test/fixtures/terms.yml
@@ -91,3 +91,6 @@ suggested_resource_with_category:
   phrase: 'categories are cool'
   fingerprint: suggested_resource_with_category
   suggested_resource: suggested_resource_with_category
+
+citation_with_all_the_things:
+  phrase: 'Lastname, F. M., & Lastname, F. M. (1999). Title of the work. Nature Medicine, 9(1), pages 1-10, pp. 1-10. 10.1016/j.physio.2010.12.004 vol. 9, no. 1 [12] :; &quot;thing&quot; 9781319145446 1075-8623 PMID: 38908367 39080678901234'
diff --git a/test/models/detector/bulk_checker_test.rb b/test/models/detector/bulk_checker_test.rb
@@ -7,7 +7,7 @@ class CitationTest < ActiveSupport::TestCase
     test 'citation_bulk_checker' do
       bulk = Detector::Citation.check_all_matches(output: true)
 
-      assert_equal(1, bulk.count)
+      assert_equal(2, bulk.count)
     end
 
     test 'journal_bulk_checker' do
@@ -25,7 +25,7 @@ class CitationTest < ActiveSupport::TestCase
     test 'standard_identifier_bulk_checker' do
       bulk = Detector::StandardIdentifiers.check_all_matches(output: true)
 
-      assert_equal(6, bulk.count)
+      assert_equal(7, bulk.count)
     end
 
     test 'suggested_resources_bulk_checker' do

diff --git a/test/models/detector/citation_test.rb b/test/models/detector/citation_test.rb
@@ -271,6 +271,62 @@ class CitationTest < ActiveSupport::TestCase
       end
     end
 
+    test 'record method returns features when detection is false' do
+      t = terms('hi')
+
+      # Confirm phrase is not a detection
+      assert_not Detector::Citation.new(t.phrase).detection?
+
+      # Confirm record method returns features
+      result = Detector::Citation.record(t)
+
+      # Confirm features are in the expected format and contain expected values
+      assert_kind_of Hash, result
+      assert_equal 11, result[:characters]
+      assert_equal 0, result[:colons]
+      assert_equal 0, result[:commas]
+      assert_equal 0, result[:periods]
+      assert_equal 0, result[:semicolons]
+      assert_equal 2, result[:words]
+      assert_equal 0, result[:apa_volume_issue]
+      assert_equal 0, result[:no]
+      assert_equal 0, result[:pages]
+      assert_equal 0, result[:pp]
+      assert_equal 0, result[:vol]
+      assert_equal 0, result[:year_parens]
+      assert_equal 0, result[:brackets]
+      assert_equal 0, result[:lastnames]
+      assert_equal 0, result[:quotes]
+    end
+
+    test 'record method returns features when detection is true' do
+      t = terms('citation')
+
+      # Confirm phrase is a detection
+      assert_predicate Detector::Citation.new(t.phrase), :detection?
+
+      # Confirm record method returns features
+      result = Detector::Citation.record(t)
+
+      # Confirm features are in the expected format and contain expected values
+      assert_kind_of Hash, result
+      assert_equal 265, result[:characters]
+      assert_equal 3, result[:colons]
+      assert_equal 7, result[:commas]
+      assert_equal 11, result[:periods]
+      assert_equal 2, result[:semicolons]
+      assert_equal 33, result[:words]
+      assert_equal 0, result[:apa_volume_issue]
+      assert_equal 1, result[:no]
+      assert_equal 0, result[:pages]
+      assert_equal 0, result[:pp]
+      assert_equal 1, result[:vol]
+      assert_equal 0, result[:year_parens]
+      assert_equal 2, result[:brackets]
+      assert_equal 4, result[:lastnames]
+      assert_equal 1, result[:quotes]
+    end
+
     test 'detections returns nil when score is lower than configured' do
       result = Detector::Citation.new('nothing here')