MITLibraries · matt-bernhardt · Jun 12, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/README.md b/README.md
@@ -73,6 +73,10 @@ changes, this is the signal which indicates that terms need to be re-evaluated.
 
 ### Optional
 
+`DETECTOR_LAMBDA_CHALLENGE_SECRET`: The secret phrase required by the external citation detector to process any request. If not present, the detector will not respond.
+`DETECTOR_LAMBDA_PATH`: The path specified by the external citation detector for prediction requests. If not present, the citation detector will not be consulted.
+`DETECTOR_LAMBDA_URL`: The address for an external citation detector, if present. If not present, the citation detector will not be consulted.
+
 `LIBKEY_KEY`: LibKey API key. Required if `LIBKEY_DOI` or `LIBKEY_PMID` are set.
 `LIBKEY_ID`: LibKey Library ID. Required if `LIBKEY_DOI` or `LIBKEY_PMID` are set.
 `LIBKEY_DOI`: If set, use LibKey for DOI metadata lookups. If not set, Unpaywall is used.

diff --git a/app/models/detector/citation.rb b/app/models/detector/citation.rb
@@ -10,7 +10,7 @@ class Detector
   # hallmarks of being a citation.
   # Phrases whose score is higher than the REQUIRED_SCORE value can be registered as a Detection.
   class Citation
-    attr_reader :score, :subpatterns, :summary
+    attr_reader :features, :score, :subpatterns, :summary
 
     # shared singleton methods
     extend Detector::BulkChecker
@@ -67,10 +67,13 @@ def detection?
     # @return Nothing intentional. Data is written to Hashes `@subpatterns`, `@summary`,
     #   and `@score` during processing.
     def initialize(phrase)
+      @features = {}
       @subpatterns = {}
       @summary = {}
       pattern_checker(phrase)
       summarize(phrase)
+      extract_features
+      @subpatterns.delete_if { |_, v| v == [] }
       @score = calculate_score
     end
 
@@ -135,13 +138,25 @@ def commas(phrase)
       phrase.count(',')
     end
 
+    # This converts the already-built @subpatterns and @summary instance variables into the @features instance variable,
+    # which has a format suitable for sending to our prediction algorithm.
+    def extract_features
+      # Need to create a separate instance variable, so use .deep_dup
+      @features = @subpatterns.deep_dup
+      # Convert the @subpattern structure of {no: = [], pages: ['194-204']} (a hash of matched substrings, with some
+      # empty) into {no: 0, pages: 1} (a hash of integers, some zero)
+      @features = @features.transform_values(&:length)
+      # Now join the re-shaped hash with the @summary variable, so everything is in one place.
+      @features = @features.merge(summary)
+    end
+
     # This builds one of the two main components of the Citation detector - the subpattern report. It uses each of the
     # regular expressions in the CITATION_PATTERNS constant, extracting all matches using the scan method.
     #
     # @return hash
     def pattern_checker(phrase)
       CITATION_PATTERNS.each_pair do |type, pattern|
-        @subpatterns[type.to_sym] = scan(pattern, phrase) if scan(pattern, phrase).present?
+        @subpatterns[type.to_sym] = scan(pattern, phrase)
       end
     end
 

diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb
@@ -0,0 +1,152 @@
+# frozen_string_literal: true
+
+class Detector
+  class MlCitation
+    attr_reader :detections
+
+    # For now the initialize method just needs to consult the external lambda.
+    #
+    #   @param phrase String. Often a `Term.phrase`.
+    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    def initialize(phrase)
+      return unless self.class.expected_env?
+
+      response = fetch(phrase)
+      @detections = response unless response == 'Error'
+    end
+
+    def detection?
+      @detections == true
+    end
+
+    # expected_env? confirms that all three required environment variables are defined. It is provided for the Term
+    # model to check prior to calling because this is still an optional extension to TACOS. If this method returns
+    # false, the Term model will fall back to the initial citation detector.
+    #
+    # @return Boolean
+    def self.expected_env?
+      Rails.logger.error('No lambda URL defined') if lambda_url.nil?
+
+      Rails.logger.error('No lambda path defined') if lambda_path.nil?
+
+      Rails.logger.error('No lambda secret defined') if lambda_secret.nil?
+
+      [lambda_url, lambda_path, lambda_secret].all?(&:present?)
+    end
+
+    # The record method runs a supplied term through the detector via its initialize method, which consults the lambda.
+    # If a positive result is received, a Detection is registered.
+    #
+    # @param term [Term]
+    # @return nil
+    def self.record(term)
+      result = Detector::MlCitation.new(term.phrase)
+      return unless result.detection?
+
+      # Detections are registered to the "MlCitation" detector for now, but may end up replacing the "Citation" detector
+      # in a future step.
+      Detection.find_or_create_by(
+        term:,
+        detector: Detector.where(name: 'MlCitation').first,
+        detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
+      )
+
+      nil
+    end
+
+    # lambda_path reads and returns the value of one environment variable.
+    #
+    # @note This is a public class method because the entire class ends up getting called in both class and instance
+    #   contexts, due to how detectors are built. The ideal state would be a private method, but that would require
+    #   changing how the class calls itself via the fetch method.
+    #
+    # @see Detector::MlCitation.expected_env?
+    # @see Detector::MlCitation.fetch
+    # @return String or nil
+    def self.lambda_path
+      ENV.fetch('DETECTOR_LAMBDA_PATH', nil)
+    end
+
+    # lambda_secret reads and returns the value of one environment variable.
+    #
+    # @note This is a public class method because the entire class ends up getting called in both class and instance
+    #   contexts, due to how detectors are built. The ideal state would be a private method, but that would require
+    #   changing how the class calls itself via the fetch method.
+    #
+    # @see Detector::MlCitation.expected_env?
+    # @see Detector::MlCitation.fetch
+    # @return String or nil
+    def self.lambda_secret
+      ENV.fetch('DETECTOR_LAMBDA_CHALLENGE_SECRET', nil)
+    end
+
+    # lambda_url reads and returns the value of one environment variable.
+    #
+    # @note This is a public class method because the entire class ends up getting called in both class and instance
+    #   contexts, due to how detectors are built. The ideal state would be a private method, but that would require
+    #   changing how the class calls itself via the fetch method.
+    #
+    # @see Detector::MlCitation.expected_env?
+    # @see Detector::MlCitation.fetch
+    # @return String or nil
+    def self.lambda_url
+      ENV.fetch('DETECTOR_LAMBDA_URL', nil)
+    end
+
+    private
+
+    # define_lambda connects to the detector lambda.
+    #
+    # @return Faraday connection
+    def define_lambda
+      Faraday.new(
+        url: self.class.lambda_url,
+        params: {}
+      )
+    end
+
+    # define_payload defines the Hash that will be sent to the lambda.
+    #
+    # @return Hash
+    def define_payload(phrase)
+      {
+        action: 'predict',
+        features: extract_features(phrase),
+        challenge_secret: self.class.lambda_secret
+      }
+    end
+
+    # extract_features passes the search phrase through the citation detector, and massages the resulting features object
+    # to correspond with what the lambda expects.
+    #
+    # @return Hash
+    def extract_features(phrase)
+      features = Detector::Citation.new(phrase).features
+      features[:apa] = features.delete :apa_volume_issue
+      features[:year] = features.delete :year_parens
+      features.delete :characters
+      features
+    end
+
+    # Fetch handles the communication with the detector lambda: defining the connection, building the payload, and any
+    # error handling with the response.
+    #
+    # @return Boolean or 'Error'
+    def fetch(phrase)
+      lambda = define_lambda
+      payload = define_payload(phrase)
+
+      response = lambda.post(self.class.lambda_path, payload.to_json)
+
+      if response.status == 200
+        JSON.parse(response.body)['response'] == 'true'
+      else
+        Rails.logger.error(response.body)
+        Sentry.set_extras({ body: response.body })
+        Sentry.capture_message('Non-200 response received from detector lambda')
+
+        'Error'
+      end
+    end
+  end
+end
diff --git a/app/models/term.rb b/app/models/term.rb
@@ -50,6 +50,7 @@ def cluster
   #
   # @return nil
   def record_detections
+    Detector::MlCitation.record(self) if Detector::MlCitation.expected_env?
     Detector::Citation.record(self)
     Detector::StandardIdentifiers.record(self)
     Detector::Journal.record(self)

diff --git a/db/seeds.rb b/db/seeds.rb
@@ -41,6 +41,7 @@
 Detector.find_or_create_by(name: 'Journal')
 Detector.find_or_create_by(name: 'SuggestedResource')
 Detector.find_or_create_by(name: 'Citation')
+Detector.find_or_create_by(name: 'MlCitation')
 Detector.find_or_create_by(name: 'Barcode')
 Detector.find_or_create_by(name: 'SuggestedResourcePattern')
 
@@ -75,6 +76,11 @@
   category: Category.find_by(name: 'Informational'),
   confidence: 0.7
 )
+DetectorCategory.find_or_create_by(
+  detector: Detector.find_by(name: 'MlCitation'),
+  category: Category.find_by(name: 'Transactional'),
+  confidence: 0.95
+)
 DetectorCategory.find_or_create_by(
   detector: Detector.find_by(name: 'PMID'),
   category: Category.find_by(name: 'Transactional'),

diff --git a/test/fixtures/detector_categories.yml b/test/fixtures/detector_categories.yml
@@ -48,3 +48,8 @@ eight:
   detector: barcode
   category: transactional
   confidence: 0.95
+
+nine:
+  detector: mlcitation
+  category: transactional
+  confidence: 0.95
diff --git a/test/fixtures/detectors.yml b/test/fixtures/detectors.yml
@@ -13,6 +13,9 @@ barcode:
 citation:
   name: 'Citation'
 
+mlcitation:
+  name: 'MlCitation'
+
 doi:
   name: 'DOI'
 

diff --git a/test/models/detector/citation_test.rb b/test/models/detector/citation_test.rb
@@ -4,10 +4,12 @@
 
 class Detector
   class CitationTest < ActiveSupport::TestCase
-    test 'detector::citation exposes three instance variables' do
+    test 'detector::citation exposes four instance variables' do
       t = terms('citation')
       result = Detector::Citation.new(t.phrase)
 
+      assert_predicate result.features, :present?
+
       assert_predicate result.score, :present?
 
       assert_predicate result.summary, :present?
@@ -196,6 +198,29 @@ class CitationTest < ActiveSupport::TestCase
       assert_operator 0, :<, result.score
     end
 
+    test 'features instance method is a hash of integers' do
+      result = Detector::Citation.new('simple search phrase')
+
+      assert_instance_of(Hash, result.features)
+
+      assert(result.features.all? { |_, v| v.integer? })
+    end
+
+    test 'features instance method includes all elements of citation detector regardless of search string' do
+      result_simple = Detector::Citation.new('simple')
+      result_complex = Detector::Citation.new('Science Education and Cultural Diversity: Mapping the Field. Studies in Science Education, 24(1), 49–73.')
+
+      assert_equal result_simple.features.length, result_complex.features.length
+    end
+
+    test 'features instance method should include all elements of citation patterns and summary thresholds' do
+      patterns = Detector::Citation.const_get :CITATION_PATTERNS
+      summary = Detector::Citation.const_get :SUMMARY_THRESHOLDS
+      result = Detector::Citation.new('simple')
+
+      assert_equal (patterns.length + summary.length), result.features.length
+    end
+
     test 'detection? convenience method returns true for obvious citations' do
       result = Detector::Citation.new(terms('citation').phrase)