MIT-LCP · tompollard · Jun 7, 2025 · Jun 7, 2025
diff --git a/physionet-django/search/test_views.py b/physionet-django/search/test_views.py
@@ -1,6 +1,10 @@
 from django.test import TestCase
 from django.utils.html import escape
 from django.urls import reverse
+from django.utils import timezone
+from django.db.models import Q
+from project.models import PublishedProject
+from .views import get_content_postgres_full_text_search, get_content_normal_search
 
 
 class TestProjectSearch(TestCase):
@@ -114,3 +118,182 @@ def assert_no_link(self, response, url):
         """
         link = '<a href="{}"'.format(escape(url))
         self.assertNotIn(link.encode(), response.content)
+
+
+class TestProjectSearchEngine(TestCase):
+    def setUp(self):
+        # Create test projects with various content
+        self.project1 = PublishedProject.objects.create(
+            title="Machine Learning ECG Analysis",
+            abstract="Deep learning approach for ECG signal processing",
+            resource_type="Project",
+            is_latest_version=True,
+            publish_datetime=timezone.now()
+        )
+        self.project1.topics.create(description="Machine Learning")
+        self.project1.topics.create(description="ECG")
+
+        self.project2 = PublishedProject.objects.create(
+            title="ECG Database",
+            abstract="Collection of ECG recordings",
+            resource_type="Project",
+            is_latest_version=True,
+            publish_datetime=timezone.now() - timezone.timedelta(days=1)
+        )
+        self.project2.topics.create(description="ECG")
+        self.project2.topics.create(description="Database")
+
+        self.project3 = PublishedProject.objects.create(
+            title="Deep Learning Tutorial",
+            abstract="Introduction to deep learning concepts",
+            resource_type="Project",
+            is_latest_version=True,
+            publish_datetime=timezone.now() - timezone.timedelta(days=2)
+        )
+        self.project3.topics.create(description="Deep Learning")
+        self.project3.topics.create(description="Tutorial")
+
+    def test_exact_phrase_matching(self):
+        """Test exact phrase matching in search"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="machine learning"
+        )
+        self.assertEqual(results.first().id, self.project1.id)
+
+    def test_partial_word_matching(self):
+        """Test partial word matching in search"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="learn"
+        )
+        self.assertIn(self.project1.id, results.values_list('id', flat=True))
+        self.assertIn(self.project3.id, results.values_list('id', flat=True))
+
+    def test_multi_word_search(self):
+        """Test multi-word search with different combinations"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="deep ecg"
+        )
+        self.assertEqual(results.first().id, self.project1.id)
+
+    def test_relevance_scoring(self):
+        """Test that relevance scoring prioritizes better matches"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="ecg"
+        )
+        # Project with "ECG" in title should rank higher than one with it in abstract
+        self.assertEqual(results.first().id, self.project2.id)
+
+    def test_normal_search_exact_matches(self):
+        """Test exact matching in normal search"""
+        results = get_content_normal_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="machine learning"
+        )
+        self.assertEqual(results.first().id, self.project1.id)
+
+    def test_normal_search_partial_matches(self):
+        """Test partial matching in normal search"""
+        results = get_content_normal_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="learn"
+        )
+        self.assertIn(self.project1.id, results.values_list('id', flat=True))
+        self.assertIn(self.project3.id, results.values_list('id', flat=True))
+
+    def test_normal_search_relevance_scoring(self):
+        """Test relevance scoring in normal search"""
+        results = get_content_normal_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="ecg"
+        )
+        # Project with "ECG" in title should rank higher
+        self.assertEqual(results.first().id, self.project2.id)
+
+    def test_search_term_normalization(self):
+        """Test that search terms are properly normalized"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="  Machine  Learning  "
+        )
+        self.assertEqual(results.first().id, self.project1.id)
+
+    def test_empty_search_terms(self):
+        """Test handling of empty search terms"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term=""
+        )
+        self.assertEqual(results.count(), 3)
+
+    def test_special_characters(self):
+        """Test handling of special characters in search terms"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="machine-learning"
+        )
+        self.assertIn(self.project1.id, results.values_list('id', flat=True))
+
+    def test_case_insensitivity(self):
+        """Test case insensitivity in search"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="MACHINE LEARNING"
+        )
+        self.assertEqual(results.first().id, self.project1.id)
+
+    def test_combined_search_strategies(self):
+        """Test that different search strategies work together"""
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="relevance",
+            direction="desc",
+            search_term="deep learning ecg"
+        )
+        # Should find project1 due to combined relevance of terms
+        self.assertEqual(results.first().id, self.project1.id)
+
+    def test_sorting_options(self):
+        """Test different sorting options"""
+        # Test sorting by publish date
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="publish_datetime",
+            direction="desc",
+            search_term="learning"
+        )
+        self.assertEqual(results.first().id, self.project1.id)
+
+        # Test sorting by title
+        results = get_content_postgres_full_text_search(
+            resource_type=["Project"],
+            orderby="title",
+            direction="asc",
+            search_term="learning"
+        )
+        self.assertEqual(results.first().id, self.project3.id)
diff --git a/physionet-django/search/views.py b/physionet-django/search/views.py
@@ -4,7 +4,7 @@
 from functools import reduce
 
 from django.conf import settings
-from django.db.models import Case, Count, IntegerField, Q, Sum, Value, When
+from django.db.models import Case, Count, IntegerField, Q, Sum, Value, When, F, FloatField
 from django.http import Http404
 from django.shortcuts import redirect, render, reverse
 from django.templatetags.static import static
@@ -66,34 +66,85 @@ def get_content_postgres_full_text_search(resource_type, orderby, direction, sea
         SearchQuery,
         SearchRank,
         SearchVector,
+        TrigramSimilarity,
     )
 
-    # Split search term by whitespace or punctuation
+    # Split search term by whitespace or punctuation and clean
     if search_term:
-        search_terms = re.split(r'\s*[\;\,\s]\s*', re.escape(search_term))
-        search_queries = [SearchQuery(term) for term in search_terms]
+        # Clean and normalize search terms
+        search_terms = [term.strip().lower() for term in re.split(r'\s*[\;\,\s]\s*', search_term)]
+        search_terms = [term for term in search_terms if term]  # Remove empty terms
+
+        # Create search queries with improved matching
+        search_queries = []
+        for term in search_terms:
+            # Create a fuzzy query that matches similar words
+            query = SearchQuery(term, config='english')
+            # Add prefix matching for better partial word matching
+            prefix_query = SearchQuery(term + ':*', config='english')
+            # Add exact phrase matching with higher weight
+            phrase_query = SearchQuery('"' + term + '"', config='english')
+            search_queries.append(query | prefix_query | phrase_query)
+
         search_query = reduce(operator.and_, search_queries)
         query = Q(resource_type__in=resource_type) & Q(search=search_query)
     else:
-        search_query = SearchQuery('')
+        search_query = SearchQuery('', config='english')
         query = Q(resource_type__in=resource_type)
 
-    vector = (SearchVector('title', weight='A') + SearchVector('abstract', weight='B')
-              + SearchVector('topics__description', weight='C'))
+    # Configure search vectors with improved weights and word stemming
+    # Title gets highest weight (A), followed by topics (B), then abstract (C)
+    vector = (SearchVector('title', weight='A', config='english') +
+              SearchVector('topics__description', weight='B', config='english') +
+              SearchVector('abstract', weight='C', config='english'))
 
     # Filter projects by latest version and annotate relevance field
     published_projects = PublishedProject.objects.annotate(search=vector).filter(query, is_latest_version=True)
 
+    # Add trigram similarity for better partial matching
+    if search_term:
+        published_projects = published_projects.annotate(
+            title_similarity=TrigramSimilarity('title', search_term),
+            topic_similarity=TrigramSimilarity('topics__description', search_term),
+            abstract_similarity=TrigramSimilarity('abstract', search_term)
+        )
+
     # get distinct projects with subquery and also include relevance from published_projects
     published_projects = PublishedProject.objects.filter(id__in=published_projects.values('id')).annotate(
-        relevance=SearchRank(vector, search_query)).distinct()
+        relevance=SearchRank(vector, search_query, weights=[0.1, 0.2, 0.4, 1.0])).distinct()
+
+    # Add combined similarity score if search term exists
+    if search_term:
+        published_projects = published_projects.annotate(
+            similarity=Case(
+                When(title_similarity__isnull=False, then=F('title_similarity') * 3),
+                default=Value(0),
+                output_field=FloatField(),
+            ) + Case(
+                When(topic_similarity__isnull=False, then=F('topic_similarity') * 2),
+                default=Value(0),
+                output_field=FloatField(),
+            ) + Case(
+                When(abstract_similarity__isnull=False, then=F('abstract_similarity')),
+                default=Value(0),
+                output_field=FloatField(),
+            )
+        )
 
     # Sorting
     direction = '-' if direction == 'desc' else ''
     order_string = '{}{}'.format(direction, orderby)
 
     if orderby == 'relevance':
-        published_projects = published_projects.order_by('-relevance', '-publish_datetime')
+        if search_term:
+            # Combine search rank with similarity score for better relevance
+            # Give more weight to exact matches and title matches
+            published_projects = published_projects.order_by(
+                (F('relevance') * 0.8 + F('similarity') * 0.2).desc(),
+                '-publish_datetime'
+            )
+        else:
+            published_projects = published_projects.order_by('-publish_datetime')
     else:
         published_projects = published_projects.order_by(order_string, '-relevance')
 
@@ -110,40 +161,59 @@ def get_content_normal_search(resource_type, orderby, direction, search_term):
     if len(search_term) == 0:
         query = Q(resource_type__in=resource_type)
     else:
-        search_term = re.split(r'\s*[\;\,\s]\s*', re.escape(search_term))
-        query = reduce(operator.or_, (Q(topics__description__iregex=r'{0}{1}{0}'.format(wb,
-            item)) for item in search_term))
-        query = query | reduce(operator.or_, (Q(abstract__iregex=r'{0}{1}{0}'.format(wb,
-            item)) for item in search_term))
-        query = query | reduce(operator.or_, (Q(title__iregex=r'{0}{1}{0}'.format(wb,
-            item)) for item in search_term))
-        query = query & Q(resource_type__in=resource_type)
+        # Clean and normalize search terms
+        search_terms = [term.strip().lower() for term in re.split(r'\s*[\;\,\s]\s*', search_term)]
+        search_terms = [term for term in search_terms if term]  # Remove empty terms
+
+        # Build queries with improved matching
+        query = Q(resource_type__in=resource_type)
+        for term in search_terms:
+            # Exact phrase matching
+            phrase_query = Q(title__icontains=term) | Q(abstract__icontains=term) | Q(topics__description__icontains=term)
+            # Word boundary matching
+            word_query = (Q(title__iregex=r'{0}{1}{0}'.format(wb, term)) |
+                         Q(abstract__iregex=r'{0}{1}{0}'.format(wb, term)) |
+                         Q(topics__description__iregex=r'{0}{1}{0}'.format(wb, term)))
+            # Partial word matching
+            partial_query = (Q(title__icontains=term) |
+                           Q(abstract__icontains=term) |
+                           Q(topics__description__icontains=term))
+
+            query = query & (phrase_query | word_query | partial_query)
+
     published_projects = (PublishedProject.objects
         .filter(query, is_latest_version=True)
         .annotate(relevance=Count('core_project_id'))
         .annotate(has_keys=Value(0, IntegerField()))
     )
 
-    # Relevance
-    for t in search_term:
-        published_projects = published_projects.annotate(
-            has_keys=Case(
-                When(title__iregex=r"{0}{1}{0}".format(wb, t), then=Value(3)),
-                default=Value(0),
-                output_field=IntegerField(),
-            )
-            + Case(
-                When(topics__description__iregex=r"{0}{1}{0}".format(wb, t), then=Value(2)),
-                default=Value(0),
-                output_field=IntegerField(),
-            )
-            + Case(
-                When(abstract__iregex=r"{0}{1}{0}".format(wb, t), then=Value(1)),
-                default=Value(0),
-                output_field=IntegerField(),
-            )
-        ).annotate(has_keys=Sum("has_keys"))
-
+    # Relevance scoring with improved weights
+    if search_term:
+        for term in search_terms:
+            published_projects = published_projects.annotate(
+                has_keys=Case(
+                    # Exact phrase matches in title
+                    When(title__iexact=term, then=Value(5)),
+                    # Word boundary matches in title
+                    When(title__iregex=r"{0}{1}{0}".format(wb, term), then=Value(4)),
+                    # Partial matches in title
+                    When(title__icontains=term, then=Value(3)),
+                    # Exact phrase matches in topics
+                    When(topics__description__iexact=term, then=Value(3)),
+                    # Word boundary matches in topics
+                    When(topics__description__iregex=r"{0}{1}{0}".format(wb, term), then=Value(2)),
+                    # Partial matches in topics
+                    When(topics__description__icontains=term, then=Value(1)),
+                    # Exact phrase matches in abstract
+                    When(abstract__iexact=term, then=Value(2)),
+                    # Word boundary matches in abstract
+                    When(abstract__iregex=r"{0}{1}{0}".format(wb, term), then=Value(1)),
+                    # Partial matches in abstract
+                    When(abstract__icontains=term, then=Value(0.5)),
+                    default=Value(0),
+                    output_field=FloatField(),
+                )
+            ).annotate(has_keys=Sum("has_keys"))
 
     # Sorting
     direction = '-' if direction == 'desc' else ''