cc-archive · aldenstpage · Apr 26, 2019 · Apr 24, 2019 · Apr 24, 2019 · Apr 24, 2019
diff --git a/cccatalog-api/cccatalog/api/controllers/search_controller.py b/cccatalog-api/cccatalog/api/controllers/search_controller.py
@@ -71,35 +71,27 @@ def search(search_params, index, page_size, ip, page=1) -> Response:
     # individual field-level queries specified.
     if 'q' in search_params.data:
         s = s.query(
-            'constant_score',
-            filter=Q(
-                'query_string',
-                query=search_params.data['q'],
-                fields=['tags.name', 'title'],
-            )
+            'query_string',
+            query=search_params.data['q'],
+            fields=['tags.name', 'title'],
         )
     else:
         if 'creator' in search_params.data:
             creator = search_params.data['creator']
             s = s.query(
-                'constant_score',
-                filter=Q('query_string', query=creator, default_field='creator')
+                'query_string', query=creator, default_field='creator'
             )
         if 'title' in search_params.data:
             title = search_params.data['title']
             s = s.query(
-                'constant_score',
-                filter=Q('query_string', query=title, default_field='title')
+                'query_string', query=title, default_field='title'
             )
         if 'tags' in search_params.data:
             tags = search_params.data['tags']
             s = s.query(
-                'constant_score',
-                filter=Q(
-                    'query_string',
-                    default_field='tags.name',
-                    query=tags
-                )
+                'query_string',
+                default_field='tags.name',
+                query=tags
             )
 
     s.extra(track_scores=True)

diff --git a/cccatalog-api/test/search_qa_test.py b/cccatalog-api/test/search_qa_test.py
@@ -1,6 +1,7 @@
 import requests
 import pprint
 import json
+import pytest
 from enum import Enum
 from .api_live_integration_test import API_URL
 
@@ -15,6 +16,7 @@ class QAScores(Enum):
     NOT_RELEVANT = 3
 
 
+@pytest.mark.skip(reason="This test is nondeterministic")
 def test_phrase_relevance():
     res = requests.get(
         "{}/image/search?q=home office&filter_dead=false&qa=true"

diff --git a/ingestion_server/ingestion_server/cleanup.py b/ingestion_server/ingestion_server/cleanup.py
@@ -83,7 +83,8 @@ def _cleanup_tags(tags):
                         'tags': _cleanup_tags,
                         'url': _cleanup_url,
                         'creator_url': _cleanup_url,
-                        'foreign_landing_url': _cleanup_url
+                        'foreign_landing_url': _cleanup_url,
+                        'thumbnail': _cleanup_url
                     }
                 }
             }

diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py
@@ -1,6 +1,14 @@
 from elasticsearch_dsl import Date, Text, Integer, Nested, Keyword, DocType
 
 
+"""
+Provides an ORM-like experience for accessing data in Elasticsearch.
+
+Note the actual schema for Elasticsearch is defined in es_mapping.py; any
+low-level changes to the index must be represented there as well.
+"""
+
+
 class SyncableDocType(DocType):
     """
     Represents tables in the source-of-truth that will be replicated to

diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py
@@ -0,0 +1,125 @@
+def create_mapping(table_name):
+    """
+    Return the Elasticsearch mapping for a given table in the database.
+
+    :param table_name: The name of the table in the upstream database.
+    :return:
+    """
+    mapping = {
+        'image': {
+           "mappings": {
+               "doc": {
+                   "properties": {
+                      "license_version": {
+                         "type": "text",
+                         "fields": {
+                            "keyword": {
+                               "type": "keyword",
+                               "ignore_above": 256
+                            }
+                         }
+                      },
+                      "view_count": {
+                         "type": "long"
+                      },
+                      "provider": {
+                         "type": "text",
+                         "fields": {
+                            "keyword": {
+                               "type": "keyword",
+                               "ignore_above": 256
+                            }
+                         }
+                      },
+                      "source": {
+                         "fields": {
+                            "keyword": {
+                               "ignore_above": 256,
+                               "type": "keyword"
+                            }
+                         },
+                         "type": "text"
+                      },
+                      "license": {
+                         "fields": {
+                            "keyword": {
+                               "ignore_above": 256,
+                               "type": "keyword"
+                            }
+                         },
+                         "type": "text"
+                      },
+                      "url": {
+                         "fields": {
+                            "keyword": {
+                               "type": "keyword",
+                               "ignore_above": 256
+                            }
+                         },
+                         "type": "text"
+                      },
+                      "tags": {
+                         "properties": {
+                            "accuracy": {
+                               "type": "float"
+                            },
+                            "name": {
+                               "type": "text",
+                               "fields": {
+                                  "keyword": {
+                                     "type": "keyword",
+                                     "ignore_above": 256
+                                  }
+                               }
+                            }
+                         }
+                      },
+                      "foreign_landing_url": {
+                         "fields": {
+                            "keyword": {
+                               "ignore_above": 256,
+                               "type": "keyword"
+                            }
+                         },
+                         "type": "text"
+                      },
+                      "id": {
+                         "type": "long"
+                      },
+                      "identifier": {
+                         "fields": {
+                            "keyword": {
+                               "type": "keyword",
+                               "ignore_above": 256
+                            }
+                         },
+                         "type": "text"
+                      },
+                      "title": {
+                         "type": "text",
+                         "similarity": "boolean",
+                         "fields": {
+                            "keyword": {
+                               "type": "keyword",
+                               "ignore_above": 256
+                            }
+                         }
+                      },
+                      "creator": {
+                         "type": "text",
+                         "fields": {
+                            "keyword": {
+                               "type": "keyword",
+                               "ignore_above": 256
+                            }
+                         }
+                      },
+                      "created_on": {
+                         "type": "date"
+                      }
+                   }
+               }
+           }
+        }
+    }
+    return mapping[table_name]
diff --git a/ingestion_server/ingestion_server/indexer.py b/ingestion_server/ingestion_server/indexer.py
@@ -16,8 +16,9 @@
 from elasticsearch_dsl import connections, Search
 from psycopg2.sql import SQL, Identifier
 from ingestion_server.qa import create_search_qa_index
-
-from ingestion_server.elasticsearch_models import database_table_to_elasticsearch_model
+from ingestion_server.elasticsearch_models import \
+    database_table_to_elasticsearch_model
+from ingestion_server.es_mapping import create_mapping
 
 """
 A utility for indexing data to Elasticsearch. For each table to
@@ -203,6 +204,10 @@ def _index_table(self, table, dest_idx=None):
             query = SQL('SELECT * FROM {}'
                         ' WHERE id BETWEEN {} AND {} ORDER BY id'
                         .format(table, last_added_es_id, last_added_pg_id))
+            self.es.indices.create(
+                index=dest_idx,
+                body=create_mapping(table)
+            )
             self._replicate(table, dest_idx, query)
 
     def _replicate(self, table, dest_index, query):