Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Phrase relevance improvements #258 #281

Merged
merged 8 commits into from
Apr 26, 2019
24 changes: 8 additions & 16 deletions cccatalog-api/cccatalog/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,35 +71,27 @@ def search(search_params, index, page_size, ip, page=1) -> Response:
# individual field-level queries specified.
if 'q' in search_params.data:
s = s.query(
'constant_score',
filter=Q(
'query_string',
query=search_params.data['q'],
fields=['tags.name', 'title'],
)
'query_string',
query=search_params.data['q'],
fields=['tags.name', 'title'],
)
else:
if 'creator' in search_params.data:
creator = search_params.data['creator']
s = s.query(
'constant_score',
filter=Q('query_string', query=creator, default_field='creator')
'query_string', query=creator, default_field='creator'
)
if 'title' in search_params.data:
title = search_params.data['title']
s = s.query(
'constant_score',
filter=Q('query_string', query=title, default_field='title')
'query_string', query=title, default_field='title'
)
if 'tags' in search_params.data:
tags = search_params.data['tags']
s = s.query(
'constant_score',
filter=Q(
'query_string',
default_field='tags.name',
query=tags
)
'query_string',
default_field='tags.name',
query=tags
)

s.extra(track_scores=True)
Expand Down
2 changes: 2 additions & 0 deletions cccatalog-api/test/search_qa_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import requests
import pprint
import json
import pytest
from enum import Enum
from .api_live_integration_test import API_URL

Expand All @@ -15,6 +16,7 @@ class QAScores(Enum):
NOT_RELEVANT = 3


@pytest.mark.skip(reason="This test is nondeterministic")
def test_phrase_relevance():
res = requests.get(
"{}/image/search?q=home office&filter_dead=false&qa=true"
Expand Down
3 changes: 2 additions & 1 deletion ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def _cleanup_tags(tags):
'tags': _cleanup_tags,
'url': _cleanup_url,
'creator_url': _cleanup_url,
'foreign_landing_url': _cleanup_url
'foreign_landing_url': _cleanup_url,
'thumbnail': _cleanup_url
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions ingestion_server/ingestion_server/elasticsearch_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
from elasticsearch_dsl import Date, Text, Integer, Nested, Keyword, DocType


"""
Provides an ORM-like experience for accessing data in Elasticsearch.

Note the actual schema for Elasticsearch is defined in es_mapping.py; any
low-level changes to the index must be represented there as well.
"""


class SyncableDocType(DocType):
"""
Represents tables in the source-of-truth that will be replicated to
Expand Down
125 changes: 125 additions & 0 deletions ingestion_server/ingestion_server/es_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
def create_mapping(table_name):
"""
Return the Elasticsearch mapping for a given table in the database.

:param table_name: The name of the table in the upstream database.
:return:
"""
mapping = {
'image': {
"mappings": {
"doc": {
"properties": {
"license_version": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"view_count": {
"type": "long"
},
"provider": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"source": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"license": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"url": {
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"type": "text"
},
"tags": {
"properties": {
"accuracy": {
"type": "float"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"foreign_landing_url": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"id": {
"type": "long"
},
"identifier": {
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"type": "text"
},
"title": {
"type": "text",
"similarity": "boolean",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"creator": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"created_on": {
"type": "date"
}
}
}
}
}
}
return mapping[table_name]
9 changes: 7 additions & 2 deletions ingestion_server/ingestion_server/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
from elasticsearch_dsl import connections, Search
from psycopg2.sql import SQL, Identifier
from ingestion_server.qa import create_search_qa_index

from ingestion_server.elasticsearch_models import database_table_to_elasticsearch_model
from ingestion_server.elasticsearch_models import \
database_table_to_elasticsearch_model
from ingestion_server.es_mapping import create_mapping

"""
A utility for indexing data to Elasticsearch. For each table to
Expand Down Expand Up @@ -203,6 +204,10 @@ def _index_table(self, table, dest_idx=None):
query = SQL('SELECT * FROM {}'
' WHERE id BETWEEN {} AND {} ORDER BY id'
.format(table, last_added_es_id, last_added_pg_id))
self.es.indices.create(
index=dest_idx,
body=create_mapping(table)
)
self._replicate(table, dest_idx, query)

def _replicate(self, table, dest_index, query):
Expand Down