Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPCN-67] Feature: Add advanced search to Wire #372

Merged
merged 4 commits into from
May 19, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions features/web_api/wire_advanced_search.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
Feature: Wire Advanced Search
Background: Push content
Given "items"
"""
[{
"_id": "weather-today-sydney", "type": "text", "version": 1, "versioncreated": "#DATE#",
"headline": "Sydney Weather Today Current",
"slugline": "current-weather-today",
"body_html": "<p><h1>Sydney Weather Report for Today</h1></p>"
}, {
"_id": "weather-today-prague", "type": "text", "version": 1, "versioncreated": "#DATE-1#",
"headline": "Prague Weather Today",
"slugline": "current-weather-today",
"body_html": "<p><h1>Prague Weather Report for Today</h1></p>"
}, {
"_id": "weather-today-belgrade", "type": "text", "version": 1, "versioncreated": "#DATE-2#",
"headline": "Belgrade Weather Today",
"slugline": "current-weather-today",
"body_html": "<p><h1>Belgrade Weather Report for Today</h1></p>"
}, {
"_id": "sports-results-today-1", "type": "text", "version": 1, "versioncreated": "#DATE-3#",
"headline": "Sports Results for Today",
"slugline": "sports-results",
"body_html": "<p><h1>Sports Results for Today 1</h1></p>"
}]
"""

@auth @admin
Scenario: Search for all keywords
When we get "/wire/search?advanced_search={"all":["Weather","Sydney"]}"
MarkLark86 marked this conversation as resolved.
Show resolved Hide resolved
Then we get the following order
"""
["weather-today-sydney"]
"""
When we get "/wire/search?advanced_search={"all":["Weather","Wellington"]}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"all":["Weather","Sydney"],"fields":["headline"]}"
Then we get the following order
"""
["weather-today-sydney"]
"""
When we get "/wire/search?advanced_search={"all":["Weather","Sydney"],"fields":["slugline"]}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"all":["Weather","Today"],"fields":["slugline"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""
When we get "/wire/search?advanced_search={"all":["Weather","Sydney","Report"],"fields":["body_html"]}"
Then we get the following order
"""
["weather-today-sydney"]
"""

@auth @admin
Scenario: Search for any keywords
When we get "/wire/search?advanced_search={"any":["Sydney","Prague","Belgrade","Wellington"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""
When we get "/wire/search?advanced_search={"any":["Wellington","Canberra"]}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"any":["Sydney","Prague","Belgrade","Wellington"],"fields":["headline"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""
When we get "/wire/search?advanced_search={"any":["Sydney","Prague","Belgrade","Wellington"],"fields":["slugline"]}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"any":["Sydney","Prague","Belgrade","Wellington"],"fields":["body_html"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""

@auth @admin
Scenario: Search for excluded keywords
When we get "/wire/search?advanced_search={"exclude":["Belgrade","Wellington","Canberra"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "sports-results-today-1"]
"""
When we get "/wire/search?advanced_search={"exclude":["Weather"]}"
Then we get the following order
"""
["sports-results-today-1"]
"""
When we get "/wire/search?advanced_search={"exclude":["Weather"],"fields":["headline"]}"
Then we get the following order
"""
["sports-results-today-1"]
"""

@auth @admin @skip
Scenario: Search multiple fields
MarkLark86 marked this conversation as resolved.
Show resolved Hide resolved
# This scenario currently fails, due to a technical limitation with `multi_match` queries
# across fields with different analyzers
# see: https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-multi-match-query.html#cross-field-analysis
When we get "/wire/search?advanced_search={"all":["Sydney","Current","Report"],"fields":["headline","body_html"]}"
Then we get the following order
"""
["weather-today-sydney"]
"""
54 changes: 53 additions & 1 deletion newsroom/search.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional, Union, Dict, Any
from typing import List, Optional, Union, Dict, Any, TypedDict
from copy import deepcopy

from flask import current_app as app, json, abort
Expand Down Expand Up @@ -66,6 +66,13 @@ def get_filter_query(
return {"terms": {aggregation_field: val}}


class AdvancedSearchParams(TypedDict):
all: List[str]
any: List[str]
exclude: List[str]
fields: List[str]


class SearchQuery(object):
"""Class for storing the search parameters for validation and query generation"""

Expand All @@ -78,6 +85,12 @@ def __init__(self):
self.navigation_ids = []
self.products = []
self.requested_products = []
self.advanced: AdvancedSearchParams = {
"all": [],
"any": [],
"exclude": [],
"fields": [],
}

self.args = {}
self.lookup = {}
Expand All @@ -98,6 +111,7 @@ class BaseSearchService(Service):
default_sort = [{"versioncreated": "desc"}]
default_page_size = 25
_matched_ids = [] # array of IDs matched on the request, used when searching all versions
default_advanced_search_fields = []

def get(self, req, lookup):
search = SearchQuery()
Expand Down Expand Up @@ -243,6 +257,7 @@ def apply_filters(self, search):
self.apply_time_limit_filter(search)
self.apply_products_filter(search)
self.apply_request_filter(search)
self.apply_request_advanced_search(search)
self.apply_embargoed_filters(search)

if len(search.query["bool"].get("should", [])):
Expand Down Expand Up @@ -639,6 +654,43 @@ def apply_request_filter(self, search):
if search.args.get("created_from") or search.args.get("created_to"):
search.source["post_filter"]["bool"]["filter"].append(self.versioncreated_range(search.args))

def apply_request_advanced_search(self, search: SearchQuery):
if not search.args.get("advanced_search"):
return

search.advanced = json.loads(search.args["advanced_search"])
fields = search.advanced.get("fields") or self.default_advanced_search_fields
if not fields:
return

def gen_match_query(keywords: List[str], operator: str, multi_match_type):
if len(fields) == 1:
MarkLark86 marked this conversation as resolved.
Show resolved Hide resolved
return {
"match": {
fields[0]: {
"query": " ".join(keywords),
"operator": operator,
"lenient": True,
},
},
}
else:
return {
"multi_match": {
"query": " ".join(keywords),
"type": multi_match_type,
"fields": fields,
"operator": operator,
},
}

if search.advanced.get("all"):
search.query["bool"]["filter"].append(gen_match_query(search.advanced["all"], "AND", "cross_fields"))
if search.advanced.get("any"):
search.query["bool"]["filter"].append(gen_match_query(search.advanced["any"], "OR", "best_fields"))
if search.advanced.get("exclude"):
search.query["bool"]["must_not"].append(gen_match_query(search.advanced["exclude"], "OR", "best_fields"))

def apply_embargoed_filters(self, search):
"""Generate filters for embargoed params"""

Expand Down
4 changes: 4 additions & 0 deletions newsroom/tests/web_api/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def before_all(context):


def before_scenario(context, scenario):
if "skip" in scenario.tags:
scenario.skip("Marked with @skip")
return

for key in list(agenda_aggs.keys()):
agenda_aggs.pop(key)
agenda_aggs.update(orig_agenda_aggs)
Expand Down
1 change: 1 addition & 0 deletions newsroom/wire/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def items_query(ignore_latest=False):

class WireSearchService(BaseSearchService):
section = "wire"
default_advanced_search_fields = ["headline", "slugline", "body_html"]

def get_bookmarks_count(self, user_id):
req = ParsedRequest()
Expand Down