Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPCN-67] Feature: Add advanced search to Wire #372

Merged
merged 4 commits into from
May 19, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions features/web_api/wire_advanced_search.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
Feature: Wire Advanced Search
Background: Push content
Given "items"
"""
[{
"_id": "weather-today-sydney", "type": "text", "version": 1, "versioncreated": "#DATE#",
"headline": "Sydney Weather Today Current",
"slugline": "current-weather-today",
"body_html": "<p><h1>Sydney Weather Report for Today</h1></p>"
}, {
"_id": "weather-today-prague", "type": "text", "version": 1, "versioncreated": "#DATE-1#",
"headline": "Prague Weather Today",
"slugline": "current-weather-today",
"body_html": "<p><h1>Prague Weather Report for Today</h1></p>"
}, {
"_id": "weather-today-belgrade", "type": "text", "version": 1, "versioncreated": "#DATE-2#",
"headline": "Belgrade Weather Today",
"slugline": "current-weather-today",
"body_html": "<p><h1>Belgrade Weather Report for Today</h1></p>"
}, {
"_id": "sports-results-today-1", "type": "text", "version": 1, "versioncreated": "#DATE-3#",
"headline": "Sports Results for Today",
"slugline": "sports-results",
"body_html": "<p><h1>Sports Results for Today 1</h1></p>"
}]
"""

@auth @admin
Scenario: Search for all keywords
When we get "/wire/search?advanced_search={"all":"Weather Sydney"}"
Then we get the following order
"""
["weather-today-sydney"]
"""
When we get "/wire/search?advanced_search={"all":"Weather Wellington"}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"all":"Weather Sydney","fields":["headline"]}"
Then we get the following order
"""
["weather-today-sydney"]
"""
When we get "/wire/search?advanced_search={"all":"Weather Sydney","fields":["slugline"]}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"all":"Weather Today","fields":["slugline"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""
When we get "/wire/search?advanced_search={"all":"Weather Sydney Report","fields":["body_html"]}"
Then we get the following order
"""
["weather-today-sydney"]
"""

@auth @admin
Scenario: Search for any keywords
When we get "/wire/search?advanced_search={"any":"Sydney Prague Belgrade Wellington"}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""
When we get "/wire/search?advanced_search={"any":"Wellington Canberra"}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"any":"Sydney Prague Belgrade Wellington","fields":["headline"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""
When we get "/wire/search?advanced_search={"any":"Sydney Prague Belgrade Wellington","fields":["slugline"]}"
Then we get the following order
"""
[]
"""
When we get "/wire/search?advanced_search={"any":"Sydney Prague Belgrade Wellington","fields":["body_html"]}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "weather-today-belgrade"]
"""

@auth @admin
Scenario: Search for excluded keywords
When we get "/wire/search?advanced_search={"exclude":"Belgrade Wellington Canberra"}"
Then we get the following order
"""
["weather-today-sydney", "weather-today-prague", "sports-results-today-1"]
"""
When we get "/wire/search?advanced_search={"exclude":"Weather"}"
Then we get the following order
"""
["sports-results-today-1"]
"""
When we get "/wire/search?advanced_search={"exclude":"Weather","fields":["headline"]}"
Then we get the following order
"""
["sports-results-today-1"]
"""

@auth @admin
Scenario: Search multiple fields
MarkLark86 marked this conversation as resolved.
Show resolved Hide resolved
# This scenario currently fails, due to a technical limitation with `multi_match` queries
# across fields with different analyzers
# see: https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-multi-match-query.html#cross-field-analysis
When we get "/wire/search?advanced_search={"all":"Sydney Current Report","fields":["headline","body_html"]}"
Then we get the following order
"""
["weather-today-sydney"]
"""
2 changes: 1 addition & 1 deletion newsroom/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ELASTIC_PREFIX = "CONTENTAPI_ELASTICSEARCH"

SCHEMA_VERSIONS = {
"wire": 1,
"wire": 2,
"agenda": 3,
}

Expand Down
3 changes: 1 addition & 2 deletions newsroom/news_api/items/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import superdesk
from content_api.items import ItemsResource as ContentApiItemsResource
from content_api.items import ItemsService as ContentApiItemsService
from newsroom.wire.items import ItemsResource as ContentApiItemsResource, ItemsService as ContentApiItemsService


class ItemsResource(ContentApiItemsResource):
Expand Down
2 changes: 1 addition & 1 deletion newsroom/news_api/news/search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from superdesk import get_resource_service
from superdesk.utc import utcnow, local_to_utc
from superdesk.errors import SuperdeskApiError
from content_api.items.resource import ItemsResource
from newsroom.wire.items import ItemsResource
from content_api.errors import BadParameterValueError, UnexpectedParameterError
from newsroom.auth import get_company

Expand Down
54 changes: 53 additions & 1 deletion newsroom/search.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional, Union, Dict, Any
from typing import List, Optional, Union, Dict, Any, TypedDict
from copy import deepcopy

from flask import current_app as app, json, abort
Expand Down Expand Up @@ -66,6 +66,13 @@ def get_filter_query(
return {"terms": {aggregation_field: val}}


class AdvancedSearchParams(TypedDict):
all: str
any: str
exclude: str
fields: str
MarkLark86 marked this conversation as resolved.
Show resolved Hide resolved


class SearchQuery(object):
"""Class for storing the search parameters for validation and query generation"""

Expand All @@ -78,6 +85,12 @@ def __init__(self):
self.navigation_ids = []
self.products = []
self.requested_products = []
self.advanced: AdvancedSearchParams = {
"all": "",
"any": "",
"exclude": "",
"fields": "",
}

self.args = {}
self.lookup = {}
Expand All @@ -98,6 +111,7 @@ class BaseSearchService(Service):
default_sort = [{"versioncreated": "desc"}]
default_page_size = 25
_matched_ids = [] # array of IDs matched on the request, used when searching all versions
default_advanced_search_fields = []

def get(self, req, lookup):
search = SearchQuery()
Expand Down Expand Up @@ -243,6 +257,7 @@ def apply_filters(self, search):
self.apply_time_limit_filter(search)
self.apply_products_filter(search)
self.apply_request_filter(search)
self.apply_request_advanced_search(search)
self.apply_embargoed_filters(search)

if len(search.query["bool"].get("should", [])):
Expand Down Expand Up @@ -639,6 +654,43 @@ def apply_request_filter(self, search):
if search.args.get("created_from") or search.args.get("created_to"):
search.source["post_filter"]["bool"]["filter"].append(self.versioncreated_range(search.args))

def apply_request_advanced_search(self, search: SearchQuery):
if not search.args.get("advanced_search"):
return

search.advanced = json.loads(search.args["advanced_search"])
fields = search.advanced.get("fields") or self.default_advanced_search_fields
if not fields:
return

def gen_match_query(keywords: str, operator: str, multi_match_type):
if len(fields) == 1:
MarkLark86 marked this conversation as resolved.
Show resolved Hide resolved
return {
"match": {
fields[0]: {
"query": keywords,
"operator": operator,
"lenient": True,
},
},
}
else:
return {
"multi_match": {
"query": keywords,
"type": multi_match_type,
"fields": fields,
"operator": operator,
},
}

if search.advanced.get("all"):
search.query["bool"]["filter"].append(gen_match_query(search.advanced["all"], "AND", "cross_fields"))
if search.advanced.get("any"):
search.query["bool"]["filter"].append(gen_match_query(search.advanced["any"], "OR", "best_fields"))
if search.advanced.get("exclude"):
search.query["bool"]["must_not"].append(gen_match_query(search.advanced["exclude"], "OR", "best_fields"))

def apply_embargoed_filters(self, search):
"""Generate filters for embargoed params"""

Expand Down
4 changes: 4 additions & 0 deletions newsroom/tests/web_api/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def before_all(context):


def before_scenario(context, scenario):
if "skip" in scenario.tags:
scenario.skip("Marked with @skip")
return

for key in list(agenda_aggs.keys()):
agenda_aggs.pop(key)
agenda_aggs.update(orig_agenda_aggs)
Expand Down
3 changes: 1 addition & 2 deletions newsroom/web/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,7 @@
CORE_APPS = [
"superdesk.notification",
"superdesk.data_updates",
"content_api.items",
"content_api.items_versions",
"newsroom.wire.items",
"content_api.search",
"content_api.auth",
"content_api.publish",
Expand Down
23 changes: 23 additions & 0 deletions newsroom/wire/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from copy import deepcopy

import superdesk
from content_api.items.resource import ItemsResource as BaseItemsResource
from content_api.items.service import ItemsService

from content_api.items_versions.resource import ItemsVersionsResource as BaseItemsVersionsResource
from content_api.items_versions.service import ItemsVersionsService


class ItemsResource(BaseItemsResource):
schema = deepcopy(BaseItemsResource.schema)
schema["slugline"] = schema["headline"] = schema["body_html"]


class ItemsVersionsResource(BaseItemsVersionsResource):
schema = deepcopy(BaseItemsVersionsResource.schema)
schema["slugline"] = schema["headline"] = schema["body_html"]


def init_app(app):
superdesk.register_resource("items", ItemsResource, ItemsService, _app=app)
superdesk.register_resource("items_versions", ItemsVersionsResource, ItemsVersionsService, _app=app)
1 change: 1 addition & 0 deletions newsroom/wire/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def items_query(ignore_latest=False):

class WireSearchService(BaseSearchService):
section = "wire"
default_advanced_search_fields = ["headline", "slugline", "body_html"]

def get_bookmarks_count(self, user_id):
req = ParsedRequest()
Expand Down