Skip to content

Commit

Permalink
Hackernews search in EVADB (#1362)
Browse files Browse the repository at this point in the history
This PR supports searching Hackernews stories in EVADB:

- We treat Hackernews as a data source
- Algolia search API for hackernews is used for searching
- Get requests are used
- We support searching within: `story`, `comment`, `URL`, `story`,
`poll` or a combination of them
- Syntax:
```
params = {
        "query": "EVADB",
        "tags": "(story,poll)",
    }
query = f"""CREATE DATABASE hackernews_data
    WITH ENGINE = "hackernews",
    PARAMETERS = {params};"""
```

---------

Co-authored-by: Kaushik Ravichandran <kravicha3@ada-01.cc.gatech.edu>
  • Loading branch information
kaushikravichandran and Kaushik Ravichandran authored Nov 20, 2023
1 parent f6f3dda commit aab2446
Show file tree
Hide file tree
Showing 8 changed files with 296 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ parts:
- file: source/reference/databases/clickhouse
- file: source/reference/databases/github
- file: source/reference/databases/snowflake
- file: source/reference/databases/hackernews

- file: source/reference/vector_databases/index
title: Vector Databases
Expand Down
44 changes: 44 additions & 0 deletions docs/source/reference/databases/hackernews.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
Hackernews
==========

The connection to Hackernews is based on the `Algolia Hackernews <https://hn.algolia.com/api>`_ API.

Dependency
----------

* requests


Parameters
----------

Required:

* ``query`` is the search query for getting the results.

Optional:

* ``tags`` is the tag used for filtering the query results. Check `available tags <https://hn.algolia.com/api#:~:text=filter%20on%20a%20specific%20tag.%20Available%20tags%3A>`_ to see a list of available filter tags.

Create Connection
-----------------

.. code-block:: text
CREATE DATABASE hackernews_data WITH ENGINE = 'hackernews', PARAMETERS = {
"query": "EVADB",
"tags": "story"
};
Supported Tables
----------------

* ``search_results``: Lists the search query results. Check `table_column_info.py <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/third_party/databases/hackernews/table_column_info.py>`_ for all the available columns in the table.

.. code-block:: sql
SELECT * FROM hackernews_data.search_results LIMIT 3;
.. note::

Looking for another table from Hackernews? Please raise a `Feature Request <https://github.com/georgia-tech-db/evadb/issues/new/choose>`_.
15 changes: 15 additions & 0 deletions evadb/third_party/databases/hackernews/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""hackernews search integration"""
152 changes: 152 additions & 0 deletions evadb/third_party/databases/hackernews/hackernews_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json

import pandas as pd
import requests

from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS
from evadb.third_party.databases.types import (
DBHandler,
DBHandlerResponse,
DBHandlerStatus,
)


class HackernewsSearchHandler(DBHandler):
def connection():
return requests.get("https://www.google.com/").status_code == 200

def __init__(self, name: str, **kwargs):
"""
Initialize the handler.
Args:
name (str): name of the DB handler instance
**kwargs: arbitrary keyword arguments for establishing the connection.
"""
super().__init__(name)
self.query = kwargs.get("query", "")
self.tags = kwargs.get("tags", "")

@property
def supported_table(self):
def _hackernews_topics_generator():
url = "http://hn.algolia.com/api/v1/search?"
url += "query=" + self.query
url += "&tags=" + (
"story" if self.tags == "" else +self.tags
) # search stories by default
response = requests.get(url)
if response.status_code != 200:
raise Exception("Could not reach website.")
json_result = response.content
dict_result = json.loads(json_result)
for row in dict_result:
yield {
property_name: row[property_name]
for property_name, _ in HACKERNEWS_COLUMNS
}

mapping = {
"search_results": {
"columns": HACKERNEWS_COLUMNS,
"generator": _hackernews_topics_generator(),
},
}
return mapping

def connect(self):
"""
Set up the connection required by the handler.
Returns:
DBHandlerStatus
"""
return DBHandlerStatus(status=True)

def disconnect(self):
"""
Close any existing connections.
"""
pass

def check_connection(self) -> DBHandlerStatus:
"""
Check connection to the handler.
Returns:
DBHandlerStatus
"""
if self.connection():
return DBHandlerStatus(status=True)
else:
return DBHandlerStatus(status=False, error="Not connected to the internet.")

def get_tables(self) -> DBHandlerResponse:
"""
Return the list of tables in the database.
Returns:
DBHandlerResponse
"""
if not self.connection():
return DBHandlerResponse(data=None, error="Not connected to the internet.")

try:
tables_df = pd.DataFrame(
list(self.supported_table.keys()), columns=["table_name"]
)
return DBHandlerResponse(data=tables_df)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))

def get_columns(self, table_name: str) -> DBHandlerResponse:
"""
Returns the list of columns for the given table.
Args:
table_name (str): name of the table whose columns are to be retrieved.
Returns:
DBHandlerResponse
"""
if not self.connection():
return DBHandlerResponse(data=None, error="Not connected to the internet.")
try:
columns_df = pd.DataFrame(
self.supported_table[table_name]["columns"], columns=["name", "dtype"]
)
return DBHandlerResponse(data=columns_df)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))

def select(self, table_name: str) -> DBHandlerResponse:
"""
Returns a generator that yields the data from the given table.
Args:
table_name (str): name of the table whose data is to be retrieved.
Returns:
DBHandlerResponse
"""
if not self.connection:
return DBHandlerResponse(data=None, error="Not connected to the database.")
try:
if table_name not in self.supported_table:
return DBHandlerResponse(
data=None,
error="{} is not supported or does not exist.".format(table_name),
)

return DBHandlerResponse(
data=None,
data_generator=self.supported_table[table_name]["generator"],
)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))
23 changes: 23 additions & 0 deletions evadb/third_party/databases/hackernews/table_column_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

HACKERNEWS_COLUMNS = [
["title", str],
["url", str],
["author", str],
["points", int],
["story_text", str],
["num_comments", int],
]
2 changes: 2 additions & 0 deletions evadb/third_party/databases/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs):
return mod.SnowFlakeDbHandler(engine, **kwargs)
elif engine == "github":
return mod.GithubHandler(engine, **kwargs)
elif engine == "hackernews":
return mod.HackernewsSearchHandler(engine, **kwargs)
elif engine == "slack":
return mod.SlackHandler(engine, **kwargs)
else:
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def read(path, encoding="utf-8"):

xgboost_libs = ["flaml[automl]"]

hackernews_libs = ["requests"]

forecasting_libs = [
"statsforecast", # MODEL TRAIN AND FINE TUNING
"neuralforecast", # MODEL TRAIN AND FINE TUNING
Expand Down Expand Up @@ -176,6 +178,7 @@ def read(path, encoding="utf-8"):
"sklearn": sklearn_libs,
"xgboost": xgboost_libs,
"forecasting": forecasting_libs,
"hackernews": hackernews_libs,
# everything except ray, qdrant, ludwig and postgres. The first three fail on pyhton 3.11.
"dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs + imagegen_libs + xgboost_libs
}
Expand Down
56 changes: 56 additions & 0 deletions test/integration_tests/long/test_hackernews_datasource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test.util import get_evadb_for_testing

import pytest

from evadb.server.command_handler import execute_query_fetch_all
from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS


@pytest.mark.notparallel
class HackernewsDataSourceTest(unittest.TestCase):
def setUp(self):
self.evadb = get_evadb_for_testing()
# reset the catalog manager before running each test
self.evadb.catalog().reset()

def tearDown(self):
execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS hackernews_data;")

@pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message")
def test_should_run_select_query_in_hackernews(self):
# Create database.
params = {
"query": "EVADB",
"tags": "story",
}
query = f"""CREATE DATABASE hackernews_data
WITH ENGINE = "hackernews",
PARAMETERS = {params};"""
execute_query_fetch_all(self.evadb, query)

query = "SELECT * FROM hackernews_data.search_results LIMIT 5;"
batch = execute_query_fetch_all(self.evadb, query)
self.assertEqual(len(batch), 10)
expected_column = list(
["search_results.{}".format(col) for col, _ in HACKERNEWS_COLUMNS]
)
self.assertEqual(batch.columns, expected_column)


if __name__ == "__main__":
unittest.main()

0 comments on commit aab2446

Please sign in to comment.