Skip to content

Commit

Permalink
Merge pull request #2148 from mabel-dev/#2146
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer authored Dec 24, 2024
2 parents 0829fea + 3d9e65a commit 901cccf
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 69 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 913
__build__ = 915

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
63 changes: 8 additions & 55 deletions opteryx/managers/expression/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,71 +180,24 @@ def _inner_filter_operations(arr, operator, value):
return list_ops.cython_allop_neq(arr[0], value)

if operator == "AnyOpILike":
patterns = value[0]
from opteryx.utils.sql import regex_match_any

combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p)
combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE)

out = numpy.zeros(arr.size, dtype=bool)
for i, row in enumerate(arr):
if row is None:
out[i] = None
continue
if row.size == 0:
continue
out[i] = any(combined_regex.search(elem) for elem in row)

return out
return regex_match_any(arr, value[0], flags=re.IGNORECASE)

if operator == "AnyOpLike":
patterns = value[0]
from opteryx.utils.sql import regex_match_any

combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p)
combined_regex = re.compile(combined_regex_pattern)
return regex_match_any(arr, value[0])

out = numpy.zeros(arr.size, dtype=bool)
for i, row in enumerate(arr):
if row is None:
out[i] = None
continue
if row.size == 0:
continue
out[i] = any(combined_regex.search(elem) for elem in row)

return out
if operator == "AnyOpNotLike":
patterns = value[0]

combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p)
combined_regex = re.compile(combined_regex_pattern)
from opteryx.utils.sql import regex_match_any

out = numpy.zeros(arr.size, dtype=bool)
for i, row in enumerate(arr):
if row is None:
out[i] = None
continue
if row.size == 0:
continue
out[i] = any(combined_regex.search(elem) for elem in row)

return numpy.invert(out)
return regex_match_any(arr, value[0], invert=True)

if operator == "AnyOpNotILike":
patterns = value[0]

combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p)
combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE)

out = numpy.zeros(arr.size, dtype=bool)
for i, row in enumerate(arr):
if row is None:
out[i] = None
continue
if row.size == 0:
continue
out[i] = any(combined_regex.search(elem) for elem in row)
from opteryx.utils.sql import regex_match_any

return numpy.invert(out)
return regex_match_any(arr, value[0], flags=re.IGNORECASE, invert=True)

if operator == "AtQuestion":
import simdjson
Expand Down
83 changes: 83 additions & 0 deletions opteryx/utils/sql.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
from typing import List

import numpy

ESCAPE_SPECIAL_CHARS = re.compile(r"([.^$*+?{}[\]|()\\])")


Expand Down Expand Up @@ -114,3 +116,84 @@ def split_sql_statements(sql: str) -> List[str]:
statements.append("".join(buffer).strip())

return [s for s in statements if s != ""]


def regex_match_any(
arr: numpy.ndarray,
patterns: List[str],
flags: int = re.NOFLAG,
invert: bool = False,
) -> numpy.ndarray:
"""
Evaluates whether each row in `arr` matches ANY of the given LIKE patterns.
Patterns are converted to regexes, combined, and compiled once.
Parameters:
arr: numpy.ndarray
1D array of rows. Each element can be:
- None
- A single string/bytes
- A list/tuple/array of strings/bytes
(all non-None elements are assumed to be the same structure).
patterns: List[str]
A list of SQL LIKE patterns. These get combined into a single regex.
flags: int, optional
Flags to pass to `re.compile()`, e.g. re.IGNORECASE for ILIKE.
Returns:
numpy.ndarray:
A 1D object array with True, False, or None,
indicating whether each row did (or did not) match the patterns.
"""
# 1) Combine the LIKE patterns into a single compiled regex
# (Empty patterns list => empty string => matches nothing)
combined_pattern_str = r"|".join(sql_like_to_regex(p) for p in patterns if p)
# If there are no valid patterns, we build a "never match" pattern
if not combined_pattern_str:
combined_pattern_str = r"(?!x)" # Negative lookahead to never match

combined_regex = re.compile(combined_pattern_str, flags=flags)

# 2) Create the output array (dtype=object so we can store None/bool)
out = numpy.empty(arr.size, dtype=object)

# 3) Determine if the array consists of single strings or lists-of-strings
first_non_none = None
for x in arr:
if x is not None:
first_non_none = x
break

# If the entire array is None, just return all None
if first_non_none is None:
out[:] = None
return out

single_string_mode = isinstance(first_non_none, (str, bytes))

# 4) Main loop
if single_string_mode:
# Single-string mode
for i, row in enumerate(arr):
if row is None:
out[i] = None
else:
# Match or not?
is_match = combined_regex.search(row) is not None
out[i] = (not is_match) if invert else is_match
else:
# Lists-of-strings mode
for i, row in enumerate(arr):
if row is None:
out[i] = None
else:
# row is assumed to be an iterable of strings/bytes
if row.size == 0:
# Probably a numpy array with zero length
is_match = False
else:
# If anything in the row matches, it's True
is_match = any(combined_regex.search(elem) for elem in row)
out[i] = (not is_match) if invert else is_match

return out
53 changes: 42 additions & 11 deletions tests/sql_battery/test_shapes_and_errors_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -1857,29 +1857,60 @@
("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%')", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 'mission')", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%', 'mission')", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 357, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 357, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 334, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 334, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 37, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 320, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 297, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ()", 0, 2, SqlError),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ()", 0, 2, SqlError),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', null)", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 323, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 300, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%aPoll%')", 0, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%aPoll%')", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo 11')", 3, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 354, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 331, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo_%')", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apo__o%')", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 34, 2, None),
("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None),

("SELECT name, missions FROM $astronauts WHERE name LIKE ANY '%armstrong%'", 0, 2, None),
("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY '%armstrong%'", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%arms%')", 0, 2, None),
("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%arms%')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%Arms%')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 'mission')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%Armstrong%', 'mission')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY '%armstrong%'", 357, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY '%armstrong%'", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%armstrong%')", 357, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY ('%armstrong%')", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%')", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY ('%Armstrong%')", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%', 'mission')", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY ('%Armstrong%', 'mission')", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', '%Aldrin%', '%Collins%')", 4, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%', '%Aldrin%', '%Collins%')", 353, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ()", 0, 2, SqlError),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ()", 0, 2, SqlError),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', null)", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%', null)", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%aRmstrong%')", 0, 2, None),
("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%aRmstrong%')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('Neil A. Armstrong')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('Neil A. Armstrong')", 356, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%__Armstrong%')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Arm__rong%')", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 123)", 1, 2, None),
("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None),

# ****************************************************************************************

# These are queries which have been found to return the wrong result or not run correctly
Expand Down
5 changes: 3 additions & 2 deletions tests/storage/test_cache_memcached.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import os
import sys
import pytest

os.environ["OPTERYX_DEBUG"] = "1"

Expand Down Expand Up @@ -62,6 +61,7 @@ def test_memcached_cache():
assert stats.get("remote_cache_hits", 0) >= stats["blobs_read"], str(stats)
assert stats.get("cache_misses", 0) == 0, stats

@skip_if(is_arm() or is_windows() or is_mac())
def test_memcache_stand_alone():
os.environ["OPTERYX_DEBUG"] = "1"
from opteryx.managers.cache import MemcachedCache
Expand Down Expand Up @@ -110,6 +110,7 @@ def threaded_cache_operations(cache: MemcachedCache, payloads: list):
for thread in threads:
thread.join()

@skip_if(is_arm() or is_windows() or is_mac())
def test_memcache_threaded():
os.environ["OPTERYX_DEBUG"] = "1"

Expand All @@ -128,7 +129,7 @@ def test_memcache_threaded():
if result:
assert result == load, f"Post-thread check failed: {result} != {load}"


@skip_if(is_arm() or is_windows() or is_mac())
def test_skip_on_error():
from opteryx.managers.cache import MemcachedCache
cache = MemcachedCache()
Expand Down

0 comments on commit 901cccf

Please sign in to comment.