Fix: SqlCatalog list_namespaces() should return only sub-namespaces#1629
Conversation
62b9efc to
9e55cf6
Compare
|
@kevinjqliu while working on this I also noticed that |
kevinjqliu
left a comment
There was a problem hiding this comment.
added a comment about using %
the test lgtm to follows the behavior as java implementation
https://github.com/apache/iceberg/blob/41b458b7022c7b0cd78eeca9102392db7889d3c9/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java#L761-L764
pyiceberg/catalog/sql.py
Outdated
| ) | ||
| with Session(self.engine) as session: | ||
| return [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] | ||
| namespaces = [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] |
There was a problem hiding this comment.
instead of all this logic, can we just filter out the results?
There was a problem hiding this comment.
Sorry, I don't think I understand what you mean here so my answer can be unrelated.
If it's about filtering in the SQL query, I don't think it's possible to easily cover some edge cases. The Java implementation handles the filtering afterward as well.
There was a problem hiding this comment.
oops sorry for the ambiguous statement. i meant to filter the results inline.
Something like this, filters the level and fuzzy match together
with Session(self.engine) as session:
namespace_tuple = Catalog.identifier_to_tuple(namespace)
sub_namespaces_level_length = len(namespace_tuple) + 1 if namespace else 1
namespaces = [
ns[:sub_namespaces_level_length] # truncate to the required level
for ns in {Catalog.identifier_to_tuple(ns) for ns in session.execute(stmt).scalars()}
if len(ns) >= sub_namespaces_level_length # only get sub namespaces/children
and ns[: len(namespace_tuple)] == namespace_tuple # exclude fuzzy matches when `namespace` contains `%` or `_`
]
return namespaces
There was a problem hiding this comment.
Thanks for the clarification!
I didn't know that ns[:0] == () would actually work 😄
| ) | ||
| @pytest.mark.parametrize("namespace_list", [lazy_fixture("database_list"), lazy_fixture("hierarchical_namespace_list")]) | ||
| def test_list_namespaces(catalog: SqlCatalog, namespace_list: List[str]) -> None: | ||
| def test_list_namespaces(catalog: SqlCatalog) -> None: |
There was a problem hiding this comment.
thanks for adding this test!
tests/catalog/test_sql.py
Outdated
| namespace = Catalog.namespace_from(table_identifier) | ||
| catalog.create_namespace(namespace) | ||
| assert namespace in catalog.list_namespaces() | ||
| assert namespace[:1] in catalog.list_namespaces() |
There was a problem hiding this comment.
why is this excluding the first result?
There was a problem hiding this comment.
I'm taking only the first level of the namespace.
Calling list_namespaces() without parameters now correctly returns only the top level namespaces.
So if namespace is a multi-level namespace, for example "db.ns1", only "db" is returned by list_namespaces()
There was a problem hiding this comment.
i see, thanks for the explanation! i think the assert here is testing that the newly created namespace exists.
so perhaps something like this matches its behavior more
assert namespace in catalog.list_namespaces(namespace[:-1])
1532ab4 to
8321d64
Compare
pyiceberg/catalog/sql.py
Outdated
| if namespace: | ||
| namespace_tuple = Catalog.identifier_to_tuple(namespace) | ||
| # exclude fuzzy matches when `namespace` contains `%` or `_` | ||
| namespaces = [ns for ns in namespaces if ns[: len(namespace_tuple)] == namespace_tuple] |
There was a problem hiding this comment.
Performance nit: Should we move the len out of the loop?
tests/catalog/test_sql.py
Outdated
| expected_list: list[tuple[str, ...]] = [("db",), ("db2",), ("db%",)] | ||
| for ns in expected_list: | ||
| assert ns in ns_list |
There was a problem hiding this comment.
Why not check the full list? This way we make sure that they are equal, and that the ns_list doesn't contain additional elements:
| expected_list: list[tuple[str, ...]] = [("db",), ("db2",), ("db%",)] | |
| for ns in expected_list: | |
| assert ns in ns_list | |
| assert ns_list == [("db",), ("db2",), ("db%",)] |
There was a problem hiding this comment.
Good idea, thanks!
I'm using sorted(ns_list) to make the test more resilient.
Or do you think we should force a specific order in list_namespaces()?
There was a problem hiding this comment.
Another note, the first assert ns_list == expected_list does not work as expected because the catalog contains other namespaces created in other tests.
ns_list = catalog.list_namespaces()
> assert sorted(ns_list) == [("db",), ("db%",), ("db2",)]
E AssertionError: assert [('db',), ('d...t_new',), ...] == [('db',), ('db%',), ('db2',)]
E Left contains 109 more items, first extra item: ('my_iceberg_database-alcotyqwtpiqunaddobf',)
There was a problem hiding this comment.
this worked for me
assert sorted(catalog.list_namespaces()) == [("db",), ("db%",), ("db2",)]
assert sorted(catalog.list_namespaces("db")) == [("db", "ns1"), ("db", "ns2")]
assert catalog.list_namespaces("db.ns1") == [("db", "ns1", "ns2")]
assert catalog.list_namespaces("db.ns1.ns2") == []
There was a problem hiding this comment.
the first assertion doesn't work if you run all the tests in test_sql.py because there are other top-level namespaces created by other tests
There was a problem hiding this comment.
huh, maybe something like this then
assert all(namespace in catalog.list_namespaces() for namespace in [("db",), ("db%",), ("db2",)])
kevinjqliu
left a comment
There was a problem hiding this comment.
Thanks for working on this fix @alessandro-nori! This is such an interesting behavior. I had to take some time to go through the java implementation and tests thoroughly
I've added some comments on refactoring and testing. I think we might want to fix #1630 as part of this PR as well. WDYT?
pyiceberg/catalog/sql.py
Outdated
| ) | ||
| with Session(self.engine) as session: | ||
| return [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] | ||
| namespaces = [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] |
There was a problem hiding this comment.
oops sorry for the ambiguous statement. i meant to filter the results inline.
Something like this, filters the level and fuzzy match together
with Session(self.engine) as session:
namespace_tuple = Catalog.identifier_to_tuple(namespace)
sub_namespaces_level_length = len(namespace_tuple) + 1 if namespace else 1
namespaces = [
ns[:sub_namespaces_level_length] # truncate to the required level
for ns in {Catalog.identifier_to_tuple(ns) for ns in session.execute(stmt).scalars()}
if len(ns) >= sub_namespaces_level_length # only get sub namespaces/children
and ns[: len(namespace_tuple)] == namespace_tuple # exclude fuzzy matches when `namespace` contains `%` or `_`
]
return namespaces
tests/catalog/test_sql.py
Outdated
| expected_list: list[tuple[str, ...]] = [("db",), ("db2",), ("db%",)] | ||
| for ns in expected_list: | ||
| assert ns in ns_list |
There was a problem hiding this comment.
this worked for me
assert sorted(catalog.list_namespaces()) == [("db",), ("db%",), ("db2",)]
assert sorted(catalog.list_namespaces("db")) == [("db", "ns1"), ("db", "ns2")]
assert catalog.list_namespaces("db.ns1") == [("db", "ns1", "ns2")]
assert catalog.list_namespaces("db.ns1.ns2") == []
tests/catalog/test_sql.py
Outdated
| namespace = Catalog.namespace_from(table_identifier) | ||
| catalog.create_namespace(namespace) | ||
| assert namespace in catalog.list_namespaces() | ||
| assert namespace[:1] in catalog.list_namespaces() |
There was a problem hiding this comment.
i see, thanks for the explanation! i think the assert here is testing that the newly created namespace exists.
so perhaps something like this matches its behavior more
assert namespace in catalog.list_namespaces(namespace[:-1])
|
Thanks for your review @kevinjqliu and @Fokko . If you prefer having both issues in the same PR for review let me know, it works either way for me. |
|
now that #1630 is merged, could you rebase off the latest main |
# Conflicts: # tests/catalog/test_sql.py
1ccfc22 to
e32fed3
Compare
e32fed3 to
83a0c54
Compare
|
Thanks for splitting this @alessandro-nori 🙏 |
kevinjqliu
left a comment
There was a problem hiding this comment.
LGTM! Thanks for fixing this
Fokko
left a comment
There was a problem hiding this comment.
Oof, missed this one. Looks good, thanks @alessandro-nori
Resolves: #1627