Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 03f33c1

Browse files
author
Sergey Vasilyev
committed
Retrieve collations for selected databases (SQL Server & Snowflake)
1 parent b73fc95 commit 03f33c1

File tree

4 files changed

+91
-4
lines changed

4 files changed

+91
-4
lines changed

data_diff/abcs/database_types.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import decimal
22
from abc import ABC, abstractmethod
3-
from typing import List, Optional, Tuple, Type, TypeVar, Union
3+
from typing import Collection, List, Optional, Tuple, Type, TypeVar, Union
44
from datetime import datetime
55

66
import attrs
@@ -15,6 +15,91 @@
1515
N = TypeVar("N")
1616

1717

18+
@attrs.frozen(kw_only=True, eq=False, order=False, unsafe_hash=True)
19+
class Collation:
20+
"""
21+
A pre-parsed or pre-known record about db collation, per column.
22+
23+
The "greater" collation should be used as a target collation for textual PKs
24+
on both sides of the diff — by coverting the "lesser" collation to self.
25+
26+
Snowflake easily absorbs the performance losses, so it has a boost to always
27+
be greater than any other collation in non-Snowflake databases.
28+
Other databases need to negotiate which side absorbs the performance impact.
29+
"""
30+
31+
# A boost for special databases that are known to absorb the performance dmaage well.
32+
absorbs_damage: bool = False
33+
34+
# Ordinal soring by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
35+
ordinal: Optional[bool] = None
36+
37+
# Lowercase first (aAbBcC or abcABC). Otherwise, uppercase first (AaBbCc or ABCabc).
38+
lower_first: Optional[bool] = None
39+
40+
# 2-letter lower-case locale and upper-case country codes, e.g. en_US. Ignored for ordinals.
41+
language: Optional[str] = None
42+
country: Optional[str] = None
43+
44+
# There are also space-, punctuation-, width-, kana-(in)sensitivity, so on.
45+
# Ignore everything not related to xdb alignment. Only case- & accent-sensitivity are common.
46+
case_sensitive: Optional[bool] = None
47+
accent_sensitive: Optional[bool] = None
48+
49+
# Purely informational, for debugging:
50+
_source: Union[None, str, Collection[str]] = None
51+
52+
def __eq__(self, other: object) -> bool:
53+
if not isinstance(other, Collation):
54+
return NotImplemented
55+
if self.ordinal and other.ordinal:
56+
# TODO: does it depend on language? what does Albanic_BIN mean in MS SQL?
57+
return True
58+
return (
59+
self.language == other.language
60+
and (self.country is None or other.country is None or self.country == other.country)
61+
and self.case_sensitive == other.case_sensitive
62+
and self.accent_sensitive == other.accent_sensitive
63+
and self.lower_first == other.lower_first
64+
)
65+
66+
def __ne__(self, other: object) -> bool:
67+
if not isinstance(other, Collation):
68+
return NotImplemented
69+
return not self.__eq__(other)
70+
71+
def __gt__(self, other: object) -> bool:
72+
if not isinstance(other, Collation):
73+
return NotImplemented
74+
if self == other:
75+
return False
76+
if self.absorbs_damage and not other.absorbs_damage:
77+
return False
78+
if other.absorbs_damage and not self.absorbs_damage:
79+
return True # this one is preferred if it cannot absorb damage as its counterpart can
80+
if self.ordinal and not other.ordinal:
81+
return True
82+
if other.ordinal and not self.ordinal:
83+
return False
84+
# TODO: try to align the languages & countries?
85+
return False
86+
87+
def __ge__(self, other: object) -> bool:
88+
if not isinstance(other, Collation):
89+
return NotImplemented
90+
return self == other or self.__gt__(other)
91+
92+
def __lt__(self, other: object) -> bool:
93+
if not isinstance(other, Collation):
94+
return NotImplemented
95+
return self != other and not self.__gt__(other)
96+
97+
def __le__(self, other: object) -> bool:
98+
if not isinstance(other, Collation):
99+
return NotImplemented
100+
return self == other or not self.__gt__(other)
101+
102+
18103
@attrs.define(frozen=True, kw_only=True)
19104
class ColType:
20105
# Arbitrary metadata added and fetched at runtime.
@@ -112,6 +197,7 @@ def python_type(self) -> type:
112197
@attrs.define(frozen=True)
113198
class StringType(ColType):
114199
python_type = str
200+
collation: Optional[Collation] = attrs.field(default=None, kw_only=True)
115201

116202

117203
@attrs.define(frozen=True)

data_diff/databases/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1130,7 +1130,7 @@ def _refine_coltypes(
11301130
)
11311131
else:
11321132
assert col_name in col_dict
1133-
col_dict[col_name] = String_VaryingAlphanum()
1133+
col_dict[col_name] = String_VaryingAlphanum(collation=col_dict[col_name].collation)
11341134

11351135
return col_dict
11361136

data_diff/databases/mssql.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def select_table_schema(self, path: DbPath) -> str:
201201
info_schema_path.insert(0, self.dialect.quote(database))
202202

203203
return (
204-
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale "
204+
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, collation_name "
205205
f"FROM {'.'.join(info_schema_path)} "
206206
f"WHERE table_name = '{name}' AND table_schema = '{schema}'"
207207
)

data_diff/databases/snowflake.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,8 @@ def select_table_schema(self, path: DbPath) -> str:
164164
info_schema_path.insert(0, database)
165165

166166
return (
167-
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale "
167+
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale"
168+
" , coalesce(collation_name, 'utf8') "
168169
f"FROM {'.'.join(info_schema_path)} "
169170
f"WHERE table_name = '{name}' AND table_schema = '{schema}'"
170171
)

0 commit comments

Comments
 (0)