Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

增加系统通用字符串脱敏规则-三段式通用脱敏规则、脱敏字段支持* #2698

Merged
merged 27 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion sql/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ class Meta:
(4, "邮箱"),
(5, "金额"),
(6, "其他"),
(100, "三段式通用脱敏规则"),
)


Expand All @@ -611,7 +612,11 @@ class DataMaskingColumns(models.Model):
"""

column_id = models.AutoField("字段id", primary_key=True)
rule_type = models.IntegerField("规则类型", choices=rule_type_choices)
rule_type = models.IntegerField(
"规则类型",
choices=rule_type_choices,
help_text="三段式通用脱敏规则:根据字段长度自动分成三份,中间段脱敏。",
)
active = models.BooleanField(
"激活状态", choices=((False, "未激活"), (True, "激活"))
)
Expand Down
43 changes: 42 additions & 1 deletion sql/utils/data_masking.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding:utf-8 -*-
import logging
import math

import sqlparse
from django.forms import model_to_dict
Expand Down Expand Up @@ -44,6 +45,18 @@ def data_masking(instance, db_name, sql, sql_result):
for column in hit_columns:
index, rule_type = column["index"], column["rule_type"]
masking_rule = masking_rules.get(rule_type)
# 如果是默认的三段式通用脱敏规则,数据库没有查询结果,则创建一个对象。
if not masking_rule and rule_type == 100:
masking_rule_obj, created = DataMaskingRules.objects.get_or_create(
rule_type=100,
rule_regex="^([\\s\\S]{0,}?)([\\s\\S]{0,}?)([\\s\\S]{0,}?)$",
hide_group=2,
rule_desc="三段式通用脱敏规则:内部实现,正则暂不支持修改,隐藏组支持修改。",
)
if created:
masking_rule = model_to_dict(masking_rule_obj)
masking_rules[rule_type] = masking_rule # 更新字典
masking_rule = masking_rules.get(rule_type)
if not masking_rule:
continue
for idx, item in enumerate(rows):
Expand Down Expand Up @@ -106,6 +119,11 @@ def analyze_query_tree(select_list, instance):
masking_column = masking_columns.get(
f"{instance}-{table_schema}-{table}-{field}"
)

# 未找到。看看通用的规则是否存在。
if not masking_column:
masking_column = masking_columns.get(f"{instance}-*-*-{field}")

if masking_column:
hit_columns.append(
{
Expand All @@ -124,15 +142,38 @@ def analyze_query_tree(select_list, instance):
def regex(masking_rule, value):
"""利用正则表达式脱敏数据"""
rule_regex = masking_rule["rule_regex"]

rule_type = masking_rule["rule_type"]
# 系统通用规则正则表达式。 这是动态的。
if rule_type == 100 and isinstance(value, str):
value_average = math.floor(len(value) / 3)
value_remainder = len(value) % 3
value_average_1 = str(value_average)
value_average_2 = str(value_average + (1 if value_remainder > 0 else 0))
value_average_3 = str(value_average + (1 if value_remainder > 1 else 0))
# value_len_str=str(value_len if value_len >= 1 else 1)
rule_regex = (
"^([\\s\\S]{"
+ value_average_1
+ ",}?)([\\s\\S]{"
+ value_average_2
+ ",}?)([\\s\\S]{"
+ value_average_3
+ ",}?)$"
)

hide_group = masking_rule["hide_group"]
# 正则匹配必须分组,隐藏的组会使用****代替
try:
p = re.compile(rule_regex, re.I)
m = p.search(str(value))
masking_str = ""
if m is None:
return value
for i in range(m.lastindex):
if i == hide_group - 1:
group = "****"
# 长度不对外隐藏,还原长度。
group = "*" * len(m.group(i + 1))
else:
group = m.group(i + 1)
masking_str = masking_str + group
Expand Down
83 changes: 83 additions & 0 deletions sql/utils/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,7 @@ def setUp(self):
db_name="db_name",
syntax_type=1,
)
# 单元测试创建脱敏规则
DataMaskingRules.objects.create(
rule_type=1, rule_regex="(.{3})(.*)(.{4})", hide_group=2
)
Expand All @@ -908,6 +909,15 @@ def setUp(self):
table_name="users",
column_name="phone",
)
# rule_type=100的规则不需要加,会自动创建。只需要加脱敏字段
DataMaskingColumns.objects.create(
rule_type=100,
active=True,
instance=self.ins,
table_schema="*",
table_name="*",
column_name="mobile",
)

def tearDown(self):
User.objects.all().delete()
Expand Down Expand Up @@ -1096,6 +1106,79 @@ def test_data_masking_hit_rules_column_and_star(self, _inception):
]
self.assertEqual(r.rows, mask_result_rows)

@patch("sql.utils.data_masking.GoInceptionEngine")
def test_data_masking_hit_default_rules_column_and_star(self, _inception):
"""命中默认脱敏规则(规则编码100),查询的SQL存在*和字段的单元测试方法。
1. 脱敏规则:库名和表名为*,字段名为mobile。
2. 脱敏规则:库名:archer_test表名:users,字段名为phone。
"""
_inception.return_value.query_data_masking.return_value = [
{
"index": 0,
"field": "phone",
"type": "varchar(80)",
"table": "users",
"schema": "archer_test",
"alias": "p",
},
{
"index": 1,
"field": "id",
"type": "varchar(80)",
"table": "users",
"schema": "archer_test",
"alias": "id",
},
{
"index": 2,
"field": "mobile",
"type": "varchar(80)",
"table": "users_not_config",
"schema": "archer_test_not_config",
"alias": "m",
},
]
sql = """select phone,id,mobile,* from users;"""
rows = (
("1", "7954597708277300617", "1"),
("12", "7954597708277300618", "12"),
("123", "7954597708277300621", "123"),
("1234", "7954597708277300622", "1234"),
("12345", "7954597708277300623", "12345"),
("123456", "7955140019084306231", "123456"),
("1234567", "7955140019084306241", "1234567"),
("12345678", "7955140019084306242", "12345678"),
("123456789", "7955140019084306243", "123456789"),
("123456789a", "7955140019084306244", "123456789a"),
("123456789ab", "7955140019084306245", "123456789ab"),
("123456789abc", "7955140019084306246", "123456789abc"),
("123456789abcd", "7955140019084306247", "123456789abcd"),
("123456789abcde", "7955140019084306248", "123456789abcde"),
)
query_result = ReviewSet(
column_list=["phone", "id", "mobile"], rows=rows, full_sql=sql
)
r = data_masking(self.ins, "archery", sql, query_result)
# 第一列走的脱敏规则1,第二列Id不应该脱敏,第三列走的脱敏规则100。
mask_result_rows = [
["1", "7954597708277300617", "*"],
["12", "7954597708277300618", "*2"],
["123", "7954597708277300621", "1*3"],
["1234", "7954597708277300622", "1**4"],
["12345", "7954597708277300623", "1**45"],
["123456", "7955140019084306231", "12**56"],
["1234567", "7955140019084306241", "12***67"],
["123*5678", "7955140019084306242", "12***678"],
["123**6789", "7955140019084306243", "123***789"],
["123***789a", "7955140019084306244", "123****89a"],
["123****89ab", "7955140019084306245", "123****89ab"],
["123*****9abc", "7955140019084306246", "1234****9abc"],
["123******abcd", "7955140019084306247", "1234*****abcd"],
["123*******bcde", "7955140019084306248", "1234*****abcde"],
]

self.assertEqual(r.rows, mask_result_rows)

@patch("sql.utils.data_masking.GoInceptionEngine")
def test_data_masking_hit_rules_column_and_star_and_column(self, _inception):
"""[column_a,a.*,column_b]"""
Expand Down
Loading