-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpatterns.py
More file actions
209 lines (178 loc) · 9.25 KB
/
patterns.py
File metadata and controls
209 lines (178 loc) · 9.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import json
import os
import re
def get_patterns():
"""Load patterns from config file and return compiled regex patterns"""
config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'config', 'patterns.json')
try:
with open(config_path, 'r') as f:
config = json.load(f)
return config['patterns']
except (FileNotFoundError, json.JSONDecodeError, KeyError):
# Fallback patterns if config file is missing or invalid
return {
"passwords": [
r"(?i)(password|pass|pwd|passwd)\s*[:=]\s*['\"]?([^\s'\"]+)",
r"(?i)(secret|token|key)\s*[:=]\s*['\"]?([^\s'\"]+)"
],
"credit_cards": [
r"\b(?:\d{4}[- ]?){3}\d{4}\b",
r"\b\d{4}[ -]?\d{6}[ -]?\d{5}\b"
],
"api_keys": [
r"(?i)(api[-_]?key)\s*[:=]\s*['\"]?([a-zA-Z0-9]{32,})",
r"(?i)(access[-_]?token)\s*[:=]\s*['\"]?([a-zA-Z0-9]{32,})"
],
"ssn": [
r"\b\d{3}-\d{2}-\d{4}\b",
r"\b\d{3}\s\d{2}\s\d{4}\b"
],
"emails": [
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"
],
"phone_numbers": [
r"\b\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})\b"
],
"ip_addresses": [
r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b"
]
}
def compile_patterns(patterns_dict):
"""Compile regex patterns for better performance"""
compiled = {}
for category, pattern_list in patterns_dict.items():
compiled[category] = []
for pattern in pattern_list:
try:
compiled[category].append(re.compile(pattern))
except re.error as e:
print(f"Warning: Invalid regex pattern '{pattern}': {e}")
return compiled
def is_likely_false_positive(match_text, category, context=""):
"""Filter out common false positives"""
match_lower = match_text.lower()
context_lower = context.lower() if context else ""
if category == 'credentials':
# Filter out obvious false positives while keeping real credentials
# Windows error codes and constants - these are very common false positives
if re.search(r'(?i)(user|pass|password|auth|login)\s*=\s*0x[0-9A-F]+', match_text):
return True
# Check for hex values that look like Windows error codes
if re.search(r'0x[0-9A-F]{8}', match_text) or re.search(r'0xC00D[0-9A-F]{4}', match_text):
return True
# Constants/defines pattern (ALL CAPS with underscores)
if re.search(r'^[A-Z][A-Z_0-9]*\s*=', match_text):
return True
# Empty or placeholder values
if re.search(r'(?i)(password|user|login|auth)\s*[:=]\s*[\'"]?\s*[\'"]?$', match_text):
return True
# Incomplete matches (ending with newlines or colons, but NOT complete quoted strings)
if match_text.endswith(('\n', '\\n', ':')):
return True
# Filter incomplete quotes (single quote or double quote at the end without proper opening)
# But don't filter complete quoted strings like "password = 'secret'"
if (match_text.endswith("'") and not re.search(r"['\"][^'\"]*'$", match_text)) or \
(match_text.endswith('"') and not re.search(r'[\'"][^\'"]*"$', match_text)):
return True
# Function parameter definitions (type hints, docstrings, comments)
if re.search(r':\s*str[,\)]', match_text) or 'username: str' in match_text or 'password: str' in match_text:
return True
# Documentation and example patterns
if any(term in match_lower for term in [
'your_username', 'your_password', 'example', 'domain\\your_username',
'username="value"', 'password="value"'
]):
return True
# Variable assignments - filter out code where variables are assigned to other variables or function calls
# BUT keep hardcoded string values in quotes
# Don't filter if it's a quoted string (these are likely real credentials)
if re.search(r'(?i)(user|username|login|password|pass|passwd)\s*[:=]\s*[\'"][^\'"]+[\'"]', match_text):
# This is a quoted string, likely a real credential - don't filter it
return False
# Filter variable assignments: var = var (but not quoted strings)
variable_assignment_patterns = [
# Direct variable assignments: var = var (no quotes)
r'(?i)(user|username|login)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*\s*$',
r'(?i)(pass|password|passwd|pwd)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*\s*$',
# Function parameters: func(username=var, password=var)
r'(?i)(username|user|login)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*\s*[,\)]',
r'(?i)(password|pass|passwd|pwd)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*\s*[,\)]',
# Object attribute assignments: self.__username = username
r'(?i)(username|user|login)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*\s*$',
r'(?i)(password|pass|passwd|pwd)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*\s*$',
# Function calls assigned to variables: password = str(password)
r'(?i)(password|pass|passwd|pwd)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\(',
r'(?i)(username|user|login)\s*=\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\(',
]
for pattern in variable_assignment_patterns:
if re.search(pattern, match_text):
return True
# Specific common code patterns (but not hardcoded strings)
common_code_patterns = [
'username=self.__username', 'password=self.__password',
'username=args.username', 'password=args.password',
'user=user', 'password=password', 'pwd = password',
'username = username', 'password = password',
'username=username', 'password=password',
'user = username', 'userName = user',
'login = tds_login', 'login = tds_prelogin',
'username = principal', 'password = mqtt_string',
'password = str(password', 'user = user.decode(',
'password = password.decode(', 'username = principal('
]
if any(pattern in match_lower for pattern in common_code_patterns):
return True
# Filter out constructor/method calls that aren't hardcoded credentials
if re.search(r'(?i)(user|username|login|password|pass|passwd)\s*=\s*[A-Z][a-zA-Z0-9_]*\s*\(', match_text):
return True
# Filter generic test values and placeholders
if any(pattern in match_lower for pattern in [
'user = \'test\'', 'pass = \'test\'', 'password = none',
'username = none', 'password: stringbuilder',
'user: username', 'login:', 'password:', 'user:',
'username:', 'passwd:'
]):
return True
# Filter result files and reports
if context and any(indicator in context_lower for indicator in [
'[credentials]', '[credit_cards]', 'match:', 'context:',
'detailed findings', 'report saved to', 'scan completed',
'file:', 'findings by category', 'sensitive data'
]):
return True
# Test files and test data
if context and any(indicator in context_lower for indicator in [
'test_', 'sample_', 'tests/', 'def test', 'class test',
'unittest', 'testcase', 'sk_test_'
]):
return True
# Filter very short matches
if len(match_text) < 6:
return True
elif category == 'credit_cards':
# For credit cards, filter out UUIDs and GUIDs which commonly trigger false positives
clean_number = re.sub(r'[\s-]', '', match_text)
# Filter UUIDs/GUIDs (they have specific patterns that aren't credit cards)
if re.match(r'^[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}', match_text):
return True
# Filter binary data patterns (all same digits, alternating patterns)
if re.match(r'^[01]+$', clean_number) and len(clean_number) >= 12:
return True
# Filter hex patterns that clearly aren't credit cards
if re.match(r'^[0-9A-Fa-f]+$', clean_number) and len(clean_number) % 2 == 0 and len(clean_number) >= 16:
# This looks like hex data, not a credit card
return True
# Filter test credit card numbers
test_numbers = [
'4111111111111111', '4000000000000002', '5555555555554444',
'1234567890123456', '0000000000000000', '1111111111111111',
'1234567890123456'
]
if clean_number in test_numbers:
return True
# Filter obvious test patterns
if any(pattern in clean_number for pattern in ['1234', '0000', '1111']):
# But allow numbers that look more realistic
if not (clean_number.startswith('4') and len(clean_number) == 16):
return True
return False