Skip to content

Commit 89ebb68

Browse files
committed
Enhanced classifier: RANDOM_STRING, UUID, NUMERIC, IP_ADDDRESS
1 parent bbba556 commit 89ebb68

File tree

3 files changed

+74
-4
lines changed

3 files changed

+74
-4
lines changed

stringlifier/api.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def __call__(self, string_or_list, return_tokens=False):
141141
else:
142142
return new_strings
143143

144-
def _extract_tokens(self, string, pred):
144+
def _extract_tokens_2class(self, string, pred):
145145
mask = ''
146146
for p in pred:
147147
mask += self.encodings._label_list[p]
@@ -168,3 +168,52 @@ def _extract_tokens(self, string, pred):
168168
tokens.append((c_tok, start, stop))
169169
new_str += '<RANDOM_STRING>'
170170
return new_str, tokens
171+
172+
def _extract_tokens(self, string, pred):
173+
mask = ''
174+
for p in pred:
175+
mask += self.encodings._label_list[p]
176+
start = 0
177+
new_str = ''
178+
tokens = []
179+
c_tok = ''
180+
last_label = mask[0]
181+
for ii in range(len(string)):
182+
# check if the label-type has changed
183+
if last_label != mask[ii]:
184+
if c_tok != '':
185+
if last_label == 'C':
186+
new_str += c_tok
187+
elif last_label == 'H':
188+
type = '<RANDOM_STRING>'
189+
elif last_label == 'N':
190+
type = '<NUMERIC>'
191+
elif last_label == 'I':
192+
type = '<IP_ADDR>'
193+
elif last_label == 'U':
194+
type = '<UUID>'
195+
196+
if last_label != 'C':
197+
tokens.append((c_tok, start, ii, type))
198+
new_str += type
199+
c_tok = ''
200+
start = ii
201+
202+
last_label = mask[ii]
203+
c_tok += string[ii]
204+
205+
if c_tok != '':
206+
if last_label == 'C':
207+
new_str += c_tok
208+
elif last_label == 'H':
209+
type = '<RANDOM_STRING>'
210+
elif last_label == 'N':
211+
type = '<NUMERIC>'
212+
elif last_label == 'I':
213+
type = '<IP_ADDR>'
214+
elif last_label == 'U':
215+
type = '<UUID>'
216+
if last_label != 'C':
217+
tokens.append((c_tok, start, ii, type))
218+
new_str += type
219+
return new_str, tokens

stringlifier/modules/stringc2.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,13 @@ def _start_train(params):
239239
if params.resume:
240240
encodings = Encodings('{0}.encodings'.format(params.output_base))
241241
else:
242+
sys.stdout.write('Generating new random data...')
243+
sys.stdout.flush()
242244
trainset = _generate_dataset(int(eval_at * 4 * params.batch_size))
243-
devset = _generate_dataset(int(eval_at / 10 * params.batch_size))
245+
sys.stdout.write('done\n')
244246
encodings = Encodings()
245247
encodings.update_encodings(trainset)
248+
246249
print('chars={0}, types={1}'.format(len(encodings._char2int), len(encodings._label2int)))
247250
print(encodings._label2int)
248251

stringlifier/modules/training.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,26 @@ def _generate_word(known_words):
2222
import datetime
2323
import base64
2424
generated = []
25-
ii = random.randint(0, 3)
25+
ii = random.randint(0, 4)
2626
mask = 'H'
2727
if ii == 0:
2828
generated.append(str(uuid.uuid4()))
29+
mask = 'U'
2930
elif ii == 1:
3031
generated.append(str(uuid.uuid4().hex))
32+
mask = 'H'
3133
elif ii == 2:
32-
generated.append(str(datetime.datetime.now().timestamp()))
34+
c = random.randint(0, 3)
35+
if c == 0:
36+
generated.append(str(datetime.datetime.now().timestamp()))
37+
elif c == 1:
38+
generated.append(str(random.randint(0, 1000000)))
39+
elif c == 2:
40+
generated.append(str(random.randint(0, 999)) + '.' + str(random.randint(0, 999)))
41+
else:
42+
generated.append(str(random.randint(0, 999)) + '.' + str(random.randint(0, 9999)) + '.' +
43+
str(random.randint(0, 9999)))
44+
mask = 'N'
3345
elif ii == 3:
3446
N = random.randint(5, 20)
3547
import string
@@ -40,6 +52,12 @@ def _generate_word(known_words):
4052
base64_bytes = base64.b64encode(message_bytes)
4153
base64_message = base64_bytes.decode('ascii')
4254
generated.append(base64_message)
55+
elif ii == 4:
56+
toks = []
57+
for _ in range(4):
58+
toks.append(str(random.randint(0, 255)))
59+
generated.append('.'.join(toks))
60+
mask = 'I'
4361
return str(generated[0]), mask[0]
4462

4563

0 commit comments

Comments
 (0)