Skip to content

Commit

Permalink
update nltk.bigrams & remove nltk dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
tzemicheal committed Jul 1, 2024
1 parent 42bd6d9 commit 676d25a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 2 deletions.
1 change: 0 additions & 1 deletion log-sequence-ad/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
--find-links https://download.pytorch.org/whl/cu118

gensim==3.8.0
nltk==3.8
torch==2.0.1
34 changes: 33 additions & 1 deletion log-sequence-ad/training-tuning-inference/datatools.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,48 @@

import random
import warnings
from collections import deque
from itertools import islice

import numpy as np
import pandas as pd
import torch
from gensim.models import Word2Vec
from nltk.util import bigrams

warnings.filterwarnings("ignore")


def ngrams(sequence, n, **kwargs):
"""compute ngram. This method is based on nltk.util.ngrams implementation.
Parameters
----------
sequence : list
List of strings
n : int
ngram param. set n=2 for bigram
Yields
------
_type_
_description_
"""

# sliding_window('ABCDEFG', 4) --> ABCD BCDE CDEF DEFG
# https://docs.python.org/3/library/itertools.html?highlight=sliding_window#itertools-recipes
it = iter(sequence)
window = deque(islice(it, n), maxlen=n)
if len(window) == n:
yield tuple(window)
for x in it:
window.append(x)
yield tuple(window)


def bigrams(sequence, **kwargs):
yield from ngrams(sequence, 2, **kwargs)


def preprocess(df, window_size=100, step_size=20):
'''Preprocessing structured log dataset
Expand Down

0 comments on commit 676d25a

Please sign in to comment.