nielstiben · nielstiben · Jan 18, 2022 · Jan 18, 2022 · Jan 18, 2022 · Jan 18, 2022
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 exclude = venv
-ignore = W503 #line break occurred before binary operation
+ignore = W503,W605 #line break occurred before binary operation
 max-line-length = 100
diff --git a/config/model/model_config.yaml b/config/model/model_config.yaml
@@ -1,7 +1,4 @@
-lr: 6e-6
-eps: 1e-8
-# model: 'bert'
-# pretrained-model: 'bert-large-uncased'
-model: 'distilbert'
+model: 'bert'
+#model: 'distilbert'
 pretrained-model: 'distilbert-base-uncased'
 num_labels: 2
diff --git a/config/train/train_config.yaml b/config/train/train_config.yaml
@@ -1,6 +1,5 @@
-optimizer: Adam
-lr: 0.001
-batch_size: 8
-scheduler:
-  name: ExponentialLR
-  gamma: 0.1
+optimizer: AdamW
+lr: 6e-6
+eps: 1e-8,
+batch_size: 16
+epochs: 5
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ torch==1.10.1
 transformers==4.15.0
 google-cloud-secret-manager==2.5.0
 wandb==0.12.9
+nltk==3.6.7
diff --git a/src/features/build_features.py b/src/features/build_features.py
@@ -5,12 +5,14 @@
 from pathlib import Path
 
 import hydra
+import nltk
 import numpy as np
 import pandas as pd  # type: ignore
 import torch
 from dotenv import find_dotenv, load_dotenv
 from omegaconf import DictConfig
 from transformers import AutoTokenizer
+from tweet_cleaner import clean_tweet_list
 
 # See kaggle notebook:
 # https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert
@@ -46,6 +48,13 @@ def main(cfg: DictConfig) -> None:
         list(data.target[split_eval:]),
     )
 
+    # %% Clean
+    nltk.download("wordnet")
+    nltk.download("omw-1.4")
+    tweet_train = clean_tweet_list(tweet_train)
+    tweet_test = clean_tweet_list(tweet_test)
+    tweet_eval = clean_tweet_list(tweet_eval)
+
     # %% Encode
     tokenizer = AutoTokenizer.from_pretrained(cfg.model["pretrained-model"])
 

diff --git a/src/features/tweet_cleaner.py b/src/features/tweet_cleaner.py
@@ -0,0 +1,305 @@
+import re
+
+import nltk
+
+replacement_patterns = [
+    (r"won\'t", "will not"),
+    (r"can\'t", "cannot"),
+    (r"i\'m", "i am"),
+    (r"ain\'t", "is not"),
+    (r"(\w+)\'ll", "\g<1> will"),
+    (r"(\w+)n\'t", "\g<1> not"),
+    (r"(\w+)\'ve", "\g<1> have"),
+    (r"(\w+)\'s", "\g<1> is"),
+    (r"(\w+)\'re", "\g<1> are"),
+    (r"(\w+)\'d", "\g<1> would"),
+]
+abbreviations = {
+    "$": " dollar ",
+    "€": " euro ",
+    "4ao": "for adults only",
+    "a.m": "before midday",
+    "a3": "anytime anywhere anyplace",
+    "aamof": "as a matter of fact",
+    "acct": "account",
+    "adih": "another day in hell",
+    "afaic": "as far as i am concerned",
+    "afaict": "as far as i can tell",
+    "afaik": "as far as i know",
+    "afair": "as far as i remember",
+    "afk": "away from keyboard",
+    "app": "application",
+    "approx": "approximately",
+    "apps": "applications",
+    "asap": "as soon as possible",
+    "asl": "age, sex, location",
+    "atk": "at the keyboard",
+    "ave.": "avenue",
+    "aymm": "are you my mother",
+    "ayor": "at your own risk",
+    "b&b": "bed and breakfast",
+    "b+b": "bed and breakfast",
+    "b.c": "before christ",
+    "b2b": "business to business",
+    "b2c": "business to customer",
+    "b4": "before",
+    "b4n": "bye for now",
+    "b@u": "back at you",
+    "bae": "before anyone else",
+    "bak": "back at keyboard",
+    "bbbg": "bye bye be good",
+    "bbc": "british broadcasting corporation",
+    "bbias": "be back in a second",
+    "bbl": "be back later",
+    "bbs": "be back soon",
+    "be4": "before",
+    "bfn": "bye for now",
+    "blvd": "boulevard",
+    "bout": "about",
+    "brb": "be right back",
+    "bros": "brothers",
+    "brt": "be right there",
+    "bsaaw": "big smile and a wink",
+    "btw": "by the way",
+    "bwl": "bursting with laughter",
+    "c/o": "care of",
+    "cet": "central european time",
+    "cf": "compare",
+    "cia": "central intelligence agency",
+    "csl": "can not stop laughing",
+    "cu": "see you",
+    "cul8r": "see you later",
+    "cv": "curriculum vitae",
+    "cwot": "complete waste of time",
+    "cya": "see you",
+    "cyt": "see you tomorrow",
+    "dae": "does anyone else",
+    "dbmib": "do not bother me i am busy",
+    "diy": "do it yourself",
+    "dm": "direct message",
+    "dwh": "during work hours",
+    "e123": "easy as one two three",
+    "eet": "eastern european time",
+    "eg": "example",
+    "embm": "early morning business meeting",
+    "encl": "enclosed",
+    "encl.": "enclosed",
+    "etc": "and so on",
+    "faq": "frequently asked questions",
+    "fawc": "for anyone who cares",
+    "fb": "facebook",
+    "fc": "fingers crossed",
+    "fig": "figure",
+    "fimh": "forever in my heart",
+    "ft.": "feet",
+    "ft": "featuring",
+    "ftl": "for the loss",
+    "ftw": "for the win",
+    "fwiw": "for what it is worth",
+    "fyi": "for your information",
+    "g9": "genius",
+    "gahoy": "get a hold of yourself",
+    "gal": "get a life",
+    "gcse": "general certificate of secondary education",
+    "gfn": "gone for now",
+    "gg": "good game",
+    "gl": "good luck",
+    "glhf": "good luck have fun",
+    "gmt": "greenwich mean time",
+    "gmta": "great minds think alike",
+    "gn": "good night",
+    "g.o.a.t": "greatest of all time",
+    "goat": "greatest of all time",
+    "goi": "get over it",
+    "gps": "global positioning system",
+    "gr8": "great",
+    "gratz": "congratulations",
+    "gyal": "girl",
+    "h&c": "hot and cold",
+    "hp": "horsepower",
+    "hr": "hour",
+    "hrh": "his royal highness",
+    "ht": "height",
+    "ibrb": "i will be right back",
+    "ic": "i see",
+    "icq": "i seek you",
+    "icymi": "in case you missed it",
+    "idc": "i do not care",
+    "idgadf": "i do not give a damn fuck",
+    "idgaf": "i do not give a fuck",
+    "idk": "i do not know",
+    "ie": "that is",
+    "i.e": "that is",
+    "ifyp": "i feel your pain",
+    "IG": "instagram",
+    "iirc": "if i remember correctly",
+    "ilu": "i love you",
+    "ily": "i love you",
+    "imho": "in my humble opinion",
+    "imo": "in my opinion",
+    "imu": "i miss you",
+    "iow": "in other words",
+    "irl": "in real life",
+    "j4f": "just for fun",
+    "jic": "just in case",
+    "jk": "just kidding",
+    "jsyk": "just so you know",
+    "l8r": "later",
+    "lb": "pound",
+    "lbs": "pounds",
+    "ldr": "long distance relationship",
+    "lmao": "laugh my ass off",
+    "lmfao": "laugh my fucking ass off",
+    "lol": "laughing out loud",
+    "ltd": "limited",
+    "ltns": "long time no see",
+    "m8": "mate",
+    "mf": "motherfucker",
+    "mfs": "motherfuckers",
+    "mfw": "my face when",
+    "mofo": "motherfucker",
+    "mph": "miles per hour",
+    "mr": "mister",
+    "mrw": "my reaction when",
+    "ms": "miss",
+    "mte": "my thoughts exactly",
+    "nagi": "not a good idea",
+    "nbc": "national broadcasting company",
+    "nbd": "not big deal",
+    "nfs": "not for sale",
+    "ngl": "not going to lie",
+    "nhs": "national health service",
+    "nrn": "no reply necessary",
+    "nsfl": "not safe for life",
+    "nsfw": "not safe for work",
+    "nth": "nice to have",
+    "nvr": "never",
+    "nyc": "new york city",
+    "oc": "original content",
+    "og": "original",
+    "ohp": "overhead projector",
+    "oic": "oh i see",
+    "omdb": "over my dead body",
+    "omg": "oh my god",
+    "omw": "on my way",
+    "p.a": "per annum",
+    "p.m": "after midday",
+    "pm": "prime minister",
+    "poc": "people of color",
+    "pov": "point of view",
+    "pp": "pages",
+    "ppl": "people",
+    "prw": "parents are watching",
+    "ps": "postscript",
+    "pt": "point",
+    "ptb": "please text back",
+    "pto": "please turn over",
+    "qpsa": "what happens",  # "que pasa",
+    "ratchet": "rude",
+    "rbtl": "read between the lines",
+    "rlrt": "real life retweet",
+    "rofl": "rolling on the floor laughing",
+    "roflol": "rolling on the floor laughing out loud",
+    "rotflmao": "rolling on the floor laughing my ass off",
+    "rt": "retweet",
+    "ruok": "are you ok",
+    "sfw": "safe for work",
+    "sk8": "skate",
+    "smh": "shake my head",
+    "sq": "square",
+    "srsly": "seriously",
+    "ssdd": "same stuff different day",
+    "tbh": "to be honest",
+    "tbs": "tablespooful",
+    "tbsp": "tablespooful",
+    "tfw": "that feeling when",
+    "thks": "thank you",
+    "tho": "though",
+    "thx": "thank you",
+    "tia": "thanks in advance",
+    "til": "today i learned",
+    "tl;dr": "too long i did not read",
+    "tldr": "too long i did not read",
+    "tmb": "tweet me back",
+    "tntl": "trying not to laugh",
+    "ttyl": "talk to you later",
+    "u": "you",
+    "u2": "you too",
+    "u4e": "yours for ever",
+    "utc": "coordinated universal time",
+    "w/": "with",
+    "w/o": "without",
+    "w8": "wait",
+    "wassup": "what is up",
+    "wb": "welcome back",
+    "wtf": "what the fuck",
+    "wtg": "way to go",
+    "wtpa": "where the party at",
+    "wuf": "where are you from",
+    "wuzup": "what is up",
+    "wywh": "wish you were here",
+    "yd": "yard",
+    "ygtr": "you got that right",
+    "ynk": "you never know",
+    "zzz": "sleeping bored and tired",
+}
+
+
+class RegexpReplacer(object):
+    # Replaces regular expression in a text.
+    def __init__(self, patterns=replacement_patterns):
+        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
+
+    def replace(self, text):
+        s = text
+
+        for (pattern, repl) in self.patterns:
+            s = re.sub(pattern, repl, s)
+        return s
+
+
+def convert_abbrev(word):
+    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word
+
+
+def clean_tweet(text: str):
+    # remove urls
+    # text = df.apply(lambda x: re.sub(r'http\S+', '', x))
+    text = re.sub(r"http\S+", "", text)
+
+    # replace contractions
+    replacer = RegexpReplacer()
+    text = replacer.replace(text)
+
+    # split words on - and \
+    text = re.sub(r"\b", " ", text)
+    text = re.sub(r"-", " ", text)
+    # replace negations with antonyms
+
+    # nltk.download('punkt')
+    tokenizer = nltk.RegexpTokenizer(r"\w+")
+    tokens = tokenizer.tokenize(text)
+
+    # Replace abbreviations
+    tokens = [convert_abbrev(word) for word in tokens]
+
+    # todo: spelling correction
+    # replacer = SpellingReplacer()
+    # tokens = [replacer.replace(t) for t in tokens]
+
+    # lemmatize/stemming
+    wnl = nltk.WordNetLemmatizer()
+    tokens = [wnl.lemmatize(t) for t in tokens]
+
+    # todo: stemming conflicts with our tokenizer (Bert)
+    # porter = nltk.PorterStemmer()
+    # tokens = [porter.stem(t) for t in tokens]
+    # filter insignificant words (using fastai)
+    # swap word phrases
+
+    text = " ".join(tokens)
+    return text
+
+
+def clean_tweet_list(tweet_list: list[str]):
+    return list(map(clean_tweet, tweet_list))
diff --git a/src/models/model.py b/src/models/model.py
@@ -21,12 +21,13 @@ def __init__(self, config: DictConfig):
                 output_attentions=False,
                 output_hidden_states=False,
             )
-        else:  # default model is distilbert
+        elif (
+            self.config.model["model"] == "distilbert-base-uncased"
+        ):  # default model is distilbert
+            print("Using DistilBert")
             self.model = DistilBertForSequenceClassification.from_pretrained(
                 self.config.model["pretrained-model"],
                 num_labels=self.config.model["num_labels"],
-                output_attentions=False,
-                output_hidden_states=False,
             )
 
     def forward(self, inputs):
@@ -69,7 +70,10 @@ def configure_optimizers(  # noqa: C901
     ) -> tuple[list[torch.optim.Optimizer], list[object]]:
         if self.config.train["optimizer"] == "AdamW":
             optimizer = torch.optim.AdamW(
-                self.parameters(), lr=self.config.train["lr"]
+                self.parameters(),
+                lr=self.config.train["lr"],
+                eps=self.config.train["eps"],
+                betas=(0.9, 0.999),
             )  # type: torch.optim.Optimizer
         elif self.config.train["optimizer"] == "Adam":
             optimizer = torch.optim.Adam(self.parameters(), lr=self.config.train["lr"])