asyml
diff --git a/‎.github/workflows/main.yml
+4-2 b/‎.github/workflows/main.yml
+4-2
diff --git a/‎.pylintrc
+1-1 b/‎.pylintrc
+1-1
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎examples/chatbot/chatbot_example.py
+40-25 b/‎examples/chatbot/chatbot_example.py
+40-25
diff --git a/‎examples/chatbot/config_data.py
+7-7 b/‎examples/chatbot/config_data.py
+7-7
diff --git a/‎examples/chatbot/create_index.py
+29-16 b/‎examples/chatbot/create_index.py
+29-16
diff --git a/‎examples/chatbot/data_utils.py
+14-10 b/‎examples/chatbot/data_utils.py
+14-10
@@ -16,7 +16,6 @@ jobs:
         python-version: [ 3.6, 3.7 ]
         torch-version: [ 1.5.0, 1.6.0 ]
         tensorflow-version: [ 1.15.0 ]
-
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -35,8 +34,11 @@ jobs:
         run: |
           python -m pip install --progress-bar off --upgrade pip
           pip install --progress-bar off Django django-guardian
-          pip install --progress-bar off pylint==2.6.0 flake8==3.8.2 mypy==0.790 pytest==5.1.3
+          pip install --progress-bar off pylint==2.6.0 flake8==3.8.2 mypy==0.790 pytest==5.1.3 black==20.8b1
           pip install --progress-bar off coverage codecov
+      - name: Format check with Black
+        run: |
+          black --line-length 80 --check forte/
       - name: Obtain Stave Database Examples
         run: |
           git clone https://github.com/asyml/stave.git
 
@@ -301,7 +301,7 @@ logging-modules=logging
 [FORMAT]
 
 # Maximum number of characters on a single line.
-max-line-length=80
+max-line-length=100
 
 # Regexp for a line that is allowed to be longer than the limit.
 # This regex matches URLs and link anchors.
 
@@ -9,6 +9,7 @@
 [![Documentation Status](https://readthedocs.org/projects/asyml-forte/badge/?version=latest)](https://asyml-forte.readthedocs.io/en/latest/?badge=latest)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/asyml/forte/blob/master/LICENSE)
 [![Chat](http://img.shields.io/badge/gitter.im-asyml/forte-blue.svg)](https://gitter.im/asyml/community)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 **Forte** is a toolkit for building Natural Language Processing pipelines,
 featuring cross-task interaction, adaptable data-model interfaces and composable
 
@@ -16,8 +16,7 @@
 from termcolor import colored
 import torch
 
-from forte.nltk import (
-    NLTKSentenceSegmenter, NLTKWordTokenizer, NLTKPOSTagger)
+from forte.nltk import NLTKSentenceSegmenter, NLTKWordTokenizer, NLTKPOSTagger
 from forte.common.configuration import Config
 from forte.data.multi_pack import MultiPack
 from forte.data.readers import MultiPackTerminalReader
@@ -36,30 +35,38 @@ def setup(config: Config) -> Pipeline:
     resource = Resources()
     query_pipeline = Pipeline[MultiPack](resource=resource)
     query_pipeline.set_reader(
-        reader=MultiPackTerminalReader(), config=config.reader)
+        reader=MultiPackTerminalReader(), config=config.reader
+    )
     query_pipeline.add(
-        component=MicrosoftBingTranslator(), config=config.translator)
+        component=MicrosoftBingTranslator(), config=config.translator
+    )
     query_pipeline.add(
-        component=BertBasedQueryCreator(), config=config.query_creator)
-    query_pipeline.add(
-        component=SearchProcessor(), config=config.searcher)
+        component=BertBasedQueryCreator(), config=config.query_creator
+    )
+    query_pipeline.add(component=SearchProcessor(), config=config.searcher)
 
-    top_response_pack_name = config.indexer.response_pack_name + '_0'
+    top_response_pack_name = config.indexer.response_pack_name + "_0"
 
     query_pipeline.add(
         component=NLTKSentenceSegmenter(),
-        selector=NameMatchSelector(select_name=top_response_pack_name))
+        selector=NameMatchSelector(select_name=top_response_pack_name),
+    )
     query_pipeline.add(
         component=NLTKWordTokenizer(),
-        selector=NameMatchSelector(select_name=top_response_pack_name))
+        selector=NameMatchSelector(select_name=top_response_pack_name),
+    )
     query_pipeline.add(
         component=NLTKPOSTagger(),
-        selector=NameMatchSelector(select_name=top_response_pack_name))
+        selector=NameMatchSelector(select_name=top_response_pack_name),
+    )
     query_pipeline.add(
-        component=SRLPredictor(), config=config.SRL,
-        selector=NameMatchSelector(select_name=top_response_pack_name))
+        component=SRLPredictor(),
+        config=config.SRL,
+        selector=NameMatchSelector(select_name=top_response_pack_name),
+    )
     query_pipeline.add(
-        component=MicrosoftBingTranslator(), config=config.back_translator)
+        component=MicrosoftBingTranslator(), config=config.back_translator
+    )
 
     query_pipeline.initialize()
 
@@ -87,28 +94,36 @@ def main(config: Config):
             resource.update(bot_utterance=[response_pack])
 
         english_pack = m_pack.get_pack("pack")
-        print(colored("English Translation of the query: ", "green"),
-              english_pack.text, "\n")
+        print(
+            colored("English Translation of the query: ", "green"),
+            english_pack.text,
+            "\n",
+        )
 
         # Just take the first pack.
-        pack = m_pack.get_pack(config.indexer.response_pack_name_prefix + '_0')
+        pack = m_pack.get_pack(config.indexer.response_pack_name_prefix + "_0")
         print(colored("Retrieved Document", "green"), pack.text, "\n")
-        print(colored("German Translation", "green"),
-              m_pack.get_pack("response").text, "\n")
+        print(
+            colored("German Translation", "green"),
+            m_pack.get_pack("response").text,
+            "\n",
+        )
         for sentence in pack.get(Sentence):
             sent_text = sentence.text
-            print(colored("Sentence:", 'red'), sent_text, "\n")
+            print(colored("Sentence:", "red"), sent_text, "\n")
 
-            print(colored("Semantic role labels:", 'red'))
+            print(colored("Semantic role labels:", "red"))
             for link in pack.get(PredicateLink, sentence):
                 parent = link.get_parent()
                 child = link.get_child()
-                print(f"  - \"{child.text}\" is role "
-                      f"{link.arg_type} of "
-                      f"predicate \"{parent.text}\"")
+                print(
+                    f'  - "{child.text}" is role '
+                    f"{link.arg_type} of "
+                    f'predicate "{parent.text}"'
+                )
             print()
 
-            input(colored("Press ENTER to continue...\n", 'green'))
+            input(colored("Press ENTER to continue...\n", "green"))
 
 
 if __name__ == "__main__":
 
@@ -24,7 +24,7 @@
     "sent_b_seq_len": ["int64", "stacked_tensor"],
     "sent_b_segment_ids": ["int64", "stacked_tensor", max_seq_length],
     "sentence_b": ["str", "stacked_tensor"],
-    "label_ids": ["int64", "stacked_tensor"]
+    "label_ids": ["int64", "stacked_tensor"],
 }
 
 train_hparam = {
@@ -33,10 +33,10 @@
     "dataset": {
         "data_name": "data",
         "feature_types": feature_types,
-        "files": "{}/train.pkl".format(pickle_data_dir)
+        "files": "{}/train.pkl".format(pickle_data_dir),
     },
     "shuffle": True,
-    "shuffle_buffer_size": 100
+    "shuffle_buffer_size": 100,
 }
 
 eval_hparam = {
@@ -45,9 +45,9 @@
     "dataset": {
         "data_name": "data",
         "feature_types": feature_types,
-        "files": "{}/eval.pkl".format(pickle_data_dir)
+        "files": "{}/eval.pkl".format(pickle_data_dir),
     },
-    "shuffle": False
+    "shuffle": False,
 }
 
 test_hparam = {
@@ -56,7 +56,7 @@
     "dataset": {
         "data_name": "data",
         "feature_types": feature_types,
-        "files": "{}/test.pkl".format(pickle_data_dir)
+        "files": "{}/test.pkl".format(pickle_data_dir),
     },
-    "shuffle": False
+    "shuffle": False,
 }
@@ -30,8 +30,9 @@
 logging.basicConfig(level=logging.INFO)
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--config_data", default="config_data",
-                    help="File to read the config from")
+parser.add_argument(
+    "--config_data", default="config_data", help="File to read the config from"
+)
 args = parser.parse_args()
 
 config = yaml.safe_load(open("config.yml", "r"))
@@ -40,11 +41,11 @@
 
 
 class Indexer:
-
     def __init__(self, model_path, torch_device=None):
 
         self.bert = tx.modules.BERTEncoder(
-            pretrained_model_name=None, hparams={"pretrained_model_name": None})
+            pretrained_model_name=None, hparams={"pretrained_model_name": None}
+        )
         self.device = torch_device
         self.bert.to(device=self.device)
 
@@ -54,10 +55,16 @@ def __init__(self, model_path, torch_device=None):
         self.bert.load_state_dict(state_dict["bert"])
 
         self.tokenizer = tx.data.BERTTokenizer(
-            pretrained_model_name="bert-base-uncased")
+            pretrained_model_name="bert-base-uncased"
+        )
 
-        self.index = EmbeddingBasedIndexer(config={
-            "index_type": "GpuIndexFlatIP", "dim": 768, "device": "gpu0"})
+        self.index = EmbeddingBasedIndexer(
+            config={
+                "index_type": "GpuIndexFlatIP",
+                "dim": 768,
+                "device": "gpu0",
+            }
+        )
 
     @torch.no_grad()
     def create_index(self):
@@ -67,9 +74,9 @@ def create_index(self):
             "dataset": {
                 "data_name": "data",
                 "feature_types": config_data.feature_types,
-                "files": ["data/train.pkl", "data/eval.pkl", "data/test.pkl"]
+                "files": ["data/train.pkl", "data/eval.pkl", "data/test.pkl"],
             },
-            "shuffle": False
+            "shuffle": False,
         }
 
         dataset = tx.data.RecordData(hparams=hparams, device=self.device)
@@ -79,23 +86,29 @@ def create_index(self):
         for idx, batch in enumerate(data_iterator):
             ids = range(start, start + len(batch))
             text = batch["sentence_b"]
-            output, _ = self.bert(inputs=batch["sent_b_input_ids"],
-                                  sequence_length=batch["sent_b_seq_len"],
-                                  segment_ids=batch["sent_b_segment_ids"])
+            output, _ = self.bert(
+                inputs=batch["sent_b_input_ids"],
+                sequence_length=batch["sent_b_seq_len"],
+                segment_ids=batch["sent_b_segment_ids"],
+            )
             cls_tokens = output[:, 0, :]  # CLS token is first token
             self.index.add(vectors=cls_tokens, meta_data=dict(zip(ids, text)))
 
             start += len(batch)
 
             if (idx + 1) % 50 == 0:
-                logging.info("Completed %s batches of size %s", idx + 1,
-                             config.indexer.batch_size)
+                logging.info(
+                    "Completed %s batches of size %s",
+                    idx + 1,
+                    config.indexer.batch_size,
+                )
 
         self.index.save(path=config.indexer.model_dir)
 
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    indexer = Indexer(model_path="model/chatbot_model.ckpt",
-                      torch_device=device)
+    indexer = Indexer(
+        model_path="model/chatbot_model.ckpt", torch_device=device
+    )
     indexer.create_index()
@@ -27,8 +27,8 @@ def split_train_eval_test(file_name):
     conversation = None
 
     for line in text_file:
-        pair = line.rstrip('\n').split('\t')
-        num, question = pair[0].split(' ', 1)
+        pair = line.rstrip("\n").split("\t")
+        num, question = pair[0].split(" ", 1)
         answer = pair[1]
 
         if num == "1":
@@ -49,8 +49,8 @@ def split_train_eval_test(file_name):
     num_test = 500
 
     train_data = text_data[0:num_train]
-    val_data = text_data[num_train:num_train + num_val]
-    test_data = text_data[num_train + num_val:num_train + num_val + num_test]
+    val_data = text_data[num_train : num_train + num_val]
+    test_data = text_data[num_train + num_val : num_train + num_val + num_test]
 
     return train_data, val_data, test_data
 
@@ -88,14 +88,17 @@ def _create_conv_with_history(conv, num_qa):
 
     new_text_data = []
     for i, _ in enumerate(conv):
-        history = conv[max(i - num_qa, 0):i]
+        history = conv[max(i - num_qa, 0) : i]
         current_qa = conv[i]
 
         if history:
-            qa_with_history = [sentence for qa in history for sentence in
-                               qa] + current_qa
-            qa_with_history = [' '.join(qa_with_history[:-1]),
-                               qa_with_history[-1]]
+            qa_with_history = [
+                sentence for qa in history for sentence in qa
+            ] + current_qa
+            qa_with_history = [
+                " ".join(qa_with_history[:-1]),
+                qa_with_history[-1],
+            ]
         else:
             qa_with_history = current_qa
 
@@ -121,7 +124,8 @@ def create_dataset_with_history(conversations, num_line=2):
 
     for conversation in conversations:
         conversation_with_history = _create_conv_with_history(
-            conversation, num_line)
+            conversation, num_line
+        )
         proc_text_data.extend(conversation_with_history)
 
     return proc_text_data