Twitter example (#469)

jzpang · jingzhi.pang@petuum.com · web-flow · commit 911c1e9eea31 · 2021-06-07T22:07:03.000-04:00
* move elastic processors * remove 3rd party processors * change nltk imports * remove indexers and change imports * Revert "remove indexers and change imports" This reverts commit ab07230. * add nltk back * change the nltk import back to forte * add forte-wrapper dependency * fix pylint * add elastic index processor back * add elastic index processor back * add elastic index processor back * add gpt2 back * add gpt2 back * fix pylint * removed allennlp * add a invalid config test * add a twitter sentiment analysis example * add readme * clean up code * fix merge conflict * add end newline * update config * update statistics * update readme Co-authored-by: jingzhi.pang@petuum.com <jingzhi.pang@petuum.com>
diff --git a/examples/twitter_sentiment_analysis/README.md b/examples/twitter_sentiment_analysis/README.md
@@ -0,0 +1,33 @@
+# Twitter Sentiment Analysis
+
+This example show the use of `Forte` to perform sentiment
+analysis on the user's retrieved tweets, based on [Tweepy](https://docs.tweepy.org/en/latest/index.html), [Twitter API](https://developer.twitter.com/en/products/twitter-api) and 
+[Vader (Valence Aware Dictionary and Sentiment Reasoner)](https://github.com/cjhutto/vaderSentiment).
+ 
+
+> **Note**: To run this example, you need to have a Twitter account and apply for Developer Access, 
+then create an application. It will generate the API credentials that you will need use to access Twitter from Python.
+You should put the credentials at `api_credential.yml` first to make the pipeline work. 
+You could refer to 
+https://developer.twitter.com/en/docs/twitter-api/getting-started/getting-access-to-the-twitter-api
+ for more information.
+
+
+## How to run the pipeline
+
+First, you need to create a virtual environment, then in command line:
+
+`cd twitter_sentiment_analysis`
+
+`pip install -r requirements.txt`
+
+
+We can run the pipeline by run
+
+`python pipeline.py`
+
+Then you can input your search query in terminal to get the tweets and sentiment scores.
+
+You can also refer to Twitter's official documentation 
+https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
+for customized query.
diff --git a/examples/twitter_sentiment_analysis/api_credential.yml b/examples/twitter_sentiment_analysis/api_credential.yml
@@ -0,0 +1,4 @@
+consumer_key: ""
+consumer_secret: ""
+access_token: ""
+access_token_secret: ""
diff --git a/examples/twitter_sentiment_analysis/config.yml b/examples/twitter_sentiment_analysis/config.yml
@@ -0,0 +1,15 @@
+boxer:
+  pack_name: "query"
+
+twitter_search:
+  num_tweets_returned: 5
+  lang: "en"
+  date_since: "2020-01-01"
+  result_type: 'recent'
+  query_pack_name: "query"
+  response_pack_name_prefix: "passage"
+  credential_file: "api_credential.yml"
+
+vader_sentiment:
+  entry_type: 'ft.onto.base_ontology.Document'
+  attribute_name: 'sentiment'
diff --git a/examples/twitter_sentiment_analysis/pipeline.py b/examples/twitter_sentiment_analysis/pipeline.py
@@ -0,0 +1,85 @@
+# Copyright 2019 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+from forte.common.configuration import Config
+from forte.data.caster import MultiPackBoxer
+from forte.data.readers import TerminalReader
+from forte.data.multi_pack import MultiPack
+
+from forte.pipeline import Pipeline
+from forte_wrapper.vader import VaderSentimentProcessor
+from forte_wrapper.twitter import TweetSearchProcessor
+from forte.data.selector import RegexNameMatchSelector
+
+
+if __name__ == "__main__":
+    # Load config file
+    config_file = os.path.join(os.path.dirname(__file__), 'config.yml')
+    config = yaml.safe_load(open(config_file, "r"))
+    config = Config(config, default_hparams=None)
+
+    # Build pipeline and add the reader, which will read query from terminal.
+    nlp: Pipeline = Pipeline()
+    nlp.set_reader(reader=TerminalReader())
+
+    # Start to work on multi-packs in the rest of the pipeline, so we use a
+    # boxer to change this.
+    nlp.add(MultiPackBoxer(), config=config.boxer)
+
+    # Search tweets.
+    nlp.add(TweetSearchProcessor(), config=config.twitter_search)
+
+    # Conduct sentiment analysis.
+    pattern = rf"{config.twitter_search.response_pack_name_prefix}_\d"
+    selector_hit = RegexNameMatchSelector(select_name=pattern)
+    nlp.add(component=VaderSentimentProcessor(),
+            selector=selector_hit, config=config.vader_sentiment)
+
+    nlp.initialize()
+
+    # process dataset
+    m_pack: MultiPack
+    for m_pack in nlp.process_dataset():
+        print('The number of datapacks(including query) is', len(m_pack.packs))
+
+        tweets, pos_sentiment, neg_sentiment, neutral_sentiment = 0, 0, 0, 0
+
+        for name, pack in m_pack.iter_packs():
+            # Do not process the query datapack
+            if name == config.twitter_search.query_pack_name:
+                continue
+
+            tweets += 1
+            for doc in pack.get(config.vader_sentiment.entry_type):
+                print('Tweet: ', doc.text)
+                print('Sentiment Compound Score: ',
+                      doc.sentiment['compound'])
+
+                compound_score = doc.sentiment['compound']
+                if compound_score >= 0.05:
+                    pos_sentiment += 1
+                elif compound_score <= -0.05:
+                    neg_sentiment += 1
+                else:
+                    neutral_sentiment += 1
+
+        print('The number of tweets retrieved: ', tweets)
+        print('The proportion of positive sentiment: ', pos_sentiment / tweets)
+        print('The proportion of negative sentiment: ', neg_sentiment / tweets)
+        print('The proportion of neutral sentiment: ',
+              neutral_sentiment / tweets)
+
+    print('Done')
diff --git a/examples/twitter_sentiment_analysis/requirements.txt b/examples/twitter_sentiment_analysis/requirements.txt
@@ -0,0 +1,2 @@
+torch>=1.5.0
+git+https://git@github.com/asyml/forte-wrappers#egg=forte-wrappers[nltk,varder,twitter]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+torch>=1.5.0`
	`2`	`+git+https://git@github.com/asyml/forte-wrappers#egg=forte-wrappers[nltk,varder,twitter]`