From 3cad0db0f5b8e1a21a15e49b44b4be41751df01b Mon Sep 17 00:00:00 2001 From: Xuebin Wei Date: Thu, 19 Oct 2023 10:09:42 -0400 Subject: [PATCH] Delete MongoDB/Collect Tweets into MongoDB with Twitter API v2.ipynb --- ...ets into MongoDB with Twitter API v2.ipynb | 355 ------------------ 1 file changed, 355 deletions(-) delete mode 100644 MongoDB/Collect Tweets into MongoDB with Twitter API v2.ipynb diff --git a/MongoDB/Collect Tweets into MongoDB with Twitter API v2.ipynb b/MongoDB/Collect Tweets into MongoDB with Twitter API v2.ipynb deleted file mode 100644 index b9236c9..0000000 --- a/MongoDB/Collect Tweets into MongoDB with Twitter API v2.ipynb +++ /dev/null @@ -1,355 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "dff27a23", - "metadata": {}, - "source": [ - "# Collect Tweets into MongoDB with Twitter API v2" - ] - }, - { - "cell_type": "markdown", - "id": "256545f3", - "metadata": {}, - "source": [ - "## Install Python libraries" - ] - }, - { - "cell_type": "markdown", - "id": "c66b2cfd", - "metadata": {}, - "source": [ - "We need the [pymongo](https://pypi.org/project/pymongo/) to manage the MongoDB database, and [tweepy](https://www.tweepy.org/) to call the Twitter APIs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "971248e0", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "!pip install pymongo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d535d4e5", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install tweepy" - ] - }, - { - "cell_type": "markdown", - "id": "4322984b", - "metadata": {}, - "source": [ - "## Import Python libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17de3efe", - "metadata": {}, - "outputs": [], - "source": [ - "import pymongo\n", - "from pymongo import MongoClient\n", - "import json\n", - "from pprint import pprint\n", - "import tweepy\n", - "import configparser" - ] - }, - { - "cell_type": "markdown", - "id": "03015593", - "metadata": {}, - "source": [ - "## Load the authorization info" - ] - }, - { - "cell_type": "markdown", - "id": "dfe8399d", - "metadata": {}, - "source": [ - "Save the database connection info and API key in a config.ini file and use the configparse to load the authorization info.\n", - "\n", - "The config.ini file shoud look like:\n", - "``` \n", - "[mytwitter]\n", - "bearer_token = \n", - "\n", - "[mymongo]\n", - "connection = \n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3898362", - "metadata": {}, - "outputs": [], - "source": [ - "config = configparser.ConfigParser(interpolation=None)\n", - "config.read('config.ini')\n", - "\n", - "BEARER_TOKEN = config['mytwitter']['bearer_token']\n", - "\n", - "mongod_connect = config['mymongo']['connection']" - ] - }, - { - "cell_type": "markdown", - "id": "51d3af7a", - "metadata": {}, - "source": [ - "## Connect to the MongoDB cluster" - ] - }, - { - "cell_type": "markdown", - "id": "4a85d395", - "metadata": {}, - "source": [ - "We will create a database named 'demo' and a collection named 'tweet_collection' in your MongoDB database." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1c6c435", - "metadata": {}, - "outputs": [], - "source": [ - "client = MongoClient(mongod_connect)\n", - "db = client.demo # use or create a database named demo\n", - "tweet_collection = db.tweet_collection #use or create a collection named tweet_collection\n", - "tweet_collection.create_index([(\"tweet.id\", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique" - ] - }, - { - "cell_type": "markdown", - "id": "5b69b57a", - "metadata": {}, - "source": [ - "## Use the API to collect tweets" - ] - }, - { - "cell_type": "markdown", - "id": "983e5dc8", - "metadata": { - "scrolled": false - }, - "source": [ - "### Define the query" - ] - }, - { - "cell_type": "markdown", - "id": "d4d02a4b", - "metadata": {}, - "source": [ - "For more about Twitter API 2.0 query operators, please check [Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed886853", - "metadata": {}, - "outputs": [], - "source": [ - "query = 'covid' #query tweets about covid" - ] - }, - { - "cell_type": "markdown", - "id": "6956faeb", - "metadata": {}, - "source": [ - "### Insert the data into mognodb" - ] - }, - { - "cell_type": "markdown", - "id": "5e16dedf", - "metadata": {}, - "source": [ - "You can set a different max_result, but the max tweets we can collect is 100." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a079c5cb", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "\n", - "client = tweepy.Client(BEARER_TOKEN)\n", - "\n", - "tweets = client.search_recent_tweets(query=query, max_results=100,\n", - " expansions=['author_id'], \n", - " tweet_fields = ['created_at','entities','lang','public_metrics','geo'],\n", - " user_fields = ['id', 'location','name', 'public_metrics','username'])\n", - "\n", - "next_token = tweets.meta['next_token']\n", - "for user, tweet in zip(tweets.includes['users'],tweets.data):\n", - " tweet_json = {}\n", - " tweet_json['tweet']= tweet.data\n", - " tweet_json['user'] = user.data\n", - " try:\n", - " tweet_collection.insert_one(tweet_json)\n", - " print(tweet_json['tweet']['created_at'])\n", - " except:\n", - " pass\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "4f91bca9", - "metadata": {}, - "source": [ - "Continue fetching early tweets with the same query. YOU WILL REACH YOUR RATE LIMIT VERY FAST" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "527a740a", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(0):\n", - " tweets = client.search_recent_tweets(query=query, max_results=10,\n", - " expansions=['author_id'], \n", - " tweet_fields = ['created_at','entities','lang','public_metrics','geo'],\n", - " user_fields = ['id', 'location','name', 'public_metrics','username'],\n", - " next_token=next_token)\n", - " next_token = tweets.meta['next_token']\n", - " for user, tweet in zip(tweets.includes['users'],tweets.data):\n", - " tweet_json = {}\n", - " tweet_json['tweet']= tweet.data\n", - " tweet_json['user'] = user.data\n", - " try:\n", - " tweet_collection.insert_one(tweet_json)\n", - " print(tweet_json['tweet']['created_at'])\n", - " except:\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "id": "b25d73c0", - "metadata": {}, - "source": [ - "## View the collected tweets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20159977", - "metadata": {}, - "outputs": [], - "source": [ - "print('Number of collected tweets:',tweet_collection.estimated_document_count())# number of tweets collected" - ] - }, - { - "cell_type": "markdown", - "id": "9bc2c4c6", - "metadata": {}, - "source": [ - "Create a text index and print the Tweets containing specific keywords." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4529e93", - "metadata": {}, - "outputs": [], - "source": [ - "tweet_collection.create_index([(\"tweet.text\", pymongo.TEXT)], name='text_index', default_language='english') # create a text index" - ] - }, - { - "cell_type": "markdown", - "id": "a18af4e8", - "metadata": {}, - "source": [ - "Create a cursor to query tweets with the created index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "925b36da", - "metadata": {}, - "outputs": [], - "source": [ - "tweet_cursor = tweet_collection.find({\"$text\": {\"$search\": \"covid\"}}) # return tweets that contain covid" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15be6740", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "for tweet in tweet_cursor:\n", - " print('---')\n", - " print (tweet['tweet']['text'])\n", - " print (tweet['user']['name'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea730f0d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}