Skip to content

Commit

Permalink
Added per user limit option for crawler.
Browse files Browse the repository at this point in the history
  • Loading branch information
soldag committed Dec 13, 2016
1 parent 52ad86d commit dfa8f4b
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 7 deletions.
4 changes: 3 additions & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def analyze(data_source_type, classifier_type, experiments_count,
# Crawl arguments
parser.add_argument("--user-limit", type=int, default=100,
help="The maximum number of accounts to crawl.")
parser.add_argument("--limit", type=int, default=0,
help="The maximum number of status updates per user to crawl.")

# Train arguments
parser.add_argument("--classifier-type", "-c", default=None,
Expand All @@ -63,7 +65,7 @@ def analyze(data_source_type, classifier_type, experiments_count,

if args.action == 'crawl':
crawl_status_updates(args.data_source_type, args.dataset_path,
user_limit=args.user_limit)
user_limit=args.user_limit, limit=args.limit)
elif args.action == 'analyze':
analyze(args.data_source_type, args.classifier_type,
args.experiments_count, args.dataset_path, args.twitter_user)
Expand Down
9 changes: 5 additions & 4 deletions core/data_provider/twitter_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,18 @@ class TwitterProvider:
def __init__(self):
self.client = self._get_twitter_client()

def get_status_updates(self, user_id=None, dataset_path=None):
def get_status_updates(self, user_id=None,
dataset_path=None, tweet_limit=0):
if user_id:
return self._get_api_status_updates(user_id)
return self._get_api_status_updates(user_id, tweet_limit)
if dataset_path:
return self._get_dataset_status_updates(dataset_path)

raise ValueError('Either user_id or dataset_path has to be provided')

def _get_api_status_updates(self, user_id):
def _get_api_status_updates(self, user_id, limit):
client = self._get_twitter_client()
tweets = tweepy.Cursor(client.user_timeline, id=user_id).items()
tweets = tweepy.Cursor(client.user_timeline, id=user_id).items(limit)
status_updates = [self._parse_tweet(tweet) for tweet in tweets]

return status_updates
Expand Down
4 changes: 2 additions & 2 deletions crawler/twitter_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from core.data_provider.twitter_provider import TwitterProvider


def crawl_status_updates(output_path, user_limit=100):
def crawl_status_updates(output_path, user_limit=100, limit=0):
provider = TwitterProvider()
user_ids = list(_get_most_popular_users(user_limit))

Expand All @@ -16,7 +16,7 @@ def crawl_status_updates(output_path, user_limit=100):
print("Crawling user @%s (%i/%i)..." % (user_id, i+1, user_limit))

# Retrieve tweets
tweets = provider.get_status_updates(user_id)
tweets = provider.get_status_updates(user_id, tweet_limit=limit)
if tweets:
# Write header to csv, if not already done
if not csv_writer.fieldnames:
Expand Down

0 comments on commit dfa8f4b

Please sign in to comment.