Skip to content

Commit

Permalink
Merge branch 'master' into incremental_learning
Browse files Browse the repository at this point in the history
* master:
  Added per user limit option for crawler.
  Add meta data features
  • Loading branch information
soldag committed Dec 13, 2016
2 parents 3907980 + dfa8f4b commit 16194c9
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 11 deletions.
4 changes: 3 additions & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def analyze(data_source_type, classifier_type, experiments_count,
# Crawl arguments
parser.add_argument("--user-limit", type=int, default=100,
help="The maximum number of accounts to crawl.")
parser.add_argument("--limit", type=int, default=0,
help="The maximum number of status updates per user to crawl.")

# Train arguments
parser.add_argument("--classifier-type", "-c", default=None,
Expand All @@ -62,7 +64,7 @@ def analyze(data_source_type, classifier_type, experiments_count,

if args.action == 'crawl':
crawl_status_updates(args.data_source_type, args.dataset_path,
user_limit=args.user_limit)
user_limit=args.user_limit, limit=args.limit)
elif args.action == 'analyze':
analyze(args.data_source_type, args.classifier_type,
args.experiments_count, args.dataset_path, args.twitter_user)
Expand Down
4 changes: 3 additions & 1 deletion core/data_provider/status_update.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import datetime

class StatusUpdate:
def __init__(self, id, author, content, date_time=None, language=None, country=None,
latitude=None, longitude=None, number_of_shares=None, number_of_likes=None):
Expand Down Expand Up @@ -71,7 +73,7 @@ def from_dict(cls, obj):
return StatusUpdate(id=obj["id"],
author=obj["author"],
content=obj["content"],
date_time=obj["date_time"],
date_time=datetime.strptime(obj["date_time"], "%Y-%m-%d %H:%M:%S"),
language=obj["language"],
country=obj["country"],
latitude=obj["latitude"],
Expand Down
9 changes: 5 additions & 4 deletions core/data_provider/twitter_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,18 @@ class TwitterProvider:
def __init__(self):
self.client = self._get_twitter_client()

def get_status_updates(self, user_id=None, dataset_path=None):
def get_status_updates(self, user_id=None,
dataset_path=None, tweet_limit=0):
if user_id:
return self._get_api_status_updates(user_id)
return self._get_api_status_updates(user_id, tweet_limit)
if dataset_path:
return self._get_dataset_status_updates(dataset_path)

raise ValueError('Either user_id or dataset_path has to be provided')

def _get_api_status_updates(self, user_id):
def _get_api_status_updates(self, user_id, limit):
client = self._get_twitter_client()
tweets = tweepy.Cursor(client.user_timeline, id=user_id).items()
tweets = tweepy.Cursor(client.user_timeline, id=user_id).items(limit)
status_updates = [self._parse_tweet(tweet) for tweet in tweets]

return status_updates
Expand Down
5 changes: 3 additions & 2 deletions core/feature_extraction/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .writing_style import WritingStyleFeatures

from .meta_data import MetaDataFeatures

def extract_features(resource):
writing_style_features = WritingStyleFeatures(resource.content).get_features()
meta_data_features = MetaDataFeatures(resource).get_features()

return writing_style_features
return writing_style_features + meta_data_features
14 changes: 14 additions & 0 deletions core/feature_extraction/meta_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class MetaDataFeatures:

def __init__(self, statusUpdate):
self.statusUpdate = statusUpdate

def get_features(self):
features = [ self.statusUpdate.number_of_shares,
self.statusUpdate.number_of_likes,
self.statusUpdate.date_time.hour,
self.statusUpdate.date_time.minute,
self.statusUpdate.date_time.date().weekday(),
self.statusUpdate.latitude or 0,
self.statusUpdate.longitude or 0 ]
return features
4 changes: 2 additions & 2 deletions crawler/twitter_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from core.data_provider.twitter_provider import TwitterProvider


def crawl_status_updates(output_path, user_limit=100):
def crawl_status_updates(output_path, user_limit=100, limit=0):
provider = TwitterProvider()
user_ids = list(_get_most_popular_users(user_limit))

Expand All @@ -16,7 +16,7 @@ def crawl_status_updates(output_path, user_limit=100):
print("Crawling user @%s (%i/%i)..." % (user_id, i+1, user_limit))

# Retrieve tweets
tweets = provider.get_status_updates(user_id)
tweets = provider.get_status_updates(user_id, tweet_limit=limit)
if tweets:
# Write header to csv, if not already done
if not csv_writer.fieldnames:
Expand Down
2 changes: 1 addition & 1 deletion templates/check.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ <h1 class="text-center">Check if your account was compromised</h1>
<form method="post">
<div class="col-md-12 text-center input--url">
<label for="account_url" class="col-md-12">Account URL:</label>
<input type="url" name="account_url" id="account_url" value="https://twitter.com/hpi_de" size="50" class="transparent col-md-12 text-center" />
<input type="url" name="account_url" id="account_url" value="https://twitter.com/sebastian_kliem" size="50" class="transparent col-md-12 text-center" />
</div>
<div class="col-md-12 text-center">
<button type="submit" value="Submit" class="btn btn-success">Submit</button>
Expand Down

0 comments on commit 16194c9

Please sign in to comment.