From 52ad86dc7ec5852f30bd469d7629635d41425580 Mon Sep 17 00:00:00 2001 From: Henriette Dinger Date: Tue, 13 Dec 2016 11:44:10 +0100 Subject: [PATCH 1/2] Add meta data features --- core/data_provider/status_update.py | 4 +++- core/feature_extraction/__init__.py | 5 +++-- core/feature_extraction/meta_data.py | 14 ++++++++++++++ templates/check.html | 2 +- 4 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 core/feature_extraction/meta_data.py diff --git a/core/data_provider/status_update.py b/core/data_provider/status_update.py index b079aa7..877da22 100644 --- a/core/data_provider/status_update.py +++ b/core/data_provider/status_update.py @@ -1,3 +1,5 @@ +from datetime import datetime + class StatusUpdate: def __init__(self, id, author, content, date_time=None, language=None, country=None, latitude=None, longitude=None, number_of_shares=None, number_of_likes=None): @@ -71,7 +73,7 @@ def from_dict(cls, obj): return StatusUpdate(id=obj["id"], author=obj["author"], content=obj["content"], - date_time=obj["date_time"], + date_time=datetime.strptime(obj["date_time"], "%Y-%m-%d %H:%M:%S"), language=obj["language"], country=obj["country"], latitude=obj["latitude"], diff --git a/core/feature_extraction/__init__.py b/core/feature_extraction/__init__.py index 5920e87..7afde36 100644 --- a/core/feature_extraction/__init__.py +++ b/core/feature_extraction/__init__.py @@ -1,7 +1,8 @@ from .writing_style import WritingStyleFeatures - +from .meta_data import MetaDataFeatures def extract_features(resource): writing_style_features = WritingStyleFeatures(resource.content).get_features() + meta_data_features = MetaDataFeatures(resource).get_features() - return writing_style_features + return writing_style_features + meta_data_features diff --git a/core/feature_extraction/meta_data.py b/core/feature_extraction/meta_data.py new file mode 100644 index 0000000..cecd473 --- /dev/null +++ b/core/feature_extraction/meta_data.py @@ -0,0 +1,14 @@ +class MetaDataFeatures: + + def __init__(self, statusUpdate): + self.statusUpdate = statusUpdate + + def get_features(self): + features = [ self.statusUpdate.number_of_shares, + self.statusUpdate.number_of_likes, + self.statusUpdate.date_time.hour, + self.statusUpdate.date_time.minute, + self.statusUpdate.date_time.date().weekday(), + self.statusUpdate.latitude or 0, + self.statusUpdate.longitude or 0 ] + return features diff --git a/templates/check.html b/templates/check.html index c7be926..a7f8d2a 100644 --- a/templates/check.html +++ b/templates/check.html @@ -8,7 +8,7 @@

Check if your account was compromised

- +
From dfa8f4be2eb33ce6183be23c25ebb673ca4105a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?So=CC=88ren=20Oldag?= Date: Tue, 13 Dec 2016 12:13:57 +0100 Subject: [PATCH 2/2] Added per user limit option for crawler. --- cli.py | 4 +++- core/data_provider/twitter_provider.py | 9 +++++---- crawler/twitter_crawler.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cli.py b/cli.py index a449259..333afb4 100644 --- a/cli.py +++ b/cli.py @@ -52,6 +52,8 @@ def analyze(data_source_type, classifier_type, experiments_count, # Crawl arguments parser.add_argument("--user-limit", type=int, default=100, help="The maximum number of accounts to crawl.") + parser.add_argument("--limit", type=int, default=0, + help="The maximum number of status updates per user to crawl.") # Train arguments parser.add_argument("--classifier-type", "-c", default=None, @@ -63,7 +65,7 @@ def analyze(data_source_type, classifier_type, experiments_count, if args.action == 'crawl': crawl_status_updates(args.data_source_type, args.dataset_path, - user_limit=args.user_limit) + user_limit=args.user_limit, limit=args.limit) elif args.action == 'analyze': analyze(args.data_source_type, args.classifier_type, args.experiments_count, args.dataset_path, args.twitter_user) diff --git a/core/data_provider/twitter_provider.py b/core/data_provider/twitter_provider.py index 01c58f1..da0a1a1 100644 --- a/core/data_provider/twitter_provider.py +++ b/core/data_provider/twitter_provider.py @@ -10,17 +10,18 @@ class TwitterProvider: def __init__(self): self.client = self._get_twitter_client() - def get_status_updates(self, user_id=None, dataset_path=None): + def get_status_updates(self, user_id=None, + dataset_path=None, tweet_limit=0): if user_id: - return self._get_api_status_updates(user_id) + return self._get_api_status_updates(user_id, tweet_limit) if dataset_path: return self._get_dataset_status_updates(dataset_path) raise ValueError('Either user_id or dataset_path has to be provided') - def _get_api_status_updates(self, user_id): + def _get_api_status_updates(self, user_id, limit): client = self._get_twitter_client() - tweets = tweepy.Cursor(client.user_timeline, id=user_id).items() + tweets = tweepy.Cursor(client.user_timeline, id=user_id).items(limit) status_updates = [self._parse_tweet(tweet) for tweet in tweets] return status_updates diff --git a/crawler/twitter_crawler.py b/crawler/twitter_crawler.py index 65f6251..c85feee 100644 --- a/crawler/twitter_crawler.py +++ b/crawler/twitter_crawler.py @@ -4,7 +4,7 @@ from core.data_provider.twitter_provider import TwitterProvider -def crawl_status_updates(output_path, user_limit=100): +def crawl_status_updates(output_path, user_limit=100, limit=0): provider = TwitterProvider() user_ids = list(_get_most_popular_users(user_limit)) @@ -16,7 +16,7 @@ def crawl_status_updates(output_path, user_limit=100): print("Crawling user @%s (%i/%i)..." % (user_id, i+1, user_limit)) # Retrieve tweets - tweets = provider.get_status_updates(user_id) + tweets = provider.get_status_updates(user_id, tweet_limit=limit) if tweets: # Write header to csv, if not already done if not csv_writer.fieldnames: