add Feed models, refactor the feed fetcher, add ElasticSearch dep

bcambel · bcambel · commit bb5013683d2f · 2014-02-09T14:11:07.000+01:00
diff --git a/pyhackers/common/dbfield.py b/pyhackers/common/dbfield.py
@@ -0,0 +1,93 @@
+import base64
+from datetime import datetime as dt, timedelta
+import logging
+import pickle
+from sqlalchemy.types import TypeDecorator, CHAR, String, Integer, Text
+from sqlalchemy.dialects.postgresql import UUID
+import uuid
+
+
+class GUID(TypeDecorator):
+    """
+    Platform-independent GUID type.
+    Uses Postgresql's UUID type, otherwise uses
+    CHAR(32), storing as stringified hex values.
+    """
+    impl = CHAR
+
+    def load_dialect_impl(self, dialect):
+        if dialect.name == 'postgresql':
+            return dialect.type_descriptor(UUID())
+        else:
+            return dialect.type_descriptor(CHAR(32))
+
+    def process_bind_param(self, value, dialect):
+        if value is None:
+            return value
+        elif dialect.name == 'postgresql':
+            return str(value)
+        else:
+            if not isinstance(value, uuid.UUID):
+                return "%.32x" % uuid.UUID(value)
+            else:
+                return "%.32x" % value # hexstring
+
+    def process_result_value(self, value, dialect):
+        if value is None:
+            return value
+        else:
+            return uuid.UUID(value)
+
+
+class Choice(TypeDecorator):
+    impl = String
+
+    def __init__(self, choices=None, **kw):
+        if choices is None:
+            choices = {}
+        self.choices = dict(choices)
+        super(Choice, self).__init__(**kw)
+
+    def process_bind_param(self, value, dialect):
+        return [k for k, v in self.choices.iteritems() if v == value][0]
+
+    def process_result_value(self, value, dialect):
+        return self.choices[value]
+
+
+class EpochType(TypeDecorator):
+    impl = Integer
+
+    epoch = dt.date(dt(1970, 1, 1))
+
+    def process_bind_param(self, value, dialect):
+        return (value - self.epoch).days
+
+    def process_result_value(self, value, dialect):
+        return self.epoch + timedelta(days=value)
+
+
+class GzippedDictField(TypeDecorator):
+    """
+    Slightly different from a JSONField in the sense that the default
+    value is a dictionary.
+    """
+    impl = Text
+
+    def process_result_value(self, value, dialect):
+        if isinstance(value, basestring) and value:
+            try:
+                value = pickle.loads(base64.b64decode(value).decode('zlib'))
+            except Exception, e:
+                logging.exception(e)
+                return {}
+        elif not value:
+            return {}
+        return value
+
+        pass
+
+    def process_bind_param(self, value, dialect):
+        if value is None:
+            return
+        return base64.b64encode(pickle.dumps(value).encode('zlib'))
diff --git a/pyhackers/common/http_utils.py b/pyhackers/common/http_utils.py
@@ -0,0 +1,127 @@
+import logging
+import requests
+from requests import Timeout
+from urlparse import urlparse
+
+
+html_ctype = 'text/html'
+xml_ctype = 'text/xml'
+xml_ctype2 = 'application/xml'
+
+rss_ctype = 'application/rss+xml'
+atom_ctype = 'application/atom+xml'
+rdf_ctype = 'application/rdf+xml'
+
+feed_types = [xml_ctype, rss_ctype, atom_ctype, rdf_ctype, xml_ctype2]
+
+
+class HttpFetcher():
+    def __init__(self):
+        self.session = requests.session()
+        self.session.config['keep_alive'] = True
+
+    #	@profile
+    def download(self, url):
+        r = self.session.get(url)
+
+        return r.text
+
+    def download_json(self, url):
+        r = self.session.get(url)
+
+        return r.json
+
+    def head(self, url, extended=True):
+        try:
+            r = self.session.head(url, allow_redirects=True)
+            print r.headers
+            print r.url
+
+            if extended:
+                return HttpResult(r.url, r.headers, r.status_code, True)
+            else:
+                return r.url
+        except Timeout as tout:
+            logging.error(tout.message, tout)
+            return None
+        except Exception:
+            logging.error("Finding Actual URL General Exception:", exc_info=True)
+            return None
+
+
+class HttpResult:
+    def __init__(self, url='', headers=None, status=None, success=False):
+        self.url = url
+        self.headers = headers
+        self.status_code = status
+        self.success = success
+        self.__content_type = None
+        self.__set_content_type()
+
+    def __set_content_type(self):
+        """
+        Parse http headers to find out content type
+        :return: Nothing
+        """
+        if self.headers is None:
+            return
+
+        content_type = self.headers.get("content-type", None)
+
+        if content_type is None:
+            return
+        if ";" in content_type:
+            content_type_parts = content_type.split(";")
+
+            if len(content_type_parts) == 2:
+                self.__content_type = content_type_parts[0]
+        else:
+            self.__content_type = content_type
+
+    @property
+    def content_type(self):
+        """
+        Return the content-type from header info
+        :return: String content-type from header if exists otherwise None
+        """
+        return self.__content_type
+
+    @property
+    def is_html(self):
+        """
+        Check if the content-type is text/html
+        :return: True/False
+        """
+        return self.__content_type == html_ctype
+
+    @property
+    def is_rss(self):
+        """
+        Check if the HttpResult is a Feed type ( text/xml, application/rss+xml, application/atom+xml )
+        :return: True/False
+        """
+        return self.__content_type in feed_types
+
+    def __reduce__(self):
+        return {
+            "url": self.url,
+            "headers": self.headers,
+            "status_code": self.status_code,
+            "success": self.success,
+            "content_type": self.content_type,
+            "is_html": self.is_html,
+            "is_rss": self.is_rss
+        }
+
+
+http_fetcher = HttpFetcher()
+
+
+def is_same_domain(url1, url2):
+    """
+    Returns true if the two urls should be treated as if they're from the same
+    domain (trusted).
+    """
+    url1 = urlparse(url1)
+    url2 = urlparse(url2)
+    return url1.netloc == url2.netloc
diff --git a/pyhackers/common/indexer.py b/pyhackers/common/indexer.py
@@ -0,0 +1,22 @@
+import logging
+from pyes import *
+from pyes.exceptions import ElasticSearchException
+from pyhackers.sentry import sentry_client
+
+conn = ES('http://es.pythonhackers.com')
+
+
+def index_data(data, index='sweet', doc='post', id=None):
+    logging.warn("Indexing data %s" % (id if id is not None else ""))
+    try:
+        res = conn.index(data, index, doc, id=id)
+    except ElasticSearchException:
+        sentry_client.captureException()
+        return False
+
+    id = None
+
+    if res is not None:
+        id = res.get("_id", None)
+
+    return id, res
diff --git a/pyhackers/model/feed.py b/pyhackers/model/feed.py
@@ -0,0 +1,114 @@
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.orm import relationship
+from sqlalchemy import Boolean, Column, Integer, String, DateTime, ForeignKey, Text
+from pyhackers.common.stringutils import safe_str
+from pyhackers.common.dbfield import Choice
+from pyhackers.db import DB as db
+
+
+FeedStatuses = (
+    ('done', 'DONE'),
+    ('error', 'ERROR'),
+    ('scheduled', 'SCHEDULED'),
+    ('moved', 'MOVED'),
+    ('nf', "NF"),
+    ('?', '?'),
+)
+
+
+class Feed(db.Model):
+    __tablename__ = 'feed'
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    title = Column(String(length=100))
+    slug = Column(Text())
+
+    description = Column(Text())
+    href = Column(Text)
+    link = Column(Text)
+    rss = Column(Text)
+    rss_hash = Column(Text, index=True)
+    lang = Column(String(length=3))
+    etag = Column(Text)
+    updated = Column(DateTime())
+    published = Column(DateTime())
+    version = Column(Text)
+    author = Column(Text)
+
+    status_code = Column(Integer())
+    status = Column(Choice(FeedStatuses))
+
+    last_post = Column(DateTime())
+    last_check = Column(DateTime())
+    next_check = Column(DateTime())
+    active = Column(Boolean())
+    top = Column(Boolean())
+    news = Column(Boolean())
+
+    logo = Column(Text())
+
+    posts = relationship(Post)
+
+
+class FeedHistory(db.Model):
+    __tablename__ = 'feed_history'
+
+    id = Column(Integer, primary_key=True)
+    timestamp = Column(DateTime())
+    http_status_code = Column(Integer())  # HTTP Status Code [20x,30x,40x,50x]
+    post_count = Column(Integer())
+    etag = Column(Text)
+    feed_id = Column(Integer())
+
+
+class Post(db.Model):
+    __tablename__ = "post"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    title = Column(Text())
+    author = Column(Text())
+    href = Column(Text())
+    content_html = Column(Text())
+    original_link = Column(Text())
+    title_hash = Column(Text, index=True)
+    link_hash = Column(Text, index=True)
+    post_id = Column(Text())  # most of the time websites publish a URL
+    post_id_hash = Column(Text, index=True)
+    media = Column(postgresql.ARRAY(String))
+    lang = Column(Text)
+    tags = Column(postgresql.ARRAY(String))
+    published_at = Column(DateTime)
+    feed_id = Column(Integer, ForeignKey('feed.id'))
+
+    stats_fb = Column(Integer)
+    stats_tw = Column(Integer)
+
+    fetched = Column(Boolean(), default=False)
+    indexed = Column(Boolean(), default=False)
+    trending = Column(Boolean(), default=False)
+    hot = Column(Boolean(), default=False)
+
+    def __repr__(self):
+        return "<Post: %s (by %s) %s>" % (safe_str(self.title), self.author, self.href)
+
+    @property
+    def original_title(self):
+        if hasattr(self, '_original_title'):
+            return self._original_title
+        else:
+            return self.title
+
+    @original_title.setter
+    def original_title(self, value):
+        self._original_title = value
+
+    @property
+    def original_author(self):
+        if hasattr(self, '_original_author'):
+            return self._original_author
+        else:
+            return self.author
+
+    @original_author.setter
+    def original_author(self, value):
+        self._original_author = value
diff --git a/pyhackers/worker/article.py b/pyhackers/worker/article.py
diff --git a/requirements.txt b/requirements.txt