Skip to content

Commit bb50136

Browse files
committed
add Feed models, refactor the feed fetcher, add ElasticSearch dep
1 parent 83fb48a commit bb50136

File tree

6 files changed

+364
-33
lines changed

6 files changed

+364
-33
lines changed

pyhackers/common/dbfield.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import base64
2+
from datetime import datetime as dt, timedelta
3+
import logging
4+
import pickle
5+
from sqlalchemy.types import TypeDecorator, CHAR, String, Integer, Text
6+
from sqlalchemy.dialects.postgresql import UUID
7+
import uuid
8+
9+
10+
class GUID(TypeDecorator):
11+
"""
12+
Platform-independent GUID type.
13+
Uses Postgresql's UUID type, otherwise uses
14+
CHAR(32), storing as stringified hex values.
15+
"""
16+
impl = CHAR
17+
18+
def load_dialect_impl(self, dialect):
19+
if dialect.name == 'postgresql':
20+
return dialect.type_descriptor(UUID())
21+
else:
22+
return dialect.type_descriptor(CHAR(32))
23+
24+
def process_bind_param(self, value, dialect):
25+
if value is None:
26+
return value
27+
elif dialect.name == 'postgresql':
28+
return str(value)
29+
else:
30+
if not isinstance(value, uuid.UUID):
31+
return "%.32x" % uuid.UUID(value)
32+
else:
33+
return "%.32x" % value # hexstring
34+
35+
def process_result_value(self, value, dialect):
36+
if value is None:
37+
return value
38+
else:
39+
return uuid.UUID(value)
40+
41+
42+
class Choice(TypeDecorator):
43+
impl = String
44+
45+
def __init__(self, choices=None, **kw):
46+
if choices is None:
47+
choices = {}
48+
self.choices = dict(choices)
49+
super(Choice, self).__init__(**kw)
50+
51+
def process_bind_param(self, value, dialect):
52+
return [k for k, v in self.choices.iteritems() if v == value][0]
53+
54+
def process_result_value(self, value, dialect):
55+
return self.choices[value]
56+
57+
58+
class EpochType(TypeDecorator):
59+
impl = Integer
60+
61+
epoch = dt.date(dt(1970, 1, 1))
62+
63+
def process_bind_param(self, value, dialect):
64+
return (value - self.epoch).days
65+
66+
def process_result_value(self, value, dialect):
67+
return self.epoch + timedelta(days=value)
68+
69+
70+
class GzippedDictField(TypeDecorator):
71+
"""
72+
Slightly different from a JSONField in the sense that the default
73+
value is a dictionary.
74+
"""
75+
impl = Text
76+
77+
def process_result_value(self, value, dialect):
78+
if isinstance(value, basestring) and value:
79+
try:
80+
value = pickle.loads(base64.b64decode(value).decode('zlib'))
81+
except Exception, e:
82+
logging.exception(e)
83+
return {}
84+
elif not value:
85+
return {}
86+
return value
87+
88+
pass
89+
90+
def process_bind_param(self, value, dialect):
91+
if value is None:
92+
return
93+
return base64.b64encode(pickle.dumps(value).encode('zlib'))

pyhackers/common/http_utils.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import logging
2+
import requests
3+
from requests import Timeout
4+
from urlparse import urlparse
5+
6+
7+
html_ctype = 'text/html'
8+
xml_ctype = 'text/xml'
9+
xml_ctype2 = 'application/xml'
10+
11+
rss_ctype = 'application/rss+xml'
12+
atom_ctype = 'application/atom+xml'
13+
rdf_ctype = 'application/rdf+xml'
14+
15+
feed_types = [xml_ctype, rss_ctype, atom_ctype, rdf_ctype, xml_ctype2]
16+
17+
18+
class HttpFetcher():
19+
def __init__(self):
20+
self.session = requests.session()
21+
self.session.config['keep_alive'] = True
22+
23+
# @profile
24+
def download(self, url):
25+
r = self.session.get(url)
26+
27+
return r.text
28+
29+
def download_json(self, url):
30+
r = self.session.get(url)
31+
32+
return r.json
33+
34+
def head(self, url, extended=True):
35+
try:
36+
r = self.session.head(url, allow_redirects=True)
37+
print r.headers
38+
print r.url
39+
40+
if extended:
41+
return HttpResult(r.url, r.headers, r.status_code, True)
42+
else:
43+
return r.url
44+
except Timeout as tout:
45+
logging.error(tout.message, tout)
46+
return None
47+
except Exception:
48+
logging.error("Finding Actual URL General Exception:", exc_info=True)
49+
return None
50+
51+
52+
class HttpResult:
53+
def __init__(self, url='', headers=None, status=None, success=False):
54+
self.url = url
55+
self.headers = headers
56+
self.status_code = status
57+
self.success = success
58+
self.__content_type = None
59+
self.__set_content_type()
60+
61+
def __set_content_type(self):
62+
"""
63+
Parse http headers to find out content type
64+
:return: Nothing
65+
"""
66+
if self.headers is None:
67+
return
68+
69+
content_type = self.headers.get("content-type", None)
70+
71+
if content_type is None:
72+
return
73+
if ";" in content_type:
74+
content_type_parts = content_type.split(";")
75+
76+
if len(content_type_parts) == 2:
77+
self.__content_type = content_type_parts[0]
78+
else:
79+
self.__content_type = content_type
80+
81+
@property
82+
def content_type(self):
83+
"""
84+
Return the content-type from header info
85+
:return: String content-type from header if exists otherwise None
86+
"""
87+
return self.__content_type
88+
89+
@property
90+
def is_html(self):
91+
"""
92+
Check if the content-type is text/html
93+
:return: True/False
94+
"""
95+
return self.__content_type == html_ctype
96+
97+
@property
98+
def is_rss(self):
99+
"""
100+
Check if the HttpResult is a Feed type ( text/xml, application/rss+xml, application/atom+xml )
101+
:return: True/False
102+
"""
103+
return self.__content_type in feed_types
104+
105+
def __reduce__(self):
106+
return {
107+
"url": self.url,
108+
"headers": self.headers,
109+
"status_code": self.status_code,
110+
"success": self.success,
111+
"content_type": self.content_type,
112+
"is_html": self.is_html,
113+
"is_rss": self.is_rss
114+
}
115+
116+
117+
http_fetcher = HttpFetcher()
118+
119+
120+
def is_same_domain(url1, url2):
121+
"""
122+
Returns true if the two urls should be treated as if they're from the same
123+
domain (trusted).
124+
"""
125+
url1 = urlparse(url1)
126+
url2 = urlparse(url2)
127+
return url1.netloc == url2.netloc

pyhackers/common/indexer.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import logging
2+
from pyes import *
3+
from pyes.exceptions import ElasticSearchException
4+
from pyhackers.sentry import sentry_client
5+
6+
conn = ES('http://es.pythonhackers.com')
7+
8+
9+
def index_data(data, index='sweet', doc='post', id=None):
10+
logging.warn("Indexing data %s" % (id if id is not None else ""))
11+
try:
12+
res = conn.index(data, index, doc, id=id)
13+
except ElasticSearchException:
14+
sentry_client.captureException()
15+
return False
16+
17+
id = None
18+
19+
if res is not None:
20+
id = res.get("_id", None)
21+
22+
return id, res

pyhackers/model/feed.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from sqlalchemy.dialects import postgresql
2+
from sqlalchemy.orm import relationship
3+
from sqlalchemy import Boolean, Column, Integer, String, DateTime, ForeignKey, Text
4+
from pyhackers.common.stringutils import safe_str
5+
from pyhackers.common.dbfield import Choice
6+
from pyhackers.db import DB as db
7+
8+
9+
FeedStatuses = (
10+
('done', 'DONE'),
11+
('error', 'ERROR'),
12+
('scheduled', 'SCHEDULED'),
13+
('moved', 'MOVED'),
14+
('nf', "NF"),
15+
('?', '?'),
16+
)
17+
18+
19+
class Feed(db.Model):
20+
__tablename__ = 'feed'
21+
22+
id = Column(Integer, primary_key=True, autoincrement=True)
23+
title = Column(String(length=100))
24+
slug = Column(Text())
25+
26+
description = Column(Text())
27+
href = Column(Text)
28+
link = Column(Text)
29+
rss = Column(Text)
30+
rss_hash = Column(Text, index=True)
31+
lang = Column(String(length=3))
32+
etag = Column(Text)
33+
updated = Column(DateTime())
34+
published = Column(DateTime())
35+
version = Column(Text)
36+
author = Column(Text)
37+
38+
status_code = Column(Integer())
39+
status = Column(Choice(FeedStatuses))
40+
41+
last_post = Column(DateTime())
42+
last_check = Column(DateTime())
43+
next_check = Column(DateTime())
44+
active = Column(Boolean())
45+
top = Column(Boolean())
46+
news = Column(Boolean())
47+
48+
logo = Column(Text())
49+
50+
posts = relationship(Post)
51+
52+
53+
class FeedHistory(db.Model):
54+
__tablename__ = 'feed_history'
55+
56+
id = Column(Integer, primary_key=True)
57+
timestamp = Column(DateTime())
58+
http_status_code = Column(Integer()) # HTTP Status Code [20x,30x,40x,50x]
59+
post_count = Column(Integer())
60+
etag = Column(Text)
61+
feed_id = Column(Integer())
62+
63+
64+
class Post(db.Model):
65+
__tablename__ = "post"
66+
67+
id = Column(Integer, primary_key=True, autoincrement=True)
68+
title = Column(Text())
69+
author = Column(Text())
70+
href = Column(Text())
71+
content_html = Column(Text())
72+
original_link = Column(Text())
73+
title_hash = Column(Text, index=True)
74+
link_hash = Column(Text, index=True)
75+
post_id = Column(Text()) # most of the time websites publish a URL
76+
post_id_hash = Column(Text, index=True)
77+
media = Column(postgresql.ARRAY(String))
78+
lang = Column(Text)
79+
tags = Column(postgresql.ARRAY(String))
80+
published_at = Column(DateTime)
81+
feed_id = Column(Integer, ForeignKey('feed.id'))
82+
83+
stats_fb = Column(Integer)
84+
stats_tw = Column(Integer)
85+
86+
fetched = Column(Boolean(), default=False)
87+
indexed = Column(Boolean(), default=False)
88+
trending = Column(Boolean(), default=False)
89+
hot = Column(Boolean(), default=False)
90+
91+
def __repr__(self):
92+
return "<Post: %s (by %s) %s>" % (safe_str(self.title), self.author, self.href)
93+
94+
@property
95+
def original_title(self):
96+
if hasattr(self, '_original_title'):
97+
return self._original_title
98+
else:
99+
return self.title
100+
101+
@original_title.setter
102+
def original_title(self, value):
103+
self._original_title = value
104+
105+
@property
106+
def original_author(self):
107+
if hasattr(self, '_original_author'):
108+
return self._original_author
109+
else:
110+
return self.author
111+
112+
@original_author.setter
113+
def original_author(self, value):
114+
self._original_author = value

0 commit comments

Comments
 (0)