Skip to content

Commit 83fb48a

Browse files
committed
article download working. Exclude unnecessary BSoup parsing
1 parent 610baf1 commit 83fb48a

File tree

3 files changed

+14
-9
lines changed

3 files changed

+14
-9
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ lib64
2424
venv
2525
.idea
2626
.tweets
27+
logs
2728
# Installer logs
2829
pip-log.txt
2930

pyhackers/common/stringutils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# coding=utf-8
22
import htmlentitydefs
3-
import anyjson
3+
import json
44
import unicodedata as ud
55

66

@@ -37,7 +37,7 @@ def safe_filename(filename):
3737

3838

3939
def safe_obj_str(my_obj):
40-
return anyjson.serialize(my_obj)
40+
return json.dumps(my_obj)
4141

4242

4343
def max_length_field(instance, name, length):

pyhackers/worker/article.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ def extract_rss_results(feed, url=''):
400400
elif isinstance(content, basestring):
401401
content_html = content
402402
else:
403-
error_reporter.captureMessage("Content has weird setup")
403+
logging.warn("Content has weird setup")
404404
content_html = ''
405405

406406
if hasattr(entry, "description") and not len(content_html):
@@ -413,23 +413,27 @@ def extract_rss_results(feed, url=''):
413413
elif isinstance(content_html, basestring):
414414
pass
415415
else:
416-
print "What the fuck is this type? %s " % type(content_html)
416+
logging.warn("What the fuck is this type? %s " % type(content_html))
417+
417418

418419
bsoup = BeautifulSoup(content_html)
420+
419421
html_text = content_html
420422
if bsoup is not None and len(bsoup.contents) > 0:
421-
html_text = "".join([c.__unicode__() for c in bsoup.contents])
423+
html_text = "".join([unicode(c) for c in bsoup.contents])
422424

423425
feed_entry['content'] = html_text
424426

425427
for attr in entry_attributes:
426428
if hasattr(entry, attr):
427429
val = entry[attr]
428430
if val is not None and isinstance(val, basestring):
429-
bsoup2 = BeautifulSoup(val)
430-
val = "".join([c.__unicode__() for c in bsoup2.contents]) if bsoup2 is not None and len(
431-
bsoup2.contents) > 0 else val
432-
val = val
431+
logging.warn(u"{} => {}".format(attr, val))
432+
433+
#bsoup2 = BeautifulSoup(val)
434+
#val = "".join([unicode(c) for c in bsoup2.contents]) if bsoup2 is not None and len(
435+
# bsoup2.contents) > 0 else val
436+
#val = val
433437

434438
feed_entry[attr] = val
435439

0 commit comments

Comments
 (0)