Skip to content

Commit

Permalink
Whole: allowing to enable JARR to retrieve and parse content
Browse files Browse the repository at this point in the history
article won't be changed, only its vector will be improved
the cluster will display the fetched content
  • Loading branch information
jaesivsm committed May 19, 2020
1 parent a921175 commit 6fe6078
Show file tree
Hide file tree
Showing 18 changed files with 255 additions and 81 deletions.
5 changes: 3 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@ opml = "==0.5"
prometheus-client = "==0.7.1"
prometheus-distributed-client = "==1.2.1"
"psycopg2" = "==2.8.5"
python-dateutil = "==2.6.1"
python-dateutil = ">=2.6.1"
rauth = "==0.7.3"
redis = "==2.10.6"
requests = "==2.21.0"
requests = ">=2.21.0"
SQLAlchemy = "==1.3.3"
the-conf = "==0.0.15"
trafilatura = "==0.4.1"
flask-cors = "==3.0.8"

[requires]
Expand Down
85 changes: 60 additions & 25 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions jarr/api/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
'date': fields.DateTime()})
content_model = cluster_ns.model('ComplexContent', {
'type': fields.String(required=True),
'content': fields.String(),
'comments': fields.String(),
'link': fields.String(),
'tags': fields.List(fields.String),
'alt': fields.String(),
'src': fields.String(),
'player': fields.String(),
Expand Down
1 change: 1 addition & 0 deletions jarr/api/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
store_missing=False)
# clustering options
set_clustering_options("feed", model, parser)
set_model_n_parser(model, parser, 'truncated_content', bool, nullable=False)
set_model_n_parser(model, parser, 'feed_type', FeedType, nullable=False)
set_model_n_parser(model, parser, 'category_id', int, nullable=False)
set_model_n_parser(model, parser, 'site_link', str, nullable=False)
Expand Down
19 changes: 4 additions & 15 deletions jarr/controllers/article.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import logging
import re
from datetime import timedelta
from hashlib import sha1

from bs4 import BeautifulSoup
from sqlalchemy import cast, func
from sqlalchemy.dialects.postgresql import TSVECTOR
from sqlalchemy import func
from werkzeug.exceptions import Forbidden, Unauthorized

from jarr.bootstrap import session
from jarr.controllers import CategoryController, FeedController
from jarr.lib.clustering_af.postgres_casting import to_vector
from jarr.lib.utils import utc_now
from jarr.models import Article, User

Expand Down Expand Up @@ -46,10 +44,6 @@
class ArticleController(AbstractController):
_db_cls = Article

@staticmethod
def lang_to_postgreq(lang):
return LANG_TO_PSQL_MAPPING.get(lang[:2].lower(), 'simple')

def challenge(self, ids):
"""Will return each id that wasn't found in the database."""
for id_ in ids:
Expand Down Expand Up @@ -89,13 +83,8 @@ def create(self, **attrs):
feed.user_id == attrs['user_id'] or self.user_id is None):
raise Forbidden("no right on feed %r" % feed.id)
attrs['user_id'], attrs['category_id'] = feed.user_id, feed.category_id
vector = (attrs.get('title', ' ')
+ ' ' + ' '.join(attrs.get('tags', []))
+ ' ' + attrs.get('content', ' '))
vector = BeautifulSoup(vector, 'html.parser').text
vector = re.sub(r'\W', ' ', vector).strip()
if vector:
attrs['vector'] = cast(vector, TSVECTOR)
attrs['vector'] = to_vector(attrs.get('title'), attrs.get('tags'),
attrs.get('content'))
attrs['link_hash'] = sha1(attrs['link'].encode('utf8')).digest()
article = super().create(**attrs)
return article
Expand Down
Loading

0 comments on commit 6fe6078

Please sign in to comment.