Whole: allowing to enable JARR to retrieve and parse content

article won't be changed, only its vector will be improved the cluster will display the fetched content
jaesivsm · May 19, 2020 · 6fe6078 · 6fe6078
1 parent a921175
commit 6fe6078
Show file tree

Hide file tree

Showing 18 changed files with 255 additions and 81 deletions.
diff --git a/Pipfile b/Pipfile
@@ -33,12 +33,13 @@ opml = "==0.5"
 prometheus-client = "==0.7.1"
 prometheus-distributed-client = "==1.2.1"
 "psycopg2" = "==2.8.5"
-python-dateutil = "==2.6.1"
+python-dateutil = ">=2.6.1"
 rauth = "==0.7.3"
 redis = "==2.10.6"
-requests = "==2.21.0"
+requests = ">=2.21.0"
 SQLAlchemy = "==1.3.3"
 the-conf = "==0.0.15"
+trafilatura = "==0.4.1"
 flask-cors = "==3.0.8"
 
 [requires]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/jarr/api/cluster.py b/jarr/api/cluster.py
@@ -26,6 +26,10 @@
     'date': fields.DateTime()})
 content_model = cluster_ns.model('ComplexContent', {
     'type': fields.String(required=True),
+    'content': fields.String(),
+    'comments': fields.String(),
+    'link': fields.String(),
+    'tags': fields.List(fields.String),
     'alt': fields.String(),
     'src': fields.String(),
     'player': fields.String(),

diff --git a/jarr/api/feed.py b/jarr/api/feed.py
@@ -51,6 +51,7 @@
                     store_missing=False)
 # clustering options
 set_clustering_options("feed", model, parser)
+set_model_n_parser(model, parser, 'truncated_content', bool, nullable=False)
 set_model_n_parser(model, parser, 'feed_type', FeedType, nullable=False)
 set_model_n_parser(model, parser, 'category_id', int, nullable=False)
 set_model_n_parser(model, parser, 'site_link', str, nullable=False)

diff --git a/jarr/controllers/article.py b/jarr/controllers/article.py
@@ -1,15 +1,13 @@
 import logging
-import re
 from datetime import timedelta
 from hashlib import sha1
 
-from bs4 import BeautifulSoup
-from sqlalchemy import cast, func
-from sqlalchemy.dialects.postgresql import TSVECTOR
+from sqlalchemy import func
 from werkzeug.exceptions import Forbidden, Unauthorized
 
 from jarr.bootstrap import session
 from jarr.controllers import CategoryController, FeedController
+from jarr.lib.clustering_af.postgres_casting import to_vector
 from jarr.lib.utils import utc_now
 from jarr.models import Article, User
 
@@ -46,10 +44,6 @@
 class ArticleController(AbstractController):
     _db_cls = Article
 
-    @staticmethod
-    def lang_to_postgreq(lang):
-        return LANG_TO_PSQL_MAPPING.get(lang[:2].lower(), 'simple')
-
     def challenge(self, ids):
         """Will return each id that wasn't found in the database."""
         for id_ in ids:
@@ -89,13 +83,8 @@ def create(self, **attrs):
                 feed.user_id == attrs['user_id'] or self.user_id is None):
             raise Forbidden("no right on feed %r" % feed.id)
         attrs['user_id'], attrs['category_id'] = feed.user_id, feed.category_id
-        vector = (attrs.get('title', ' ')
-                + ' ' + ' '.join(attrs.get('tags', []))
-                + ' ' + attrs.get('content', ' '))
-        vector = BeautifulSoup(vector, 'html.parser').text
-        vector = re.sub(r'\W', ' ', vector).strip()
-        if vector:
-            attrs['vector'] = cast(vector, TSVECTOR)
+        attrs['vector'] = to_vector(attrs.get('title'), attrs.get('tags'),
+                                    attrs.get('content'))
         attrs['link_hash'] = sha1(attrs['link'].encode('utf8')).digest()
         article = super().create(**attrs)
         return article