Skip to content

Commit

Permalink
SSRF mitigation
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Nov 29, 2023
1 parent 307bbfb commit 0951916
Show file tree
Hide file tree
Showing 8 changed files with 441 additions and 198 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ the-conf = "==0.*"
flask-cors = "==3.*"
psycopg2-binary = "==2.*"
werkzeug = "==2.1.2" # fixing for bug with last flask version
advocate = "1.*"

[requires]
python_version = "3.10"
222 changes: 189 additions & 33 deletions Pipfile.lock

Large diffs are not rendered by default.

236 changes: 155 additions & 81 deletions jarr/controllers/article_clusterizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,44 @@
from jarr.metrics import ARTICLE_CREATION, TFIDF_SCORE, WORKER_BATCH
from jarr.models import Article, Cluster, Feed
from jarr.signals import event
from jarr.utils import get_tfidf_pref
from sqlalchemy import and_, or_

logger = logging.getLogger(__name__)
NO_CLUSTER_TYPE = {ArticleType.image, ArticleType.video, ArticleType.embedded}
WAKABLE_REASONS = {ReadReason.marked, ReadReason.mass_marked,
ReadReason.filtered}
WAKABLE_REASONS = {
ReadReason.marked,
ReadReason.mass_marked,
ReadReason.filtered,
}
cluster_event = partial(event.send, module=__name__)


class Clusterizer:
def get_tfidf_pref(feed, pref_name):
"""Tool to figure out clustering setting for a feed.
For a given feed and a given attribute name will return a boolean
If this same attribute is set to false on feed's user false will be
returned, if it's set to false on feed's category false will also be
returned. If not the value be returned from feed configuration.
Defaults are set in configurations
"""
objs = feed.user, feed.category, feed
for obj in objs:
if obj is None:
continue
if not obj.cluster_conf or pref_name not in obj.cluster_conf:
continue
if not obj.cluster_conf[pref_name] and obj is not feed:
continue
return obj.cluster_conf.get(pref_name)
return getattr(conf.clustering.tfidf, pref_name)


def _true_or_unset(attr):
return or_(attr.__eq__(True), attr.__eq__(None))


class Clusterizer:
def __init__(self, user_id=None):
self.user_id = user_id
self.corpus = []
Expand All @@ -36,16 +62,19 @@ def get_config(self, obj, attr):
config value.
In any case, computed value is cached in Clusterizer instance.
"""

def cache(val):
self._config_cache[obj.__class__.__name__][attr][obj.id] = val
return val

if obj.id in self._config_cache[obj.__class__.__name__]:
return self._config_cache[obj.__class__.__name__][obj.id]
if obj.__class__.__name__ == "Article":
return cache(self.get_config(obj.feed, attr))
if obj.__class__.__name__ == "Cluster":
return cache(all(self.get_config(article, attr)
for article in obj.articles))
return cache(
all(self.get_config(article, attr) for article in obj.articles)
)
val = getattr(obj, attr)
if val is not None:
logger.debug("%r.%s is %r", obj, attr, val)
Expand All @@ -66,11 +95,16 @@ def get_neighbors(self, article):
yet, it'll be pulled out of the database.
"""
if not self.corpus_initialized:
filters = {"__and__": [{'vector__ne': None}, {'vector__ne': ''}],
"article_type": None}
filters = {
"__and__": [{"vector__ne": None}, {"vector__ne": ""}],
"article_type": None,
}
self.corpus_initialized = True
self.corpus = list(self._get_query_for_clustering(
article, filters=filters, filter_tfidf=True))
self.corpus = list(
self._get_query_for_clustering(
article, filters=filters, filter_tfidf=True
)
)
tfidf_conf = conf.clustering.tfidf
low_bound = article.simple_vector_magnitude / tfidf_conf.size_factor
high_bound = article.simple_vector_magnitude * tfidf_conf.size_factor
Expand All @@ -81,82 +115,109 @@ def get_neighbors(self, article):

def _get_cluster_by_link(self, article):
for candidate in self._get_query_for_clustering(
article, {'link_hash': article.link_hash}):
article, {"link_hash": article.link_hash}
):
article.cluster_reason = ClusterReason.link
cluster_event(context='link', result='match', level=logging.INFO)
cluster_event(context="link", result="match", level=logging.INFO)
return candidate.cluster

def _get_cluster_by_similarity(self, article):
neighbors = list(self.get_neighbors(article))

min_sample_size = get_tfidf_pref(article.feed, 'min_sample_size')
min_sample_size = get_tfidf_pref(article.feed, "min_sample_size")
if len(neighbors) < min_sample_size:
logger.info('only %d docs against %d required, no TFIDF for %r',
len(neighbors), min_sample_size, article)
cluster_event(context='tfidf', result='sample size forbird')
logger.info(
"only %d docs against %d required, no TFIDF for %r",
len(neighbors),
min_sample_size,
article,
)
cluster_event(context="tfidf", result="sample size forbird")
return None
logger.info('%r TFIDF is gonna work with a corpus of %d documents',
article.feed, len(neighbors))
WORKER_BATCH.labels(worker_type='tfidf_batch').observe(len(neighbors))
logger.info(
"%r TFIDF is gonna work with a corpus of %d documents",
article.feed,
len(neighbors),
)
WORKER_BATCH.labels(worker_type="tfidf_batch").observe(len(neighbors))

best_match, score = get_best_match_and_score(article, neighbors)
TFIDF_SCORE.labels(
feed_type=article.feed.feed_type.value).observe(score)
if score > get_tfidf_pref(article.feed, 'min_score'):
labeled = TFIDF_SCORE.labels(feed_type=article.feed.feed_type.value)
labeled.observe(score)
if score > get_tfidf_pref(article.feed, "min_score"):
article.cluster_reason = ClusterReason.tf_idf
article.cluster_score = int(score * 1000)
article.cluster_tfidf_neighbor_size = len(neighbors)
article.cluster_tfidf_with = best_match.id
cluster_event(context='tfidf', result='match', level=logging.INFO)
cluster_event(context="tfidf", result="match", level=logging.INFO)
return best_match.cluster
cluster_event(context='tfidf', result='miss')
cluster_event(context="tfidf", result="miss")

def _get_query_for_clustering(self, article, filters, filter_tfidf=False):
time_delta = timedelta(days=conf.clustering.time_delta)
date_cond = {'date__lt': article.date + time_delta,
'date__gt': article.date - time_delta}
retr_cond = {'retrieved_date__lt': article.retrieved_date + time_delta,
'retrieved_date__gt': article.retrieved_date - time_delta}
filters.update({'cluster_id__ne': None,
'user_id': article.user_id,
'id__ne': article.id,
'__or__': [date_cond, retr_cond]})
if article.category_id \
and not self.get_config(article, 'cluster_same_category'):
filters['category_id__ne'] = article.category_id
if not self.get_config(article, 'cluster_same_feed'):
filters['feed_id__ne'] = article.feed_id

feed_join = [Feed.id == Article.feed_id,
or_(Feed.cluster_enabled.__eq__(True),
Feed.cluster_enabled.__eq__(None))]
date_cond = {
"date__lt": article.date + time_delta,
"date__gt": article.date - time_delta,
}
retr_cond = {
"retrieved_date__lt": article.retrieved_date + time_delta,
"retrieved_date__gt": article.retrieved_date - time_delta,
}
filters["cluster_id__ne"] = None
filters["user_id"] = article.user_id
filters["id__ne"] = article.id
filters["__or__"] = [date_cond, retr_cond]
if article.category_id and not self.get_config(
article, "cluster_same_category"
):
filters["category_id__ne"] = article.category_id
if not self.get_config(article, "cluster_same_feed"):
filters["feed_id__ne"] = article.feed_id

feed_join = [
Feed.id == Article.feed_id,
_true_or_unset(Feed.cluster_enabled),
]

if filter_tfidf:
feed_join.append(or_(Feed.cluster_tfidf_enabled.__eq__(True),
Feed.cluster_tfidf_enabled.__eq__(None)))
feed_join.append(_true_or_unset(Feed.cluster_tfidf_enabled))

query = ArticleController(article.user_id).read(**filters)\
query = (
ArticleController(article.user_id)
.read(**filters)
.join(Feed, and_(*feed_join))
)

# operations involving categories are complicated, handling in software
for candidate in query:
if not self.get_config(candidate, "cluster_enabled"):
continue
if filter_tfidf and \
not self.get_config(candidate, "cluster_tfidf_enabled"):
tfidf_enabled = self.get_config(candidate, "cluster_tfidf_enabled")
if filter_tfidf and not tfidf_enabled:
continue
yield candidate

def _create_from_article(self, article,
cluster_read=None, cluster_liked=False):
def _create_from_article(
self, article, cluster_read=None, cluster_liked=False
):
cluster = Cluster(user_id=article.user_id)
article.cluster_reason = ClusterReason.original
return self.enrich_cluster(cluster, article,
cluster_read, cluster_liked,
force_article_as_main=True)
return self.enrich_cluster(
cluster,
article,
cluster_read,
cluster_liked,
force_article_as_main=True,
)

def enrich_cluster(self, cluster, article,
cluster_read=None, cluster_liked=False,
force_article_as_main=False):
def enrich_cluster(
self,
cluster,
article,
cluster_read=None,
cluster_liked=False,
force_article_as_main=False,
):
"Will add given article to given cluster."
article.cluster = cluster
# handling read status
Expand All @@ -165,63 +226,76 @@ def enrich_cluster(self, cluster, article,
elif cluster_read is not None: # filters indicate a read status
cluster.read = cluster.read and cluster_read
cluster.read_reason = ReadReason.filtered
logger.debug('marking as read because of filter %r', cluster)
elif (cluster.read # waking up a cluster
and cluster.read_reason in WAKABLE_REASONS
and self.get_config(article, 'cluster_wake_up')
and self.get_config(cluster, 'cluster_wake_up')):
logger.debug("marking as read because of filter %r", cluster)
elif (
cluster.read # waking up a cluster
and cluster.read_reason in WAKABLE_REASONS
and self.get_config(article, "cluster_wake_up")
and self.get_config(cluster, "cluster_wake_up")
):
cluster.read = False
logger.debug('waking up %r', cluster)
logger.debug("waking up %r", cluster)
# once one article is liked the cluster is liked
cluster.liked = cluster.liked or cluster_liked
if force_article_as_main or cluster.main_date > article.date \
or (not article.feed.truncated_content
and all(cluster_article.feed.truncated_content
for cluster_article in cluster.articles)):
if (
force_article_as_main
or cluster.main_date > article.date
or (
not article.feed.truncated_content
and all(
cluster_article.feed.truncated_content
for cluster_article in cluster.articles
)
)
):
cluster.main_title = article.title
cluster.main_date = article.date
cluster.main_link = article.link
cluster.main_feed_title = article.feed.title
cluster.main_article_id = article.id
cluster.content = article.content_generator.generate_and_merge(
cluster.content)
cluster.content
)
self.add_to_corpus(article)
session.add(cluster)
session.add(article)
session.commit()
read_reason = cluster.read_reason.value if cluster.read_reason else ''
ARTICLE_CREATION.labels(read_reason=read_reason,
read='read' if cluster.read else 'unread',
cluster=article.cluster_reason.value).inc()
read_reason = cluster.read_reason.value if cluster.read_reason else ""
ARTICLE_CREATION.labels(
read_reason=read_reason,
read="read" if cluster.read else "unread",
cluster=article.cluster_reason.value,
).inc()
return cluster

def main(self, article, filter_result=None):
"""Will add given article to a fitting cluster or create a cluster
fitting that article."""
filter_result = filter_result or {}
allow_clustering = filter_result.get('clustering', True)
filter_read = filter_result.get('read', False)
filter_liked = filter_result.get('liked', False)
logger.info('%r - processed filter: %r', article, filter_result)
cluster_config = self.get_config(article.feed, 'cluster_enabled')
allow_clustering = filter_result.get("clustering", True)
filter_read = filter_result.get("read", False)
filter_liked = filter_result.get("liked", False)
logger.info("%r - processed filter: %r", article, filter_result)
cluster_config = self.get_config(article.feed, "cluster_enabled")

# fetching article so that vector comparison is made on full content
ArticleController(article.user_id).enhance(article)

if not allow_clustering:
cluster_event(context='clustering', result='filter forbid')
cluster_event(context="clustering", result="filter forbid")
elif not cluster_config:
cluster_event(context='clustering', result='config forbid')
cluster_event(context="clustering", result="config forbid")
else:
cluster = self._get_cluster_by_link(article)
if not cluster:
if not self.get_config(article.feed, 'cluster_tfidf_enabled'):
cluster_event(context='tfidf', result='config forbid')
if not self.get_config(article.feed, "cluster_tfidf_enabled"):
cluster_event(context="tfidf", result="config forbid")
elif article.article_type in NO_CLUSTER_TYPE:
cluster_event(context='tfidf', result='wrong article type')
cluster_event(context="tfidf", result="wrong article type")
else:
cluster = self._get_cluster_by_similarity(article)
if cluster:
return self.enrich_cluster(cluster, article,
filter_read, filter_liked)
return self.enrich_cluster(
cluster, article, filter_read, filter_liked
)
return self._create_from_article(article, filter_read, filter_liked)
15 changes: 7 additions & 8 deletions jarr/controllers/icon.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import base64

from jarr.bootstrap import conf, session
from jarr.bootstrap import session
from jarr.models import Icon
from jarr.utils import jarr_get
from jarr.lib.utils import jarr_get

from .abstract import AbstractController

Expand All @@ -13,15 +13,14 @@ class IconController(AbstractController):

@staticmethod
def _build_from_url(attrs):
if 'url' in attrs and 'content' not in attrs:
if "url" in attrs and "content" not in attrs:
try:
resp = jarr_get(attrs['url'], timeout=conf.crawler.timeout,
user_agent=conf.crawler.user_agent)
resp = jarr_get(attrs["url"])
except Exception:
return attrs
attrs.update({'url': resp.url,
'mimetype': resp.headers.get('content-type', None),
'content': base64.b64encode(resp.content).decode('utf8')})
attrs["url"] = resp.url
attrs["mimetype"] = resp.headers.get("content-type", None)
attrs["content"] = base64.b64encode(resp.content).decode("utf8")
return attrs

def create(self, **attrs):
Expand Down
Loading

0 comments on commit 0951916

Please sign in to comment.