Skip to content

Commit

Permalink
api: lots of optimizations for speed/efficiency
Browse files Browse the repository at this point in the history
  • Loading branch information
tschaume committed Aug 4, 2021
1 parent 4a3b010 commit f64a3e6
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 194 deletions.
134 changes: 66 additions & 68 deletions mpcontribs-api/mpcontribs/api/contributions/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
from fastnumbers import isfloat
from flask_mongoengine import DynamicDocument
from mongoengine import CASCADE, signals
from mongoengine.queryset import DoesNotExist
from mongoengine.queryset.manager import queryset_manager
from mongoengine.fields import StringField, BooleanField, DictField
from mongoengine.fields import LazyReferenceField, ReferenceField
from mongoengine.fields import DateTimeField, ListField
from marshmallow.utils import get_value, _Missing
from marshmallow.utils import get_value
from boltons.iterutils import remap
from decimal import Decimal
from pint import UnitRegistry
Expand Down Expand Up @@ -110,16 +109,23 @@ def truncate_digits(q):
return get_quantity(s)


def get_min_max(sender, path):
# NOTE can't filter for project when using wildcard index data.$**
# https://docs.mongodb.com/manual/core/index-wildcard/#wildcard-index-query-sort-support
def get_min_max(sender, path, project_name):
# https://docs.mongodb.com/manual/core/index-wildcard/
# NOTE need a query to trigger wildcard IXSCAN
# NOTE can't filter for `project` when using wildcard index on `data`
# NOTE `project` field in wildcardProjection for wildcard index on all fields
# NOTE reset `only` in custom queryset manager via `exclude`
field = f"{path}{delimiter}value"
key = f"{field}__type".replace(delimiter, "__")
q = {key: "number"} # NOTE need a query to trigger wildcard IXSCAN
qs = sender.objects(**q).only(field).order_by(field)
values = [get_value(doc, field) for doc in qs]
values = [v for v in values if not isinstance(v, _Missing)]
return (values[0], values[-1]) if len(values) else (None, None)
q = {key: "number"}
exclude = list(sender._fields.keys())
qs = sender.objects(**q).exclude(*exclude).only(field, "project").order_by(field)
docs = [doc for doc in qs if doc.project.pk == project_name]

if not docs:
return None, None

return get_value(docs[0], field), get_value(docs[-1], field)


def get_resource(component):
Expand Down Expand Up @@ -164,16 +170,11 @@ class Contributions(DynamicDocument):
meta = {
"collection": "contributions",
"indexes": [
"project",
"identifier",
"formula",
"is_public",
"last_modified",
"needs_build",
{"fields": [(r"data.$**", 1)]},
"notebook",
]
+ list(COMPONENTS.keys()),
"project", "identifier", "formula", "is_public", "last_modified",
"needs_build", "notebook", {"fields": [(r"data.$**", 1)]},
# can only use wildcardProjection option with wildcard index on all document fields
{"fields": [(r"$**", 1)], "wildcardProjection" : {"project": 1}},
] + list(COMPONENTS.keys()),
}

@queryset_manager
Expand All @@ -191,22 +192,22 @@ def post_init(cls, sender, document, **kwargs):
resource = get_resource(component)
for i, o in enumerate(lst):
digest = get_md5(resource, o, fields)
obj = resource.document.objects(md5=digest).only("id").first()
objs = resource.document.objects(md5=digest)
exclude = list(resource.document._fields.keys())
obj = objs.exclude(*exclude).only("id").first()
if obj:
lst[i] = obj.to_dbref()

@classmethod
def pre_save_post_validation(cls, sender, document, **kwargs):
if kwargs.get("skip"):
return

# set formula field
if hasattr(document, "formula") and not document.formula:
formulae = current_app.config["FORMULAE"]
document.formula = formulae.get(document.identifier, document.identifier)

# project is LazyReferenceField & load columns due to custom queryset manager
project = document.project.fetch().reload("columns")
columns = {col.path: col for col in project.columns}

# run data through Pint Quantities and save as dicts
def make_quantities(path, key, value):
Expand Down Expand Up @@ -236,17 +237,16 @@ def make_quantities(path, key, value):

# ensure that the same units are used across contributions
field = delimiter.join(["data"] + list(path) + [key])
try:
column = project.columns.get(path=field)
if column.unit != str(q.value.units):
qq = q.value.to(column.unit)
q = new_error_units(q, qq)
except DoesNotExist:
pass # column doesn't exist yet (generated in post_save)
except DimensionalityError:
raise ValueError(
f"Can't convert [{q.units}] to [{column.unit}] for {field}!"
)
if field in columns:
column = columns[field]
if column.unit != "NaN" and column.unit != str(q.value.units):
try:
qq = q.value.to(column.unit)
q = new_error_units(q, qq)
except DimensionalityError:
raise ValueError(
f"Can't convert [{q.units}] to [{column.unit}] for {field}!"
)

# significant digits
q = truncate_digits(q)
Expand All @@ -270,9 +270,11 @@ def post_save(cls, sender, document, **kwargs):
if kwargs.get("skip"):
return

from mpcontribs.api.projects.document import Column

# project is LazyReferenceField; account for custom query manager
project = document.project.fetch()
project.reload(*project._fields)
project = document.project.fetch().reload("columns")
columns = {col.path: col for col in project.columns}

# set columns field for project
def update_columns(path, key, value):
Expand All @@ -284,26 +286,18 @@ def update_columns(path, key, value):
not is_quantity and isinstance(value, str) and key not in quantity_keys
)
if is_quantity or is_text:
project.reload("columns")
try:
column = project.columns.get(path=path)
if is_quantity:
v = value["value"]
if isnan(column.max) or v > column.max:
column.max = v
if isnan(column.min) or v < column.min:
column.min = v

except DoesNotExist:
column = {"path": path}
if path not in columns:
columns[path] = Column(path=path)

if is_quantity:
column["unit"] = value["unit"]
column["min"] = column["max"] = value["value"]
columns[path].unit = value["unit"]

project.columns.create(**column)
if is_quantity:
columns[path].min, columns[path].max = get_min_max(
sender, path, project.name
)

project.save().reload("columns")
ncolumns = len(project.columns)
ncolumns = len(columns)
if ncolumns > 50:
raise ValueError("Reached maximum number of columns (50)!")

Expand All @@ -314,12 +308,10 @@ def update_columns(path, key, value):

# add/remove columns for other components
for path in COMPONENTS.keys():
try:
project.columns.get(path=path)
except DoesNotExist:
if getattr(document, path):
project.columns.create(path=path)
project.save().reload("columns")
if path not in columns and getattr(document, path):
columns[path] = Column(path=path)

project.update(columns=columns.values())

@classmethod
def pre_delete(cls, sender, document, **kwargs):
Expand All @@ -339,21 +331,27 @@ def post_delete(cls, sender, document, **kwargs):
return

# reset columns field for project
project = document.project.fetch()
project = document.project.fetch().reload("columns")
columns = {col.path: col for col in project.columns}

for column in list(project.columns):
for path, column in columns.items():
if not isnan(column.min) and not isnan(column.max):
column.min, column.max = get_min_max(sender, column.path)
column.min, column.max = get_min_max(sender, path, project.name)
if isnan(column.min) and isnan(column.max):
# just deleted last contribution with this column
project.update(pull__columns__path=column.path)
columns.pop(path)
else:
# use wildcard index if available -> single field query
field = column.path.replace(delimiter, "__") + "__type"
qs = sender.objects(**{field: "string"}).only(column.path)
# NOTE reset `only` in custom queryset manager via `exclude`
exclude = list(sender._fields.keys())
field = path.replace(delimiter, "__") + "__type"
q = {field: "string"}
qs = sender.objects(**q).exclude(*exclude).only(path, "project")

if qs.count() < 1 or sum(1 for d in qs if d.project.pk == project.name) < 1:
columns.pop(path)

if qs.count() < 1 or qs.filter(project__name=project.name).count() < 1:
project.update(pull__columns__path=column.path)
project.update(columns=columns.values())


signals.post_init.connect(Contributions.post_init, sender=Contributions)
Expand Down
7 changes: 4 additions & 3 deletions mpcontribs-api/mpcontribs/api/contributions/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,10 @@ def value_for_field(self, obj, field):
obj.reload("id", "project", "data")

# obj.project is LazyReference & Projects uses custom queryset manager
project = obj.project.document_type.objects.only(
"title", "references", "description", "authors"
).get(pk=obj.project.pk)
DocType = obj.project.document_type
exclude = list(DocType._fields.keys())
only = ["title", "references", "description", "authors"]
project = DocType.objects.exclude(*exclude).only(*only).with_id(obj.project.pk)
ctx = {
"cid": str(obj.id),
"title": project.title,
Expand Down
4 changes: 3 additions & 1 deletion mpcontribs-api/mpcontribs/api/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,9 @@ def has_read_permission(self, request, qs):
# project is LazyReferenceFields (multiple queries)
module = import_module("mpcontribs.api.projects.document")
Projects = getattr(module, "Projects")
projects = Projects.objects.only("name", "owner", "is_public", "is_approved")
exclude = list(Projects._fields.keys())
only = ["name", "owner", "is_public", "is_approved"]
projects = Projects.objects.exclude(*exclude).only(*only)

# contributions are set private/public independent from projects
# - private contributions in a public project are only accessible to owner/group
Expand Down
Loading

0 comments on commit f64a3e6

Please sign in to comment.