Skip to content

Commit

Permalink
fix component duplicate checks and referencing
Browse files Browse the repository at this point in the history
  • Loading branch information
tschaume committed Apr 28, 2021
1 parent f3d0975 commit 380c9da
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 50 deletions.
13 changes: 6 additions & 7 deletions mpcontribs-api/mpcontribs/api/attachments/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from filetype.types.image import Jpeg, Png, Gif, Tiff

from mpcontribs.api.config import API_CNAME
from mpcontribs.api.contributions.document import get_resource, get_md5, COMPONENTS

MAX_BYTES = 200 * 1024
BUCKET = "mpcontribs-attachments"
Expand All @@ -38,8 +39,7 @@ class Attachments(DynamicDocument):
@classmethod
def post_init(cls, sender, document, **kwargs):
if document.id and document._data.get("content"):
from mpcontribs.api.attachments.views import AttachmentsResource
res = AttachmentsResource()
res = get_resource("attachments")
requested_fields = res.get_requested_fields(params=request.args)

if "content" in requested_fields:
Expand All @@ -56,7 +56,8 @@ def pre_delete(cls, sender, document, **kwargs):

@classmethod
def pre_save_post_validation(cls, sender, document, **kwargs):
from mpcontribs.api.attachments.views import AttachmentsResource
if document.md5:
return # attachment already cross-referenced to existing one

# b64 decode
try:
Expand All @@ -73,10 +74,8 @@ def pre_save_post_validation(cls, sender, document, **kwargs):
)

# md5
resource = AttachmentsResource()
d = resource.serialize(document, fields=["mime", "content"])
s = json.dumps(d, sort_keys=True).encode("utf-8")
document.md5 = md5(s).hexdigest()
resource = get_resource("attachments")
document.md5 = get_md5(resource, document, COMPONENTS["attachments"])

# save to S3 and unset content
s3_client.put_object(
Expand Down
61 changes: 42 additions & 19 deletions mpcontribs-api/mpcontribs/api/contributions/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pint.converters import ScaleConverter
from pint.errors import DimensionalityError
from uncertainties import ufloat_fromstr
from collections import defaultdict

from mpcontribs.api import enter, valid_dict, delimiter

Expand Down Expand Up @@ -50,6 +51,18 @@
}


def format_cell(cell):
if cell.count(" ") > 1:
return cell

q = get_quantity(cell)
if not q:
return cell

q = truncate_digits(q)
return str(q.value) if isnan(q.std_dev) else str(q)


def new_error_units(measurement, quantity):
if quantity.units == measurement.value.units:
return measurement
Expand Down Expand Up @@ -104,6 +117,19 @@ def get_min_max(sender, path):
return (values[0], values[-1]) if len(values) else (None, None)


def get_resource(component):
klass = component.capitalize()
vmodule = import_module(f"mpcontribs.api.{component}.views")
Resource = getattr(vmodule, f"{klass}Resource")
return Resource()


def get_md5(resource, obj, fields):
d = resource.serialize(obj, fields=fields)
s = json.dumps(d, sort_keys=True).encode("utf-8")
return md5(s).hexdigest()


class Contributions(DynamicDocument):
project = LazyReferenceField(
"Projects", required=True, passthrough=True, reverse_delete_rule=CASCADE
Expand Down Expand Up @@ -149,20 +175,12 @@ def post_init(cls, sender, document, **kwargs):
for component, fields in COMPONENTS.items():
lst = document._data.get(component)
if lst and lst[0].id is None: # id is None for incoming POST
dmodule = import_module(f"mpcontribs.api.{component}.document")
klass = component.capitalize()
Docs = getattr(dmodule, klass)
vmodule = import_module(f"mpcontribs.api.{component}.views")
Resource = getattr(vmodule, f"{klass}Resource")
resource = Resource()
resource = get_resource(component)
for i, o in enumerate(lst):
d = resource.serialize(o, fields=fields)
s = json.dumps(d, sort_keys=True).encode("utf-8")
digest = md5(s).hexdigest()
obj = Docs.objects(md5=digest).only("id").first()
digest = get_md5(resource, o, fields)
obj = resource.document.objects(md5=digest).only("id").first()
if obj:
obj.reload()
lst[i] = obj
lst[i] = obj.to_dbref()

@classmethod
def pre_save_post_validation(cls, sender, document, **kwargs):
Expand Down Expand Up @@ -292,19 +310,24 @@ def update_columns(path, key, value):
def pre_delete(cls, sender, document, **kwargs):
args = ["notebook"] + list(COMPONENTS.keys())
document.reload(*args)

# remove reference documents
if document.notebook is not None:
from mpcontribs.api.notebooks.document import Notebooks

Notebooks.objects(id=document.notebook.id).delete()
deleted = defaultdict(list)

for component in COMPONENTS.keys():
# check if other contributions exist before deletion!
for obj in getattr(document, component):
for idx, obj in enumerate(getattr(document, component)):
q = {component: obj.id}
if sender.objects(**q).count() < 2:
obj.delete()
deleted[component].append(idx)

# remove reference documents
if document.notebook is not None:
from mpcontribs.api.notebooks.document import Notebooks

nid = document.notebook.id
nb = Notebooks.objects(id=nid).first()
nb.delete(signal_kwargs=deleted)


@classmethod
def post_delete(cls, sender, document, **kwargs):
Expand Down
10 changes: 8 additions & 2 deletions mpcontribs-api/mpcontribs/api/notebooks/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,18 @@ def post_init(cls, sender, document, **kwargs):

@classmethod
def pre_delete(cls, sender, document, **kwargs):
idx = 0
deleted = kwargs.get("tables", [])

for cell in document.cells:
for output in cell.get("outputs", []):
contents = output.get("data", {}).get("image/png")
if contents:
key = hashlib.sha1(contents.encode("utf-8")).hexdigest()
s3_client.delete_object(Bucket=BUCKET, Key=key)
if idx in deleted:
key = hashlib.sha1(contents.encode("utf-8")).hexdigest()
s3_client.delete_object(Bucket=BUCKET, Key=key)

idx += 1

def transform(self, incoming=True):
if incoming:
Expand Down
30 changes: 8 additions & 22 deletions mpcontribs-api/mpcontribs/api/tables/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,7 @@
from mongoengine import signals, EmbeddedDocument
from mongoengine.fields import StringField, ListField, IntField, EmbeddedDocumentField

from mpcontribs.api.contributions.document import truncate_digits, get_quantity


def format_cell(cell):
if cell.count(" ") > 1:
return cell

q = get_quantity(cell)
if not q:
return cell

q = truncate_digits(q)
return str(q.value) if isnan(q.std_dev) else str(q)
from mpcontribs.api.contributions.document import format_cell, get_resource, get_md5, COMPONENTS


class Labels(EmbeddedDocument):
Expand All @@ -43,18 +31,16 @@ class Tables(DynamicDocument):
meta = {"collection": "tables", "indexes": ["name", "columns", "md5"]}

@classmethod
def pre_save_post_validation(cls, sender, document, **kwargs):
from mpcontribs.api.tables.views import TablesResource

# significant digits
def post_init(cls, sender, document, **kwargs):
document.data = [[format_cell(cell) for cell in row] for row in document.data]

# md5 and total_data_rows
resource = TablesResource()
d = resource.serialize(document, fields=["index", "columns", "data"])
s = json.dumps(d, sort_keys=True).encode("utf-8")
document.md5 = md5(s).hexdigest()
@classmethod
def pre_save_post_validation(cls, sender, document, **kwargs):
# significant digits, md5 and total_data_rows
resource = get_resource("tables")
document.md5 = get_md5(resource, document, COMPONENTS["tables"])
document.total_data_rows = len(document.data)


signals.post_init.connect(Tables.post_init, sender=Tables)
signals.pre_save_post_validation.connect(Tables.pre_save_post_validation, sender=Tables)

0 comments on commit 380c9da

Please sign in to comment.