Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion documentcloud/addons/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from rest_flex_fields.utils import is_expanded

# DocumentCloud
from documentcloud.addons.choices import Event
from documentcloud.addons.models import (
AddOn,
AddOnEvent,
Expand Down Expand Up @@ -955,7 +956,9 @@ def get_queryset(self):
return queryset

def perform_create(self, serializer):
serializer.save(user=self.request.user)
instance = serializer.save(user=self.request.user)
if instance.event in [Event.hourly, Event.daily, Event.weekly]:
instance.dispatch()

class Filter(django_filters.FilterSet):
addon = django_filters.NumberFilter(
Expand Down
23 changes: 16 additions & 7 deletions documentcloud/documents/models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,11 +479,12 @@ def set_page_text(self, page_text_infos):
def _set_page_positions(self, pages, file_names, file_contents):
"""Handle grafting page positions back into the document"""

current_pdf_contents = storage.open(self.doc_path, "rb").read()
current_pdf = pymupdf.open(stream=current_pdf_contents)
current_pdf = pymupdf.open(stream=storage.open(self.doc_path, "rb").read())
start_page = pages[0]["page_number"]
stop_page = pages[-1]["page_number"]
grafted_pdf = self._init_graft_pdf(current_pdf, start_page, stop_page)
grafted_pdf, base_pdf_stream = self._init_graft_pdf(
current_pdf, start_page, stop_page
)
current_pdf.close()

for page in pages:
Expand All @@ -504,7 +505,7 @@ def _set_page_positions(self, pages, file_names, file_contents):

# merge the overlay pages back onto the original document
contents = self._merge_overlay(
current_pdf_contents,
base_pdf_stream,
grafted_pdf,
start_page,
stop_page,
Expand All @@ -513,9 +514,9 @@ def _set_page_positions(self, pages, file_names, file_contents):

return contents

def _merge_overlay(self, current_pdf_contents, grafted_pdf, start_page, stop_page):
def _merge_overlay(self, base_pdf_stream, grafted_pdf, start_page, stop_page):
"""Merge the text only overlay pages back in to the base PDF"""
base_pdf = Pdf.open(BytesIO(current_pdf_contents))
base_pdf = Pdf.open(base_pdf_stream)
overlay_pdf = Pdf.open(BytesIO(grafted_pdf.tobytes()))

for i in range(start_page, stop_page + 1):
Expand All @@ -531,13 +532,21 @@ def _merge_overlay(self, current_pdf_contents, grafted_pdf, start_page, stop_pag
def _init_graft_pdf(self, current_pdf, start_page, stop_page):
"""Initialize a new PDF to graft OCR text onto"""
grafted_pdf = pymupdf.open()
buffer = BytesIO()

for pdf_page in current_pdf.pages(start_page, stop_page + 1):
grafted_pdf.new_page(
width=pdf_page.rect.width,
height=pdf_page.rect.height,
)
return grafted_pdf
pdf_page.add_redact_annot(pdf_page.rect)
pdf_page.apply_redactions(
images=pymupdf.PDF_REDACT_IMAGE_NONE,
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
)

current_pdf.save(buffer)
return grafted_pdf, buffer

def solr(self, fields=None, index_text=False):
"""Get a solr document to index the current document
Expand Down
16 changes: 15 additions & 1 deletion documentcloud/documents/processing/info_and_image/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# Third Party
import environ
import pdfplumber
import pymupdf
import redis
import requests
from botocore.exceptions import ClientError
Expand Down Expand Up @@ -553,7 +554,20 @@ def graft_ocr_in_pdf(doc_id, slug, access):
redis_pdf_pages = REDIS.hkeys(page_text_pdf_field)
doc_path = path.doc_path(doc_id, slug)

base_pdf = Pdf.open(io.BytesIO(storage.open(doc_path).read()))
base_pdf = pymupdf.open(stream=storage.open(doc_path).read())
buffer = io.BytesIO()
for redis_page_key in redis_pdf_pages:
page_number = int(redis_page_key)
page = base_pdf[page_number]
page.add_redact_annot(page.rect)
page.apply_redactions(
images=pymupdf.PDF_REDACT_IMAGE_NONE,
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
)
base_pdf.save(buffer)
buffer.seek(0)

base_pdf = Pdf.open(buffer)
for redis_page_key in redis_pdf_pages:
page_number = int(redis_page_key)
overlay_pdf = Pdf.open(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ pebble==4.5.0
redis==3.4.1
requests==2.22.0
sentry-sdk==0.14.0
pymupdf==1.25.3
2 changes: 1 addition & 1 deletion documentcloud/documents/processing/ocr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def ocr_page_textract(doc_id, tmp_files, upload_text_path, access, slug, page_nu
logger.info("[OCR PAGE] textract doc_id %s", doc_id)

text = "\n".join(
item["Text"] for item in response["Blocks"] if item["BlockType"] == "Line"
item["Text"] for item in response["Blocks"] if item["BlockType"] == "LINE"
)

with storage.open(upload_text_path, "w", access=access) as new_text_file:
Expand Down
32 changes: 30 additions & 2 deletions documentcloud/documents/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,8 +801,8 @@ def page_search(self, request, pk=None):
else:
return Response(results.highlighting.get(pk, {}))

@action(detail=False, methods=["get"])
def pending(self, request):
@action(detail=False, methods=["get"], url_path="pending")
def bulk_pending(self, request):
"""Get the progress status on all of the current users pending documents"""
if not self.request.user or not self.request.user.is_authenticated:
return Response([])
Expand All @@ -826,6 +826,34 @@ def pending(self, request):
)
return Response([])

@action(detail=True, methods=["get"], url_path="pending")
def pending(self, request):
"""Get the processing progress of a single pending document"""
document = self.get_object()

if not request.user.is_authenticated or document.user != request.user:
return Response(None)

if document.status != Status.pending:
return Response(None)

try:
response = httpsub.post(
settings.PROGRESS_URL,
json={"doc_ids": [document.id]},
timeout=settings.PROGRESS_TIMEOUT,
)
response.raise_for_status()
return Response(response.json())
except RequestException as exc:
logger.warning(
"Error getting progress for document %s: %s",
document.id,
exc,
exc_info=sys.exc_info(),
)
return Response([])

class Filter(django_filters.FilterSet):
user = ModelMultipleChoiceFilter(model=User, help_text="Filter by users")
organization = ModelMultipleChoiceFilter(
Expand Down