diff --git a/doc/release-notes/10579-avoid-solr-deletes.md b/doc/release-notes/10579-avoid-solr-deletes.md new file mode 100644 index 00000000000..1062a2fb78f --- /dev/null +++ b/doc/release-notes/10579-avoid-solr-deletes.md @@ -0,0 +1,9 @@ +A features flag called "reduce-solr-deletes" has been added to improve how datafiles are indexed. When the flag is enabled, +Dataverse wil avoid pre-emptively deleting existing solr documents for the files prior to sending updated information. This +should improve performance and will allow additional optimizations going forward. + +The /api/admin/index/status and /api/admin/index/clear-orphans calls +(see https://guides.dataverse.org/en/latest/admin/solr-search-index.html#index-and-database-consistency) +will now find and remove (respectively) additional permissions related solr documents that were not being detected before. +Reducing the overall number of documents will improve solr performance and large sites may wish to periodically call the +clear-orphans API. \ No newline at end of file diff --git a/doc/sphinx-guides/source/developers/performance.rst b/doc/sphinx-guides/source/developers/performance.rst index 562fa330d75..6c864bec257 100644 --- a/doc/sphinx-guides/source/developers/performance.rst +++ b/doc/sphinx-guides/source/developers/performance.rst @@ -121,6 +121,7 @@ While in the past Solr performance hasn't been much of a concern, in recent year We are tracking performance problems in `#10469 `_. In a meeting with a Solr expert on 2024-05-10 we were advised to avoid joins as much as possible. (It was acknowledged that many Solr users make use of joins because they have to, like we do, to keep some documents private.) Toward that end we have added two feature flags called ``avoid-expensive-solr-join`` and ``add-publicobject-solr-field`` as explained under :ref:`feature-flags`. It was confirmed experimentally that performing the join on all the public objects (published collections, datasets and files), i.e., the bulk of the content in the search index, was indeed very expensive, especially on a large instance the size of the IQSS prod. archive, especially under indexing load. We confirmed that it was in fact unnecessary and were able to replace it with a boolean field directly in the indexed documents, which is achieved by the two feature flags above. However, as of writing this, this mechanism should still be considered experimental. +Another flag, ``reduce-solr-deletes``, avoids deleting solr documents for files in a dataset prior to sending updates. It also eliminates several causes of orphan permission documents. This is expected to improve indexing performance to some extent and is a step towards avoiding unnecessary updates (i.e. when a doc would not change). Datasets with Large Numbers of Files or Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 15f67dc4dd6..e12b5d5d6cf 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -3274,6 +3274,9 @@ please find all known feature flags below. Any of these flags can be activated u * - add-publicobject-solr-field - Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. - ``Off`` + * - reduce-solr-deletes + - Avoids deleting and recreating solr documents for dataset files when reindexing. + - ``Off`` * - disable-return-to-author-reason - Removes the reason field in the `Publish/Return To Author` dialog that was added as a required field in v6.2 and makes the reason an optional parameter in the :ref:`return-a-dataset` API call. - ``Off`` diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 41ea6ae39f0..21f925f8981 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse; +import edu.harvard.iq.dataverse.DatasetVersion.VersionState; import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.dataaccess.DataAccess; import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter; @@ -383,7 +384,8 @@ public FileMetadata findMostRecentVersionFileIsIn(DataFile file) { if (fileMetadatas == null || fileMetadatas.isEmpty()) { return null; } else { - return fileMetadatas.get(0); + // This assumes the order of filemetadatas is from first to most recent, which is true as of v6.3 + return fileMetadatas.get(fileMetadatas.size() - 1); } } @@ -759,6 +761,13 @@ public List findAll() { return em.createQuery("select object(o) from DataFile as o order by o.id", DataFile.class).getResultList(); } + public List findVersionStates(Long fileId) { + Query query = em.createQuery( + "select distinct dv.versionState from DatasetVersion dv where dv.id in (select fm.datasetVersion.id from FileMetadata fm where fm.dataFile.id=:fileId)"); + query.setParameter("fileId", fileId); + return query.getResultList(); + } + public DataFile save(DataFile dataFile) { if (dataFile.isMergeable()) { diff --git a/src/main/java/edu/harvard/iq/dataverse/GuestbookResponse.java b/src/main/java/edu/harvard/iq/dataverse/GuestbookResponse.java index 9041ccf887c..1ea7d02791d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/GuestbookResponse.java +++ b/src/main/java/edu/harvard/iq/dataverse/GuestbookResponse.java @@ -17,6 +17,8 @@ import java.util.List; import jakarta.persistence.*; import jakarta.validation.constraints.Size; +import java.util.Collections; +import java.util.Comparator; /** * @@ -178,7 +180,7 @@ public GuestbookResponse(GuestbookResponse source){ this.setSessionId(source.getSessionId()); List customQuestionResponses = new ArrayList<>(); if (!source.getCustomQuestionResponses().isEmpty()){ - for (CustomQuestionResponse customQuestionResponse : source.getCustomQuestionResponses() ){ + for (CustomQuestionResponse customQuestionResponse : source.getCustomQuestionResponsesSorted() ){ CustomQuestionResponse customQuestionResponseAdd = new CustomQuestionResponse(); customQuestionResponseAdd.setResponse(customQuestionResponse.getResponse()); customQuestionResponseAdd.setCustomQuestion(customQuestionResponse.getCustomQuestion()); @@ -254,6 +256,18 @@ public String getResponseDate() { public List getCustomQuestionResponses() { return customQuestionResponses; } + + public List getCustomQuestionResponsesSorted(){ + + Collections.sort(customQuestionResponses, (CustomQuestionResponse cqr1, CustomQuestionResponse cqr2) -> { + int a = cqr1.getCustomQuestion().getDisplayOrder(); + int b = cqr2.getCustomQuestion().getDisplayOrder(); + return Integer.valueOf(a).compareTo(b); + }); + + + return customQuestionResponses; + } public void setCustomQuestionResponses(List customQuestionResponses) { this.customQuestionResponses = customQuestionResponses; @@ -317,7 +331,11 @@ public void setSessionId(String sessionId) { this.sessionId= sessionId; } - public String toHtmlFormattedResponse() { + public String toHtmlFormattedResponse(){ + return toHtmlFormattedResponse(null); + } + + public String toHtmlFormattedResponse(AuthenticatedUser requestor) { StringBuilder sb = new StringBuilder(); @@ -326,17 +344,25 @@ public String toHtmlFormattedResponse() { sb.append(BundleUtil.getStringFromBundle("dataset.guestbookResponse.respondent") + "
    \n
  • " + BundleUtil.getStringFromBundle("name") + ": " + getName() + "
  • \n
  • "); sb.append(" " + BundleUtil.getStringFromBundle("email") + ": " + getEmail() + "
  • \n
  • "); - sb.append( - " " + BundleUtil.getStringFromBundle("institution") + ": " + wrapNullAnswer(getInstitution()) + "
  • \n
  • "); - sb.append(" " + BundleUtil.getStringFromBundle("position") + ": " + wrapNullAnswer(getPosition()) + "
\n"); + sb.append(" " + BundleUtil.getStringFromBundle("institution") + ": " + wrapNullAnswer(getInstitution()) + "\n
  • "); + sb.append(" " + BundleUtil.getStringFromBundle("position") + ": " + wrapNullAnswer(getPosition()) + "
  • "); + + //Add requestor information to response to help dataset admin with request processing + if (requestor != null){ + sb.append("\n
  • " + BundleUtil.getStringFromBundle("dataset.guestbookResponse.requestor.id") + ": " + requestor.getId()+ "
  • "); + sb.append("\n
  • " + BundleUtil.getStringFromBundle("dataset.guestbookResponse.requestor.identifier") + ": " + requestor.getIdentifier()+ "
  • \n"); + } else { + sb.append("\n"); + } + sb.append(BundleUtil.getStringFromBundle("dataset.guestbookResponse.guestbook.additionalQuestions") + ":
      \n"); - for (CustomQuestionResponse cqr : getCustomQuestionResponses()) { + for (CustomQuestionResponse cqr : getCustomQuestionResponsesSorted()) { sb.append("
    • " + BundleUtil.getStringFromBundle("dataset.guestbookResponse.question") + ": " + cqr.getCustomQuestion().getQuestionString() + "
      " + BundleUtil.getStringFromBundle("dataset.guestbookResponse.answer") + ": " - + wrapNullAnswer(cqr.getResponse()) + "
    • \n"); + + wrapNullAnswer(cqr.getResponse()) + "\n
      "); } sb.append("
    "); return sb.toString(); diff --git a/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java index 1eee9c65501..7359ef8eb33 100644 --- a/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java @@ -456,7 +456,7 @@ public String getMessageTextBasedOnNotification(UserNotification userNotificatio GuestbookResponse gbr = far.getGuestbookResponse(); if (gbr != null) { messageText += MessageFormat.format( - BundleUtil.getStringFromBundle("notification.email.requestFileAccess.guestbookResponse"), gbr.toHtmlFormattedResponse()); + BundleUtil.getStringFromBundle("notification.email.requestFileAccess.guestbookResponse"), gbr.toHtmlFormattedResponse(requestor)); } return messageText; case GRANTFILEACCESS: diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index b7b2760e79b..f61ac341a4a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.search; import edu.harvard.iq.dataverse.*; +import edu.harvard.iq.dataverse.DatasetVersion.VersionState; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean; import edu.harvard.iq.dataverse.batch.util.LoggingUtil; @@ -12,6 +13,7 @@ import edu.harvard.iq.dataverse.datavariable.VariableMetadataUtil; import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; +import edu.harvard.iq.dataverse.search.IndexableDataset.DatasetState; import edu.harvard.iq.dataverse.settings.FeatureFlags; import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; @@ -38,6 +40,7 @@ import java.util.concurrent.Future; import java.util.concurrent.Semaphore; import java.util.function.Function; +import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import jakarta.annotation.PostConstruct; @@ -473,94 +476,160 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr * @todo should we use solrDocIdentifierDataset or * IndexableObject.IndexableTypes.DATASET.getName() + "_" ? */ - // String solrIdPublished = solrDocIdentifierDataset + dataset.getId(); String solrIdPublished = determinePublishedDatasetSolrDocId(dataset); String solrIdDraftDataset = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.WORKING_COPY.getSuffix(); - // String solrIdDeaccessioned = IndexableObject.IndexableTypes.DATASET.getName() - // + "_" + dataset.getId() + - // IndexableDataset.DatasetState.DEACCESSIONED.getSuffix(); String solrIdDeaccessioned = determineDeaccessionedDatasetId(dataset); StringBuilder debug = new StringBuilder(); debug.append("\ndebug:\n"); - int numPublishedVersions = 0; - List versions = dataset.getVersions(); - List solrIdsOfFilesToDelete = new ArrayList<>(); - for (DatasetVersion datasetVersion : versions) { - Long versionDatabaseId = datasetVersion.getId(); - String versionTitle = datasetVersion.getTitle(); - String semanticVersion = datasetVersion.getSemanticVersion(); - DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); - if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { - numPublishedVersions += 1; - } - debug.append("version found with database id " + versionDatabaseId + "\n"); - debug.append("- title: " + versionTitle + "\n"); - debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); - List fileMetadatas = datasetVersion.getFileMetadatas(); - List fileInfo = new ArrayList<>(); - for (FileMetadata fileMetadata : fileMetadatas) { - String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId(); - /** - * It sounds weird but the first thing we'll do is preemptively - * delete the Solr documents of all published files. Don't - * worry, published files will be re-indexed later along with - * the dataset. We do this so users can delete files from - * published versions of datasets and then re-publish a new - * version without fear that their old published files (now - * deleted from the latest published version) will be - * searchable. See also - * https://github.com/IQSS/dataverse/issues/762 - */ - solrIdsOfFilesToDelete.add(solrIdOfPublishedFile); - fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); - } - try { - /** - * Preemptively delete *all* Solr documents for files associated - * with the dataset based on a Solr query. - * - * We must query Solr for this information because the file has - * been deleted from the database ( perhaps when Solr was down, - * as reported in https://github.com/IQSS/dataverse/issues/2086 - * ) so the database doesn't even know about the file. It's an - * orphan. - * - * @todo This Solr query should make the iteration above based - * on the database unnecessary because it the Solr query should - * find all files for the dataset. We can probably remove the - * iteration above after an "index all" has been performed. - * Without an "index all" we won't be able to find files based - * on parentId because that field wasn't searchable in 4.0. - * - * @todo We should also delete the corresponding Solr - * "permission" documents for the files. - */ - List allFilesForDataset = findFilesOfParentDataset(dataset.getId()); - solrIdsOfFilesToDelete.addAll(allFilesForDataset); - } catch (SearchException | NullPointerException ex) { - logger.fine("could not run search of files to delete: " + ex); + boolean reduceSolrDeletes = FeatureFlags.REDUCE_SOLR_DELETES.enabled(); + if (!reduceSolrDeletes) { + int numPublishedVersions = 0; + List versions = dataset.getVersions(); + List solrIdsOfFilesToDelete = new ArrayList<>(); + for (DatasetVersion datasetVersion : versions) { + Long versionDatabaseId = datasetVersion.getId(); + String versionTitle = datasetVersion.getTitle(); + String semanticVersion = datasetVersion.getSemanticVersion(); + DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); + if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { + numPublishedVersions += 1; + } + debug.append("version found with database id " + versionDatabaseId + "\n"); + debug.append("- title: " + versionTitle + "\n"); + debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); + List fileMetadatas = datasetVersion.getFileMetadatas(); + List fileInfo = new ArrayList<>(); + for (FileMetadata fileMetadata : fileMetadatas) { + String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId(); + /** + * It sounds weird but the first thing we'll do is preemptively + * delete the Solr documents of all published files. Don't + * worry, published files will be re-indexed later along with + * the dataset. We do this so users can delete files from + * published versions of datasets and then re-publish a new + * version without fear that their old published files (now + * deleted from the latest published version) will be + * searchable. See also + * https://github.com/IQSS/dataverse/issues/762 + */ + solrIdsOfFilesToDelete.add(solrIdOfPublishedFile); + fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); + } + try { + /** + * Preemptively delete *all* Solr documents for files associated + * with the dataset based on a Solr query. + * + * We must query Solr for this information because the file has + * been deleted from the database ( perhaps when Solr was down, + * as reported in https://github.com/IQSS/dataverse/issues/2086 + * ) so the database doesn't even know about the file. It's an + * orphan. + * + * @todo This Solr query should make the iteration above based + * on the database unnecessary because it the Solr query should + * find all files for the dataset. We can probably remove the + * iteration above after an "index all" has been performed. + * Without an "index all" we won't be able to find files based + * on parentId because that field wasn't searchable in 4.0. + * + * @todo We should also delete the corresponding Solr + * "permission" documents for the files. + */ + List allFilesForDataset = findFilesOfParentDataset(dataset.getId()); + solrIdsOfFilesToDelete.addAll(allFilesForDataset); + } catch (SearchException | NullPointerException ex) { + logger.fine("could not run search of files to delete: " + ex); + } + int numFiles = 0; + if (fileMetadatas != null) { + numFiles = fileMetadatas.size(); + } + debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); } - int numFiles = 0; - if (fileMetadatas != null) { - numFiles = fileMetadatas.size(); + debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); + if (doNormalSolrDocCleanUp) { + IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete); + debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); } - debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); - } - debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); - if (doNormalSolrDocCleanUp) { - IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete); - debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); } DatasetVersion latestVersion = dataset.getLatestVersion(); - String latestVersionStateString = latestVersion.getVersionState().name(); DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState(); + String latestVersionStateString = latestVersionState.name(); DatasetVersion releasedVersion = dataset.getReleasedVersion(); boolean atLeastOnePublishedVersion = false; if (releasedVersion != null) { atLeastOnePublishedVersion = true; - } else { - atLeastOnePublishedVersion = false; } + if (reduceSolrDeletes) { + List solrIdsOfDocsToDelete = null; + if (logger.isLoggable(Level.FINE)) { + writeDebugInfo(debug, dataset); + } + if (doNormalSolrDocCleanUp) { + try { + solrIdsOfDocsToDelete = findFilesOfParentDataset(dataset.getId()); + logger.fine("Existing file docs: " + String.join(", ", solrIdsOfDocsToDelete)); + if (!solrIdsOfDocsToDelete.isEmpty()) { + // We keep the latest version's docs unless it is deaccessioned and there is no + // published/released version + // So skip the loop removing those docs from the delete list except in that case + if ((!latestVersion.isDeaccessioned() || atLeastOnePublishedVersion)) { + List latestFileMetadatas = latestVersion.getFileMetadatas(); + String suffix = (new IndexableDataset(latestVersion)).getDatasetState().getSuffix(); + for (FileMetadata fileMetadata : latestFileMetadatas) { + String solrIdOfPublishedFile = solrDocIdentifierFile + + fileMetadata.getDataFile().getId() + suffix; + solrIdsOfDocsToDelete.remove(solrIdOfPublishedFile); + } + } + if (releasedVersion != null && !releasedVersion.equals(latestVersion)) { + List releasedFileMetadatas = releasedVersion.getFileMetadatas(); + for (FileMetadata fileMetadata : releasedFileMetadatas) { + String solrIdOfPublishedFile = solrDocIdentifierFile + + fileMetadata.getDataFile().getId(); + solrIdsOfDocsToDelete.remove(solrIdOfPublishedFile); + } + } + } + // Clear any unused dataset docs + if (!latestVersion.isDraft()) { + // The latest version is released, so should delete any draft docs for the + // dataset + solrIdsOfDocsToDelete.add(solrIdDraftDataset); + } + if (!atLeastOnePublishedVersion) { + // There's no released version, so should delete any normal state docs for the + // dataset + solrIdsOfDocsToDelete.add(solrIdPublished); + } + if (atLeastOnePublishedVersion || !latestVersion.isDeaccessioned()) { + // There's a released version or a draft, so should delete any deaccessioned + // state docs for the dataset + solrIdsOfDocsToDelete.add(solrIdDeaccessioned); + } + } catch (SearchException | NullPointerException ex) { + logger.fine("could not run search of files to delete: " + ex); + } + logger.fine("Solr docs to delete: " + String.join(", ", solrIdsOfDocsToDelete)); + + if (!solrIdsOfDocsToDelete.isEmpty()) { + List solrIdsOfPermissionDocsToDelete = new ArrayList<>(); + for (String file : solrIdsOfDocsToDelete) { + // Also remove associated permission docs + solrIdsOfPermissionDocsToDelete.add(file + discoverabilityPermissionSuffix); + } + solrIdsOfDocsToDelete.addAll(solrIdsOfPermissionDocsToDelete); + logger.fine("Solr docs and perm docs to delete: " + String.join(", ", solrIdsOfDocsToDelete)); + + IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService + .deleteMultipleSolrIds(solrIdsOfDocsToDelete); + debug.append("result of attempt to premptively deleted published files before reindexing: " + + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); + } + } + } + Map desiredCards = new LinkedHashMap<>(); /** * @todo refactor all of this below and have a single method that takes @@ -583,7 +652,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr .append(indexDraftResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("Draft exists, no need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(" (and files). Result: ") @@ -591,7 +660,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr } desiredCards.put(DatasetVersion.VersionState.RELEASED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deletePublishedResults = removePublished(dataset); results.append("No published version. Attempting to delete traces of published version from index. Result: ") .append(deletePublishedResults).append("\n"); @@ -634,13 +703,13 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr results.append("No draft version. Attempting to index as deaccessioned. Result: ").append(indexDeaccessionedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.RELEASED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deletePublishedResults = removePublished(dataset); results.append("No published version. Attempting to delete traces of published version from index. Result: ").append(deletePublishedResults).append("\n"); } desiredCards.put(DatasetVersion.VersionState.DRAFT, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { List solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset); String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset); String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete); @@ -688,7 +757,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr results.append("Attempted to index " + solrIdPublished).append(". Result: ").append(indexReleasedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DRAFT, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { List solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset); String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset); String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete); @@ -697,7 +766,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr } desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("No need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult); @@ -748,7 +817,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("No need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult); @@ -790,11 +859,42 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr } } - private String deleteDraftFiles(List solrDocIdsForDraftFilesToDelete) { - String deleteDraftFilesResults = ""; - IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete); - deleteDraftFilesResults = indexResponse.toString(); - return deleteDraftFilesResults; + private void writeDebugInfo(StringBuilder debug, Dataset dataset) { + List versions = dataset.getVersions(); + int numPublishedVersions = 0; + for (DatasetVersion datasetVersion : versions) { + Long versionDatabaseId = datasetVersion.getId(); + String versionTitle = datasetVersion.getTitle(); + String semanticVersion = datasetVersion.getSemanticVersion(); + DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); + if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { + numPublishedVersions += 1; + } + debug.append("version found with database id " + versionDatabaseId + "\n"); + debug.append("- title: " + versionTitle + "\n"); + debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); + List fileInfo = new ArrayList<>(); + List fileMetadatas = datasetVersion.getFileMetadatas(); + + for (FileMetadata fileMetadata : fileMetadatas) { + /** + * It sounds weird but the first thing we'll do is preemptively delete the Solr + * documents of all published files. Don't worry, published files will be + * re-indexed later along with the dataset. We do this so users can delete files + * from published versions of datasets and then re-publish a new version without + * fear that their old published files (now deleted from the latest published + * version) will be searchable. See also + * https://github.com/IQSS/dataverse/issues/762 + */ + fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); + } + int numFiles = 0; + if (fileMetadatas != null) { + numFiles = fileMetadatas.size(); + } + debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); + } + debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); } private IndexResponse indexDatasetPermissions(Dataset dataset) { @@ -872,14 +972,14 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set solrDocIdsForDraftFilesToDelete) { + String deleteDraftFilesResults = ""; + IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete); + deleteDraftFilesResults = indexResponse.toString(); + return deleteDraftFilesResults; + } private Dataverse findRootDataverseCached() { if (true) { @@ -2085,8 +2193,50 @@ public List findPermissionsInSolrOnly() throws SearchException { SolrDocumentList list = rsp.getResults(); for (SolrDocument doc: list) { long id = Long.parseLong((String) doc.getFieldValue(SearchFields.DEFINITION_POINT_DVOBJECT_ID)); + String docId = (String)doc.getFieldValue(SearchFields.ID); if(!dvObjectService.checkExists(id)) { - permissionInSolrOnly.add((String)doc.getFieldValue(SearchFields.ID)); + permissionInSolrOnly.add(docId); + } else { + DvObject obj = dvObjectService.findDvObject(id); + if (obj instanceof Dataset d) { + DatasetVersion dv = d.getLatestVersion(); + if (docId.endsWith("draft_permission")) { + if (!dv.isDraft()) { + permissionInSolrOnly.add(docId); + } + } else if (docId.endsWith("deaccessioned_permission")) { + if (!dv.isDeaccessioned()) { + permissionInSolrOnly.add(docId); + } + } else { + if (d.getReleasedVersion() == null) { + permissionInSolrOnly.add(docId); + } + } + } else if (obj instanceof DataFile f) { + List states = dataFileService.findVersionStates(f.getId()); + Set strings = states.stream().map(VersionState::toString).collect(Collectors.toSet()); + logger.fine("States for " + docId + ": " + String.join(", ", strings)); + if (docId.endsWith("draft_permission")) { + if (!states.contains(VersionState.DRAFT)) { + permissionInSolrOnly.add(docId); + } + } else if (docId.endsWith("deaccessioned_permission")) { + if (!states.contains(VersionState.DEACCESSIONED) && states.size() == 1) { + permissionInSolrOnly.add(docId); + } + } else { + if (!states.contains(VersionState.RELEASED)) { + permissionInSolrOnly.add(docId); + } else { + if(dataFileService.findFileMetadataByDatasetVersionIdAndDataFileId(f.getOwner().getReleasedVersion().getId(), f.getId()) == null) { + logger.fine("Adding doc " + docId + " to list of permissions in Solr only"); + permissionInSolrOnly.add(docId); + } + } + + } + } } } if (cursorMark.equals(nextCursorMark)) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java index 19695af931d..0ced9994678 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java @@ -58,6 +58,19 @@ public enum FeatureFlags { * @since Dataverse 6.3 */ ADD_PUBLICOBJECT_SOLR_FIELD("add-publicobject-solr-field"), + /** + * Dataverse normally deletes all solr documents related to a dataset's files + * when the dataset is reindexed. With this flag enabled, additional logic is + * added to the reindex process to delete only the solr documents that are no + * longer needed. (Required docs will be updated rather than deleted and + * replaced.) Enabling this feature flag should make the reindex process + * faster without impacting the search results. + * + * @apiNote Raise flag by setting + * "dataverse.feature.reduce-solr-deletes" + * @since Dataverse 6.3 + */ + REDUCE_SOLR_DELETES("reduce-solr-deletes"), /** * With this flag enabled, the Return To Author pop-up will not have a required * "Reason" field, and a reason will not be required in the @@ -68,7 +81,6 @@ public enum FeatureFlags { * @since Dataverse 6.3 */ DISABLE_RETURN_TO_AUTHOR_REASON("disable-return-to-author-reason"), - ; final String flag; diff --git a/src/main/java/edu/harvard/iq/dataverse/util/MailUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/MailUtil.java index ccec3e5f09b..36c249de834 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/MailUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/MailUtil.java @@ -35,7 +35,10 @@ public static String getSubjectTextBasedOnNotification(UserNotification userNoti case CREATEDV: return BundleUtil.getStringFromBundle("notification.email.create.dataverse.subject", rootDvNameAsList); case REQUESTFILEACCESS: - return BundleUtil.getStringFromBundle("notification.email.request.file.access.subject", Arrays.asList(rootDvNameAsList.get(0), datasetDisplayName)); + String userNameFirst = userNotification.getRequestor().getFirstName(); + String userNameLast = userNotification.getRequestor().getLastName(); + String userIdentifier = userNotification.getRequestor().getIdentifier(); + return BundleUtil.getStringFromBundle("notification.email.request.file.access.subject", Arrays.asList(rootDvNameAsList.get(0), userNameFirst, userNameLast, userIdentifier, datasetDisplayName)); case REQUESTEDFILEACCESS: return BundleUtil.getStringFromBundle("notification.email.requested.file.access.subject", Arrays.asList(rootDvNameAsList.get(0), datasetDisplayName)); case GRANTFILEACCESS: diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 05b3644fa92..c0705e70b1a 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -757,8 +757,8 @@ dashboard.card.datamove.dataset.command.error.indexingProblem=Dataset could not notification.email.create.dataverse.subject={0}: Your dataverse has been created notification.email.create.dataset.subject={0}: Dataset "{1}" has been created notification.email.dataset.created.subject={0}: Dataset "{1}" has been created -notification.email.request.file.access.subject={0}: Access has been requested for a restricted file in dataset "{1}" -notification.email.requested.file.access.subject={0}: You have requested access to a restricted file in dataset "{1}" +notification.email.request.file.access.subject={0}: {1} {2} ({3}) requested access to dataset "{4}" +notification.email.requested.file.access.subject={0}: You have requested access to a restricted file in dataset "{1}" notification.email.grant.file.access.subject={0}: You have been granted access to a restricted file notification.email.rejected.file.access.subject={0}: Your request for access to a restricted file has been rejected notification.email.submit.dataset.subject={0}: Dataset "{1}" has been submitted for review @@ -1400,6 +1400,8 @@ dataset.guestbookResponse.respondent=Respondent dataset.guestbookResponse.question=Q dataset.guestbookResponse.answer=A dataset.guestbookResponse.noResponse=(No Response) +dataset.guestbookResponse.requestor.id=authenticatedUserId +dataset.guestbookResponse.requestor.identifier=authenticatedUserIdentifier # dataset.xhtml diff --git a/src/test/java/edu/harvard/iq/dataverse/util/MailUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/MailUtilTest.java index 0756c4650fb..f9236ab8338 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/MailUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/MailUtilTest.java @@ -2,6 +2,7 @@ import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.UserNotification; +import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.branding.BrandingUtil; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; @@ -82,7 +83,12 @@ public void testSubjectRevokeRole() { @Test public void testSubjectRequestFileAccess() { userNotification.setType(UserNotification.Type.REQUESTFILEACCESS); - assertEquals("LibraScholar: Access has been requested for a restricted file in dataset \"\"", MailUtil.getSubjectTextBasedOnNotification(userNotification, null)); + AuthenticatedUser requestor = new AuthenticatedUser(); + requestor.setFirstName("Tom"); + requestor.setLastName("Jones"); + requestor.setUserIdentifier("TJ-1234"); + userNotification.setRequestor(requestor); + assertEquals("LibraScholar: Tom Jones (@TJ-1234) requested access to dataset \"\"", MailUtil.getSubjectTextBasedOnNotification(userNotification, null)); } @Test