Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/release-notes/12122-archiving in sequence.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This release introduces an additial setting related to archival bag creation, ArchiveOnlyIfEarlierVersionsAreArchived (default false).
If it is true, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived.
This is intended to support use cases where deduplication of files between dataset versions will be done (i.e. by a third-party service running at the archival copy location) and is a step towards supporting the Oxford Common File Layout (OCFL) as an archival format.
12 changes: 12 additions & 0 deletions doc/sphinx-guides/source/installation/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2263,6 +2263,13 @@ At present, archiving classes include the DuraCloudSubmitToArchiveCommand, Local

All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers).

Two settings that can be used with all current Archivers are:

- \:BagGeneratorThreads - the number of threads to use when adding data files to the zipped bag. The default is 2. Values of 4 or more may increase performance on larger machines but may cause problems if file access is throttled
- \:ArchiveOnlyIfEarlierVersionsAreArchived - when true, requires dataset versions to be archived in order by confirming that all prior versions have been successfully archived before allowing a new version to be archived. Default is false

These must be included in the \:ArchiverSettings for the Archiver to work

.. _Duracloud Configuration:

Duracloud Configuration
Expand Down Expand Up @@ -5333,6 +5340,11 @@ This setting specifies which storage system to use by identifying the particular

For examples, see the specific configuration above in :ref:`BagIt Export`.

:ArchiveOnlyIfEarlierVersionsAreArchived
++++++++++++++++++++++++++++++++++++++++

This setting, if true, only allows creation of an archival Bag for a dataset version if all prior versions have been successfully archived. The default is false (any version can be archived independently as long as other settings allow it)

:ArchiverSettings
+++++++++++++++++

Expand Down
54 changes: 38 additions & 16 deletions src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,9 @@ public void setSelectedHostDataverse(Dataverse selectedHostDataverse) {
private boolean showIngestSuccess;

private Boolean archivable = null;
private Boolean versionArchivable = null;
private Boolean checkForArchivalCopy;
private Boolean supportsDelete;
private HashMap<Long,Boolean> versionArchivable = new HashMap<>();
private Boolean someVersionArchived = null;

public boolean isShowIngestSuccess() {
Expand Down Expand Up @@ -6147,41 +6149,61 @@ public boolean isArchivable() {
return archivable;
}

public boolean isVersionArchivable() {
if (versionArchivable == null) {
public boolean isVersionArchivable(Long id) {
Boolean thisVersionArchivable = versionArchivable.get(id);
if (thisVersionArchivable == null) {
// If this dataset isn't in an archivable collection return false
versionArchivable = false;
thisVersionArchivable = false;
boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived, false);
if (isArchivable()) {
boolean checkForArchivalCopy = false;
// Otherwise, we need to know if the archiver is single-version-only
// If it is, we have to check for an existing archived version to answer the
// question
String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null);
if (className != null) {
try {
Class<?> clazz = Class.forName(className);
Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class);
Object[] params = { settingsWrapper };
checkForArchivalCopy = (Boolean) m.invoke(null, params);

DatasetVersion targetVersion = dataset.getVersions().stream()
.filter(v -> v.getId().equals(id)).findFirst().orElse(null);
if (requiresEarlierVersionsToBeArchived) {// Find the specific version by id
DatasetVersion priorVersion = DatasetUtil.getPriorVersion(targetVersion);

if (priorVersion== null || (isVersionArchivable(priorVersion.getId())
&& ArchiverUtil.isVersionArchived(priorVersion))) {
thisVersionArchivable = true;
}
}
if (checkForArchivalCopy == null) {
//Only check once
Class<?> clazz = Class.forName(className);
Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class);
Method m2 = clazz.getMethod("supportsDelete");
Object[] params = { settingsWrapper };
checkForArchivalCopy = (Boolean) m.invoke(null, params);
supportsDelete = (Boolean) m2.invoke(null);
}
if (checkForArchivalCopy) {
// If we have to check (single version archiving), we can't allow archiving if
// one version is already archived (or attempted - any non-null status)
versionArchivable = !isSomeVersionArchived();
thisVersionArchivable = !isSomeVersionArchived();
} else {
// If we allow multiple versions or didn't find one that has had archiving run
// on it, we can archive, so return true
versionArchivable = true;
// If we didn't find one that has had archiving run
// on it, or archiving per version is supported and either
// the status is null or the archiver can delete prior runs and status isn't success,
// we can archive, so return true
// Find the specific version by id
String status = targetVersion.getArchivalCopyLocationStatus();
thisVersionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete));
}
} catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException
| InvocationTargetException | NoSuchMethodException | SecurityException e) {
logger.warning("Failed to call isSingleVersion on configured archiver class: " + className);
logger.warning("Failed to call methods on configured archiver class: " + className);
e.printStackTrace();
}
}
}
versionArchivable.put(id, thisVersionArchivable);
}
return versionArchivable;
return thisVersionArchivable;
}

public boolean isSomeVersionArchived() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.iq.dataverse;

import edu.harvard.iq.dataverse.authorization.Permission;
import edu.harvard.iq.dataverse.dataset.DatasetUtil;
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
import jakarta.ejb.EJB;
import jakarta.ejb.Stateless;
Expand Down Expand Up @@ -95,18 +96,7 @@ private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, FileMeta
//TODO: this could use some refactoring to cut down on the number of for loops!
private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, DatasetVersion currentversion) {
List<DataFile> allfiles = allRelatedFiles(fileMetadata);
boolean foundCurrent = false;
DatasetVersion priorVersion = null;
for (DatasetVersion versionLoop : fileMetadata.getDatasetVersion().getDataset().getVersions()) {
if (foundCurrent) {
priorVersion = versionLoop;
break;
}
if (versionLoop.equals(currentversion)) {
foundCurrent = true;
}

}
DatasetVersion priorVersion = DatasetUtil.getPriorVersion(fileMetadata.getDatasetVersion());
if (priorVersion != null && priorVersion.getFileMetadatasSorted() != null) {
for (FileMetadata fmdTest : priorVersion.getFileMetadatasSorted()) {
for (DataFile fileTest : allfiles) {
Expand Down
17 changes: 17 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -740,4 +740,21 @@ public static String getLocaleCurationStatusLabelFromString(String label) {
}
return localizedName;
}

// Find the prior version - relies on version sorting by major/minor numbers
public static DatasetVersion getPriorVersion(DatasetVersion version) {
boolean foundCurrent = false;
DatasetVersion priorVersion = null;
for (DatasetVersion versionLoop : version.getDataset().getVersions()) {
if (foundCurrent) {
priorVersion = versionLoop;
break;
}
if (versionLoop.equals(version)) {
foundCurrent = true;
}

}
return priorVersion;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import edu.harvard.iq.dataverse.DataCitation;
import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetVersion;
import edu.harvard.iq.dataverse.DvObject;
import edu.harvard.iq.dataverse.SettingsWrapper;
import edu.harvard.iq.dataverse.authorization.Permission;
import edu.harvard.iq.dataverse.authorization.users.ApiToken;
Expand All @@ -15,23 +14,32 @@
import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
import edu.harvard.iq.dataverse.pidproviders.doi.datacite.DOIDataCiteRegisterService;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key;
import edu.harvard.iq.dataverse.util.ListSplitUtil;
import edu.harvard.iq.dataverse.util.bagit.BagGenerator;
import edu.harvard.iq.dataverse.util.bagit.OREMap;
import edu.harvard.iq.dataverse.workflow.step.Failure;
import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult;

import jakarta.ejb.TransactionAttribute;
import jakarta.ejb.TransactionAttributeType;
import jakarta.json.Json;
import jakarta.json.JsonObjectBuilder;

import java.io.IOException;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.security.DigestInputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

@RequiredPermissions(Permission.PublishDataset)
public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand<DatasetVersion> {

private final DatasetVersion version;
private final Map<String, String> requestedSettings = new HashMap<String, String>();
protected final Map<String, String> requestedSettings = new HashMap<String, String>();
protected boolean success=false;
private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName());
private static final int MAX_ZIP_WAIT = 20000;
Expand All @@ -43,16 +51,17 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion
}

@Override
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public DatasetVersion execute(CommandContext ctxt) throws CommandException {

String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings);
String[] settingsArray = settings.split(",");
for (String setting : settingsArray) {
setting = setting.trim();
if (!setting.startsWith(":")) {
logger.warning("Invalid Archiver Setting: " + setting);
List<String> settingsList = ListSplitUtil.split(settings);
for (String settingName : settingsList) {
Key setting = Key.parse(settingName);
if (setting == null) {
logger.warning("Invalid Archiver Setting: " + settingName);
} else {
requestedSettings.put(setting, ctxt.settings().get(setting));
requestedSettings.put(settingName, ctxt.settings().getValueForKey(setting));
}
}

Expand All @@ -62,22 +71,76 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException {
//No un-expired token
token = ctxt.authentication().generateApiTokenForUser(user);
}
performArchiveSubmission(version, token, requestedSettings);
runArchivingProcess(version, token, requestedSettings);
return ctxt.em().merge(version);
}

/**
* Note that this method may be called from the execute method above OR from a
* workflow in which execute() is never called and therefore in which all
* variables must be sent as method parameters. (Nominally version is set in the
* constructor and could be dropped from the parameter list.)
* @param ctxt
*
* @param version - the DatasetVersion to archive
* @param token - an API Token for the user performing this action
* @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans).
*/
public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map<String, String> requestedSettings) {
// this.requestedSettings won't be set yet in the workflow case, so set it now (used in getNumberOfBagGeneratorThreads)
this.requestedSettings.putAll(requestedSettings);
// Check if earlier versions must be archived first
String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived.toString());
boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue);
if (requireEarlierArchived) {

Dataset dataset = version.getDataset();
List<DatasetVersion> versions = dataset.getVersions();

boolean foundCurrent = false;

// versions are ordered, all versions after the current one have lower
// major/minor version numbers
for (DatasetVersion versionInLoop : versions) {
if (foundCurrent) {
// Once foundCurrent is true, we are looking at prior versions
// Check if this earlier version has been successfully archived
String archivalStatus = versionInLoop.getArchivalCopyLocationStatus();
if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS)
// || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE)
) {
JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder();
statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE);
statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE,
"Successful archiving of earlier versions is required.");
version.setArchivalCopyLocation(statusObjectBuilder.build().toString());
return new Failure("Earlier versions must be successfully archived first",
"Archival prerequisites not met");
}
}
if (versionInLoop.equals(version)) {
foundCurrent = true;
}

}
}
// Delegate to the archiver-specific implementation
return performArchiveSubmission(version, token);
}


/**
* This method is the only one that should be overwritten by other classes. Note
* that this method may be called from the execute method above OR from a
* workflow in which execute() is never called and therefore in which all
* variables must be sent as method parameters. (Nominally version is set in the
* constructor and could be dropped from the parameter list.)
* @param ctxt
*
* @param version - the DatasetVersion to archive
* @param token - an API Token for the user performing this action
* @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans).
*/
abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, Map<String, String> requestedSetttings);
protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token);

protected int getNumberOfBagGeneratorThreads() {
if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) {
Expand Down Expand Up @@ -110,8 +173,8 @@ public Thread startBagThread(DatasetVersion dv, PipedInputStream in, DigestInput
public void run() {
try (PipedOutputStream out = new PipedOutputStream(in)) {
// Generate bag
BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads());
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setNumConnections(getNumberOfBagGeneratorThreads());
bagger.setAuthenticationKey(token.getTokenString());
bagger.generateBag(out);
success = true;
Expand Down Expand Up @@ -183,4 +246,16 @@ public static boolean isSingleVersion(SettingsWrapper settingsWrapper) {
public static boolean isSingleVersion(SettingsServiceBean settingsService) {
return false;
}

/** Whether the archiver can delete existing archival files (and thus can retry when the existing files are incomplete/obsolete)
* A static version supports calls via reflection while the instance method supports inheritance for use on actual command instances (see DatasetPage for both use cases).
* @return
*/
public static boolean supportsDelete() {
return false;
}

public boolean canDelete() {
return supportsDelete();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ public DRSSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion versi
}

@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token,
Map<String, String> requestedSettings) {
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) {
logger.fine("In DRSSubmitToArchiveCommand...");
JsonObject drsConfigObject = null;

Expand Down Expand Up @@ -113,7 +112,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t

JsonObject collectionConfig = adminMetadata.getJsonObject(COLLECTIONS).getJsonObject(alias);

WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token, requestedSettings);
WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token);

JsonObjectBuilder statusObject = Json.createObjectBuilder();
statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE);
Expand Down
Loading