diff --git a/doc/release-notes/9859-ORE and Bag updates.md b/doc/release-notes/9859-ORE and Bag updates.md new file mode 100644 index 00000000000..dd3ae3bbbe1 --- /dev/null +++ b/doc/release-notes/9859-ORE and Bag updates.md @@ -0,0 +1,14 @@ +Dataverse's OAI_ORE Metadata Export format and archival BagIT exports +(which include the OAI-ORE metadata export file) have been updated to include +information about the dataset version state, e.g. RELEASED or DEACCESSIONED +and to indicate which version of Dataverse was used to create the archival Bag. +As part of the latter, the current OAI_ORE Metadata format has been given a 1.0.0 +version designation and it is expected that any future changes to the OAI_ORE export +format will result in a version change and that tools such as DVUploader that can +recreate datasets from archival Bags will start indicating which version(s) of the +OAI_ORE format they can read. + +Dataverse installations that have been using archival Bags may wish to update any +existing archival Bags they have, e.g. by deleting existing Bags and using the Dataverse +[archival Bag export API](https://guides.dataverse.org/en/latest/installation/config.html#bagit-export-api-calls) +to generate updated versions. \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/integrations.rst b/doc/sphinx-guides/source/admin/integrations.rst index 21adf8338d9..9a24cf0715c 100644 --- a/doc/sphinx-guides/source/admin/integrations.rst +++ b/doc/sphinx-guides/source/admin/integrations.rst @@ -217,7 +217,14 @@ Sponsored by the `Ontario Council of University Libraries (OCUL) `_ zipped `BagIt `_ bags to the `Chronopolis `_ via `DuraCloud `_, to a local file system, or to `Google Cloud Storage `_. +A Dataverse installation can be configured to submit a copy of published Dataset versions, packaged as `Research Data Alliance conformant `_ zipped `BagIt `_ bags to `Chronopolis `_ via `DuraCloud `_, a local file system, any S3 store, or to `Google Cloud Storage `_. +Submission can be automated to occur upon publication, or can be done periodically (via external scripting). +The archival status of each Dataset version can be seen in the Dataset page version table and queried via API. + +The archival Bags include all of the files and metadata in a given dataset version and are sufficient to recreate the dataset, e.g. in a new Dataverse instance, or potentially in another RDA-conformant repository. +Specifically, the archival Bags include an OAI-ORE Map serialized as JSON-LD that describe the dataset and it's files, as well as information about the version of Dataverse used to export the archival Bag. + +The `DVUploader `_ includes functionality to recreate a Dataset from an archival Bag produced by Dataverse (using the Dataverse API to do so). For details on how to configure this integration, see :ref:`BagIt Export` in the :doc:`/installation/config` section of the Installation Guide. diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index fe4aacafdc4..56d245f97c0 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -2111,10 +2111,12 @@ The API call requires a Json body that includes the list of the fileIds that the curl -H "X-Dataverse-key: $API_TOKEN" -H "Content-Type:application/json" "$SERVER_URL/api/datasets/:persistentId/files/actions/:unset-embargo?persistentId=$PERSISTENT_IDENTIFIER" -d "$JSON" +.. _Archival Status API: + Get the Archival Status of a Dataset By Version ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Archiving is an optional feature that may be configured for a Dataverse installation. When that is enabled, this API call be used to retrieve the status. Note that this requires "superuser" credentials. +Archival :ref:`BagIt Export` is an optional feature that may be configured for a Dataverse installation. When that is enabled, this API call be used to retrieve the status. Note that this requires "superuser" credentials. ``GET /api/datasets/$dataset-id/$version/archivalStatus`` returns the archival status of the specified dataset version. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index e2c24f0e8ac..bf7f8fa168c 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1,4 +1,3 @@ -============= Configuration ============= @@ -1427,24 +1426,25 @@ BagIt file handler configuration settings: BagIt Export ------------ -Your Dataverse installation may be configured to submit a copy of published Datasets, packaged as `Research Data Alliance conformant `_ zipped `BagIt `_ archival Bags (sometimes called BagPacks) to `Chronopolis `_ via `DuraCloud `_ or alternately to any folder on the local filesystem. +Your Dataverse installation may be configured to submit a copy of published Datasets, packaged as `Research Data Alliance conformant `_ zipped `BagIt `_ archival Bags (sometimes called BagPacks) to one of several supported storage services. +Supported services include `Chronopolis `_ via `DuraCloud `_, Google's Cloud, and any service that can provide an S3 interface or handle files transferred to a folder on the local filesystem. -These archival Bags include all of the files and metadata in a given dataset version and are sufficient to recreate the dataset, e.g. in a new Dataverse instance, or postentially in another RDA-conformant repository. +These archival Bags include all of the files and metadata in a given dataset version and are sufficient to recreate the dataset, e.g. in a new Dataverse instance, or potentially in another RDA-conformant repository. The `DVUploader `_ includes functionality to recreate a Dataset from an archival Bag produced by Dataverse. (Note that this functionality is distinct from the :ref:`BagIt File Handler` upload files to an existing Dataset via the Dataverse user interface.) The Dataverse Software offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse Software web interface. At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). -All current options support the archival status APIs and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). +All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). .. _Duracloud Configuration: Duracloud Configuration +++++++++++++++++++++++ -Also note that while the current Chronopolis implementation generates the archival Bag and submits it to the archive's DuraCloud interface, the step to make a 'snapshot' of the space containing the archival Bag (and verify it's successful submission) are actions a curator must take in the DuraCloud interface. +The current Chronopolis implementation generates the archival Bag and submits it to the archive's DuraCloud interface. The step to make a 'snapshot' of the space containing the archival Bag (and verify it's successful submission) are actions a curator must take in the DuraCloud interface. -The minimal configuration to support an archiver integration involves adding a minimum of two Dataverse Software Keys and any required Payara jvm options. The example instructions here are specific to the DuraCloud Archiver\: +The minimal configuration to support archiver integration involves adding a minimum of two Dataverse Software settings. Individual archivers may require additional settings and/or Payara jvm options and micro-profile settings. The example instructions here are specific to the DuraCloud Archiver\: \:ArchiverClassName - the fully qualified class to be used for archiving. For example: @@ -1454,7 +1454,7 @@ The minimal configuration to support an archiver integration involves adding a m ``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":DuraCloudHost, :DuraCloudPort, :DuraCloudContext, :BagGeneratorThreads"`` -The DPN archiver defines three custom settings, one of which is required (the others have defaults): +The DuraCloud archiver defines three custom settings, one of which is required (the others have defaults): \:DuraCloudHost - the URL for your organization's Duracloud site. For example: diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index b3995b5957e..aa653a6e360 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -8,6 +8,7 @@ import edu.harvard.iq.dataverse.DatasetFieldServiceBean; import edu.harvard.iq.dataverse.DatasetFieldType; import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DatasetVersion.VersionState; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DvObjectContainer; import edu.harvard.iq.dataverse.Embargo; @@ -39,13 +40,31 @@ import org.apache.commons.lang3.exception.ExceptionUtils; +/** + * This class is used to generate a JSON-LD representation of a Dataverse object leveraging the OAI_ORE and other community vocabularies. As of v1.0.0, + * the format is being versioned and ANY CHANGES TO THE OUTPUT of this class must be reflected in a version increment (see DATAVERSE_ORE_FORMAT_VERSION). + * + * The OREMap class is intended to record ALL the information needed to recreate an existing Dataverse dataset. As of v1.0.0, this is true with the + * exception that auxiliary files are not referenced in the OREMap. While many types of auxiliary files will be regenerated automatically based on datafile + * contents, Dataverse now allows manually uploaded auxiliary files and these cannot be reproduced solely from the dataset/datafile contents. + */ public class OREMap { + //Required Services static SettingsServiceBean settingsService; static DatasetFieldServiceBean datasetFieldService; + static SystemConfig systemConfig; + private static final Logger logger = Logger.getLogger(OREMap.class.getCanonicalName()); public static final String NAME = "OREMap"; + + //NOTE: Update this value whenever the output of this class is changed + private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.0"; + private static final String DATAVERSE_SOFTWARE_NAME = "Dataverse"; + private static final String DATAVERSE_SOFTWARE_URL = "https://github.com/iqss/dataverse"; + + private Map localContext = new TreeMap(); private DatasetVersion version; private Boolean excludeEmail = null; @@ -114,6 +133,18 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { .add(JsonLDTerm.schemaOrg("name").getLabel(), version.getTitle()) .add(JsonLDTerm.schemaOrg("dateModified").getLabel(), version.getLastUpdateTime().toString()); addIfNotNull(aggBuilder, JsonLDTerm.schemaOrg("datePublished"), dataset.getPublicationDateFormattedYYYYMMDD()); + //Add version state info - DRAFT, RELEASED, DEACCESSIONED, ARCHIVED with extra info for DEACCESIONED + VersionState vs = version.getVersionState(); + if(vs.equals(VersionState.DEACCESSIONED)) { + JsonObjectBuilder deaccBuilder = Json.createObjectBuilder(); + deaccBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), vs.name()); + deaccBuilder.add(JsonLDTerm.DVCore("reason").getLabel(), version.getVersionNote()); + addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("forwardUrl"), version.getArchiveNote()); + aggBuilder.add(JsonLDTerm.schemaOrg("creativeWorkStatus").getLabel(), deaccBuilder); + + } else { + aggBuilder.add(JsonLDTerm.schemaOrg("creativeWorkStatus").getLabel(), vs.name()); + } TermsOfUseAndAccess terms = version.getTermsOfUseAndAccess(); if (terms.getLicense() != null) { @@ -269,10 +300,23 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { return aggBuilder.add("@context", contextBuilder.build()); } else { // Now create the overall map object with it's metadata + + //Start with a reference to the Dataverse software + JsonObjectBuilder dvSoftwareBuilder = Json.createObjectBuilder() + .add("@type", JsonLDTerm.schemaOrg("SoftwareApplication").getLabel()) + .add(JsonLDTerm.schemaOrg("name").getLabel(), DATAVERSE_SOFTWARE_NAME) + .add(JsonLDTerm.schemaOrg("version").getLabel(), systemConfig.getVersion(true)) + .add(JsonLDTerm.schemaOrg("url").getLabel(), DATAVERSE_SOFTWARE_URL); + + //Now the OREMAP object itself JsonObjectBuilder oremapBuilder = Json.createObjectBuilder() .add(JsonLDTerm.dcTerms("modified").getLabel(), LocalDate.now().toString()) .add(JsonLDTerm.dcTerms("creator").getLabel(), BrandingUtil.getInstallationBrandName()) .add("@type", JsonLDTerm.ore("ResourceMap").getLabel()) + //Add the version of our ORE format used + .add(JsonLDTerm.schemaOrg("additionalType").getLabel(), DATAVERSE_ORE_FORMAT_VERSION) + //Indicate which Dataverse version created it + .add(JsonLDTerm.DVCore("generatedBy").getLabel(), dvSoftwareBuilder) // Define an id for the map itself (separate from the @id of the dataset being // described .add("@id", @@ -467,8 +511,10 @@ private static void addCvocValue(String val, JsonArrayBuilder vals, JsonObject c } } - public static void injectSettingsService(SettingsServiceBean settingsSvc, DatasetFieldServiceBean datasetFieldSvc) { + //These are used to pick up various settings/constants from the application + public static void injectServices(SettingsServiceBean settingsSvc, DatasetFieldServiceBean datasetFieldSvc, SystemConfig systemCfg) { settingsService = settingsSvc; datasetFieldService = datasetFieldSvc; + systemConfig = systemCfg; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMapHelper.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMapHelper.java index 4d63edac268..cca1e16b4f8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMapHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMapHelper.java @@ -2,7 +2,7 @@ import edu.harvard.iq.dataverse.DatasetFieldServiceBean; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; - +import edu.harvard.iq.dataverse.util.SystemConfig; import jakarta.annotation.PostConstruct; import jakarta.ejb.EJB; import jakarta.ejb.Singleton; @@ -22,8 +22,11 @@ public class OREMapHelper { @EJB DatasetFieldServiceBean datasetFieldSvc; + @EJB + SystemConfig systemConfig; + @PostConstruct public void injectService() { - OREMap.injectSettingsService(settingsSvc, datasetFieldSvc); + OREMap.injectServices(settingsSvc, datasetFieldSvc, systemConfig); } }