-
Notifications
You must be signed in to change notification settings - Fork 2
/
config-clarin-clarin.xml
142 lines (122 loc) · 5.95 KB
/
config-clarin-clarin.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
<!-- This is the configuration for CLARIN harvesting. -->
<config>
<!-- ### configuration settings ### -->
<settings>
<!-- Working directory. -->
<workdir>workspace</workdir>
<!-- Maximum number of attempts per record before giving up. -->
<max-retry-count>2</max-retry-count>
<!-- Delay between retries of a record (milliseconds). -->
<retry-delay>10000</retry-delay>
<!-- Maximum number of concurrent harvester threads -->
<max-jobs>6</max-jobs>
<!-- Number of resources placed in the resource pool. -->
<resource-pool-size>4</resource-pool-size>
<!-- Default timeout (for connection and reading) for a single
http request in seconds. If unspecified, will be INFINITE. -->
<timeout>60</timeout>
<!-- File used to log harvesting times. -->
<state-file>state.xml</state-file>
<!-- File used to store endpoint -> directory mappings -->
<map-file>map.csv</map-file>
<!-- If this parameter is true, use incremental harvesting. For that to
work, state-file must be defined, the file must exist, and the provider
in question must have been harvested previously. If any of these conditions
is not fulfilled, this setting has no effect.-->
<incremental>false</incremental>
<scenario>ListRecords</scenario>
</settings>
<!-- ### output directories (referenced in the action section) ### -->
<directories>
<!-- When the attribute 'max-files' is non-zero, subdirectories
will be created to ensure no directory has more than that
number of files. -->
<dir path="oai-rec" id="rec" max-files="0"/>
<dir path="oai-pmh" id="oai" max-files="0"/>
<dir path="results/cmdi-1_1" id="cmdi-1_1" max-files="0"/>
<dir path="results/cmdi" id="cmdi-1_2" max-files="0"/>
</directories>
<!-- ### actions to take on metadata formats (in order of preference) ### -->
<actions>
<format match="namespace" value="http://www.clarin.eu/cmd/1">
<action type="save" dir="oai" suffix=".xml"/>
<action type="split"/>
<action type="save" dir="rec" suffix=".xml"/>
<action type="strip"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<format match="namespace" value="http://www.clarin.eu/cmd/">
<action type="save" dir="oai" suffix=".xml"/>
<action type="split"/>
<action type="save" dir="rec" suffix=".xml"/>
<action type="strip"/>
<action type="save" dir="cmdi-1_1" suffix=".xml"/>
<action type="transform" file="https://infra.clarin.eu/CMDI/1.x/upgrade/cmd-record-1_1-to-1_2.xsl" cache="cache"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<format match="prefix" value="olac">
<action type="save" dir="oai" suffix=".xml"/>
<action type="transform" file="resources/addOAISetName.xsl" cache="cache"/>
<action type="split"/>
<action type="save" dir="rec" suffix=".xml"/>
<action type="transform" file="https://raw.githubusercontent.com/clarin-eric/metadata-conversion/olac-cmdi/olac-cmdi/olac2cmdi.xsl"/>
<action type="save" dir="cmdi-1_1" suffix=".xml"/>
<action type="transform" file="https://infra.clarin.eu/CMDI/1.x/upgrade/cmd-record-1_1-to-1_2.xsl" cache="cache"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<format match="prefix" value="oai_dc">
<action type="save" dir="oai" suffix=".xml"/>
<action type="transform" file="resources/addOAISetName.xsl" cache="cache"/>
<action type="split"/>
<action type="save" dir="rec" suffix=".xml"/>
<action type="transform" file="https://raw.githubusercontent.com/clarin-eric/metadata-conversion/olac-cmdi/olac-cmdi/olac2cmdi.xsl"/>
<action type="save" dir="cmdi-1_1" suffix=".xml"/>
<action type="transform" file="https://infra.clarin.eu/CMDI/1.x/upgrade/cmd-record-1_1-to-1_2.xsl" cache="cache"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
</actions>
<!-- ### list of providers ### -->
<providers>
<import>
<registry url="https://centres.clarin.eu/restxml/"/>
<!--
Block harvesting from a specific provider defined in the registry
element by supplying its endpoint URL as an child element tagged
'exclude'. You can add multiple exclude elements. Please note that
the exclusion only applies to the registry element, not to the
provider elements.
-->
<!--
<exclude url=""/>
-->
<!--
Add endpoint specific configuration to providers defined in the registry.
-->
<!-- MPI sometimes needs a bit more time -->
<config url="https://archive.mpi.nl/oai2" max-retry-count="5" retry-delay="10 30 60 360"/>
<!-- Leipzig doesn't like too much time between calls within one session, GetRecord calls don't care -->
<config url="https://clarinoai.informatik.uni-leipzig.de/oaiprovider/oai" scenario="ListIdentifiers"/>
<!-- BAS runs sometimes into memory problems when its big CMD records get processed in batch -->
<config url="http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl" scenario="ListIdentifiers"/>
</import>
<!-- Virtual Collection Registry -->
<provider url="http://clarin.ids-mannheim.de/vcr/oai"/>
<!--
In addition to the registry entries, there are some "D centres" that
don't belong in the centre registry. They are manually included here.
If one is later added to the registry the entry should be removed from
this file.
Currently included:
- Utrecht University Library and Het Nederlands
- Instituut voor Beeld en Geluid.
<provider url="http://dspace.library.uu.nl/oai/clarin"/>
<provider url="http://oai.beeldengeluid.nl/academia/oai"/>
-->
<!-- extra -->
<provider url="https://repository.ortolang.fr/api/oai"/>
<provider url="https://dataverse.no/oai">
<set>trolling</set>
</provider>
<provider url="https://lat1.lis.soas.ac.uk/ds/oaiprovider/oai2"/>
</providers>
</config>