-
Notifications
You must be signed in to change notification settings - Fork 0
/
PAULA.xml
1861 lines (1731 loc) · 110 KB
/
PAULA.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.oasis-open.org/docbook/xml/5.0/rng/docbook.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
<book xmlns="http://docbook.org/ns/docbook" xmlns:xlink="http://www.w3.org/1999/xlink" version="5.0">
<info>
<title>PAULA XML Documentation</title>
<subtitle>Format version 1.1</subtitle>
<releaseinfo>Version: P1.1.2013.1.21a</releaseinfo>
<pubdate>21 Jan 2013</pubdate>
<authorgroup>
<author>
<personname>Amir Zeldes</personname>
<email>amir.zeldes@rz.hu-berlin.de</email>
<affiliation>
<orgname>SFB 632 D1</orgname>
<orgdiv>Humboldt-Universität zu Berlin </orgdiv>
</affiliation>
</author>
<author>
<personname>Florian Zipser</personname>
<email>f.zipser@gmx.de</email>
<affiliation>
<orgname>SFB 632 D1</orgname>
<orgdiv>Humboldt-Universität zu Berlin </orgdiv>
<orgdiv>INRIA</orgdiv>
</affiliation>
</author>
<author>
<personname>Arne Neumann</personname>
<email>arne.neumann@uni-potsdam.de</email>
<affiliation>
<orgname>SFB 632 D1</orgname>
<orgdiv>Universität Potsdam</orgdiv>
</affiliation>
</author>
</authorgroup>
</info>
<preface><title>Preamble</title>
<para>
<firstterm>PAULA XML</firstterm> or <firstterm>PAULA</firstterm> for short
(<firstterm>Potsdamer AUstauschformat Linguistischer Annotationen</firstterm>,
'Potsdam Exchange Format for Linguistic Annotations') is a standoff XML format designed
to represent a wide range of linguistically annotated textual and multi-modal corpora.
The format was created at Potsdam University and developed within SFB 632, the
collaborative research centre "Information Structure", subproject D1, "Linguistic
Database" at Potsdam University and Humboldt-Universität zu Berlin (see
<citation>Dipper2005</citation>, <citation>DipperGoetze2005</citation>,
<citation>ChiarcosEtAl2008</citation>). The description below represents the
normative documentation for PAULA version 1.1, with some notes on previous versions of
PAULA. For the latest documentation always check the PAULA Website which also contains an online HTML version of this documentation. </para>
<para>The standoff nature of PAULA refers to the fact that each layer of linguistic
annotation, such as part-of-speech annotations, lemmatizations, syntax trees,
coreference annotation etc. are stored in separate XML files which refer to the same raw
data. In this manner annotations can easily be added, deleted and updated without
disturbing independent annotation layers, and discontinuous or hierarchically
conflicting structures can be represented. Additionally the format ensures the
retainment of unaltered raw data, including white space and other elements often lost
due to restrictions of the encoding format. As a generalized XML format, PAULA is
indifferent to particular names or semantics of annotation structures. It concentrates
instead on the representation of corpus data as a set of arbitrarily labeled directed
acyclic graphs (so called multi-DAGs, wherein annotation projects may contain cycles as
long as these are on different annotation levels).</para>
<para>This documentation is structured as follows: the next chapter gives an overview of
the overall <link xlink:href="#datamodel">data model</link> of the current PAULA format,
followed by a chapter on <link xlink:href="#corpus_structure">corpus structure</link>
for XML files and folders. Further chapters review different file types: the <link
xlink:href="#required_files">minimal necessary files</link> for PAULA documents,
<link xlink:href="#metadata">metadata</link>, <link xlink:href="#primary_text_data"
>primary text data</link>, <link xlink:href="#tokenization">tokenizations</link> and
<link xlink:href="#mark">span annotations</link>, <link xlink:href="#struct"
>hierarchical graphs</link> and <link xlink:href="#pointing_relations">pointing
relations</link>. The final chapters give additional information on the optional use
of <link xlink:href="#namespaces">namespaces</link>, some special scenarios such as
building <link xlink:href="#parallel_corpora">parallel corpora</link>, <link
xlink:href="#dialogue_data">dialogue corpora</link> and <link
xlink:href="#multimodal">multimodal corpora</link>, recommendations for <link
xlink:href="#naming_conventions">file naming conventions</link> and information on
<link xlink:href="#versions">older/deprecated elements</link> of the PAULA XML
standard focusing on differences to the current version.</para>
</preface>
<chapter xml:id="datamodel">
<title>Datamodel overview</title>
<para xlink:href="">PAULA projects are graphs dominated by a top level node refered
to as a <link xlink:href="#corpus"><classname>corpus</classname></link>. Corpus
objects comprise graphs of one or more annotated <link xlink:href="#document"
><classname>document</classname></link> objects, optionally organized within
a tree of <link xlink:href="#corpus"><classname>subcorpus</classname></link>
objects. The tree of corpus, subcorpora and documents corresponds to a file system
folder tree. Corpora, subcorpora and documents can all receive <link
xlink:href="#metadata"><classname>metadata</classname></link> annotations. </para>
<para>All documents must contain at least one source of <link
xlink:href="#primary_text_data"><classname>primary text data</classname></link>,
possibly more in cases of <link xlink:href="#parallel_corpora">parallel corpora</link>
or <link xlink:href="#dialogue_data">dialogue data</link>, and at least one <link
xlink:href="#tokenization"><classname>tokenization</classname></link> of this data.
Tokenized data may be annotated directly using features called <link xlink:href="#feat"
><classname>feat</classname></link>, such as parts-of-speech, lemmatization,
etc. Further hierarchical structures can be built on top of tokens using flat span
objects called <link xlink:href="#mark"><classname>mark</classname></link> (i.e.
<firstterm>markables</firstterm>) or hierarchically nestable objects called <link
xlink:href="#struct"><classname>struct</classname></link> (i.e.
<firstterm>structures</firstterm>), which may also be annotated with
<classname>feat</classname> objects. The type of node or annotation (part-of-speech,
phrase-category etc.) is given by the type attribute of each set of nodes or
annotations. </para>
<para>Beyond the edges resulting from the construction of hierarchies through structs,
further non-hierarchical edges may be defined between any two nodes in a document using
pointing relations. Both edges connecting structs to tokens or other structs and
pointing relations may be annotated using feats and given a type. All objects and
annotations below the document level may carry a PAULA <link xlink:href="#namespaces"
><classname>namespace</classname></link> bundling relevant annotation layers
which belong together under a common identifier (note that these are not identical with
XML namespaces). The following two figures give an overview of this general data model
for the corpus/document structure and the structure of objects within them. For details
and examples of the individual model elements and their specific XML serialization see
the next chapters.</para>
<para>
<figure xml:id="Figure_corpus_model">
<title>Datamodel for (sub)corpus and document tree</title>
<mediaobject>
<imageobject>
<imagedata fileref="figures/paula_corpusStructure.svg" scale="50"/>
</imageobject>
</mediaobject>
</figure>
<figure xml:id="Figure_doc_model">
<title>Datamodel for document-internal objects</title>
<mediaobject>
<imageobject>
<imagedata fileref="figures/paula_documentStructure.svg" scale="30"/>
</imageobject>
</mediaobject>
</figure>
</para>
</chapter>
<chapter xml:id="corpus_structure">
<title>Corpus structure</title>
<sect1 xml:id="corpus">
<title>Corpus and subcorpus</title>
<para>In PAULA a corpus structure is defined by means of a file system folder structure.
The name of the corpus is determined by the name of the top level directory of the
folder structure. The top level directory may contain further directories. If these
directories contain subdirectories themselves, then they are considered to be
subcorpora. Subcorpora are generally used to provide meaningful subdivisions of a
corpus, e.g. based on genre, period, language etc. These may be accompanied by
appropriate <link xlink:href="#metadata">metadata</link>.</para>
<para>Each subcorpus carries the name of its directory. It is possible, but not
recommended, to repeat subcorpus names at different levels of nesting. A directory
cannot contain two identically named subdirectories, and therefore it is impossible
for two sibling subcorpora to have the same name. Under *NIX systems it is possible
to have directories with identical names except for capitalization. This is not
recommended for compatibility with other operating systems. In addition to
directories, a top level corpus or a subcorpus may contain an
<classname>annoSet</classname> file, which lists the set of subfolders in the
same directory (see <link xlink:href="#annoset">annoSets</link>). This is not
required unless the corpus or subcorpus should receive metadata annotations (see
<link xlink:href="#metadata">metadata</link>).</para>
<para>
<figure xml:id="Figure_paula_dir_struct">
<title>Directory structure for a PAULA corpus</title>
<programlisting><![CDATA[
+-- mycorpus/
¦ +-- subcorpus1/
¦ ¦ +-- doc1/
¦ ¦ +-- doc2/
¦ ¦ +-- doc3/
¦ +-- subcorpus2/
¦ ¦ +-- doc4/
¦ ¦ +-- doc5/
¦ ¦ +-- ...
¦ +-- subcorpus3/
... ...
]]>
</programlisting>
</figure>
</para>
<para> A subdirectory which contains no further directories is a document. Every corpus
and subcorpus must contain at least one document (possibly nested within a lower
level folder), empty corpora or subcorpora are not allowed. The minimal structure
for a PAULA corpus is therefore a corpus folder containing a document folder, which
must contain the minimal document structure described under <link
xlink:href="#document">documents</link>.</para>
</sect1>
<sect1 xml:id="document">
<title>Documents</title>
<para>A PAULA <classname>document</classname> is a terminal directory within the
directoy structure of the PAULA <classname><link xlink:href="#corpus"
>corpus</link></classname>, i.e. it is a folder that contains no subfolders.
Usually documents corresponds to coherent texts (e.g. an article), but in some
contexts other divisions may be sensible (e.g. chapters of a book as individual
documents). The primary consideration is whether or not annotations need to cross
boundaries between segments of the annotated texts, since annotation nodes and edges
can only exist within a document. It is not possible for an element in one document
to refer to or include an element from another document.</para>
<para>The name of the document is determined by the name of the folder representing it.
A document must contain at least a <classname><link xlink:href="#primary_text_data"
>primary text data</link></classname> file, a <link
xlink:href="#tokenization"><classname>tokenization</classname></link>, an
<classname><link xlink:href="#annoset">annoSet</link></classname> file and
the relevant <link xlink:href="#DTD">DTDs</link> used in the document, unless these
are stored in a separate folder and refered to with appropriate relative paths. If
the document contains no <link xlink:href="#tokenization">tokenization</link> or
other annotations, then these will be <filename>paula_text.dtd</filename>,
<filename>paula_struct.dtd</filename> and <filename>paula_header.dtd</filename>.
Typically, however, a document almost always contains a tokenization of the primary
text data and some annotations, meaning at least <filename>paula_mark.dtd</filename>
and <filename>paula_feat.dtd</filename> (see <link xlink:href="#DTD">DTDs</link> for
more information). It is generally advisable to contain all DTDs used in a corpus in
every document, as redundant DTDs do not disrupt processing or validation. </para>
<para>By convention, all XML files within a document (i.e. all files except DTDs) share
the document name as part of the file name, which appears first except for possible
<link xlink:href="#namespaces">namespaces</link>, and is followed by annotation
layer-specific elements. For more information about recommended naming practices see
<link xlink:href="#naming_conventions">naming conventions</link>.</para>
</sect1>
<sect1 xml:id="annoset">
<title>AnnoSets</title>
<para>Each PAULA <classname><link xlink:href="#document">document</link></classname>
must contain an <classname>annoSet</classname> file which describes the set of
annotations contained in the document. The <classname>annoSet</classname> conforms
with the <link xlink:href="#DTD">DTD</link>
<filename>paula_struct.dtd</filename> and contains a
<classname>structList</classname> element which contains one or more
<classname>struct</classname> elements, each of which contains one or more
<classname>rel</classname> elements (these are the same elements used for the
description of <link xlink:href="#struct">hierarchical annotations</link> as well).
Every XML file within the document directory (but not DTDs and not the
<classname>annoSet</classname> file itself) must be the
<classname>@xlink:href</classname> attribute of some <classname>rel</classname>
in the <classname>annoSet</classname>, including the special
<classname>annoFeat</classname> file if it has been included (see <link
xlink:href="#annofeat">Annofeats</link>). There are therefore as many
<classname>rel</classname> elements in the <classname>annoSet</classname> as
there are XML files in the directory, minus one (since the
<classname>annoSet</classname> itself is not referenced). Different structs can
be used to group together files belonging to one logical annotation layer, such as
the <classname><link xlink:href="#primary_text_data">primary text
data</link></classname> and its <classname><link xlink:href="#tokenization"
>tokenization</link></classname>, or related annotations such as part of
speech and lemma. The following example shows some typical groupings following the
PAULA <link xlink:href="#naming_conventions">naming conventions</link>.</para>
<para>
<example xml:id="Example_annoset">
<title>An <classname>annoSet</classname> file for doc1 in mycorpus</title>
<programlisting><![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_struct.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1.anno" />
<structList xmlns:xlink="http://www.w3.org/1999/xlink"
type="annoSet">
<struct id="anno_1">
<rel id="rel_1" xlink:href="mycorpus.doc1.anno_feat.xml" />
</struct>
<struct id="anno_2">
<rel id="rel_2" xlink:href="mycorpus.doc1.text.xml" />
<rel id="rel_3" xlink:href="mycorpus.doc1.tok.xml" />
</struct>
<struct id="anno_3">
<rel id="rel_4" xlink:href="mycorpus.doc1.tok_pos.xml" />
<rel id="rel_5" xlink:href="mycorpus.doc1.tok_lemma.xml" />
</struct>
<struct id="anno_4">
<rel id="rel_6" xlink:href="mycorpus.doc1.phrase.xml" />
<rel id="rel_7" xlink:href="mycorpus.doc1.phrase_cat.xml" />
<rel id="rel_8" xlink:href="mycorpus.doc1.phrase_func.xml" />
</struct>
</structList>
</paula>]]></programlisting>
</example>
</para>
<para>Annotation layers within the same struct are often interdependent, such that
removing one of the files from the document may disrupt the annotation graph shared
with the others. Also note that since <link xlink:href="#namespaces"
>namespaces</link> are also used to group related annotation layers together,
often (but not necessarily always) layers with the same namespace will also be in
the same <classname>struct</classname> in the <classname>annoSet</classname>.</para>
<para>A second function of annoSets is to list the contents of corpora or subcorpora.
AnnoSets within subcorpus or corpus folders are optional, though if they are
missing, the contents of the folder cannot be validated against a list. AnnoSets in
corpora or subcorpora are only required if the corpus or subcorpus should receive
metadata annotations, in which case an <classname>annoSet</classname> to which the
metadata features must point is required (see <link xlink:href="#metadata"
>metadata</link> for more information). An <classname>annoSet</classname> for a
subcorpus or corpus can look like the following example.</para>
<para>
<example xml:id="Example_annoset_corpus">
<title>An <classname>annoSet</classname> file for the corpus
<filename>mycorpus</filename> with three documents</title>
<programlisting><![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_struct.dtd">
<paula version="1.1">
<header paula_id="mycorpus.anno" />
<structList xmlns:xlink="http://www.w3.org/1999/xlink"
type="annoSet">
<struct id="anno_1">
<rel id="rel_1" xlink:href="doc1/" />
<rel id="rel_2" xlink:href="doc2/" />
<rel id="rel_3" xlink:href="doc3/" />
</struct>
</structList>
</paula>]]></programlisting>
</example>
</para>
<para>Corpus or subcorpus annoSets generally place all child subcorpora or documents
within one <classname>struct</classname> element as in the example above, though it
is not prohibited to group some items into different <classname>struct</classname>
elements. It is also possible to mix subcorpora and documents within the same corpus
or subcorpus level folder. There is no difference in notation and all immediate
subfolders in the file system are simply listed: <filename>subcorpus1/</filename>,
<filename>doc1/</filename> etc.</para>
</sect1>
</chapter>
<chapter xml:id="required_files">
<title>Required files and DTDs</title>
<sect1><title>Minimal document structure</title>
<para> Every document within a PAULA corpus requires at least one instance of each of the
following three XML file types: a <classname><link xlink:href="#primary_text_data"
>primary text data</link></classname> file, a <link xlink:href="#tokenization"
><classname>tokenization</classname></link>, and an <classname><link
xlink:href="#annoset">annoSet</link></classname> file. These accordingly define
the raw data, a basic segmentation of the data into minimal units and a list of the
files in the directory (see documentation of the individual file types for
details).</para>
<para>Additionally, the relevant DTDs must be added which define these file types. At a
minimum, the DTDs necessary for the required files above are: </para>
<para><itemizedlist>
<listitem>
<para><filename>paula_header.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_struct.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_mark.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_text.dtd</filename></para>
</listitem>
</itemizedlist>
</para>
<para>The DTDs may be repeated in each document to simplify moving and adding documents at any point in the corpus strucutre (as in the examples in this documentation),
or else DTDs can be saved in one folder (e.g. the corpus root) and refered to from each document using a relative path.</para>
</sect1>
<sect1 xml:id="DTD"><title>Additional DTDs</title>
<para>Beyond the DTDs in the previous section, if the document contains any
<classname><link xlink:href="#feat">feat</link></classname> annotations or
an <classname><link xlink:href="#annofeat">annoFeat</link></classname> file, it will
require the DTD <filename>paula_feat.dtd</filename>, and if it contains <link
xlink:href="#pointing_relations">pointing relations</link> using the
<classname>rel</classname> element, the file <filename>paula_rel.dtd</filename>
will also be necessary. A further DTD, <filename>paula_multiFeat.dtd</filename>, is
needed if multiple feat annotations should be defined in one XML file, see <link
xlink:href="#multifeats">multifeats</link>.</para>
<para>Usually the necessary DTDs are repeatedly included in every document folder for
validation purposes, though it is possible to include them in only one folder and
refer to them from each document using a relative path (cf. the previous section).
It is not necessary to include <filename>paula_rel.dtd</filename> or
<filename>paula_feat.dtd</filename> for corpora or documents that do not contain
pointing relations, even if some other documents in the corpus do, though it may be
recommended to have the same DTDs or DTD references in all folders in case pointing
relations or feature annotations are added to further corpus documents later on. The
following full list of DTDs may therefore be included in every document:</para>
<para><itemizedlist>
<listitem>
<para><filename>paula_header.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_struct.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_mark.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_text.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_feat.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_rel.dtd</filename></para>
</listitem>
<listitem>
<para><filename>paula_multiFeat.dtd</filename></para>
</listitem>
</itemizedlist>
</para>
</sect1>
</chapter>
<chapter xml:id="metadata">
<title>Metadata</title>
<para>Metadata encompasses annotations that apply to an entire object in the corpus
structure, i.e. to a corpus, subcorpus or document. The metadata does not annotate
specific elements within a text, but rather characterizes the entire container object.
In PAULA XML metadata is realized in lists of <classname>feat</classname> elements
(features), which refer to the <classname>annoSet</classname> of the relevant object
(see <link xlink:href="#annoset">annoSets</link>). It is also possible for metadata
annotations to carry a <link xlink:href="#namespaces">namespace</link>, just like any
other form of annotation. </para>
<sect1>
<title>Corpus and subcorpus metadata</title>
<para>Corpus and subcorpus level metadata can optionally be added to any corpus or
subfolder containing an <classname><link xlink:href="#annoset"
>annoSet</link></classname>. It is not possible to add metadata to a folder not
containing an <classname>annoSet</classname>. The following example illustrates a
metadata annotation for the corpus <filename>mycorpus</filename>.</para>
<para>
<example xml:id="Example_corp_meta"><title>Metadata for the corpus <filename>mycorpus</filename></title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_feat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.meta_lang"/>
<featList xmlns:xlink="http://www.w3.org/1999/xlink"
type="lang" xml:base="mycorpus.anno.xml">
<feat xlink:href="#anno_1" value="eng"/><!-- English -->
</featList>
</paula>
]]></programlisting>
</example>
</para>
<para>Since the name of the metadata attribute is determined in the the
<classname>@type</classname> attribute of the
<classname>featList</classname> element, it is necessary to define a
separate <classname>feat</classname> file for each metadata annotation, unless
<link xlink:href="#meta_multifeat">multiFeat</link> metadata files are used.
Note also that in this example the feat is only pointing at the
<classname>struct</classname> element "anno_1" from the
<classname>annoSet</classname> file <filename>mycorpus.anno.xml</filename>.
It is also possible to have multiple <classname>feat</classname> elements,
pointing to each one of the <classname>struct</classname> elements in the
<classname>annoSet</classname>. In the current version of PAULA this makes
no difference: once a metadata annotation has been applied to any
<classname>struct</classname> element in the <classname>annoSet</classname>,
it applies to the entire object described by the
<classname>annoSet</classname>.</para>
</sect1>
<sect1>
<title>Document metadata</title>
<para>Document metadata works exactly like corpus metadata: it is defined within a
<classname>feat</classname> file which has the annotation name in the
<classname>featList</classname>
<classname>@type</classname> attribute and the value in the
<classname>feat</classname>
<classname>@value</classname> attribute. The <classname>feat</classname> element
should point at a <classname>struct</classname> element from the document's
<classname><link xlink:href="#annoset">annoSet</link></classname>. It is
possible but not necessary to annotate all <classname>struct</classname>
elements in the <classname>annoSet</classname>. The following example
demonstrates this.</para>
<para>
<example xml:id="Example_doc_meta"><title>Metadata for the document <filename>mycorpus/doc1</filename></title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_feat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1.meta_year"/>
<featList xmlns:xlink="http://www.w3.org/1999/xlink" type="year"
xml:base="mycorpus.doc1.anno.xml">
<feat xlink:href="#anno_1" value="1999"/><!-- year 1999 -->
</featList>
</paula>
]]></programlisting>
</example>
</para>
<para>If the <classname>annoSet</classname> of doc1 contains several structs names
"anno_1", "anno_2" etc., it is possible to annotate them all using multiple
<classname>feat</classname> elements. This is identical to annotating just
one of the elements, as in the example above: the metadata annotation "year" has
been applied to the document and given the value "1999".</para>
</sect1>
<sect1 xml:id="meta_multifeat">
<title>Using multifeats in metadata</title>
<para>When using a large number of metadata annotations, it is sometimes more
convenient to use just one XML document to define all meta annotations. This is
made possible by using <classname>multiFeat</classname> files. The following
example illustrates the use of <classname>multiFeat</classname> annotations to
define metadata. For more detailed information on
<classname>multiFeat</classname> annotations see also <link
xlink:href="#multifeats">multiFeat annotations</link>.</para>
<para>
<example xml:id="Example_meta_multiFeat"><title>Multiple metadata annotations in one file using <classname>multiFeat</classname> elements.
</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_multiFeat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1.meta_multiFeat"/>
<multiFeatList xmlns:xlink="http://www.w3.org/1999/xlink"
type="multiFeat" xml:base="mycorpus.doc1.anno.xml">
<multiFeat xlink:href="#anno_1">
<feat name="year" value="2012"/>
<feat name="language" value="English"/>
<feat name="source_format" value="PAULA XML"/>
<!-- ... -->
</multiFeat>
</multiFeatList>
</paula>
]]></programlisting>
</example>
</para>
</sect1>
<sect1 xml:id="annofeat">
<title>AnnoFeats</title>
<para>Each PAULA document may optionally contain an <classname>annoFeat</classname> file
listing the types of all annotation files including <classname>mark</classname>,
<classname>feat</classname>, <classname>struct</classname> and
<classname>rel</classname> files, for validation purposes. Not including an
<classname>annofeat</classname> file means that the annotation layers available
within the files specified in the <classname>annoSet</classname> cannot be
validated, though it may make it easier to update annotation layers dynamically. The
following example illustrates the use of the <classname>annoFeat</classname> file in
reference to <xref linkend="Example_annoset"/> in the previous section. </para>
<para>
<example xml:id="Example_annofeat">
<title>An <classname>annoFeat</classname> file for doc1 in mycorpus</title>
<programlisting><![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_feat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1.annoFeat" />
<featList type="annoFeat" xml:base="mycorpus.doc1.anno.xml"
xmlns:xlink="http://www.w3.org/1999/xlink">
<feat xlink:href="#rel_1" value="annoFeat" />
<feat xlink:href="#rel_2" value="text" />
<feat xlink:href="#rel_3" value="tok" />
<feat xlink:href="#rel_4" value="pos" />
<feat xlink:href="#rel_5" value="lemma" />
<feat xlink:href="#rel_6" value="phrase" />
<feat xlink:href="#rel_7" value="cat" />
<feat xlink:href="#rel_8" value="func" />
</featList>
</paula>]]></programlisting>
</example>
</para>
<para>Note that since the value of the <classname>feat</classname> is a string and not
an ID, it is possible for multiple rels to refer to the same annotation type name.
In order to disambiguate in such cases, it is possible to use <link
xlink:href="#namespaces">namespaces</link>, provided that these have been used
in the corresponding annotation files. The value then takes the form
"namespace:anno_name", e.g. "stts:pos".</para>
<para>The <classname>annoFeat</classname> file cannot be used in corpus and subcorpus
directories.</para>
</sect1>
</chapter>
<chapter xml:id="primary_text_data">
<title>Primary text data</title>
<para>The <classname>primary text data</classname> forms the lowest level of resource
representation, corresponding to the minimally analyzed linguistic data: a strech of
untokenized plain text. The presence of at least one such file is obligatory in
every PAULA <classname><link xlink:href="#document">document</link></classname>.
Even if the resource to be annotated originates in spoken data for which a primary
recording exists, its textual transcription forms the primary data. A segment of a
recording is therefore seen to 'take place' in correspondences with a certain
stretch of text (see <link xlink:href="#AV_data">Aligned audio/video files</link>
for details). The primary data follows the schema definition in <filename><link
xlink:href="#req_XML">paula_text.dtd</link></filename>, which must be
present. The type of the file is "text", and by convention the file name ends with
the extension <filename>*.text.xml</filename> and its paula_id is the same as the
file name prefix, ending in <code>_text</code> instead of the file extension
<filename>*.text.xml</filename>. <xref linkend="Example_text"/> illustrates a
<classname>primary text data</classname> file called
<filename>mycorpus.doc1.text.xml</filename>.</para>
<para>
<example xml:id="Example_text"><title>A primary text data file</title><programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_text.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1_text" type="text"/>
<body>This is an example.</body>
</paula>]]></programlisting></example>
</para>
<para>A PAULA document can also contain more than one <classname>primary text
data</classname> file. There are at least two scenarios where this is recommended,
for which the respective sections should be consulted: <link
xlink:href="#parallel_corpora">parallel corpora</link> with aligned texts in
multiple languages and <link xlink:href="#dialogue_data">dialogue data</link> with
multiple simultaneous speakers.</para>
<para>As with other PAULA XML files, the first segment of text before a period within
the filename of the <classname>primary text data</classname> file can be interpreted
as a PAULA <classname><link xlink:href="#namespaces">namespace</link></classname>.
In documents with only one such file, this is usually not important, but it is
possible to use namespaces to group together text from different languages or
speakers in parallel corpora or dialogue data respectively. </para>
</chapter>
<chapter xml:id="mark">
<title>Spans and markables</title>
<sect1><title>Introduction to spans and markables</title>
<para> In PAULA it is possible to define spans of data for further annotation. Spans are
defined using the <classname>mark</classname> element, which stands for
<firstterm>markable</firstterm> and has two primary functions: defining a
<link xlink:href="#tokenization"
>tokenization</link> for a primary text data and defining a
non-terminal <link xlink:href="#span_anno">annotation span</link> node above the token
level. </para>
</sect1>
<sect1 xml:id="tokenization"><title>Tokenizations and token markables</title>
<para>A <classname>tokenization</classname> forms a minimal level
of analysis that segments a <classname><link xlink:href="#primary_text_data"
>primary text data</link></classname> file into units that can be
annotated further. It is not possible to directly annotate text that is not
tokenized, and every PAULA document must contain at least one
<classname>tokenization</classname>. It is possible to include whitespace
characters within the primary data and then ignore these characters while
tokenizing, so that adjacent tokens are not interrupted by any characters on the
tokenized level. <xref linkend="Example_tok"/> illustrates this
principle.</para>
<para>
<example xml:id="Example_tok"><title>Tokenization of the <classname>primary text data</classname> "This is an example."</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_mark.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1_tok"/>
<markList xmlns:xlink="http://www.w3.org/1999/xlink" type="tok"
xml:base="mycorpus.doc1.text.xml">
<mark id="tok_1"
xlink:href="#xpointer(string-range(//body,'',1,4))"/><!-- This -->
<mark id="tok_2"
xlink:href="#xpointer(string-range(//body,'',6,2))"/><!-- is -->
<mark id="tok_3"
xlink:href="#xpointer(string-range(//body,'',9,2))"/><!-- an -->
<mark id="tok_4"
xlink:href="#xpointer(string-range(//body,'',12,7))"/><!--example-->
<mark id="tok_5"
xlink:href="#xpointer(string-range(//body,'',19,1))"/><!-- . -->
</markList>
</paula>
]]></programlisting>
</example>
</para>
<para>The first token element with the id "tok_1" begins at the first character of
the text (the letter "T") and goes covering a total of 4 character: "This".
Character 5 is a space, which has not been tokenized. The next token, "tok_2",
begins at character 6, covering 2 characters: "is". It is also possible to define
tokens with no textual extension, i.e. empty tokens. Such tokens have a string range
spanning zero characters. However, they must still have an anchor position within
the text. The following example illustrates an empty token in the sentence "he takes
people out to fish", where the unrealized subject of "to fish" is tokenized between
"out" and "to" with a character span of zero characters.</para>
<para>
<example xml:id="Example_tok_fish"><title>Tokenization of the primary data "he takes people out to fish"</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_mark.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc2_tok"/>
<markList xmlns:xlink="http://www.w3.org/1999/xlink" type="tok"
xml:base="mycorpus.doc2.text.xml">
<mark id="tok_1"
xlink:href="#xpointer(string-range(//body,'',1,2))"/><!-- he -->
<mark id="tok_2"
xlink:href="#xpointer(string-range(//body,'',4,5))"/><!-- takes -->
<mark id="tok_3"
xlink:href="#xpointer(string-range(//body,'',10,6))"/><!--people-->
<mark id="tok_4"
xlink:href="#xpointer(string-range(//body,'',17,3))"/><!-- out -->
<mark id="tok_5"
xlink:href="#xpointer(string-range(//body,'',21,0))"/><!-- -->
<mark id="tok_6"
xlink:href="#xpointer(string-range(//body,'',22,2))"/><!-- to -->
<mark id="tok_7"
xlink:href="#xpointer(string-range(//body,'',25,4))"/><!--fish-->
</markList>
</paula>
]]></programlisting>
</example>
</para>
<para>Although a PAULA tokenization file is defined with reference to the general
markable DTD <filename><link xlink:href="#DTD">paula_mark.dtd</link></filename>, it is distinguished from other types
of markables, specifically <link xlink:href="#span_anno">annotation
markables</link>, in two ways. Firstly, the <classname>@type</classname>
attribute of the element <classname>markList</classname>, which must be set to
the value <classname>tok</classname>. Secondly, tokenization can only refer to a
<classname>primary text data</classname> file. It is not possible to define
a token pointing to a more complex structure (e.g. another markable or
token).</para>
<para>As of PAULA version 1.1 it is possible to have multiple <classname>primary
text data</classname> files, each of which must then be tokenized. Multiple
tokenizations of the same <classname>primary text data</classname> are not
possible in PAULA 1.1, but are planned as part of a future version of PAULA XML. </para>
</sect1>
<sect1 xml:id="span_anno"><title>Annotation span markables</title>
<para>The element <classname>mark</classname> may be used to group together a set of
<link xlink:href="#tokenization">tokens</link> for further annotation. This is
usually done in order to annotate a certain feature-value pair which applies to
these tokens. Span annotations therefore have the semantics of
<firstterm>attribution</firstterm> within the graph structure, i.e. stating that
an area of the data has a certain property or attribute. These attributes are
realized in PAULA using <classname><link xlink:href="#feat">feat</link></classname>
annotation files, one or more of which can apply to any span defined by a markable.
Span markables are defined with reference to the DTD <filename><link
xlink:href="#DTD">paula_mark.dtd</link></filename>. The type of markable
being annotated (e.g. a referent or referring expression in a discourse, a chunk for
chunking annotation, etc.) is given by the <classname>@type</classname> attribute of
the <classname>markList</classname> element, and may be any string value other than
"tok" which is reserved for <link xlink:href="#tokenization">tokenizations</link>.
Other values are not ruled out by the format, but it is recommended to use types
that follow XML element naming conventions, i.e. strings that contain only
alphanumeric ascii characters with no spaces and beginning with an alphabetic
character.</para>
<para>Markables may be continuous or discontinuous, i.e. they may apply to a set of
consecutive tokens or to non-consecutive tokens. The following example
illustrates both types of markables in a single file with the type
"chunk".</para>
<para>
<example xml:id="Example_mark"><title>Markables of the type "chunk" above a set of six tokens "I" "'ve "picked" "the" "kids" "up"</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_mark.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1_chunk_seg"/>
<markList xmlns:xlink="http://www.w3.org/1999/xlink" type="chunk"
xml:base="mycorpus.doc1.tok.xml">
<!-- I -->
<mark id="chunk_1" xlink:href="#tok_1"/>
<!-- 've picked...up -->
<mark id="chunk_2"
xlink:href="(#xpointer(id('tok_2')/range-to(id('tok_3'))),#tok_6)"/>
<!-- the kids -->
<mark id="chunk_3"
xlink:href="#xpointer(id('tok_3')/range-to(id('tok_4')))"/>
</markList>
</paula>
]]></programlisting>
</example>
</para>
<para>In the example, three markables have been defined which refer to six tokens in
the token file <classname>mycorpus.doc1.tok.xml</classname>, as entered in the
<classname>markList</classname> element's <classname>@xml:base</classname>
attribute. The first markable, "chunk_1" points to "#tok_1" in the token file which
covers the string "I". The third markable, "chunk_3", points to a range of
consecutive tokens, from "tok_3" to "tok_4", which covers the words "the kids". The
chunk in the middle, "chunk_2", points to a discontinuous set of tokens, namely a
range "tok_2" to "tok_3" and a further individual token "tok_6", corresponding to
the tokens "'ve picked" and a later token "up". These markables cannot be annotated
further within this file (e.g. with the type of chunk as nominal, verbal, etc.).
Further annotation of the markables beyond the markable list
<classname>@type</classname> must be added in separate files as <classname><link
xlink:href="#feat">feat</link></classname> annotations.</para>
<para>Note that the markable type is set once in the <classname>markList</classname>
element for all markables in the file. To define markables of a different type,
a separate markable file must be generated. Separate files are not required to
have the same segmentations and constitute independent layers of
annotation.</para>
</sect1>
<sect1 xml:id="feat"><title>Feats</title>
<para>The element <classname>feat</classname> and corresponding feat files represent
arbitrary key-value feature annotations which may be applied to a variety of
elements, such as parts of speech or syntactic categories, but also metadata.
They can be applied to mark elements to annotate <link xlink:href="#span_anno"
>spans of tokens</link> or even <link xlink:href="#tokenization"
>tokens</link> directly, but also to <link xlink:href="#struct"
><classname>struct</classname></link> elements as part of
non-hierarchical annotations or metadata annotation of <classname><link
xlink:href="#annoset">annoSet</link></classname> elements. The following
two examples illustrate feature annotation of spans and tokens. For other uses
see <link xlink:href="#metadata">metadata</link> and <link
xlink:href="#struct_feat">annotating structs</link>. In <xref
linkend="Example_tok_feat"/> a <classname>featList</classname> with the
<classname>@type</classname> "pos" contains six <classname>feat</classname>
elements, each annotating a single token with its part of speech in the
<classname>@value</classname> attribute. </para>
<para>
<example xml:id="Example_tok_feat"><title>Annotating tokens with <classname>feat</classname> annotations for part of speech</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_feat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1_pos"/>
<featList xmlns:xlink="http://www.w3.org/1999/xlink" type="pos"
xml:base="mycorpus.doc1.tok.xml">
<feat xlink:href="#tok_1" value="PP"/><!-- I -->
<feat xlink:href="#tok_2" value="VBP"/><!-- 've -->
<feat xlink:href="#tok_3" value="VBN"/><!-- picked -->
<feat xlink:href="#tok_4" value="DT"/><!-- the -->
<feat xlink:href="#tok_5" value="NNS"/><!-- kids -->
<feat xlink:href="#tok_6" value="RP"/><!-- up -->
</featList>
</paula>
]]></programlisting>
</example>
</para>
<para>It is also possible to annotate more than one token at a time by using <link
xlink:href="#span_anno">annotation span markables</link>, which cover one or
more tokens each. In this case the features do not refer to a token file, but to
a markable file which refers to some tokens in itself. The following example
illustrates the annotation of such spans, which works in much the same way as
the annotation of tokens.</para>
<para>
<example xml:id="Example_mark_feat"><title>Annotating spans from a markable file with <classname>feat</classname> annotations for chunk
type</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_feat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1_chunk_seg_chunk_type"/>
<featList xmlns:xlink="http://www.w3.org/1999/xlink"
type="chunk_type" xml:base="mycorpus.doc1.chunk_seg.xml">
<feat xlink:href="#chunk_1" value="N"/><!-- I -->
<feat xlink:href="#chunk_2" value="V"/><!-- 've picked _ up -->
<feat xlink:href="#chunk_3" value="N"/><!-- the kids -->
</featList>
</paula>
]]></programlisting>
</example>
</para>
<para>In this case, three features of the type "chunk_type" have been assigned to
three markables in the file <filename>mycorpus.doc1.chunk_seg.xml</filename>.
The "chunk_type" of the first markable is given the value "N". The second
markable receives the "chunk_type" "V" and the third is "N" again. Note that the
tokens covered by the respective markables are not defined here, though comments
to the right of each element can help keep track of the text covered by each
annotation. The actual tokens covered by each markable are defined in the
separate file <filename>mycorpus.doc1.chunk_seg.xml</filename>. There is also no
necessary connection between the type of feature and the type of markable,
though in many cases it makes sense to give them similar names, e.g. markables
called "chunk" and an annotation "chunk_type" (see also <link
xlink:href="#naming_conventions">naming conventions</link>).</para>
</sect1>
<sect1><title xml:id="multifeats">Multifeats</title>
<para> In cases where multiple annotations always apply to the same nodes, it may be
more economic to specify multiple, usually related annotations in the same file.
This is made possible by the use of <classname>multiFeat</classname> files,
together with the associated <filename>paula_multiFeat.dtd</filename>. Each
multiFeat contains multiple feat annotations applying to the element specified
in the <classname>@xlink:href</classname> attribute of the
<classname>multiFeat</classname> element. Since the
<classname>multiFeat</classname> itself is not an actual annotation, but a
container for other annotations, the <classname>multiFeatList</classname>
element is conventionally given the type "multiFeat". The example below
illustrates the use of multiFeat annotations. </para>
<para>
<example xml:id="Example_multiFeat"><title>Annotating multiple annotations using <classname>multiFeat</classname> elements.
</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_multiFeat.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc1.tok_multiFeat"/>
<multiFeatList xmlns:xlink="http://www.w3.org/1999/xlink"
type="multiFeat" xml:base="mycorpus.doc1.tok.xml">
<multiFeat xlink:href="#tok_1"> <!-- I -->
<feat name="pos" value="PPER"/>
<feat name="lemma" value="I"/>
</multiFeat>
<multiFeat xlink:href="#tok_2"> <!-- 've -->
<feat name="pos" value="VBP"/>
<feat name="lemma" value="have"/>
</multiFeat>
<!-- ... -->
</multiFeatList>
</paula>
]]></programlisting>
</example>
</para>
<para> Note that there is no difference from the data model point of view between
the use of multiple <classname>feat</classname> files or one
<classname>multiFeat</classname> file specifying the same annotation types.
Note also that when using <link xlink:href="#namespaces">namespaces</link>, all
annotations in a <classname>multiFeat</classname> have the same namespace,
determined by the <classname>multiFeat</classname> file name. While it is
possible to have different annotation in different
<classname>multiFeat</classname> elements in the same file, it is
recommended to avoid this, as it can quickly become confusing. The use of
<classname>multiFeat</classname> annotations can also make it potentially
difficult to add, remove and edit annotations after the fact, since separate
annotation layers are mixed in one XML file.</para>
</sect1>
</chapter>
<chapter>
<title>Hierarchical structures</title>
<para>Hierarchical structures are used in PAULA for two different purposes: for the
creation of hierarchically nested annotation graphs (e.g. syntax trees, rhetorical
structure annotation, hierarchical topological fields) and for the definition of
structured <classname>annoSet</classname> objects (see <link xlink:href="#annoset"
>annoSets</link>). Hierarchical structures express the graph semantic property
that a parent node <firstterm>consists of</firstterm> its children, or in reverse,
that children nodes <firstterm>constitute</firstterm> their parent nodes. The
semantics of hierarchical edges is also called <firstterm>dominance</firstterm> (a
parent node <firstterm>dominates</firstterm> a child node), and they are
consequently known as <firstterm>dominance edges</firstterm> as well. This chapter
describes hierarchical annotation graphs. For non-hierarchical annotations see also
<link xlink:href="#mark">spans and markables</link>. </para>
<sect1 xml:id="struct"><title>Structs</title>
<para>To form hierarchically nested (i.e. recursive) non-terminal nodes above the
token level, the <classname>struct</classname> element should be used.
<firstterm>Directed acyclic graphs</firstterm> (DAGs) of struct elements may
be defined in struct files according to <filename><link xlink:href="#DTD"
>paula_struct.dtd</link></filename>. The <classname>struct</classname>
element is embedded within a <classname>structList</classname> which determines
the <classname>@type</classname> for all structs in the file. It has only one
attribute, an <classname>@id</classname> which allows it to become the target of
incoming edges. Outgoing edges are annotated using the child element
<classname>rel</classname>, which has its own <classname>@type</classname>
(the type of edge) and an attribute <classname>@xlink:href</classname>
determining the target's id, as well as its own <classname>@id</classname>
attribute for further annotation (see annotating structs and rels). The
following example illustrates a simple syntax tree for the sentence "he ". The
correpsonding syntax tree is also visualized in <xref linkend="Figure_fish_tree"
/>. </para>
<para>
<example xml:id="Example_struct"><title>Constructing a hierarchical syntax tree with <classname>struct</classname> elements
type</title>
<programlisting><![CDATA[<?xml version="1.0" standalone="no"?>
<!DOCTYPE paula SYSTEM "paula_struct.dtd">
<paula version="1.1">
<header paula_id="mycorpus.doc2_phrase"/>
<structList xmlns:xlink="http://www.w3.org/1999/xlink"
type="phrase">
<struct id="phrase_1"> <!-- NP -->
<!-- he -->
<rel id="rel_1" type="edge" xlink:href="mycorpus.doc2.tok.xml#tok_1"/>
</struct>
<struct id="phrase_2"> <!-- VP -->
<!-- takes -->
<rel id="rel_2" type="edge" xlink:href="mycorpus.doc2.tok.xml#tok_2"/>
<rel id="rel_3" type="edge" xlink:href="#phrase_3"/>
<rel id="rel_4" type="edge" xlink:href="#phrase_4"/>
<rel id="rel_5" type="edge" xlink:href="#phrase_5"/>
</struct>
<struct id="phrase_3"> <!-- NP -->
<!-- people -->
<rel id="rel_6" type="edge" xlink:href="mycorpus.doc2.tok.xml#tok_3"/>
<!-- _ -->
<rel id="rel_7" type="secedge" xlink:href="mycorpus.doc2.tok.xml#tok_5"/>
</struct>
<struct id="phrase_4"> <!-- PRT -->
<!-- out -->
<rel id="rel_8" type="edge" xlink:href="mycorpus.doc2.tok.xml#tok_4"/>
</struct>
<struct id="phrase_5"> <!-- S -->
<rel id="rel_9" type="edge" xlink:href="#phrase_6"/>
<rel id="rel_10" type="edge" xlink:href="#phrase_7"/>
</struct>
<struct id="phrase_6"> <!-- NP -->
<!-- _ -->
<rel id="rel_11" type="edge" xlink:href="mycorpus.doc2.tok.xml#tok_5"/>
</struct>
<struct id="phrase_7"> <!-- VP -->
<!-- to -->
<rel id="rel_12" type="edge" xlink:href="mycorpus.doc2.tok.xml#tok_6"/>
<rel id="rel_13" type="edge" xlink:href="#phrase_8"/>
</struct>