forked from grouin/medina
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lanceur.sh
129 lines (103 loc) · 5.57 KB
/
lanceur.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/sh
# We assume the existence of BRAT annotations (*txt and *ann files) in
# train and test directories (in this demonstration, we use the files
# we provided in corpus/clinique/jorf/ repository (French Official
# Journal annotated documents)
# Creation of a list in data/ directory of forms, lemma, and POS
# tags, based on lists produced by ABU CNAM, and named
# forme-lemme-pos.tab (to be done only once)
#bash scripts/pre_creeDictionnaire.bash
#perl scripts/pre_releveNgrammes.pl
# Conversion from BRAT to embedded annotations (files *.tag)
# - one argument: path to *{ann,txt} files
perl scripts/zero_alignement.pl corpus/clinique/jorf/train/
perl scripts/zero_alignement.pl corpus/clinique/jorf/test/
# Production of tabular file using the BIO schema for linear chain CRF
# - four arguments: path to embedded annotations files, file extension
# for those files (tag), name of output tabular file, and type of
# annotation schema to be used (IO BIO BIO2 BIO2H BWEMO BWEMO+)
# - in addition, a fifth argument may be used to indicate all labels
# to be kept in the tabular file; e.g., Personne,Ville to only keep
# annotations of Persons and Towns; if this argument is not used,
# all existing annotations will be kept
# - annotation schema BIO2 is the schema commonly used (default value)
# but we achieved better results using the BWEMO+ annotation schema
perl scripts/zero_tabulaire.pl corpus/clinique/jorf/train/ tag tab_train.zero BWEMO+
perl scripts/zero_tabulaire.pl corpus/clinique/jorf/test/ tag tab_test.zero BWEMO+
# Check-up of produced tabular files
perl scripts/pre_verification-tabulaire.pl tab_train.zero
perl scripts/pre_verification-tabulaire.pl tab_test.zero
# Over-training reduction by deletion of unannotated lines when those
# lines are not in a local context of annotated lines (e.g., more than
# 17 lines). The output consists in a new tabular file with less
# unannotated lines, to be used to train the model in the next
# step. Only for the training stage
perl scripts/zero_supprimeO.pl tab_train.zero 17 >tab_reduc.zero
# Statistical model building using the Wapiti tool
wapiti train -t 2 -a rprop- -1 1 -c -p config/config_zero.tpl tab_reduc.zero modele-zero
# Model application on test data
wapiti label -p -m modele-zero tab_test.zero >sortie-zero.wap
# If a lexicon file is provided with mandatory annotations to be made
# (especially in case of regular sensitive data), the previous
# prediction file will be completed with annotations from the lexicon;
# a lexicon may be infered from file names if those names are composed
# of first names and last names of patients
perl scripts/post_lexique.pl -l lexique.tab -r ./ -e wap -s out
# A propagation of annotations already made can be done, but this
# process does not take into account the context (perhaps dangerous)
#perl -CSDA scripts/post_propagation.pl sortie-zero.out >sortie-zero.prop
# Prediction output evaluation (script from the conll challenge); a
# conversion step from BWEMO annotation schema to a more classical
# BIO2 one is mandatory
perl scripts/post_conversion.pl sortie-zero.out
perl scripts/conlleval.pl -d '\t' <sortie-zero.out
# False positive and false negative analysis
#perl scripts/post_differences.pl sortie-zero
# Single annotated files production from output (files *sgml in test/)
perl scripts/crf-output-splitter.pl sortie-zero.out output
mkdir brat/
cp corpus/clinique/jorf/test/*txt brat/
cp output/*sgml brat/
perl scripts/conversion-brat.pl brat/
#cp brat/*{ann,txt} path/to/brat/data/
# Post-processing steps to pseudonymize texts, based on previously
# identified entities (date shiffting in the past, pseudonyms for
# person names, fake phone number, and replacement of other
# predictions by a generic tag)
perl scripts/post_antidatation.pl -r output/ -e sgml
perl scripts/post_pseudonymization.pl -r output/ -e dat
rm corpus/clinique/jorf/{train,test}/*tag
rm output/*{dat,dat.log}
################################################################
#
# To train a model specifically for one category, and to force Wapiti
# decoding using two models, use the following steps:
#
perl scripts/zero_alignement.pl corpus/clinique/jorf/train/
perl scripts/zero_tabulaire.pl corpus/clinique/jorf/train/ tag tab_train.zero BWEMO+ Personne
perl scripts/zero_supprimeO.pl tab_train.zero 40 >tab_reduc.zero
time wapiti train -t 2 -a rprop- -1 1 -c -p config/config_zero.tpl tab_reduc.zero modele-Pers
wapiti label -p -m modele-Pers tab_test.zero | perl -ne "s/O$/NUL/; print $_" >temp
wapiti label --force -p -m modele-deid temp >temp2
cat temp2 | cut -f 1,2,3,4,5,6,7,8,9,10,11,12,13,14,16 >sortie-zero
perl scripts/post_conversion.pl sortie-zero
perl scripts/conlleval.pl -d '\t' <sortie-zero
perl scripts/crf-output-splitter.pl sortie-zero output
rm temp*
################################################################
#
# To train a model on new data (in-house data) reusing the existing
# deid model (in order to combine existing performances with new ones
# from the new dataset), we need to check there is no new labels (no
# new classes may be used). We assume the new annotated data (*txt and
# *ann files) are in the "input" folder:
#
perl scripts/zero_alignement.pl input/
perl scripts/zero_tabulaire.pl input/ tag tab_train.zero BWEMO+
perl scripts/zero_supprimeO.pl tab_train.zero 17 >tab_reduc.zero
perl scripts/poursuiteEntrainement.pl tab_reduc.zero
wapiti train -t 2 -a rprop- -1 1 -c tab_reduc.zero -m modele-deid modele-inhouse
wapiti label -p -m modele-inhouse tab_test.zero >sortie-zero
perl scripts/crf-output-splitter.pl sortie-zero output
perl scripts/conlleval.pl -d '\t' <sortie-zero
rm tab_train.zero tab_reduc.zero