-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcldfbench_barrier-islands-mentawai-wlist1853.py
231 lines (189 loc) · 10.1 KB
/
cldfbench_barrier-islands-mentawai-wlist1853.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# This code looks very messy and contain many commented code because:
# - this is my first time experimenting with Python + cldfbench
# - so I did trial and error using/fixing/matching processing codes from different examples from Lexibank repository.
# - at the moment, the active codes work for me, provided the similar data structure (column names, capitalisation, etc.) in the raw and etc.
# I attempt to make a more detailed documentation for how I managed to get this workflow running (including the required data structures)
import pathlib
import re
# import json
import attr
from clldutils.misc import slug
from pylexibank import Language, FormSpec, Concept, Lexeme
from pylexibank.dataset import Dataset as BaseDataset
# from pylexibank.util import progressbar
@attr.s
class CustomLanguage(Language):
Sources = attr.ib(default=None)
@attr.s
class CustomLexeme(Lexeme): # to add custom column into forms.csv (looking at Barlow, Russell & Don Killian. 2023. CLDF dataset derived from Barlow and Killian’s "Tomoip Wordlist" from 2023. Zenodo. https://doi.org/10.5281/zenodo.8437515.)
CommonTranscription = attr.ib(default=None)
@attr.s
class CustomConcept(Concept):
Number = attr.ib(default=None)
def cln(word): return re.sub("[†\\d\\.\\*\\?\\~]", "", word)
class Dataset(BaseDataset):
dir = pathlib.Path(__file__).parent
id = "barrier-islands-mentawai-wlist1853"
language_class = CustomLanguage
lexeme_class = CustomLexeme # to add custom column into forms.csv (looking at Barlow, Russell & Don Killian. 2023. CLDF dataset derived from Barlow and Killian’s "Tomoip Wordlist" from 2023. Zenodo. https://doi.org/10.5281/zenodo.8437515.)
# form_spec = FormSpec(missing_data=["∅", "#", "NA", 'XX', '*#', '<NA>'], normalize_unicode="NFC")
# the separators here by default works when the args.writer.add_forms_from_value() is used (i.e., the Form is automatically split in terms of the separators specified here)
# however, the separators do not seem to work to split multi-form cell of forms when args.writer.add_form_from_segments() is used.
form_spec = FormSpec(normalize_unicode="NFC", separators = ",")
concept_class = CustomConcept
#def cldf_specs(self): # A dataset must declare all CLDF sets it creates.
# from cldfbench import CLDFSpec
# return CLDFSpec(dir=self.cldf_dir, module='Wordlist')
def cmd_download(self, args):
"""
Download files to the raw/ directory. You can use helpers methods of `self.raw_dir`, e.g.
# >>> self.raw_dir.download(url, fname)
"""
pass
# from pathlib import Path
# import pylexibank
# from clldutils.misc import slug
# class Dataset(pylexibank.Dataset):
# dir = Path(__file__).parent
# id = "barrier-islands-mentawai-wlist1853"
# def cmd_makecldf(self, args):
# args.writer.add_sources()
# language_lookup = args.writer.add_languages(lookup_factory="Name")
# concept_lookup = args.writer.add_concepts(
# id_factory=lambda x: x.number + "_" + slug(x.english), lookup_factory="Name"
# )
# for entry in pylexibank.progressbar(
# self.raw_dir.read_csv("mentawai1853.tsv", delimiter="\t", dicts=True)
# ):
# args.writer.add_forms_from_value(
# Language_ID=language_lookup[entry["Source"]],
# Parameter_ID=concept_lookup[entry["English"]],
# Value=entry["Mentawai"],
# Source=["VonRosenberg1853"],
# )
def cmd_makecldf(self, args):
args.writer.add_sources()
args.log.info("added sources")
# add languages
languages = args.writer.add_languages(
id_factory=lambda l: l["Name"], lookup_factory=lambda l: (l["Name"], l["Sources"])
)
sources = {k[0]: k[1] for k in languages}
def cln(word): return re.sub("[†\\d\\.\\*\\?\\~]", "", word)
# add concept
concepts = {}
for i, concept in enumerate(self.concepts):
idx = str(i + 1) + "_" + slug(concept["Gloss"])
args.writer.add_concept(
ID=idx,
Name=concept["Gloss"],
Number=concept["Number"],
Concepticon_ID=concept["Concepticon_ID"],
Concepticon_Gloss=concept["Concepticon_Gloss"]
)
concepts[concept["Gloss"], concept["Concepticon_ID"]] = idx
#concepts[concept["Concepticon_ID"]] = idx
args.log.info("added concepts")
# for row in progressbar(self.raw_dir.read_csv("mentawai1853.tsv", dicts=True, delimiter="\t")):
# args.writer.add_forms_from_value(
# Local_ID=row["ID"],
# Language_ID=row["Doculect"],
# #Parameter_ID=idx[row],
# Parameter_ID=concepts[cln(row["English"]), cln(row["Gloss"])],
# Value=row["Commons"]
# #Source=sources[row["Doculect"]],
# )
for idx, row in enumerate(self.raw_dir.read_csv(
"mentawai1853.tsv", delimiter="\t", dicts=True)):
#args.writer.add_form_with_segments( # this add_form_with_segments() is used when to include the Segments info from the raw data (this function also requires explicit statement of the addition of the Form variable). But, args.writer.add_forms_from_value is used when we can ignore the Segments variable
args.writer.add_forms_from_value(
Local_ID=row["ID"],
Language_ID=row["Doculect"],
# For the following line of code in getting the Parameter_ID to work, we need to:
# - ensure that the CONCEPTICON_ID in the raw/main data and in the Concepts.tsv data is the same.
# That is, we cannot have CONCEPTICON_ID of NA (in raw/main data) but empty in Concepts.tsv data.
# If that is the case, it will throw KeyError such as `KeyError: ('this and that', 'NA')` (in this case, the function found this key: 'this and that', 'NA' in the raw/main data but in the concepts dictionary, it is 'this and that', '', which is different!)
Parameter_ID=concepts[row["English"], row["CONCEPTICON_ID"]],
CommonTranscription=row["CommonsNotSegmented"], # this column takes the non-tokenised common transcription
Value=row["Mentawai"], # this column takes the original transcription
#Value=row["Mentawai"], # specify this line for the Value col. explicitly when args.writer.add_form_with_segments() is used
#Form=row["CommonsNotSegmented"], # specify this line for the Segments col. explicitly when args.writer.add_form_with_segments() is used
#Segments=list(row["IPA"]), # specify this line for the Segments col. explicitly when args.writer.add_form_with_segments() is used; the Segments column needs a list (not string), that is why we use the list() function
Source="VonRosenberg1853")
# def cmd_makecldf(self, args):
# # add bib
# args.writer.add_sources()
# args.log.info("added sources")
# # read in data
# data = self.raw_dir.read_csv(
# "mentawai1853.tsv", delimiter="\t",
# )
# header = data[0]
# header[0] = "Gloss"
# cognates = {}
# cogidx = 1
# for i in range(2, len(data), 2):
# words = dict(zip(header, data[i]))
# cognates = dict(zip(header, data[i+1]))
# concept = data[i][0]
# for language in languages:
# entry = words.get(language).strip()
# cog = cognates.get(language).strip()
# if entry.replace('#', '').strip():
# if concept+'-'+cog not in cognates:
# cognates[concept+'-'+cog] = cogidx
# cogidx += 1
# cogid = cognates[concept+'-'+cog]
# for lex in args.writer.add_forms_from_value(
# Language_ID=language,
# Parameter_ID=concepts[concept],
# Value=entry,
# Source=sources[language],
# Cognacy=cogid
# ):
# args.writer.add_cognate(
# lexeme=lex,
# Cognateset_ID=cogid,
# Source="VonRosenberg1853"
# )
# from pathlib import Path
# import attr
# from clldutils.misc import slug
# from pylexibank import Language, FormSpec
# from pylexibank.dataset import Dataset as BaseDataset
# from pylexibank.util import progressbar
# @attr.s
# class CustomLanguage(Language):
# Source = attr.ib(default=None)
# class Dataset(BaseDataset):
# dir = Path(__file__).parent
# id = "barrier-islands-mentawai-wlist1853"
# language_class = CustomLanguage
# form_spec = FormSpec(separators = ",")
# def cmd_download(self, args):
# pass
# def cmd_makecldf(self, args):
# args.writer.add_sources()
# # add concept
# concepts = {}
# for concept in self.concepts:
# idx = concept["NUMBER"]+"_"+slug(concept["ENGLISH"])
# concepts[concept["ENGLISH"]] = idx
# args.writer.add_concept(
# ID=idx,
# Name=concept["ENGLISH"],
# Concepticon_ID=concept["CONCEPTICON_ID"],
# Concepticon_Gloss=concept["CONCEPTICON_GLOSS"],
# )
# languages = args.writer.add_languages(
# id_factory=lambda l: l["Name"], lookup_factory=lambda l: (l["Name"], l["Source"])
# )
# sources = {k[0]: k[1] for k in languages} # language: source map
# for row in progressbar(self.raw_dir.read_csv("mentawai1853.tsv", dicts=True, delimiter="\t")):
# args.writer.add_forms_from_value(
# Local_ID=row["ID"],
# Language_ID=row["Source"],
# Parameter_ID=concepts[concept],
# Value=row["Mentawai"],
# Source=sources[row["Source"]],
# )