Skip to content

Commit

Permalink
fix duplicate canonical forms #148
Browse files Browse the repository at this point in the history
(use wn:otherForm for alternate representations)
  • Loading branch information
simongray committed Dec 4, 2024
1 parent d26c10c commit d8c85a3
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 1 deletion.
61 changes: 61 additions & 0 deletions src/main/dk/cst/dannet/db/bootstrap.clj
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,62 @@
(println "... adding" (count triples-to-add) "connotation sentiment triples")
(db/safe-add! g triples-to-add))))

(defn new-reps
[label]
(let [clean (fn [lstr]
(-> (str lstr)
(str/replace #"(^| )'+" "$1") ; infixed apostrophe
(str/replace "\"" "")))
label-str (clean label)
reps (first (re-find #"[^ \(\)]+(/[^ \(\)]+)+" label-str))]
(when reps
(for [part (str/split reps #"/")]
(da (str/replace label-str reps part))))))

(defn fix-canonical-reps!
[dataset]
(let [g (db/get-graph dataset prefix/dn-uri)
q (op/sparql
"SELECT ?w ?form ?label ?rep
WHERE {
?w ontolex:canonicalForm ?form .
?form ontolex:writtenRep ?rep .
?form ontolex:writtenRep ?rep2 .
FILTER (?rep != ?rep2) .
?w rdfs:label ?label .
}")
w->ms (group-by '?w (q/run g q))
ms (map (fn [[?word [{:syms [?label]} :as ms]]]
(when-let [reps (new-reps ?label)]
{:add (if (= ?word :dn/word-51001426) ; m/k'er special case
[[:dn/word-51001426 :ontolex/canonicalForm '_mker_form]
['_mker_form :ontolex/writtenRep (da "m/k'er")]]
(into
(let [cf (symbol (str "_form_" (name ?word)))]
[[?word :ontolex/canonicalForm cf]
[cf :ontolex/writtenRep (first reps)]])
(apply concat (map-indexed
(fn [n rep]
(let [of (symbol (str "_form_" (name ?word) "_" n))]
[[?word :ontolex/otherForm of]
[of :ontolex/writtenRep rep]]))
(rest reps)))))
:remove (into [[?word :ontolex/canonicalForm '_]]
(for [rep (set (map '?rep ms))]
['_ :ontolex/writtenRep rep]))}))
w->ms)]
(let [g (db/get-graph dataset prefix/dn-uri)
model (db/get-model dataset prefix/dn-uri)
triples-to-add (mapcat :add ms)
triples-to-remove (mapcat :remove ms)]
(txn/transact-exec model
(println "... removing old form triples:" (count triples-to-remove))
(doseq [triple triples-to-remove]
(db/remove! model triple)))
(txn/transact-exec g
(println "... adding" (count triples-to-add) "updated form triples")
(db/safe-add! g triples-to-add)))))

(h/defn make-release-changes!
"This function tracks all changes made in this release, i.e. deletions and
additions to either of the export datasets.
Expand All @@ -357,6 +413,9 @@

;; ==== The block of changes for this particular release. ====

;; Remove duplicate canonical forms, add other forms instead #148
(fix-canonical-reps! dataset)

;; Rename dns:supersense -> wn:lexfile #146
(db/update-triples! prefix/dn-uri dataset
'[:bgp
Expand Down Expand Up @@ -510,6 +569,8 @@
(existing-sentiment (:graph @dk.cst.dannet.web.resources/db))
(connotation-rows)

(fix-canonical-reps! (:dataset @dk.cst.dannet.web.resources/db))

;; data needed by Bolette
(let [g (:graph @dk.cst.dannet.web.resources/db)
existing (existing-sentiment g)
Expand Down
2 changes: 1 addition & 1 deletion src/main/dk/cst/dannet/query/operation.clj
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@
"SELECT ?form ?word ?synset ?label ?shortLabel ?definition ?ontoType
WHERE {
?form ontolex:writtenRep \"" lemma "\"@da .
?word ontolex:canonicalForm ?form ;
?word ontolex:canonicalForm|ontolex:otherForm ?form ;
ontolex:evokes ?synset .
OPTIONAL {
?synset rdfs:label ?label .
Expand Down
2 changes: 2 additions & 0 deletions src/main/dk/cst/dannet/web/resources.clj
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,8 @@
(update-vals (merge fixed-theme (zipmap rels colors)) deref)))

(comment
(bootstrap/fix-canonical-reps! (:dataset @db))

;; Generate the them used for e.g. radial diagrams
(generate-synset-rels-theme)

Expand Down

0 comments on commit d8c85a3

Please sign in to comment.