From d2b64ba054c7e2f5b8ad64021169c4b7a8808388 Mon Sep 17 00:00:00 2001 From: Simon Gray Date: Tue, 30 May 2023 16:19:01 +0200 Subject: [PATCH] #3 - add the remaining old English synonym links as ILI references --- .gitignore | 1 + src/main/dk/cst/dannet/bootstrap.clj | 31 ++++++++++++++++++++----- src/main/dk/cst/dannet/web/section.cljc | 11 +++++++-- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 91d777be..e468bea4 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ bootstrap/other/sentiment/sense_polarities.tsv # The open English WordNet bootstrap/other/english/english-wordnet-2022.ttl bootstrap/other/english/ili.ttl +bootstrap/other/english/ili-map-pwn20.tab bootstrap/other/english/yaml bootstrap/other/english/senseidx.edn diff --git a/src/main/dk/cst/dannet/bootstrap.clj b/src/main/dk/cst/dannet/bootstrap.clj index 77030fd6..084d2b32 100644 --- a/src/main/dk/cst/dannet/bootstrap.clj +++ b/src/main/dk/cst/dannet/bootstrap.clj @@ -30,6 +30,9 @@ [dk.cst.dannet.query :as q] [dk.cst.dannet.query.operation :as op])) +(declare read-triples) +(declare cor-k-pos) + ;; TODO: missing labels ;; http://localhost:3456/dannet/data/synset-48454 ;; http://localhost:3456/dannet/data/synset-49086 @@ -557,6 +560,17 @@ (def senseidx->english-synset (delay (edn/read-string (slurp "bootstrap/other/english/senseidx.edn")))) +(def wn20-id->ili + (delay (->> (read-triples [identity + "bootstrap/other/english/ili-map-pwn20.tab" + :encoding "UTF-8" + :separator \tab]) + (filter (fn [[_ _ confidence]] + (= confidence "1"))) + (map (fn [[ili-id wn-id _]] + [(str "ENG20-" wn-id) (keyword "ili" ili-id)])) + (into {})))) + (h/defn ->english-link-triples "Convert a `row` from 'relations.csv' to triples. @@ -564,10 +578,12 @@ [[subj-id _ rel obj-id _ _ :as row]] ;; Ignores eq_has_hyponym and eq_has_hyperonym, no equivalent in GWA schema. ;; This loses us 123 of the original 5000l links to the Princton WordNet. + ;; TODO: implement dns relations for this as we apparently use those the new data too... (when (= "eq_has_synonym" rel) - ;; TODO: need backup for IDs that match e.g. "ENG20-07945291-n" - (when-let [obj (get @senseidx->english-synset obj-id)] - #{[(synset-uri subj-id) :wn/eq_synonym obj]}))) + (if-let [obj (get @senseidx->english-synset obj-id)] + #{[(synset-uri subj-id) :wn/eq_synonym obj]} + (when-let [ili-obj (get @wn20-id->ili obj-id)] + #{[(synset-uri subj-id) :wn/ili ili-obj]})))) ;; TODO: can we create new forms/words/synsets rather than overload writtenRep? (defn explode-written-reps @@ -600,9 +616,6 @@ [s & after] (apply str "\"" s "\"" after)) -(declare read-triples) -(declare cor-k-pos) - (def sense-properties (let [row->kv (fn [[dannetsemid lemma hom pos dn_lemma id gloss]] (let [[_ pos'] (re-matches #"([^\.]+)\.?" pos) @@ -1103,6 +1116,7 @@ :separator \tab :preprocess (comp mark-duplicate-senses rest)] + ;; TODO: publish as a separate dataset? ;; Links to the Open English WordNet :oewn-links [->english-link-triples "bootstrap/dannet/DanNet-2.5.1_csv/relations.csv"]} @@ -1258,6 +1272,11 @@ (->> (read-triples (get-in imports [prefix/dn-uri :relations])) (take 10)) + ;; Example English WordNet or ILI links + (->> (read-triples (get-in imports [prefix/dn-uri :oewn-links])) + (remove nil?) + (take 10)) + ;; Example links to the Open English WordNet (->> (read-triples (get-in imports [prefix/dn-uri :en-links])) (take 10)) diff --git a/src/main/dk/cst/dannet/web/section.cljc b/src/main/dk/cst/dannet/web/section.cljc index 5c7b905e..603296c7 100644 --- a/src/main/dk/cst/dannet/web/section.cljc +++ b/src/main/dk/cst/dannet/web/section.cljc @@ -11,7 +11,9 @@ [[nil [:rdf/type :owl/sameAs :skos/definition + :wn/definition :rdfs/comment + :wn/partOfSpeech :lexinfo/partOfSpeech :lexinfo/senseExample :dns/sentiment @@ -29,10 +31,15 @@ :ontolex/sense :ontolex/isSenseOf :ontolex/lexicalizedSense - :ontolex/isLexicalizedSenseOf]] + :ontolex/isLexicalizedSenseOf + :wn/ili + :wn/eq_synonym]] [#{(->LangStr "Semantic relations" "en") (->LangStr "Betydningsrelationer" "da")} - (some-fn (prefix/with-prefix 'wn :except #{:wn/partOfSpeech}) + (some-fn (prefix/with-prefix 'wn :except #{:wn/partOfSpeech + :wn/definition + :wn/ili + :wn/eq_synonym}) (comp #{:dns/usedFor :dns/usedForObject :dns/nearAntonym