Skip to content

Commit

Permalink
#3 - import open English WordNet
Browse files Browse the repository at this point in the history
- add relevant schemas
- move hash functions to separate ns
- also include hash of prefix data
  • Loading branch information
simongray committed Dec 19, 2022
1 parent ac3a4ab commit 2de5293
Show file tree
Hide file tree
Showing 8 changed files with 1,521 additions and 126 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ bootstrap/other/cor/ddo_bet_corextlink.csv
# File sent by Sanni in email on 2022-05-23 (renamed).
bootstrap/other/sentiment/sense_polarities.tsv

# The open English WordNet
bootstrap/other/english/english-wordnet-2021.ttl

.cpcache
.DS_Store

Expand All @@ -30,4 +33,4 @@ resources/public/js/compiled
out/
target/
dannet.jar
db/
/db/
832 changes: 832 additions & 0 deletions resources/schemas/external/lime.xml

Large diffs are not rendered by default.

513 changes: 513 additions & 0 deletions resources/schemas/external/synsem.xml

Large diffs are not rendered by default.

75 changes: 11 additions & 64 deletions src/main/dk/cst/dannet/bootstrap.clj
Original file line number Diff line number Diff line change
Expand Up @@ -15,70 +15,17 @@
(:require [clojure.set :as set]
[clojure.java.io :as io]
[clojure.string :as str]
[clojure.walk :as walk]
[clojure.data.csv :as csv]
[ont-app.vocabulary.lstr :refer [->LangStr]]
[better-cond.core :as better]
[dk.cst.dannet.hash :as h]
[dk.cst.dannet.web.components :as com]
[dk.cst.dannet.prefix :as prefix])
(:import [java.util Date]))

;; TODO: sense mapping seems wrong http://localhost:3456/dannet/external/cor/COR.30123
;; TODO: weird? http://localhost:3456/dannet/data/synset-47363

;; Via jpmonettas: https://clojurians.slack.com/archives/C03S1KBA2/p1670838328124429
;; Copy-pasted from: https://github.com/jpmonettas/hansel/blob/master/src/hansel/instrument/forms.clj#L829-L865
(defn normalize-gensyms
"When the reader reads things like #(+ % %) it uses a global id to generate symbols,
so everytime will read something different, like :
(fn* [p1__37935#] (+ p1__37935# p1__37935#))
(fn* [p1__37939#] (+ p1__37939# p1__37939#))
Normalize symbol can be applied to generate things like :
(fn* [p__0] (+ p__0 p__0)).
Useful for generating stable form hashes."
[form]
(let [psym->id (atom {})
gensym? (fn [x]
(and (symbol? x)
(re-matches #"^p([\d])__([\d]+)#$" (name x))))
normal (fn [psym]
(let [ids @psym->id
nsymid (if-let [id (get ids psym)]
id

(if (empty? ids)
0
(inc (apply max (vals ids)))))]

(swap! psym->id assoc psym nsymid)

(symbol (str "p__" nsymid))))]
(walk/postwalk
(fn [x]
(if (gensym? x)
(normal x)
x))
form)))

(defn hash-form
"Ensure that the sequential `form` coll hashes the same across restarts."
[form]
(hash (mapv str (normalize-gensyms form))))

(defmacro def-hashed
"A regular def macro that hashes its own body and attaches this to :hash."
[& [name :as args]]
`(do
(def ~@args)
(alter-meta! #'~name assoc :hash (hash-form (quote ~args)))))

(defmacro defn-hashed
"A regular defn macro that hashes its own body and attaches this to :hash."
[& [name :as args]]
`(do
(defn ~@args)
(alter-meta! #'~name assoc :hash (hash-form (quote ~args)))))

(defn da
[s]
(->LangStr s "da"))
Expand All @@ -102,7 +49,7 @@
"The RDF resource URI for the DanNet/EuroWordNet concepts."
(prefix/prefix->rdf-resource 'dnc))

(def-hashed metadata-triples
(h/def metadata-triples
"Metadata for the DanNet dataset is defined here since it doesn't have a
associated .ttl file. The Dublin Core Terms NS is used below which supersedes
the older DC namespace (see: https://www.dublincore.org/schemas/rdfs/ )."
Expand Down Expand Up @@ -274,7 +221,7 @@
token
(recur tokens))))))

(defn-hashed examples
(h/defn examples
"Convert a `row` from 'synsets.csv' to example key-value pairs."
[[synset-id label gloss _ :as row]]
(when-let [[_ example-str] (re-find brug gloss)]
Expand Down Expand Up @@ -406,7 +353,7 @@
(->> (clean-ontological-type "LanguageRepresentation+Artifact+Object")
(explode-ontological-type synset)))))

(defn-hashed ->synset-triples
(h/defn ->synset-triples
"Convert a `row` from 'synsets.csv' to triples."
[[synset-id label gloss ontological-type :as row]]
(if (= synset-id "8715")
Expand Down Expand Up @@ -441,7 +388,7 @@
[inherit :dns/inheritedFrom from]
[inherit :dns/inheritedRelation rel]})))

(defn-hashed ->relation-triples
(h/defn ->relation-triples
"Convert a `row` from 'relations.csv' to triples.
Note: certain rows are unmapped, so the relation will remain a string!"
Expand Down Expand Up @@ -490,7 +437,7 @@
(apply str "\"" s "\"" after))

;; TODO: investigate semantics of ' in input forms of multiword expressions
(defn-hashed ->word-triples
(h/defn ->word-triples
"Convert a `row` from 'words.csv' to triples."
[[word-id form pos _ :as row]]
(when (and (= (count row) 4)
Expand Down Expand Up @@ -537,7 +484,7 @@
(re-find #"slang" register)
(conj [sense :lexinfo/register :lexinfo/slangRegister]))))

(defn-hashed ->sense-triples
(h/defn ->sense-triples
"Convert a `row` from 'wordsenses.csv' to triples."
[[sense-id word-id synset-id register _ :as row]]
(when (and (= (count row) 5)
Expand Down Expand Up @@ -573,7 +520,7 @@
(get #{"n" "nx" "nxx"} pol) :marl/Negative
:else :marl/Neutral))

(defn-hashed ->sentiment-triples
(h/defn ->sentiment-triples
"Convert a `row` from 'sense_polarities.tsv' to Opinion triples.
In ~2000 cases a sense-id will be missing (it has the same ID as the word-id).
Expand Down Expand Up @@ -632,7 +579,7 @@
"kolon" nil})

;; http://dsn.dk/sprogets-udvikling/sprogteknologi-og-fagsprog/cor#
(defn-hashed ->cor-k-triples
(h/defn ->cor-k-triples
"Convert a `row` from the COR-K ID file to triples; assumes that the
rows have been preprocessed by 'preprocess-cor-k' beforehand.
Expand Down Expand Up @@ -672,11 +619,11 @@
rep-id
(conj [lexical-form :rdfs/seeAlso full]))))

(defn-hashed ->cor-ext-triples
(h/defn ->cor-ext-triples
[[id lemma comment _ _ _ grammar form :as row]]
(->cor-k-triples (with-meta [id lemma comment grammar form] (meta row))))

(defn-hashed ->cor-link-triples
(h/defn ->cor-link-triples
[[id word-id sense-id :as row]]
(let [[_ cor-ns lemma-id _ _] (re-matches cor-id id)
cor-word (cor-uri cor-ns lemma-id)
Expand Down
29 changes: 24 additions & 5 deletions src/main/dk/cst/dannet/db.clj
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
[dk.cst.dannet.db.csv :as db.csv]
[dk.cst.dannet.prefix :as prefix]
[dk.cst.dannet.web.components :as com]
[dk.cst.dannet.bootstrap :as bootstrap :refer [defn-hashed]]
[dk.cst.dannet.bootstrap :as bootstrap]
[dk.cst.dannet.hash :as h]
[dk.cst.dannet.query :as q]
[dk.cst.dannet.query.operation :as op]
[dk.cst.dannet.transaction :as txn])
Expand All @@ -38,9 +39,15 @@
(->> (for [{:keys [alt uri export]} (vals prefix/schemas)]
(when-not export
(if alt
(if (or (str/starts-with? alt "http://")
(str/starts-with? alt "https://"))
(cond
(= alt :no-schema)
nil

(or (str/starts-with? alt "http://")
(str/starts-with? alt "https://"))
alt

:else
(io/resource alt))
uri)))
(filter some?)))
Expand Down Expand Up @@ -186,7 +193,7 @@
[^Dataset dataset ^String model-uri]
(.getGraph (get-model dataset model-uri)))

(defn-hashed add-bootstrap-import!
(h/defn add-bootstrap-import!
"Add the `bootstrap-imports` of the old DanNet CSV files to a Jena `dataset`."
[dataset bootstrap-imports]
(let [{:keys [examples]} (get bootstrap-imports prefix/dn-uri)
Expand Down Expand Up @@ -286,6 +293,15 @@

dataset))

(h/defn add-open-english-wordnet!
"Add the Open English WordNet to a Jena `dataset`."
[dataset]
(println "Importing Open English Wordnet...")
(txn/transact-exec dataset
(aristotle/read (get-graph dataset "https://en-word.net/")
"bootstrap/other/english/english-wordnet-2021.ttl"))
(println "Open English Wordnet imported!"))

(defn ->dataset
"Get a Dataset object of the given `db-type`. TDB also requires a `db-path`."
[db-type & [db-path]]
Expand Down Expand Up @@ -336,7 +352,9 @@
:or {db-type :in-mem} :as opts}]
(let [files (bootstrap-files bootstrap-imports)
fn-hashes (conj bootstrap/hashes
(:hash (meta #'add-bootstrap-import!)))
(:hash (meta #'add-bootstrap-import!))
(:hash (meta #'add-open-english-wordnet!))
(hash prefix/schemas))
;; Undo potentially negative number by bit-shifting.
files-hash (pos-hash files)
bootstrap-hash (pos-hash fn-hashes)
Expand All @@ -354,6 +372,7 @@
(do
(println "Data input has changed -- rebuilding database...")
(add-bootstrap-import! dataset bootstrap-imports)
(add-open-english-wordnet! dataset)
(println new-entry)
(spit log-path (str new-entry "\n----\n") :append true)))
(println "WARNING: no imports!"))
Expand Down
64 changes: 64 additions & 0 deletions src/main/dk/cst/dannet/hash.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
(ns dk.cst.dannet.hash
"Functions for hashing program data; used to invoke database rebuilds.
The :hash key of the metadata attached to any hashed def/defn forms can be
checked at runtime. If this hash differs from the last recorded hash, the form
must have changed in some way.
The key functions and data in DanNet have been decorated with hashes.
These hashes are checked when instantiating an instance of a DanNet database."
(:require [clojure.walk :as walk])
(:refer-clojure :exclude [defn]))

;; Via jpmonettas: https://clojurians.slack.com/archives/C03S1KBA2/p1670838328124429
;; Copy-pasted from: https://github.com/jpmonettas/hansel/blob/master/src/hansel/instrument/forms.clj#L829-L865
(clojure.core/defn normalize-gensyms
"When the reader reads things like #(+ % %) it uses a global id to generate symbols,
so everytime will read something different, like :
(fn* [p1__37935#] (+ p1__37935# p1__37935#))
(fn* [p1__37939#] (+ p1__37939# p1__37939#))
Normalize symbol can be applied to generate things like :
(fn* [p__0] (+ p__0 p__0)).
Useful for generating stable form hashes."
[form]
(let [psym->id (atom {})
gensym? (fn [x]
(and (symbol? x)
(re-matches #"^p([\d])__([\d]+)#$" (name x))))
normal (fn [psym]
(let [ids @psym->id
nsymid (if-let [id (get ids psym)]
id

(if (empty? ids)
0
(inc (apply max (vals ids)))))]

(swap! psym->id assoc psym nsymid)

(symbol (str "p__" nsymid))))]
(walk/postwalk
(fn [x]
(if (gensym? x)
(normal x)
x))
form)))

(clojure.core/defn hash-form
"Ensure that the sequential `form` coll hashes the same across restarts."
[form]
(hash (mapv str (normalize-gensyms form))))

(defmacro def
"A regular def macro that hashes its own body and attaches this to :hash."
[& [name :as args]]
`(do
(def ~@args)
(alter-meta! #'~name assoc :hash (hash-form (quote ~args)))))

(defmacro defn
"A regular defn macro that hashes its own body and attaches this to :hash."
[& [name :as args]]
`(do
(clojure.core/defn ~@args)
(alter-meta! #'~name assoc :hash (hash-form (quote ~args)))))
Loading

0 comments on commit 2de5293

Please sign in to comment.