Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix uncertain insertion #10

Merged
merged 6 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject clj-hgvs "0.4.7"
(defproject clj-hgvs "0.5.0-SNAPSHOT"
:description "Clojure(Script) library for handling HGVS"
:url "https://github.com/chrovis/clj-hgvs"
:license {:name "Apache License, Version 2.0"
Expand Down
132 changes: 94 additions & 38 deletions src/clj_hgvs/mutation.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@
;;; g.122_123ins123_234inv (TODO)
;;; g.122_123ins213_234invinsAins123_211inv (TODO)
;;; g.549_550insN
;;; g.1134_1135ins(100)
;;; g.1134_1135insN[100]
;;; g.?_?insNC_000023.10:(12345_23456)_(34567_45678)

(defrecord DNAInsertion [coord-start coord-end alt]
Expand All @@ -445,10 +445,10 @@
(string? alt) (case ins-format
:auto (if (and (every? #(= % \N) alt)
(>= (count alt) 10))
(str "(" (count alt) ")")
(str "N[" (count alt) "]")
alt)
:bases alt
:count (str "(" (count alt) ")"))
:count (str "N[" (count alt) "]"))
(map? alt) [(:transcript alt)
":"
(coord/format (:coord-start alt))
Expand Down Expand Up @@ -478,7 +478,7 @@
(defn- parse-dna-insertion-alt
[s kind]
(or (re-matches #"[A-Z]+" s)
(some-> (re-matches #"\((\d+)\)" s)
(some-> (re-matches #"N\[(\d+)\]" s)
(second)
(intl/parse-long)
(repeat "N")
Expand Down Expand Up @@ -624,20 +624,28 @@
;;; e.g. g.6775delinsGA
;;; g.6775delTinsGA
;;; c.145_147delinsTGG
;;; c.145_147delinsN[10]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though only a reference page of DNA deletion-insertion have an example of uncertain insertion, theoretically, RNA deletion-insertion and protein deletion-insertion can also have the uncertain insertion. Mutalyzer supports delinsn, e.g. https://mutalyzer.nl/normalizer/NM_004006.2:r.2623_2803delinsn. Implementation of the uncertain insertion for only DNA deletion-insertion seems incomplete 🤔


(defrecord DNAIndel [coord-start coord-end ref alt]
Mutation
(format [this] (format this nil))
(format [this {:keys [show-bases?] :or {show-bases? false}}]
(format [this {:keys [show-bases? ins-format] :or {show-bases? false ins-format :auto}}]
(apply str (flatten [(coord/format coord-start)
(if (and coord-end
(or (not (coord/comparable-coordinates? coord-start coord-end))
(neg? (compare coord-start coord-end))))
(when (and coord-end
(or (not (coord/comparable-coordinates? coord-start coord-end))
(neg? (compare coord-start coord-end))))
["_" (coord/format coord-end)])
"del"
(if show-bases? ref)
(when show-bases? ref)
"ins"
alt])))
(when (string? alt)
(case ins-format
:auto (if (and (every? #(= % \N) alt)
(>= (count alt) 10))
(str "N[" (count alt) "]")
alt)
:bases alt
:count (str "N[" (count alt) "]")))])))
(plain [this]
(into {:mutation "dna-indel"} (plain-coords this)))

Expand All @@ -663,6 +671,15 @@
:clj-hgvs.mutation.dna-indel/ref
:clj-hgvs.mutation.dna-indel/alt])))

(defn- parse-dna-indel-alt
[s]
(or (re-matches #"[A-Z]+" s)
(some-> (re-matches #"N\[(\d+)\]" s)
(second)
(intl/parse-long)
(repeat "N")
(#(apply str %)))))

(defn dna-indel
"Constructor of DNAIndel. Throws an exception if any input is illegal."
[coord-start coord-end ref alt]
Expand All @@ -673,13 +690,16 @@
(DNAIndel. coord-start coord-end ref alt))

(def ^:private dna-indel-re
#"([\d\-\+\*\?]+)(?:_([\d\-\+\*\?]+))?del([A-Z]+)?ins([A-Z]+)")
#"([\d\-\+\*\?]+)(?:_([\d\-\+\*\?]+))?del([A-Z]+)?ins(N\[\d+\]|[A-Z]+)")

(defn parse-dna-indel
[s kind]
(let [[_ coord-s coord-e ref alt] (re-matches dna-indel-re s)
parse-coord (coord-parser kind)]
(dna-indel (parse-coord coord-s) (some-> coord-e parse-coord) ref alt)))
(dna-indel (parse-coord coord-s)
(some-> coord-e parse-coord)
ref
(parse-dna-indel-alt alt))))

(defmethod restore "dna-indel"
[m]
Expand Down Expand Up @@ -1036,20 +1056,25 @@
;;; RNA - insertion
;;;
;;; e.g. r.756_757insacu
;;; r.431_432ins(5)
;;; r.431_432insn[5]
;;; r.123_124insL37425.1:23_361

(defrecord RNAInsertion [coord-start coord-end alt]
Mutation
(format [this] (format this nil))
(format [this _]
(format [this {:keys [ins-format] :or {ins-format :auto}}]
(str (coord/format coord-start)
"_"
(coord/format coord-end)
"ins"
(cond
(map? alt) (str (:genbank alt) ":" (:coord-start alt) "_" (:coord-end alt))
(re-matches #"n{2,}" alt) (str "(" (count alt) ")")
(re-matches #"n{2,}" alt) (case ins-format
:auto (if (>= (count alt) 10)
(str "n[" (count alt) "]")
alt)
:bases alt
:count (str "n[" (count alt) "]"))
:else alt)))
(plain [this]
(into {:mutation "rna-insertion"} (plain-coords this))))
Expand All @@ -1074,9 +1099,9 @@

(defn- parse-rna-alt-n
[s]
(if-let [n (some-> (re-find #"\((\d)\)" s)
second
intl/parse-long)]
(when-let [n (some-> (re-find #"n\[(\d+)\]" s)
second
intl/parse-long)]
(apply str (repeat n "n"))))

(def ^:private genbank-re
Expand All @@ -1098,8 +1123,8 @@
(rna-insertion (coord/parse-rna-coordinate coord-s)
(some-> coord-e coord/parse-rna-coordinate)
(cond
(re-find #"n\[\d+\]" alt) (parse-rna-alt-n alt)
(re-find #"[a-z]+" alt) alt
(re-find #"\(\d\)" alt) (parse-rna-alt-n alt)
:else (parse-rna-alt-genbank alt)))))

(defmethod restore "rna-insertion"
Expand Down Expand Up @@ -1218,20 +1243,28 @@
;;; e.g. r.775delinsga
;;; r.775deluinsga
;;; r.775_777delinsc
;;; r.775_777delinsn[10]

(defrecord RNAIndel [coord-start coord-end ref alt]
Mutation
(format [this] (format this nil))
(format [this {:keys [show-bases?] :or {show-bases? false}}]
(format [this {:keys [show-bases? ins-format] :or {show-bases? false ins-format :auto}}]
(str (coord/format coord-start)
(if (and coord-end
(or (not (coord/comparable-coordinates? coord-start coord-end))
(neg? (compare coord-start coord-end))))
(when (and coord-end
(or (not (coord/comparable-coordinates? coord-start coord-end))
(neg? (compare coord-start coord-end))))
(str "_" (coord/format coord-end)))
"del"
(if show-bases? ref)
(when show-bases? ref)
"ins"
alt))
(if (re-matches #"n{2,}" alt)
(case ins-format
:auto (if (>= (count alt) 10)
(str "n[" (count alt) "]")
alt)
:bases alt
:count (str "n[" (count alt) "]"))
alt)))
(plain [this]
(into {:mutation "rna-indel"} (plain-coords this)))

Expand Down Expand Up @@ -1267,15 +1300,17 @@
(RNAIndel. coord-start coord-end ref alt))

(def ^:private rna-indel-re
#"([\d\-\+\*]+)(?:_([\d\-\+\*]+))?del([a-z]+)?ins([a-z]+)")
#"([\d\-\+\*]+)(?:_([\d\-\+\*]+))?del([a-z]+)?ins([a-z\d\[\]]+)")

(defn parse-rna-indel
[s]
(let [[_ coord-s coord-e ref alt] (re-matches rna-indel-re s)]
(rna-indel (coord/parse-rna-coordinate coord-s)
(some-> coord-e coord/parse-rna-coordinate)
ref
alt)))
(if (re-find #"n\[\d+\]" alt)
(parse-rna-alt-n alt)
alt))))

(defmethod restore "rna-indel"
[m]
Expand Down Expand Up @@ -1724,12 +1759,12 @@
;;; Protein - insertion
;;;
;;; e.g. Lys23_Leu24insArgSerGln
;;; Arg78_Gly79ins23
;;; Arg78_Gly79insX[23]

(defrecord ProteinInsertion [ref-start coord-start ref-end coord-end alts]
Mutation
(format [this] (format this nil))
(format [this {:keys [amino-acid-format] :or {amino-acid-format :long}}]
(format [this {:keys [amino-acid-format ins-format] :or {amino-acid-format :long ins-format :auto}}]
(apply str (flatten [(cond-> ref-start
(= amino-acid-format :short) ->short-amino-acid)
(coord/format coord-start)
Expand All @@ -1739,7 +1774,14 @@
(coord/format coord-end)
"ins"
(if (every? #(= % "Xaa") alts)
(count alts)
(let [alts (cond->> alts
(= amino-acid-format :short) (map ->short-amino-acid))]
(case ins-format
:auto (if (>= (count alts) 10)
(str "X[" (count alts) "]")
alts)
:amino-acids alts
:count (str "X[" (count alts) "]")))
(cond->> alts
(= amino-acid-format :short) (map ->short-amino-acid)))])))
(plain [this]
Expand Down Expand Up @@ -1770,10 +1812,14 @@
[s]
(condp re-matches s
#"([A-Z*]([a-z]{2})?)+" (mapv ->long-amino-acid (re-seq #"[A-Z*](?:[a-z]{2})?" s))
#"\d+" (vec (repeat (intl/parse-long s) "Xaa"))))
#"X\[\d+\]" (-> (re-find #"X\[(\d+)\]" s)
second
intl/parse-long
(repeat "Xaa")
vec)))

(def ^:private protein-insertion-re
#"([A-Z](?:[a-z]{2})?)(\d+)_([A-Z](?:[a-z]{2})?)(\d+)ins([\da-zA-Z*]+)")
#"([A-Z](?:[a-z]{2})?)(\d+)_([A-Z](?:[a-z]{2})?)(\d+)ins([\da-zA-Z*\[\]]+)")

(defn parse-protein-insertion
[s]
Expand All @@ -1793,22 +1839,32 @@
;;;
;;; e.g. Cys28delinsTrpVal
;;; Cys28_Lys29delinsTrp
;;; Cys28_Lys29delinsX[10]

(defrecord ProteinIndel [ref-start coord-start ref-end coord-end alts]
Mutation
(format [this] (format this nil))
(format [this {:keys [amino-acid-format] :or {amino-acid-format :long}}]
(format [this {:keys [amino-acid-format ins-format] :or {amino-acid-format :long ins-format :auto}}]
(apply str (flatten [(cond-> ref-start
(= amino-acid-format :short) ->short-amino-acid)
(coord/format coord-start)
(if (should-show-end? ref-start coord-start ref-end coord-end)
(when (should-show-end? ref-start coord-start ref-end coord-end)
["_"
(cond-> ref-end
(= amino-acid-format :short) ->short-amino-acid)
(coord/format coord-end)])
"delins"
(cond->> alts
(= amino-acid-format :short) (map ->short-amino-acid))])))
(if (every? #(= % "Xaa") alts)
(let [alts (cond->> alts
(= amino-acid-format :short) (map ->short-amino-acid))]
(case ins-format
:auto (if (>= (count alts) 10)
(str "X[" (count alts) "]")
alts)
:amino-acids alts
:count (str "X[" (count alts) "]")))
(cond->> alts
(= amino-acid-format :short) (map ->short-amino-acid)))])))
(plain [this]
(into {:mutation "protein-indel"} (plain-coords this))))

Expand All @@ -1835,7 +1891,7 @@
(ProteinIndel. ref-start coord-start ref-end coord-end alts))

(def ^:private protein-indel-re
#"([A-Z](?:[a-z]{2})?)(\d+)(?:_([A-Z](?:[a-z]{2})?)(\d+))?delins([A-Z*][a-zA-Z*]*)?")
#"([A-Z](?:[a-z]{2})?)(\d+)(?:_([A-Z](?:[a-z]{2})?)(\d+))?delins([A-Z*][a-zA-Z*\[\]\d]*)?")

(defn parse-protein-indel
[s]
Expand All @@ -1844,7 +1900,7 @@
(coord/parse-protein-coordinate coord-s)
(->long-amino-acid ref-e)
(some-> coord-e coord/parse-protein-coordinate)
(mapv ->long-amino-acid (some->> alts (re-seq #"[A-Z*](?:[a-z]{2})?"))))))
(mapv ->long-amino-acid (some->> alts parse-protein-insertion-alts)))))

(defmethod restore "protein-indel"
[m]
Expand Down
15 changes: 15 additions & 0 deletions src/clj_hgvs/repairer.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,20 @@
(string/replace s #"del\d+ins" "delins")
s))

;; g.1134_1135ins(100) -> g.1134_1135insN[100]
;; r.431_432ins(5) -> r.431_432insn[5]
;; p.R78_G79ins23 -> p.R78_G79insX[23]
(defn ^:no-doc replace-uncertain-bases
[s kind]
(case kind
(:genome :mitochondria :coding-dna :non-coding-dna :circular-dna)
(string/replace s #"ins\((\d+)\)$" "insN[$1]")
(:rna)
(string/replace s #"ins\((\d+)\)$" "insn[$1]")
(:protein)
(string/replace s #"ins(\d+)$" "insX[$1]")
s))

;; c.112GAT(14) -> c.112GAT[14]
(defn ^:no-doc replace-repeated-seqs-parens1
[s kind]
Expand Down Expand Up @@ -318,6 +332,7 @@
remove-alternative
remove-inv-bases
remove-del-count-from-delins
replace-uncertain-bases
replace-repeated-seqs-parens1
replace-repeated-seqs-parens2
remove-genomic-bases-from-protein
Expand Down
5 changes: 5 additions & 0 deletions test/clj_hgvs/core_test.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,11 @@
;; remove-del-count-from-delins
"c.123_124del2insCTGA" "c.123_124delinsCTGA"

;; replace-uncertain-bases
"g.1134_1135ins(100)" "g.1134_1135insN[100]"
"r.431_432ins(5)" "r.431_432insn[5]"
"p.R78_G79ins23" "p.R78_G79insX[23]"

;; replace-repeated-seqs-parens1
"c.112GAT(14)" "c.112GAT[14]"

Expand Down
Loading
Loading