Skip to content

Commit

Permalink
Add bcf support for vcf validator.
Browse files Browse the repository at this point in the history
  • Loading branch information
niyarin committed Jul 27, 2023
1 parent dfd9762 commit 0f0c36e
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 45 deletions.
83 changes: 59 additions & 24 deletions src/cljam/io/vcf/util/validator.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.vcf.util.validator
(:require [cljam.io.vcf.util :as vcf-util]))
(:require [cljam.io.vcf.util :as vcf-util]
[clojure.string :as cstr]))

(defn- valid-ref? [s]
(and (string? s)
Expand All @@ -17,7 +18,24 @@
(and (sequential? filt)
(every? keyword? filt))))

(defn- check-base-records [variant meta-included-contig? meta-included-format?]
(defn- validate-alleles [alt vref]
(when (sequential? alt)
(keep-indexed
#(cond
(and (not (string? %2)) (not (nil? %2)))
(format "An allele must be string but got other type at index %d."
%1)

(= %2 "")
(format "An allele cannot be empty but got an empty allele at index %d."
%1)

(= :other (:type (vcf-util/inspect-allele vref %2)))
(format "Contains bad allele at index %d." %1))
alt)))

(defn- check-base-records [vcf-or-bcf variant meta-included-contig?
meta-included-filter? meta-included-format?]
(reduce-kv
(fn [m k vs]
(reduce
Expand All @@ -30,18 +48,24 @@
[[(complement (partial some #{\< \> \, \space \tab}))
(format "Must not contain whitespaces commas or angle brackets, but got %s."
(str (vec (distinct (keep #{\< \> \, \space \tab} (:chr variant))))))]
[meta-included-contig? (format (str "CHROM %s must be declared in a "
"##contig field in the header.")
(:chr variant))]],
[#(or (= vcf-or-bcf :vcf)
(meta-included-contig? %))
(format (str "CHROM %s must be declared in a "
"##contig field in the header.")
(:chr variant))]],
:pos [[integer? "Position must be Integer."]],
:ref [[valid-ref? "Must consist of ACGTNacgtn"]]
:ref [[valid-ref? "Must consist of ACGTNacgtn."]]
:alt [[(every-pred seq sequential?) "Must be a sequence."]
[(every-pred
sequential?
(partial every? #(not= :other (:type (vcf-util/inspect-allele (:ref variant) %)))))
"Invalid allele."]]
[#(not (seq (validate-alleles % (:ref variant))))
(cstr/join ", " (validate-alleles (:alt variant) (:ref variant)))]]
:qual [[valid-qual? "Qual must be float."]]
:filter [[valid-filter? "Invalid filter."]]
:filter [[valid-filter? "Invalid filter."]
[#(or (= vcf-or-bcf :vcf)
(every? meta-included-filter? %))
(and (sequential? (:filter variant))
(format "Filter identifiers %s must in meta-info."
(vec (remove meta-included-filter?
(:filter variant)))))]]
:format [[meta-included-format? "Must be contain meta"]]}))

(defn- check-entry-type [entry type-str]
Expand Down Expand Up @@ -97,6 +121,10 @@
(let [contigs (into #{} (map :id) contig)]
#(or (nil? %) (contigs %))))

(defn- make-header-contained-filter-validator [filter]
(let [filter-ids (into #{} (map (comp keyword :id)) filter)]
#(or (nil? %) (= % :PASS) (filter-ids %))))

(defn- make-format-validator [format]
#(or (nil? %) ((set (map (comp keyword :id) format)) %)))

Expand All @@ -116,31 +144,38 @@
nil
info))))

(defn- make-validator* [{:keys [contig format info]} header]
(defn- make-validator* [vcf-or-bcf {:keys [contig format info filter]} header]
(let [meta-included-contig? (make-contig-validator contig)
meta-included-filter? (make-header-contained-filter-validator filter)
meta-included? (make-format-validator format)
info-validator (make-info-validator info)
samples (drop 8 header)]
(fn [variant]
(merge (check-base-records variant meta-included-contig? meta-included?)
(merge (check-base-records vcf-or-bcf variant meta-included-contig? meta-included-filter? meta-included?)
(info-validator (:info variant) (count (:alt variant)))
(check-each-samples variant samples format)))))

(defn invalid-variant?
"Check variant and return returns a map that explains bad positions"
[variant meta-info header]
((make-validator* meta-info header) variant))
"Check variant and return returns a map that explains bad positions.
If you want to check writing bcf, pass :bcf as vcf-or-bcf."
([variant meta-info header]
(invalid-variant? variant meta-info header :vcf))
([variant meta-info header vcf-or-bcf]
((make-validator* vcf-or-bcf meta-info header) variant)))

(defn validate-variants
"Checks if there is any invalid variant in the given sequence `variants`.
The validity of a variant is defined by `meta-info` and `header`.
Returns a lazy sequence of the same elements of the input if there are no
invalid variant. The validation is evaluated lazily and throws an exception
at the first invalid variant."
[meta-info header variants]
(let [validator (make-validator* meta-info header)]
(map (fn [v]
(when-let [info (validator v)]
(throw (ex-info (str "VCF validatoin failed: " (keys info))
(assoc info :variant v))))
v) variants)))
at the first invalid variant.
If you want to check writing bcf, pass :bcf as vcf-or-bcf."
([meta-info header variants]
(validate-variants meta-info :vcf header variants))
([meta-info vcf-or-bcf header variants]
(let [validator (make-validator* vcf-or-bcf meta-info header)]
(map (fn [v]
(when-let [info (validator v)]
(throw (ex-info (str "VCF validatoin failed: " (keys info))
(assoc info :variant v))))
v) variants))))
85 changes: 64 additions & 21 deletions test/cljam/io/vcf/util/validator_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@
base-header))
["Must not contain whitespaces commas or angle brackets, but got [\\space]."]))
(is (= (:chr
(validator/invalid-variant? {:chr "chr10" :pos 10 :ref "A" :alt ["AT"]}
{} base-header))
(validator/invalid-variant? {:chr "chr10" :pos 10 :ref "A" :alt ["AT"]}
{} base-header :bcf))
["CHROM chr10 must be declared in a ##contig field in the header."])))
(testing "ref check"
(is (= (:ref
Expand All @@ -101,29 +101,40 @@
:ref "AP"}
{:contig [{:id "chr10"}]}
base-header))
["Not include invalid char."]))
["Must consist of ACGTNacgtn."]))
(is (= (:ref
(validator/invalid-variant?
{:chr "chr10" :pos 10
:ref "" :alt ["GT"]}
{:contig [{:id "chr10"}]}
base-header))
["Not include invalid char."])))
["Must consist of ACGTNacgtn."])))
(testing "alt check"
(is (=
(:alt
(validator/invalid-variant?
{:chr "chr10" :pos 10 :ref "A" :alt "AT"}
{:contig [{:id "chr10"}]}
base-header))
["Must be a sequence." "Invalid allele."]))
(is (=
(:alt
(validator/invalid-variant?
{:chr "chr10" :pos 10 :ref "A" :alt ""}
{:contig [{:id "chr10"}]}
base-header))
["Must be a sequence." "Invalid allele."]))
["Must be a sequence."]))
(is (= (:alt
(validator/invalid-variant?
{:chr "chr10" :pos 10 :ref "A" :alt ""}
{:contig [{:id "chr10"}]}
base-header))
["Must be a sequence."]))
(is (= (:alt (validator/invalid-variant?
{:chr "chr10" :pos 10 :ref "A" :alt ["AP"]}
{:contig [{:id "chr10"}]}
base-header))
["Contains bad allele at index 0."]))
(is (= (:alt (validator/invalid-variant?
{:chr "chr10" :pos 10 :ref "A" :alt ["" 1]}
{:contig [{:id "chr10"}]}
base-header))
[(str "An allele cannot be empty but got an empty allele at index 0"
"., "
"An allele must be string but got other type at index 1.")]))
(is (nil?
(validator/invalid-variant?
{:chr "chr10" :pos 10 :alt ["A" "<DEL>" nil]
Expand All @@ -145,11 +156,11 @@
:ref "G" :alt ["GT"]}
{:contig [{:id "chr10"}]}
base-header))))
(testing "filter check"
(testing "vcf filter check"
(is (nil?
(validator/invalid-variant?
{:chr "chr10" :pos 10
:filter :PASS
:filter [:PASS]
:ref "G" :alt ["GT"]}
{:contig [{:id "chr10"}]}
base-header)))
Expand All @@ -160,6 +171,32 @@
:ref "G" :alt ["GT"]}
{:contig [{:id "chr10"}]}
base-header))))
(testing "bcf filter check"
(is (= (:filter
(validator/invalid-variant?
{:chr "chr10" :pos 10
:filter [:q10]
:ref "G" :alt ["GT"]}
{:contig [{:id "chr10"}]}
base-header
:bcf))
["Filter identifiers [:q10] must in meta-info."]))
(is (nil?
(validator/invalid-variant?
{:chr "chr10" :pos 10
:filter [:q10]
:ref "G" :alt ["GT"]}
{:contig [{:id "chr10"}] :filter [{:id "q10"}]}
base-header
:bcf)))
(is (nil?
(validator/invalid-variant?
{:chr "chr10" :pos 10
:filter [:PASS]
:ref "G" :alt ["GT"]}
{:contig [{:id "chr10"}] :filter [{:id "q10"}]}
base-header
:bcf))))

(testing "info check"
(is (= (validator/invalid-variant?
Expand All @@ -178,10 +215,16 @@
base-header)))))

(deftest make-validator-test
(is (thrown? clojure.lang.ExceptionInfo
(doall (validator/validate-variants {} base-header [{:chr "<chr10>" :pos 100}]))))
(let [org [{:chr "chr10" :pos 10 :ref "A" :alt ["AT"]}]]
(is (= (validator/validate-variants
{:contig [{:id "chr10"}]}
base-header
org) org))))
(testing "bcf"
(is (thrown? clojure.lang.ExceptionInfo
(doall (validator/validate-variants
{} base-header [{:chr "chr10" :pos 100}])))))
(testing "vcf"
(is (thrown? clojure.lang.ExceptionInfo
(doall (validator/validate-variants
{} base-header [{:chr "<chr10>" :pos 100}]))))
(let [org [{:chr "chr10" :pos 10 :ref "A" :alt ["AT"]}]]
(is (= (validator/validate-variants
{:contig [{:id "chr10"}]}
base-header
org) org)))))

0 comments on commit 0f0c36e

Please sign in to comment.