Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validator for writing sam. #278

Merged
merged 7 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 255 additions & 0 deletions src/cljam/io/sam/util/validator.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
(ns cljam.io.sam.util.validator)

(defn- error [path msg & args]
{:errors {path [(apply format msg args)]}})

(defn- merge-validation-results
([] nil)
([res] res)
([res1 res2]
(letfn [(rec [x y]
(cond (nil? x) y
(nil? y) x
(map? x) (merge-with rec x y)
:else (into x y)))]
(rec res1 res2)))
([res1 res2 res3 & more]
(reduce merge-validation-results res1 (list* res2 res3 more))))

(defn- validate-pos* [rname pos refmap]
(let [max-len (get-in refmap [rname :LN])]
(cond
(not (integer? pos)) ["Must be an integer."]
(not (<= 0 (int pos) Integer/MAX_VALUE))
["Must be in the [0, 2147483647]."]

(and max-len
(> (int pos) (int max-len)))
[(format "Must be less than or equal to %d." max-len)])))

(defn- validate-pos [{:keys [refmap]} {:keys [pos rname]}]
(when-let [err (validate-pos* rname pos refmap)]
(apply error :pos err)))

(defn- validate-pnext [{:keys [refmap]} {:keys [pnext rname]}]
(when-let [err (validate-pos* rname pnext refmap)]
(apply error :pnext err)))

(defn- validate-rname* [rname refmap]
(cond
(not (string? rname)) ["Must be a string."]
(and (not (= rname "*"))
(not (get refmap rname)))
[(format (str "Must be declared as the SN value in the SQ line within the"
" header. (%s)") rname)]))

(defn- validate-rname [{:keys [refmap]} {:keys [rname]}]
(when-let [err (validate-rname* rname refmap)]
(apply error :rname err)))

(defn- validate-rnext [{:keys [refmap]} {:keys [rnext]}]
(when-let [err (validate-rname* rnext refmap)]
(error :rnext err)))

(defn- validate-qname [_ {:keys [qname]}]
(if (not (string? qname))
(error :qname "Must be a string.")
(when-let [res
(cond-> nil
(not (<= (count qname) 254))
(conj "Must be less than or equal to 254 characters.")

(not (re-matches #"^[!-?A-~]+$" qname))
(conj "Must not contain illegal characters."))]
{:errors {:qname res}})))

(defn- validate-mapq [_ {:keys [mapq]}]
(cond
(not (integer? mapq)) (error :mapq "Must be an integer.")
(not (<= 0 (int mapq) 255))
(error :mapq "Must be in the [0-255].")))

(defn- validate-cigar [_ {:keys [cigar]}]
(cond
(not (string? cigar)) (error :cigar "Must be a string.")
(not (re-matches #"^\*|([0-9]+[MIDNSHPX=])+$" cigar))
(error :cigar "Invalid format.")))

(defn- validate-tlen [_ {:keys [tlen]}]
(cond
(not (integer? tlen)) (error :tlen "Must be integer.")
(not (<= (- Integer/MAX_VALUE) tlen Integer/MAX_VALUE))
(error :tlen "Must be in the [-2147483647,2147483647].")))

(defn- validate-qual [_ {:keys [qual]}]
(cond
(not (string? qual)) (error :qual "Must be a string.")
(not (re-matches #"[!-~]+" qual)) (error :qual "Must be composed only of ASCII characters within the valid phred33 range [!-~].")))

(defn- validate-seq [{:keys [file-type]} {seq' :seq}]
(cond
(not (string? seq')) (error :seq "Must be a string.")
(and (= file-type :bam) (not (re-matches #"\*|[=ACMGRSVTWYHKDBN]+" seq'))) (error :seq "Must not contain bad character.")
(and (= file-type :sam) (not (re-matches #"\*|[A-Za-z=.]+" seq'))) (error :seq "Must not contain bad character.")))

(defn- validate-sam-option [{:keys [value] type' :type}]
(case type'
"A" (when-not (and (char? value) (<= (int \!) (int value) (int \~)))
["Must be a char [!-~]."])
"i" (when-not (and (integer? value) (<= -2147483648 value 4294967295))
["Must be 32 bit signed integer."])
"f" (when-not (or (float? value) (integer? value))
["Must be a float."])
"Z" (when-not (and (string? value) (re-matches #"[ !-~]*" value))
["Must be a printable string [ !-~]*"])
"H" (when-not (and (sequential? value)
(every? (every-pred integer? #(<= -255 (int %) 255))
value))
["Must be a byte array."])
"B" (when-not (and (string? value)
(re-matches #"[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)*"
value))
["Must be a string of comma-separated array of numbers."])
[(format "Type %s is invalid" (str type'))]))

(defn- validate-bam-option [{:keys [value] type' :type}]
(case type'
"A" (when-not (and (char? value) (<= (int \!) (int value) (int \~)))
["Must be a char [!-~]."])
"c" (when-not (and (integer? value) (<= 0 value 127))
["Must be 8 bit signed integer."])
"C" (when-not (and (integer? value) (<= 0 value 255))
["Must be 8 bit unsigned integer."])
"s" (when-not (and (integer? value) (<= -32768 value 32767))
["Must be 16 bit signed integer."])
"S" (when-not (and (integer? value) (<= 0 value 65535))
["Must be 16 bit usgned integer."])
"i" (when-not (and (integer? value) (<= -2147483648 value 2147483647))
["Must be 32 bit signed integer."])
"I" (when-not (and (integer? value) (<= 0 value 4294967296))
["Must be 32 bit signed integer."])
"f" (when-not (or (float? value) (integer? value))
["Must be a float."])
"Z" (when-not (and (string? value) (re-matches #"[ !-~]*" value))
["Must be a printable string [ !-~]*"])
"H" (when-not (and (sequential? value)
(every? (every-pred integer? #(<= -255 (int %) 255))
value))
["Must be a byte array."])
"B" (when-not (and (string? value)
(re-matches #"[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)*"
value))
["Must be a string of comma-separated array of numbers."])
[(format "Type %s is invalid" (str type'))]))

(defn- validate-option [v file-type]
(case file-type
:sam (validate-sam-option v)
:bam (validate-bam-option v)))

(defn- validate-options [{:keys [file-type]} {:keys [options]}]
(map-indexed #(when-let [err (validate-option %2 file-type)]
(apply error [:options %1] err))
options))

(defn- validate-data-record [validator alignment]
(if (map? alignment)
(let [f (juxt validate-qname
validate-rname
validate-rnext
validate-pos
validate-pnext
validate-mapq
validate-cigar
validate-qual
validate-tlen
validate-seq)]
(apply merge-validation-results
(concat (f validator alignment)
(validate-options validator alignment))))
(error [] (str "Alignment must be a map, but got " (pr-str alignment)))))

(defn make-validator
"Creates a sam validator that is necessary for variant validation.
Takes the following three arguments:
- header: SAM's header columns (including mandatory columns)
- options: Validation options
The available validation options are:
- :file-type Specify the file type (either of :sam and :bam).
Defaults to :sam"
([header] (make-validator header {}))
([header {:keys [file-type] :or {file-type :sam}}]
{:file-type file-type
:refmap (into {} (map (juxt :SN identity) (:SQ header)))}))

(defn validate-alignment
"Checks if the given alignments data is in the format cljam expects, and returns
a validation result map pointing out the problematic portion of data that does
not conform to the format. Otherwise returns nil.
The validation result map looks like:
{:errors {[:chr] [\"...\"]
:warnings {[:pos] [\"...\"]}
alignment { ... alignment data ...}}}"

[validator alignment]
(let [res (validate-data-record validator alignment)]
(when (seq res)
(assoc res :alignment alignment))))

(defn validate-alignments
"Applies `validation-alignment` to each element of the given sequence and collects
non-nil validation results into a lazy sequence.
Returns a transducer if `alignments` is not specified."

([validator]
(keep (partial validate-alignment validator)))
([validator alignments]
(sequence (validate-alignments validator) alignments)))

(defn- stringify-validation-result-messages [m]
(with-out-str
(doseq [[i [path msgs]] (map-indexed vector m)
:let [path' (str path)
indent (apply str (repeat (+ (count path') 4) \space))]]
(when (not= i 0) (newline))
(printf " - %s: %s" path (first msgs))
(doseq [msg (rest msgs)]
(newline)
(printf "%s %s" indent msg)))))

(defn check-alignment
"Checks if the given alignment data is in the format cljam expects, and throws
an error if it doesn't conform to the format. Otherwise returns the input alignment
data.
Also, if any validation warning is found, it will be reported to stderr."

[validator alignment]
(let [{:keys [warnings errors] v :alignment :as res} (validate-alignment validator alignment)]
(when warnings
(binding [*out* *err*]
(printf "Alignment validation warning at %s\n%s"
(pr-str (cond-> v
(map? v)
(select-keys [:qname :rname :rnext :pos :pnext :mapq :cigar :qual :tlen :seq])))
(stringify-validation-result-messages warnings))
(newline)))
(when errors
(let [msg (format "Alignment validation error at %s\n%s"
(pr-str (cond-> v
(map? v)
(select-keys [:qname :rname :rnext :pos :pnext :mapq :cigar :qual :tlen :seq])))
(stringify-validation-result-messages errors))]
(throw (ex-info msg res))))
alignment))

(defn check-alignments
"Applies `check-alignments` to each element of the given sequence.
Returns a lazy sequence of the same elements of the input if there are no
invalid alignment The validation is evaluated lazily and throws an exception
at the first invalid alignment
Returns a transducer if `alignments` is not specified."

([validator]
(map (partial check-alignment validator)))
([validator alignments]
(sequence (check-alignments validator) alignments)))
101 changes: 101 additions & 0 deletions test/cljam/io/sam/util/validator_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
(ns cljam.io.sam.util.validator-test
(:require [clojure.test :refer [deftest is are testing]]
[cljam.io.sam.util.validator :as validator]))

(deftest validate-option-test
(testing "bad type"
(is (= (#'validator/validate-option {:type "!" :value \!} :sam)
["Type ! is invalid"])))
(testing "type A"
(is (nil? (#'validator/validate-option {:type "A" :value \!} :sam)))
(is (= (#'validator/validate-option {:type "A" :value 100} :sam)
["Must be a char [!-~]."])))
(testing "type c bam"
(is (nil? (#'validator/validate-option {:type "c" :value 10} :bam)))
(is (= (#'validator/validate-option {:type "c" :value 300} :bam)
["Must be 8 bit signed integer."])))
(testing "type i"
(is (nil? (#'validator/validate-option {:type "i" :value 10} :sam)))
(is (= (#'validator/validate-option {:type "i" :value "10"} :sam)
["Must be 32 bit signed integer."]))
(is (= (#'validator/validate-option {:type "i" :value 100000000000} :sam)
["Must be 32 bit signed integer."])))
(testing "type f"
(is (nil? (#'validator/validate-option {:type "f" :value 10} :sam)))
(is (nil? (#'validator/validate-option {:type "f" :value 10.1} :sam)))
(is (= (#'validator/validate-option {:type "f" :value "A"} :sam)
["Must be a float."])))
(testing "type Z"
(is (nil? (#'validator/validate-option {:type "Z" :value "!@abc"} :sam)))
(is (= (#'validator/validate-option {:type "Z" :value 10} :sam)
["Must be a printable string [ !-~]*"])))
(testing "type H"
(is (nil? (#'validator/validate-option {:type "H" :value [1,2]} :sam)))
(is (= (#'validator/validate-option {:type "H" :value "A"} :sam)
["Must be a byte array."])))
(testing "type B"
(is (nil? (#'validator/validate-option
{:type "B" :value "f,-0.3,0.0,0.3"}
:sam)))
(is (= (#'validator/validate-option {:type "B" :value "W"} :sam)
["Must be a string of comma-separated array of numbers."]))))

(deftest validate-data-record-test
(let [validator (validator/make-validator {:SQ [{:SN "ref", :LN 45}]})
valid-align
{:rname "ref" :pos 10 :qname "a" :mapq 10 :cigar "16M"
:rnext "*" :tlen 0 :pnext 0 :seq "ATGC" :qual "*"
:options {}}]
(are [k v ans]
(= (get-in (#'validator/validate-data-record validator (assoc valid-align k v))
[:errors k])
ans)
:qname 100 ["Must be a string."]
:qname (apply str (repeat 255 \a))
["Must be less than or equal to 254 characters."]

:qname "@@" ["Must not contain illegal characters."]

:qname (apply str (repeat 255 \@))
["Must not contain illegal characters."
"Must be less than or equal to 254 characters."]

:rname 10 ["Must be a string."]
:rname "NOT-FOUND" ["Must be declared as the SN value in the SQ line within the header. (NOT-FOUND)"]
:pos "ABC" ["Must be an integer."]
:pos 100000000 ["Must be less than or equal to 45."]
:pos 46 ["Must be less than or equal to 45."]
:pos -100 ["Must be in the [0, 2147483647]."]
:mapq "A" ["Must be an integer."]
:mapq 300 ["Must be in the [0-255]."]
:cigar 10 ["Must be a string."]
:cigar "3Y" ["Invalid format."]
:rname 10 ["Must be a string."]
:pnext 100000000 ["Must be less than or equal to 45."]
:pnext "A" ["Must be an integer."]
:tlen -9900000000 ["Must be in the [-2147483647,2147483647]."]
:qual 10 ["Must be a string."]
:qual "bad qual" ["Must be composed only of ASCII characters within the valid phred33 range [!-~]."]
:seq 100 ["Must be a string."]
:seq [\A \B] ["Must be a string."]
:seq "A!TGC" ["Must not contain bad character."])
(is (= (get-in (#'validator/validate-data-record
validator
(assoc valid-align :options [{:type "!" :value \!}]))
[:errors [:options 0]])
["Type ! is invalid"]))))

(deftest check-alignment-test
(is (thrown? clojure.lang.ExceptionInfo
(doall (validator/check-alignments
(validator/make-validator {:SQ [{:SN "ref", :LN 45}]})
[{:rname "ref" :pos 10000000
:qname "a"
:mapq 10 :cigar "16M" :rnext "*"
:tlen 0 :pnext 0 :seq "ATGC"
:qual "*" :options {}}]))))
(let [input [{:rname "ref" :pos 10 :qname "a" :mapq 10 :cigar "16M"
:rnext "*" :tlen 0 :pnext 0 :seq "ATGC" :qual "*"
:options {}}]]
(is (= (validator/check-alignments (validator/make-validator {:SQ [{:SN "ref", :LN 45}]}) input)
input))))