diff --git a/src/cljam/algo/cram_indexer.clj b/src/cljam/algo/cram_indexer.clj new file mode 100644 index 00000000..df272f02 --- /dev/null +++ b/src/cljam/algo/cram_indexer.clj @@ -0,0 +1,24 @@ +(ns cljam.algo.cram-indexer + (:require [cljam.io.crai :as crai] + [cljam.io.cram.core :as cram.core] + [cljam.io.cram.reader :as cram.reader] + [cljam.io.sam.util.header :as sam.header])) + +(defn create-index + "Creates a CRAM index file from the given CRAM file. + + Takes the following options: + - skip-sort-order-check?: By default, the CRAM indexer checks if the header + is declared as `SO:coordinate` and raises an error if not. + If this option is set to true, the CRAM indexer will skip the header check + and create an index file regardless of the header declaration." + [in-cram out-crai & {:keys [skip-sort-order-check?]}] + (with-open [r (cram.core/reader in-cram {}) + w (crai/writer out-crai)] + (let [header @(.-header r)] + (when (and (not skip-sort-order-check?) + (not= (sam.header/sort-order header) sam.header/order-coordinate)) + (throw + (ex-info "Cannot create CRAM index file for CRAM file not declared as sorted by coordinate" + {:sort-order (sam.header/sort-order header)}))) + (crai/write-index-entries w (cram.reader/generate-index-entries r))))) diff --git a/src/cljam/io/crai.clj b/src/cljam/io/crai.clj index 8cbc1052..07dc8046 100644 --- a/src/cljam/io/crai.clj +++ b/src/cljam/io/crai.clj @@ -2,24 +2,36 @@ (:require [cljam.util :as util] [cljam.util.intervals :as intervals] [clojure.java.io :as io] - [clojure.string :as str])) + [clojure.string :as str]) + (:import [java.io Closeable OutputStreamWriter PrintWriter])) + +(defn- read-index-entries [rdr] + (->> (line-seq rdr) + (map (fn [line] + (let [[^long seq-id start span container-offset slice-offset size] + (map #(Long/parseLong %) (str/split line #"\t")) + unmapped? (neg? seq-id)] + {:ref-seq-id seq-id + :start (if unmapped? 0 start) + :span (if unmapped? 0 span) + :container-offset container-offset + :slice-offset slice-offset + :size size}))))) (defn read-index "Reads a CRAI file `f` and creates an index." [f refs] (let [refs (vec refs)] (with-open [rdr (io/reader (util/compressor-input-stream f))] - (->> (line-seq rdr) - (map (fn [line] - (let [[^long seq-id ^long start ^long span container-offset slice-offset size] - (map #(Long/parseLong %) (str/split line #"\t")) - unmapped? (neg? seq-id)] - {:chr (if unmapped? "*" (:name (nth refs seq-id))) - :start (if unmapped? 0 start) - :end (if unmapped? 0 (+ start span)) - :container-offset container-offset - :slice-offset slice-offset - :size size}))) + (->> (read-index-entries rdr) + (map (fn [{:keys [^long ref-seq-id ^long start ^long span] :as entry}] + (-> entry + (assoc :chr + (if (neg? ref-seq-id) + "*" + (:name (nth refs ref-seq-id))) + :end (+ start span)) + (dissoc :ref-seq-id :span)))) intervals/index-intervals)))) (defn find-overlapping-entries @@ -27,3 +39,49 @@ region." [idx chr start end] (intervals/find-overlap-intervals idx chr start end)) + +(deftype CRAIWriter [^PrintWriter writer] + Closeable + (close [_] + (.close writer))) + +(defn writer + "Creates and returns a CRAI writer for the file `f`." + ^Closeable [f] + (-> (util/compressor-output-stream f :gzip) + OutputStreamWriter. + PrintWriter. + ->CRAIWriter)) + +(defn- write-index-entry + [^CRAIWriter wtr {:keys [^long ref-seq-id start span container-offset slice-offset size]}] + (let [unmapped? (neg? ref-seq-id)] + (doto ^PrintWriter (.-writer wtr) + (.print ref-seq-id) + (.print \tab) + (.print (if unmapped? 0 (long start))) + (.print \tab) + (.print (if unmapped? 0 (long span))) + (.print \tab) + (.print (long container-offset)) + (.print \tab) + (.print (long slice-offset)) + (.print \tab) + (.print (long size)) + (.print \newline)))) + +(defn write-index-entries + "Writes CRAM index entries. + + Each CRAM index entry consists of the following keys: + - ref-seq-id: Reference sequence id (-1 for slices containing only unmapped records) + - start: Alignment start + - span: Alignment span + - container-offset: Absolute byte offset of the container header in the file + - slice-offset: Relative byte offset of the slice header block + - size: Slice size in bytes + + Note that if :ref-seq-id equals to -1 (unmapped), the CRAI writer will ignore + the actual values of :start/:span and write zeros for both fields." + [^CRAIWriter wtr index-entries] + (run! (partial write-index-entry wtr) index-entries)) diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj index 0c8596ff..aba6bbe2 100644 --- a/src/cljam/io/cram.clj +++ b/src/cljam/io/cram.clj @@ -55,9 +55,16 @@ The function also takes an optional argument `option`, which is a map that consists of: - reference: A string representing the path to the reference file, or - a sequence reader that reads sequences from the reference file. - This may be omitted only when the CRAM file to be read does not - require a reference file." + a sequence reader that reads sequences from the reference file. + This may be omitted only when the CRAM file to be read does not require + a reference file. + - create-index?: If true, creates a .crai index file in the course of CRAM + file writing. + - skip-sort-order-check?: When creating a CRAM index for the CRAM file, + the CRAM writer, by default, checks if the header is declared as + `SO:coordinate` and raises an error if not. + If this option is set to true, the CRAM writer will skip the header check + and create an index file regardless of the header declaration." (^CRAMWriter [f] (writer f {})) (^CRAMWriter [f option] (cram/writer f option))) diff --git a/src/cljam/io/cram/core.clj b/src/cljam/io/cram/core.clj index f89b770c..08fd0e32 100644 --- a/src/cljam/io/cram/core.clj +++ b/src/cljam/io/cram/core.clj @@ -10,7 +10,7 @@ [clojure.string :as str]) (:import [cljam.io.cram.reader CRAMReader] [cljam.io.cram.writer CRAMWriter] - [java.io FileNotFoundException] + [java.io DataOutputStream FileNotFoundException] [java.net URL] [java.nio.channels FileChannel] [java.nio.file OpenOption StandardOpenOption])) @@ -78,14 +78,23 @@ "Creates a new CRAM writer that writes to a CRAM file f. Takes an option map as the second argument. An option map consists of: - - reference: a string representing the path to a reference file" - ^CRAMWriter [f {:keys [reference]}] + - reference: A string representing the path to a reference file + - create-index?: If true, creates a .crai index file in the course of CRAM + file writing. + - skip-sort-order-check?: When creating a CRAM index for the CRAM file, + the CRAM writer, by default, checks if the header is declared as + `SO:coordinate` and raises an error if not. + If this option is set to true, the CRAM writer will skip the header check + and create an index file regardless of the header declaration." + ^CRAMWriter [f {:keys [reference create-index?] :as opts}] (let [file (cio/file f) url (cio/as-url file) url' (str url) file-id (subs url' 0 (min 20 (count url'))) - out (cio/output-stream file) + out (DataOutputStream. (cio/output-stream file)) + index-writer (when create-index? + (crai/writer (util/as-url (str url' ".crai")))) seq-resolver (some-> reference resolver/seq-resolver resolver/cached-resolver) - wtr (writer/->CRAMWriter url out seq-resolver)] + wtr (writer/->CRAMWriter url out seq-resolver index-writer opts)] (writer/write-file-definition wtr file-id) wtr)) diff --git a/src/cljam/io/cram/encode/alignment_stats.clj b/src/cljam/io/cram/encode/alignment_stats.clj index dbf3d139..3c30fbc9 100644 --- a/src/cljam/io/cram/encode/alignment_stats.clj +++ b/src/cljam/io/cram/encode/alignment_stats.clj @@ -1,4 +1,13 @@ -(ns cljam.io.cram.encode.alignment-stats) +(ns cljam.io.cram.encode.alignment-stats + (:import [java.util HashMap])) + +(defn- update-single-alignment-span! [^longs cur-span ^long pos ^long end] + (let [cur-pos (aget cur-span 0)] + (when (pos? pos) + (when (or (zero? cur-pos) (< pos cur-pos)) + (aset cur-span 0 pos)) + (when (< (aget cur-span 1) end) + (aset cur-span 1 end))))) (defprotocol IAlignmentStatsBuilder (update! [_ ri pos end nbases nrecords]) @@ -9,39 +18,33 @@ (deftype AlignmentStatsBuilder [^:unsynchronized-mutable first? ^:unsynchronized-mutable ^long ref-index - ^:unsynchronized-mutable ^long start - ^:unsynchronized-mutable ^long end ^:unsynchronized-mutable ^long nbases - ^:unsynchronized-mutable ^long nrecords] + ^:unsynchronized-mutable ^long nrecords + ^longs span] IAlignmentStatsBuilder (build [_] (let [ri (if first? -1 ref-index)] (->AlignmentStats ri - (if (neg? ri) 0 start) - (if (neg? ri) 0 end) + (if (neg? ri) 0 (aget span 0)) + (if (neg? ri) 0 (aget span 1)) nbases nrecords))) (update! [this ri pos end nbases nrecords] - (let [ri (long ri) - pos (long pos) - end (long end)] + (let [ri (long ri)] (cond first? (do (set! ref-index ri) (set! first? false)) (not= ri ref-index) (set! ref-index -2)) - (when (and (>= ref-index 0) (pos? pos)) - (when (or (zero? (.-start this)) (< pos (.-start this))) - (set! (.-start this) pos)) - (when (< (.-end this) end) - (set! (.-end this) end))) + (when (>= ref-index 0) + (update-single-alignment-span! span pos end)) (set! (.-nbases this) (+ (.-nbases this) (long nbases))) (set! (.-nrecords this) (+ (.-nrecords this) (long nrecords)))))) (defn make-alignment-stats-builder "Creates a new alignment stats builder." [] - (->AlignmentStatsBuilder true 0 0 0 0 0)) + (->AlignmentStatsBuilder true 0 0 0 (long-array 2))) (defn merge-stats "Merges multiple alignment stats into one." @@ -51,3 +54,32 @@ (update! builder ri start end nbases nrecords)) ss) (build builder))) + +(defprotocol IAlignmentSpansBuilder + (build-spans [_]) + (update-span! [_ ri pos end])) + +(defn- ref-index-comparator [^long ri1 ^long ri2] + (cond (= ri1 ri2) 0 + (= ri1 -1) 1 + (= ri2 -1) -1 + :else (compare ri1 ri2))) + +(defn make-alignment-spans-builder + "Creates a new alignment spans builder." + [] + (let [ri->span (HashMap.)] + (reify IAlignmentSpansBuilder + (build-spans [_] + (into (sorted-map-by ref-index-comparator) + (map (fn [[ri ^longs span]] + (let [start (aget span 0) + end (aget span 1)] + [ri {:start start, :span (inc (- end start))}]))) + ri->span)) + (update-span! [_ ri pos end] + (let [span (or (get ri->span ri) + (let [span (long-array 2)] + (.put ri->span ri span) + span))] + (update-single-alignment-span! span pos end)))))) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index 6b80a3c7..ffa4ed01 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -112,6 +112,40 @@ (BA (aget bs i))) (encode-qual record QS)))) +(defn- build-cram-record-encoder + [cram-header rname->idx tag-dict {:keys [BF CF] :as ds-encoders} tag-encoders stats-builder] + (let [pos-encoder (build-positional-data-encoder cram-header ds-encoders) + name-encoder (build-read-name-encoder ds-encoders) + mate-encoder (build-mate-read-encoder rname->idx ds-encoders) + tags-encoder (build-auxiliary-tags-encoder tag-dict ds-encoders tag-encoders) + mapped-encoder (build-mapped-read-encoder ds-encoders) + unmapped-encoder (build-unmapped-read-encoder ds-encoders)] + (fn [record] + (let [bf (bit-and (long (:flag record)) + (bit-not (sam.flag/encoded #{:next-reversed :next-unmapped})))] + (stats/update! stats-builder + (::ref-index record) (:pos record) (::end record) + (count (:seq record)) 1) + (BF bf) + (CF (::flag record)) + (pos-encoder record) + (name-encoder record) + (mate-encoder record) + (tags-encoder record) + (if (sam.flag/unmapped? (:flag record)) + (unmapped-encoder record) + (mapped-encoder record)))))) + +(defn encode-slice-records + "Encodes CRAM records in a slice all at once using the given data series encoders + and tag encoders. Returns the alignment stats for this slice." + [cram-header rname->idx tag-dict ds-encoders tag-encoders records] + (let [stats-builder (stats/make-alignment-stats-builder) + record-encoder (build-cram-record-encoder cram-header rname->idx tag-dict + ds-encoders tag-encoders stats-builder)] + (run! record-encoder records) + (stats/build stats-builder))) + (defn- add-mismatches [n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs] (loop [i (long n), rpos (long rpos), spos (long spos), fs fs] @@ -163,52 +197,20 @@ \P (recur more rpos spos (conj! fs {:code :padding :pos pos :len n})))) [(persistent! fs) (dec rpos)]))))) -(defn- build-cram-record-encoder - [seq-resolver cram-header tag-dict subst-mat {:keys [BF CF] :as ds-encoders} - tag-encoders stats-builder] - (let [rname->idx (into {} - (map-indexed (fn [i {:keys [SN]}] [SN i])) - (:SQ cram-header)) - pos-encoder (build-positional-data-encoder cram-header ds-encoders) - name-encoder (build-read-name-encoder ds-encoders) - mate-encoder (build-mate-read-encoder rname->idx ds-encoders) - tags-encoder (build-auxiliary-tags-encoder tag-dict ds-encoders tag-encoders) - mapped-encoder (build-mapped-read-encoder ds-encoders) - unmapped-encoder (build-unmapped-read-encoder ds-encoders)] - (fn [record] - (let [bf (bit-and (long (:flag record)) - (bit-not (sam.flag/encoded #{:next-reversed :next-unmapped}))) - ;; these flag bits of CF are hard-coded at the moment: - ;; - 0x01: quality scores stored as array (true) - ;; - 0x02: detached (true) - ;; - 0x04: has mate downstream (false) - cf (cond-> 0x03 - (= (:seq record) "*") (bit-or 0x08)) - ri (ref-index rname->idx (:rname record)) - [fs end] (calculate-read-features&end seq-resolver subst-mat record) - record' (assoc record ::flag cf ::ref-index ri ::features fs)] - (stats/update! stats-builder ri (:pos record') end (count (:seq record')) 1) - (BF bf) - (CF cf) - (pos-encoder record') - (name-encoder record') - (mate-encoder record') - (tags-encoder record') - (if (sam.flag/unmapped? (:flag record')) - (unmapped-encoder record') - (mapped-encoder record')))))) - -(defn encode-slice-records - "Encodes CRAM records in a slice all at once using the given data series encoders - and tag encoders. Returns the alignment stats for this slice." - [seq-resolver cram-header tag-dict subst-mat ds-encoders tag-encoders records] - (let [stats-builder (stats/make-alignment-stats-builder) - record-encoder (build-cram-record-encoder seq-resolver - cram-header - tag-dict - subst-mat - ds-encoders - tag-encoders - stats-builder)] - (run! record-encoder records) - (stats/build stats-builder))) +(defn preprocess-slice-records + "Preprocesses slice records to calculate some record fields prior to record + encoding that are necessary for the CRAM writer to generate some header + components." + [seq-resolver rname->idx subst-mat ^objects records] + (dotimes [i (alength records)] + (let [record (aget records i) + ;; these flag bits of CF are hard-coded at the moment: + ;; - 0x01: quality scores stored as array (true) + ;; - 0x02: detached (true) + ;; - 0x04: has mate downstream (false) + cf (cond-> 0x03 + (= (:seq record) "*") (bit-or 0x08)) + ri (ref-index rname->idx (:rname record)) + [fs end] (calculate-read-features&end seq-resolver subst-mat record) + record' (assoc record ::flag cf ::ref-index ri ::end end ::features fs)] + (aset records i record')))) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index a0c40eef..dd5f53a4 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -3,6 +3,7 @@ [cljam.io.cram.data-series :as ds] [cljam.io.cram.decode.record :as record] [cljam.io.cram.decode.structure :as struct] + [cljam.io.cram.encode.alignment-stats :as stats] [cljam.io.cram.seq-resolver.protocol :as resolver] [cljam.io.protocols :as protocols] [cljam.io.util.byte-buffer :as bb] @@ -81,21 +82,25 @@ (inc (- (long end) offset)))))) (.-seq-resolver rdr)))) -(defn- read-slice-records [^CRAMReader rdr bb compression-header slice-header] - (let [blocks (into [] (map (fn [_] (struct/decode-block bb))) - (range (:blocks slice-header))) - core-block (first (filter #(zero? (long (:content-id %))) blocks)) - bs-decoder (when core-block - (bs/make-bit-stream-decoder (:data core-block))) - ds-decoders (ds/build-data-series-decoders compression-header bs-decoder blocks) - tag-decoders (ds/build-tag-decoders compression-header bs-decoder blocks)] - (record/decode-slice-records (seq-resolver-for-slice rdr slice-header blocks) - (.-qname-generator rdr) - @(.-header rdr) - compression-header - slice-header - ds-decoders - tag-decoders))) +(defn- read-slice-records + ([rdr bb compression-header slice-header] + (read-slice-records rdr nil bb compression-header slice-header)) + ([^CRAMReader rdr seq-resolver bb compression-header slice-header] + (let [blocks (into [] (map (fn [_] (struct/decode-block bb))) + (range (:blocks slice-header))) + core-block (first (filter #(zero? (long (:content-id %))) blocks)) + bs-decoder (when core-block + (bs/make-bit-stream-decoder (:data core-block))) + ds-decoders (ds/build-data-series-decoders compression-header bs-decoder blocks) + tag-decoders (ds/build-tag-decoders compression-header bs-decoder blocks)] + (record/decode-slice-records (or seq-resolver + (seq-resolver-for-slice rdr slice-header blocks)) + (.-qname-generator rdr) + @(.-header rdr) + compression-header + slice-header + ds-decoders + tag-decoders)))) (defn- with-each-slice-header [^ByteBuffer bb f slice-offsets] (let [container-header-end (.position bb) @@ -225,3 +230,73 @@ (if @(.-index rdr) (read-alignments-in-region-with-index rdr chr start end) (read-alignments-in-region-without-index rdr chr start end))) + +(defn- slice-index-entries + [^CRAMReader rdr seq-resolver rname->idx bb compression-header slice-header] + (if (= (:ref-seq-id slice-header) -2) + (let [slice-records (read-slice-records rdr seq-resolver bb compression-header slice-header) + spans-builder (stats/make-alignment-spans-builder)] + (run! (fn [{:keys [rname] :as record}] + (let [ri (if (= rname "*") -1 (get rname->idx rname))] + (stats/update-span! spans-builder ri (:pos record) (:end record)))) + slice-records) + (mapv (fn [[ri {:keys [start span]}]] + {:ref-seq-id ri, :start start, :span span}) + (stats/build-spans spans-builder))) + [(select-keys slice-header [:ref-seq-id :start :span])])) + +(defn- with-each-slice-header&offset [container-header ^ByteBuffer bb f] + (let [container-header-end (.position bb) + compression-header (struct/decode-compression-header-block bb) + slice-offsets (vec (:landmarks container-header))] + (mapcat (fn [^long slice-offset ^long slice-end] + (let [slice-start (+ container-header-end slice-offset) + _ (.position ^Buffer bb slice-start) + slice-header (struct/decode-slice-header-block bb) + slice-size (- slice-end slice-offset)] + (f compression-header slice-header slice-offset slice-size))) + slice-offsets + (-> slice-offsets + (conj (:length container-header)) + rest)))) + +;; The current implementation of the CRAM index entry generator decodes each +;; entire CRAM record for records in multiple reference slices, which requires +;; the reference file to restore the read sequence of those records although +;; it's unnecessary in theory. +;; The following stub implementation of the seq resolver allows the CRAM index +;; entry generator to decode the slice records without the limitation of requiring +;; the real reference file. +(defn- stub-seq-resolver [] + (reify resolver/ISeqResolver + (resolve-sequence [_ _]) + (resolve-sequence [_ _ start end] + (byte-array (inc (- (long end) (long start))) (byte (int \N)))))) + +(defn generate-index-entries + "Generates CRAM index entries for the given CRAM file, returning them as a sequence + of maps which is intended to be consumed with `cljam.io.crai/write-index-entries`." + [^CRAMReader rdr] + (let [^FileChannel ch (.-channel rdr) + seq-resolver (stub-seq-resolver) + rname->idx (into {} + (map-indexed (fn [i {:keys [SN]}] [SN i])) + (:SQ @(.-header rdr)))] + (letfn [(read-fn [container-offset] + (fn [container-header bb] + (when-not (struct/eof-container? container-header) + (with-each-slice-header&offset container-header bb + (fn [compression-header slice-header slice-offset slice-size] + (->> (slice-index-entries rdr seq-resolver rname->idx bb + compression-header slice-header) + (map #(assoc % + :container-offset container-offset + :slice-offset slice-offset + :size slice-size)))))))) + (step [] + (let [container-offset (.position ch)] + (when (< container-offset (.size ch)) + (when-let [entries (read-container-with rdr (read-fn container-offset))] + (concat entries (lazy-seq (step)))))))] + (.position ch (long @(.-offset rdr))) + (step)))) diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj index bcc96b56..d76ef76c 100644 --- a/src/cljam/io/cram/writer.clj +++ b/src/cljam/io/cram/writer.clj @@ -1,22 +1,26 @@ (ns cljam.io.cram.writer - (:require [cljam.io.cram.data-series :as ds] + (:require [cljam.io.crai :as crai] + [cljam.io.cram.data-series :as ds] [cljam.io.cram.encode.alignment-stats :as stats] [cljam.io.cram.encode.record :as record] [cljam.io.cram.encode.structure :as struct] [cljam.io.cram.seq-resolver.protocol :as resolver] - [cljam.io.protocols :as protocols]) - (:import [java.io Closeable OutputStream] + [cljam.io.protocols :as protocols] + [cljam.io.sam.util.header :as sam.header]) + (:import [java.io Closeable DataOutputStream OutputStream] [java.security MessageDigest])) (declare write-header write-alignments) -(deftype CRAMWriter [url stream seq-resolver] +(deftype CRAMWriter [url stream seq-resolver index-writer options] Closeable (close [_] (struct/encode-eof-container stream) (.close ^OutputStream stream) (when seq-resolver - (.close ^Closeable seq-resolver))) + (.close ^Closeable seq-resolver)) + (when index-writer + (.close ^Closeable index-writer))) protocols/IWriter (writer-url [_] url) protocols/IAlignmentWriter @@ -36,15 +40,19 @@ (defn write-header "Writes the CRAM header." [^CRAMWriter wtr header] + (when (and (.-index-writer wtr) + (not (:skip-sort-order-check? (.-options wtr))) + (not= (sam.header/sort-order header) sam.header/order-coordinate)) + (throw + (ex-info "Cannot create CRAM index file for CRAM file not declared as sorted by coordinate" + {:sort-order (sam.header/sort-order header)}))) (struct/encode-cram-header-container (.-stream wtr) header)) -(defn- reference-md5 [seq-resolver header {:keys [^long ri ^long start ^long end]}] - (if (neg? ri) - (byte-array 16) - (let [chr (:SN (nth (:SQ header) ri)) - ref-bases (resolver/resolve-sequence seq-resolver chr start end) - md5 (MessageDigest/getInstance "md5")] - (.digest md5 ref-bases)))) +(defn- preprocess-records + [seq-resolver rname->idx subst-mat ^objects container-records] + (dotimes [i (alength container-records)] + (let [slice-records (aget container-records i)] + (record/preprocess-slice-records seq-resolver rname->idx subst-mat slice-records)))) (defn- build-tag-dictionary [^objects container-records] (let [tags->index (volatile! {})] @@ -98,6 +106,14 @@ m entry)) {} tag-dict)) +(defn- reference-md5 [seq-resolver cram-header {:keys [^long ri ^long start ^long end]}] + (if (neg? ri) + (byte-array 16) + (let [chr (:SN (nth (:SQ cram-header) ri)) + ref-bases (resolver/resolve-sequence seq-resolver chr start end) + md5 (MessageDigest/getInstance "md5")] + (.digest md5 ref-bases)))) + (defn- stats->header-base [{:keys [ri ^long start ^long end nbases nrecords]}] {:ref-seq-id ri :start start @@ -106,10 +122,10 @@ :records nrecords}) (defn- generate-slice - [seq-resolver header counter tag-dict subst-mat tag-encodings slice-records] + [seq-resolver cram-header rname->idx counter tag-dict tag-encodings slice-records] (let [ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings) tag-encoders (ds/build-tag-encoders tag-encodings) - stats (record/encode-slice-records seq-resolver header tag-dict subst-mat + stats (record/encode-slice-records cram-header rname->idx tag-dict ds-encoders tag-encoders slice-records) ds-results (mapcat #(%) (vals ds-encoders)) tag-results (for [[_tag v] tag-encoders @@ -126,28 +142,28 @@ vals (cons {:content-id 0 :data (struct/generate-block :raw 5 0 (byte-array 0))})) - ref-md5 (reference-md5 seq-resolver header stats) - header-block (struct/generate-slice-header-block - (assoc (stats->header-base stats) - :counter counter - :embedded-reference -1 - :reference-md5 ref-md5) - blocks) + ref-md5 (reference-md5 seq-resolver cram-header stats) + header (assoc (stats->header-base stats) + :counter counter + :embedded-reference -1 + :reference-md5 ref-md5) + header-block (struct/generate-slice-header-block header blocks) block-data (mapv :data blocks)] - {:header-block header-block - :data-blocks block-data + {:header header :stats stats + :header-block header-block + :data-blocks block-data :counter (+ (long counter) (long (:nrecords stats))) :size (->> block-data (map #(alength ^bytes %)) (apply + (alength header-block)))})) (defn- generate-slices - [seq-resolver header counter tag-dict subst-mat tag-encodings container-records] + [seq-resolver cram-header rname->idx counter tag-dict tag-encodings container-records] (loop [[slice-records & more] container-records, counter counter, acc []] (if slice-records - (let [slice (generate-slice seq-resolver header counter tag-dict subst-mat - tag-encodings slice-records)] + (let [slice (generate-slice seq-resolver cram-header rname->idx counter + tag-dict tag-encodings slice-records)] (recur more (:counter slice) (conj acc slice))) acc))) @@ -169,22 +185,58 @@ (apply + 1)) :landmarks landmarks))) -(defn- write-container [^CRAMWriter wtr header counter container-records] - (let [^OutputStream out (.-stream wtr) +(defn- slice-index-entries [slice-header slice-records] + (if (= (:ref-seq-id slice-header) -2) + (let [spans-builder (stats/make-alignment-spans-builder)] + (run! (fn [record] + (stats/update-span! spans-builder (::record/ref-index record) + (:pos record) (::record/end record))) + slice-records) + (mapv (fn [[ri {:keys [start span]}]] + {:ref-seq-id ri, :start start, :span span}) + (stats/build-spans spans-builder))) + [(select-keys slice-header [:ref-seq-id :start :span])])) + +(defn- container-index-entries + [container-offset container-header slices container-records] + (for [[offset {:keys [header size]} records] (map vector + (:landmarks container-header) + slices + container-records) + entry (slice-index-entries header records)] + (assoc entry + :container-offset container-offset + :slice-offset offset + :size size))) + +(defn- write-index-entries + [index-writer container-offset container-header slices container-records] + (let [entries (container-index-entries container-offset container-header + slices container-records)] + (crai/write-index-entries index-writer entries))) + +(defn- write-container [^CRAMWriter wtr cram-header counter container-records] + (let [^DataOutputStream out (.-stream wtr) ds-encodings ds/default-data-series-encodings + seq-resolver (.-seq-resolver wtr) + rname->idx (into {} + (map-indexed (fn [i {:keys [SN]}] [SN i])) + (:SQ cram-header)) subst-mat {\A {\T 0, \G 1, \C 2, \N 3} \T {\A 0, \G 1, \C 2, \N 3} \G {\A 0, \T 1, \C 2, \N 3} \C {\A 0, \T 1, \G 2, \N 3} \N {\A 0, \T 1, \G 2, \C 3}} + _ (preprocess-records seq-resolver rname->idx subst-mat container-records) tag-dict (build-tag-dictionary container-records) tag-encodings (build-tag-encodings tag-dict) - slices (generate-slices (.-seq-resolver wtr) header counter tag-dict - subst-mat tag-encodings container-records) + slices (generate-slices seq-resolver cram-header rname->idx + counter tag-dict tag-encodings container-records) compression-header-block (struct/generate-compression-header-block {:RN true, :AP false, :RR true} subst-mat tag-dict ds-encodings tag-encodings) container-header (generate-container-header compression-header-block slices) + container-offset (.size out) counter' (:counter (peek slices))] (struct/encode-container-header out (assoc container-header :counter counter)) (.write out compression-header-block) @@ -192,6 +244,9 @@ (.write out header-block) (run! #(.write out ^bytes %) data-blocks)) slices) + (when-let [index-writer (.-index-writer wtr)] + (write-index-entries index-writer container-offset container-header + slices container-records)) counter')) (defn- partition-alignments [slices-per-container records-per-slice alns] diff --git a/test-resources/cram/test.sorted.cram b/test-resources/cram/test.sorted.cram new file mode 100644 index 00000000..bb18b934 Binary files /dev/null and b/test-resources/cram/test.sorted.cram differ diff --git a/test-resources/cram/test.sorted_with_unknown_so.cram b/test-resources/cram/test.sorted_with_unknown_so.cram new file mode 100644 index 00000000..4ab7bd02 Binary files /dev/null and b/test-resources/cram/test.sorted_with_unknown_so.cram differ diff --git a/test/cljam/algo/cram_indexer_test.clj b/test/cljam/algo/cram_indexer_test.clj new file mode 100644 index 00000000..b155f5ed --- /dev/null +++ b/test/cljam/algo/cram_indexer_test.clj @@ -0,0 +1,35 @@ +(ns cljam.algo.cram-indexer-test + (:require [cljam.algo.cram-indexer :as indexer] + [cljam.io.crai :as crai] + [cljam.io.cram :as cram] + [cljam.test-common :as common] + [cljam.util :as util] + [clojure.java.io :as io] + [clojure.test :refer [deftest is]])) + +(defn- read-index-entries [f] + (with-open [r (io/reader (util/compressor-input-stream f))] + (doall (#'crai/read-index-entries r)))) + +(deftest create-index-test + (let [f (io/file common/temp-dir "medium.cram.crai")] + (common/with-before-after {:before (common/prepare-cache!) + :after (common/clean-cache!)} + (is (thrown-with-msg? Exception #"Cannot create CRAM index file .*" + (indexer/create-index common/medium-cram-file f))) + (indexer/create-index common/medium-cram-file f + :skip-sort-order-check? true) + (is (= (read-index-entries common/medium-crai-file) + (read-index-entries f)))))) + +(deftest cram-without-alignments-test + (common/with-before-after {:before (common/prepare-cache!) + :after (common/clean-cache!)} + (let [header {:HD {:SO "coordinate"} + :SQ [{:SN "chr1", :LN 100}]} + target (io/file common/temp-dir "no_aln.cram") + target-crai (io/file common/temp-dir "no_aln.cram.crai")] + (with-open [w (cram/writer target)] + (cram/write-header w header) + (cram/write-refs w header)) + (is (common/not-throw? (indexer/create-index target target-crai)))))) diff --git a/test/cljam/io/crai_test.clj b/test/cljam/io/crai_test.clj index a8889560..197e68b3 100644 --- a/test/cljam/io/crai_test.clj +++ b/test/cljam/io/crai_test.clj @@ -1,7 +1,9 @@ (ns cljam.io.crai-test (:require [cljam.io.crai :as crai] [cljam.test-common :as common] - [clojure.test :refer [deftest are]])) + [cljam.util :as util] + [clojure.java.io :as io] + [clojure.test :refer [are deftest is]])) (def ^:private test-refs (->> (concat (range 1 23) ["X" "Y"]) @@ -116,3 +118,24 @@ :container-offset 378365 :slice-offset 190037 :size 12326}]))) + +(deftest write-index-entries-test + (let [entries [{:ref-seq-id 0, :start 546609, :span 205262429, + :container-offset 324, :slice-offset 563, :size 22007} + {:ref-seq-id 0, :start 206547069, :span 42644506, + :container-offset 324, :slice-offset 22570, :size 7349} + {:ref-seq-id 1, :start 67302, :span 231638920, + :container-offset 30272, :slice-offset 563, :size 21618} + {:ref-seq-id -1, :start 0, :span 0, + :container-offset 354657, :slice-offset 563, :size 23119} + {:ref-seq-id -1, :start 0, :span 0, + :container-offset 378365, :slice-offset 171, :size 23494} + {:ref-seq-id -1, :start 0, :span 0, + :container-offset 378365, :slice-offset 23665, :size 23213}] + f (io/file common/temp-dir "test.cram.crai")] + (common/with-before-after {:before (common/prepare-cache!) + :after (common/clean-cache!)} + (with-open [w (crai/writer f)] + (crai/write-index-entries w entries)) + (with-open [r (io/reader (util/compressor-input-stream f))] + (is (= entries (#'crai/read-index-entries r))))))) diff --git a/test/cljam/io/cram/encode/alignment_stats_test.clj b/test/cljam/io/cram/encode/alignment_stats_test.clj index 00551aa9..c113564b 100644 --- a/test/cljam/io/cram/encode/alignment_stats_test.clj +++ b/test/cljam/io/cram/encode/alignment_stats_test.clj @@ -49,3 +49,22 @@ (into {} (stats/merge-stats [{:ri -1, :start 0, :end 0, :nbases 10000, :nrecords 20} {:ri -1, :start 0, :end 0, :nbases 15000, :nrecords 30}])))))) + +(deftest make-alignment-spans-builder-test + (let [builder (stats/make-alignment-spans-builder) + records [{:ri 0, :start 1, :end 100} + {:ri 0, :start 51, :end 150} + {:ri 1, :start 51, :end 150} + {:ri 1, :start 151, :end 300} + {:ri 2, :start 51, :end 150} + {:ri 2, :start 200, :end 300} + {:ri -1, :start 0, :end 0} + {:ri -1, :start 0, :end 0}]] + (run! (fn [{:keys [ri start end]}] + (stats/update-span! builder ri start end)) + records) + (is (= {0 {:start 1, :span 150} + 1 {:start 51, :span 250} + 2 {:start 51, :span 250} + -1 {:start 0, :span 1}} + (stats/build-spans builder))))) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index 87664a7d..0793584d 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -68,6 +68,38 @@ [[{:code :softclip, :pos 1, :bases (mapv int "TT")}] 4])) +(deftest preprocess-slice-records-test + (let [rname->idx {"ref" 0} + records (object-array + [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"} + {:rname "ref", :pos 5, :cigar "2S3M",:seq "CCTGT", :qual "##AAC"} + {:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"} + {:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"} + {:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"} + {:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*"}])] + (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records) + (is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH" + ::record/flag 0x03, ::record/ref-index 0, ::record/end 5 + ::record/features [{:code :subst, :pos 3 :subst 0}]} + {:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC" + ::record/flag 0x03, ::record/ref-index 0, ::record/end 7 + ::record/features [{:code :softclip, :pos 1, :bases [(int \C) (int \C)]}]} + {:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF" + ::record/flag 0x03, ::record/ref-index 0, ::record/end 14 + ::record/features []} + {:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF" + ::record/flag 0x03, ::record/ref-index 0, ::record/end 19 + ::record/features [{:code :insertion, :pos 2, :bases [(int \A)]} + {:code :deletion, :pos 4, :len 1}]} + {:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE" + ::record/flag 0x03, ::record/ref-index -1, ::record/end 0 + ::record/features []} + {:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*" + ::record/flag 0x0b, ::record/ref-index -1, ::record/end 10 + ::record/features []}] + (walk/prewalk #(if (.isArray (class %)) (vec %) %) + records))))) + (deftest encode-slice-records-test (testing "mapped reads" (let [cram-header {:SQ @@ -76,6 +108,9 @@ :RG [{:ID "rg001"} {:ID "rg002"}]} + rname->idx (into {} + (map-indexed (fn [i {:keys [SN]}] [SN i])) + (:SQ cram-header)) tag-dict [[] [{:tag :MD, :type \Z} {:tag :NM, :type \c}]] @@ -87,34 +122,36 @@ :NM {\c {:codec :byte-array-len :len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]} :val-encoding {:codec :external, :content-id 5131619}}}}) - records [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0, - :cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" - :options [{:RG {:type "Z", :value "rg001"}} - {:MD {:type "Z", :value "2C2"}} - {:NM {:type "c", :value 1}}] - ::record/tags-index 1} - {:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15, - :cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC" - :options [{:RG {:type "Z", :value "rg001"}} - {:MD {:type "Z", :value "3"}} - {:NM {:type "c", :value 0}}] - ::record/tags-index 1} - {:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60, - :cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF" - :options [{:RG {:type "Z", :value "rg002"}} - {:MD {:type "Z", :value "5"}} - {:NM {:type "c", :value 0}}] - ::record/tags-index 1} - {:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15, - :cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" - :options [{:RG {:type "Z", :value "rg002"}} - {:MD {:type "Z", :value "3^T2"}} - {:NM {:type "c", :value 2}}] - ::record/tags-index 1} - {:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, - :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" - :options [], ::record/tags-index 0}] - stats (record/encode-slice-records test-seq-resolver cram-header tag-dict subst-mat + records (object-array + [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0, + :cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "2C2"}} + {:NM {:type "c", :value 1}}] + ::record/tags-index 1} + {:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15, + :cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "3"}} + {:NM {:type "c", :value 0}}] + ::record/tags-index 1} + {:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60, + :cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF" + :options [{:RG {:type "Z", :value "rg002"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}] + ::record/tags-index 1} + {:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15, + :cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" + :options [{:RG {:type "Z", :value "rg002"}} + {:MD {:type "Z", :value "3^T2"}} + {:NM {:type "c", :value 2}}] + ::record/tags-index 1} + {:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, + :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" + :options [], ::record/tags-index 0}]) + _ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records) + stats (record/encode-slice-records cram-header rname->idx tag-dict ds-encoders tag-encoders records) ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders) tag-res (walk/prewalk #(if (fn? %) (%) %) tag-encoders)] @@ -269,25 +306,30 @@ (let [cram-header {:SQ [{:SN "ref"} {:SN "ref2"}]} + rname->idx (into {} + (map-indexed (fn [i {:keys [SN]}] [SN i])) + (:SQ cram-header)) tag-dict [[]] ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings) - records [{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, - :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF" - :options [], ::record/tags-index 0} - {:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0, - :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD" - :options [], ::record/tags-index 0} - {:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, - :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF" - :options [], ::record/tags-index 0} - {:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0, - :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD" - :options [], ::record/tags-index 0} - {:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, - :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" - :options [], ::record/tags-index 0}] - stats (record/encode-slice-records test-seq-resolver cram-header tag-dict - subst-mat ds-encoders {} records) + records (object-array + [{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF" + :options [], ::record/tags-index 0} + {:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD" + :options [], ::record/tags-index 0} + {:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF" + :options [], ::record/tags-index 0} + {:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD" + :options [], ::record/tags-index 0} + {:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" + :options [], ::record/tags-index 0}]) + _ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records) + stats (record/encode-slice-records cram-header rname->idx tag-dict + ds-encoders {} records) ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)] (is (= {:ri -1, :start 0, :end 0, :nbases 25, :nrecords 5} (into {} stats))) diff --git a/test/cljam/io/cram/writer_test.clj b/test/cljam/io/cram/writer_test.clj new file mode 100644 index 00000000..d4145f21 --- /dev/null +++ b/test/cljam/io/cram/writer_test.clj @@ -0,0 +1,70 @@ +(ns cljam.io.cram.writer-test + (:require [cljam.io.cram.encode.record :as record] + [cljam.io.cram.writer :as writer] + [clojure.test :refer [deftest is testing]])) + +(deftest container-index-entries-test + (testing "no multiple reference slices" + (is (= [{:ref-seq-id 0, :start 123, :span 456 + :container-offset 100, :slice-offset 200, :size 1000} + {:ref-seq-id 0, :start 987, :span 654 + :container-offset 100, :slice-offset 1200, :size 900} + {:ref-seq-id 1, :start 1234, :span 5678 + :container-offset 100, :slice-offset 2100, :size 1100}] + (#'writer/container-index-entries + 100 + {:landmarks [200 1200 2100]} + [{:header {:ref-seq-id 0, :start 123, :span 456} + :size 1000} + {:header {:ref-seq-id 0, :start 987, :span 654} + :size 900} + {:header {:ref-seq-id 1, :start 1234, :span 5678} + :size 1100}] + [[{::record/ref-index 0, :pos 123 ::record/end 273} + {::record/ref-index 0, :pos 428 ::record/end 578}] + [{::record/ref-index 0, :pos 987 ::record/end 1137} + {::record/ref-index 0, :pos 1490 ::record/end 1640}] + [{::record/ref-index 1, :pos 1234 ::record/end 1384} + {::record/ref-index 1, :pos 6761 ::record/end 6911}]]))) + (is (= [{:ref-seq-id -1, :start 0, :span 0 + :container-offset 100, :slice-offset 200, :size 1000} + {:ref-seq-id -1, :start 0, :span 0 + :container-offset 100, :slice-offset 1200, :size 900}] + (#'writer/container-index-entries + 100 + {:landmarks [200 1200]} + [{:header {:ref-seq-id -1, :start 0, :span 0} + :size 1000} + {:header {:ref-seq-id -1, :start 0, :span 0} + :size 900}] + [[{::record/ref-index -1, :pos 0 ::record/end 0} + {::record/ref-index -1, :pos 0 ::record/end 0}] + [{::record/ref-index -1, :pos 0 ::record/end 0} + {::record/ref-index -1, :pos 0 ::record/end 0}]])))) + (testing "some multiple reference slices" + (is (= [{:ref-seq-id 0, :start 123, :span 456 + :container-offset 100, :slice-offset 200, :size 1000} + {:ref-seq-id 0, :start 987, :span 654 + :container-offset 100, :slice-offset 1200, :size 900} + {:ref-seq-id 1, :start 1234, :span 5678 + :container-offset 100, :slice-offset 1200, :size 900} + ;; Both of :start/:span should be zero for unmapped slices in respect of + ;; the CRAM specification, but it's guaranteed by the CRAI writer, + ;; so it doesn't matter what values these keys actually take here + {:ref-seq-id -1, :start 0, :span 1 + :container-offset 100, :slice-offset 1200, :size 900}] + (#'writer/container-index-entries + 100 + {:landmarks [200 1200]} + [{:header {:ref-seq-id 0, :start 123, :span 456} + :size 1000} + {:header {:ref-seq-id -2, :start 0, :span 0} + :size 900}] + [[{::record/ref-index 0, :pos 123 ::record/end 273} + {::record/ref-index 0, :pos 428 ::record/end 578}] + [{::record/ref-index 0, :pos 987 ::record/end 1138} + {::record/ref-index 0, :pos 1490 ::record/end 1640} + {::record/ref-index 1, :pos 1234 ::record/end 1384} + {::record/ref-index 1, :pos 6761 ::record/end 6911} + {::record/ref-index -1, :pos 0 ::record/end 0} + {::record/ref-index -1, :pos 0 ::record/end 0}]]))))) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index 8ff2902e..7e8d6269 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -9,6 +9,8 @@ [clojure.test :refer [are deftest is testing]])) (def ^:private temp-cram-file (io/file common/temp-dir "test.cram")) +(def ^:private temp-cram-file-2 (io/file common/temp-dir "test2.cram")) +(def ^:private temp-cram-file-3 (io/file common/temp-dir "test3.cram")) (defn- fixup-bam-aln [aln] (-> (into {} aln) @@ -105,3 +107,42 @@ (cram/read-header r'))) (is (= (cram/read-alignments r) (cram/read-alignments r')))))) + +(deftest writer-index-options-test + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (testing "A CRAM index file won't be created by default" + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file-2 + {:reference common/test-fa-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (is (not (.exists (io/file (str temp-cram-file-2 ".crai")))))) + (testing "A CRAM index file will be created if `:create-index?` is set to true" + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file-2 + {:reference common/test-fa-file + :create-index? true})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (is (.exists (io/file (str temp-cram-file-2 ".crai"))))) + (testing "Error when trying to create an index file for a CRAM file not declared as `SO:coordinate`" + (with-open [r (cram/reader common/test-sorted-with-unknown-so-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file-3 + {:reference common/test-fa-file + :create-index? true})] + (is (thrown-with-msg? Exception #"Cannot create CRAM index file for CRAM file not declared as sorted by coordinate" + (cram/write-header w (cram/read-header r)))))) + (testing "`:skip-sort-order-check?` skips the header check when creating an index file" + (with-open [r (cram/reader common/test-sorted-with-unknown-so-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file-3 + {:reference common/test-fa-file + :create-index? true + :skip-sort-order-check? true})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (is (.exists (io/file (str temp-cram-file-3 ".crai"))))))) diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index 5eb2ac4f..91524567 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -166,6 +166,8 @@ ;; ### CRAM files (def test-cram-file "test-resources/cram/test.cram") +(def test-sorted-cram-file "test-resources/cram/test.sorted.cram") +(def test-sorted-with-unknown-so-cram-file "test-resources/cram/test.sorted_with_unknown_so.cram") (def medium-cram-file "test-resources/cram/medium.cram") (def medium-with-standard-tags-cram-file "test-resources/cram/medium_with_standard_tags.cram") (def medium-without-index-cram-file "test-resources/cram/medium_without_index.cram")