From 77c63483110144f9b0f0ea63da13d7a401e37d7e Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 16 Apr 2026 17:16:44 +1000 Subject: [PATCH] added wasm module --- rust/Cargo.lock | 24 ++ rust/Cargo.toml | 1 + rust/bioscript-formats/src/alignment.rs | 274 ++++++++++++---------- rust/bioscript-formats/src/genotype.rs | 58 ++++- rust/bioscript-formats/src/inspect.rs | 202 +++++++++++++++- rust/bioscript-formats/src/lib.rs | 5 +- rust/bioscript-wasm/Cargo.toml | 25 ++ rust/bioscript-wasm/src/js_reader.rs | 106 +++++++++ rust/bioscript-wasm/src/lib.rs | 296 ++++++++++++++++++++++++ 9 files changed, 864 insertions(+), 127 deletions(-) create mode 100644 rust/bioscript-wasm/Cargo.toml create mode 100644 rust/bioscript-wasm/src/js_reader.rs create mode 100644 rust/bioscript-wasm/src/lib.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 7be5391..055f9fd 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -153,6 +153,20 @@ dependencies = [ "serde_yaml", ] +[[package]] +name = "bioscript-wasm" +version = "0.1.0" +dependencies = [ + "bioscript-core", + "bioscript-formats", + "console_error_panic_hook", + "js-sys", + "noodles", + "serde", + "serde_json", + "wasm-bindgen", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -355,6 +369,16 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + [[package]] name = "const-oid" version = "0.10.2" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 537e055..ca3d8b4 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -7,6 +7,7 @@ members = [ "bioscript-formats", "bioscript-runtime", "bioscript-schema", + "bioscript-wasm", ] [profile.dev] diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index a40a44a..c319d0d 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -1,6 +1,6 @@ use std::{ collections::{BTreeMap, HashSet}, - io::Seek as _, + io::{BufRead, Read, Seek}, path::Path, }; @@ -19,7 +19,7 @@ use bioscript_core::{GenomicLocus, RuntimeError}; use crate::genotype::GenotypeLoadOptions; #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(crate) enum AlignmentOpKind { +pub enum AlignmentOpKind { Match, Insertion, Deletion, @@ -32,13 +32,13 @@ pub(crate) enum AlignmentOpKind { } #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(crate) struct AlignmentOp { +pub struct AlignmentOp { pub kind: AlignmentOpKind, pub len: usize, } #[derive(Debug, Clone)] -pub(crate) struct AlignmentRecord { +pub struct AlignmentRecord { pub start: i64, pub end: i64, pub is_unmapped: bool, @@ -56,38 +56,46 @@ pub(crate) fn for_each_cram_record( options: &GenotypeLoadOptions, reference_file: &Path, locus: &GenomicLocus, - mut on_record: F, + on_record: F, ) -> Result<(), RuntimeError> where F: FnMut(AlignmentRecord) -> Result, { let repository = build_reference_repository(reference_file)?; - let mut builder = - cram::io::indexed_reader::Builder::default().set_reference_sequence_repository(repository); - - if let Some(index_path) = options.input_index.as_ref() { - let index = crai::fs::read(index_path).map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM index {} for {}: {err}", - index_path.display(), - path.display() - )) - })?; - builder = builder.set_index(index); - } + let mut reader = build_cram_indexed_reader_from_path(path, options, repository)?; + let label = path.display().to_string(); + for_each_cram_record_with_reader(&mut reader, &label, locus, on_record) +} - let mut reader = builder.build_from_path(path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open indexed CRAM {}: {err}", - path.display() - )) +pub(crate) fn query_cram_records( + path: &Path, + options: &GenotypeLoadOptions, + reference_file: &Path, + locus: &GenomicLocus, +) -> Result, RuntimeError> { + let mut records = Vec::new(); + for_each_cram_record(path, options, reference_file, locus, |record| { + records.push(record); + Ok(true) })?; + Ok(records) +} +/// Iterate decoded alignment records intersecting `locus`, streaming from an +/// already-built CRAM `IndexedReader`. This is the reader-based entry point +/// used by non-filesystem callers (e.g. wasm with a JS ReadAt shim). +pub fn for_each_cram_record_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + mut on_record: F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(AlignmentRecord) -> Result, +{ let header = reader.read_header().map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM header {}: {err}", - path.display() - )) + RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")) })?; let region = build_region(&header, locus).ok_or_else(|| { @@ -100,55 +108,32 @@ where let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; stream_selected_alignment_records( - path, - &mut reader, + label, + reader, &header, ®ion, locus.end, &selected_containers, &mut on_record, - )?; - - Ok(()) + ) } -pub(crate) fn for_each_raw_cram_record( - path: &Path, - options: &GenotypeLoadOptions, - reference_file: &Path, +/// Iterate raw CRAM records intersecting `locus`, streaming from an +/// already-built CRAM `IndexedReader`. The raw variant preserves the +/// `cram::Record` handle so callers can pull base+quality at a specific +/// reference position (needed for SNP pileups). +pub fn for_each_raw_cram_record_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, locus: &GenomicLocus, mut on_record: F, ) -> Result<(), RuntimeError> where + R: Read + Seek, F: FnMut(cram::Record<'_>) -> Result, { - let repository = build_reference_repository(reference_file)?; - let mut builder = - cram::io::indexed_reader::Builder::default().set_reference_sequence_repository(repository); - - if let Some(index_path) = options.input_index.as_ref() { - let index = crai::fs::read(index_path).map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM index {} for {}: {err}", - index_path.display(), - path.display() - )) - })?; - builder = builder.set_index(index); - } - - let mut reader = builder.build_from_path(path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open indexed CRAM {}: {err}", - path.display() - )) - })?; - let header = reader.read_header().map_err(|err| { - RuntimeError::Io(format!( - "failed to read CRAM header {}: {err}", - path.display() - )) + RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")) })?; let region = build_region(&header, locus).ok_or_else(|| { @@ -161,33 +146,94 @@ where let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; stream_selected_cram_records( - path, - &mut reader, + label, + reader, &header, ®ion, locus.end, &selected_containers, &mut on_record, - )?; + ) +} - Ok(()) +/// Build a CRAM `IndexedReader` over any `Read + Seek` source given a parsed +/// CRAI index and a reference repository. Mirrors `build_from_path` but with +/// an externally-provided reader — the wasm path uses this with a JS-backed +/// reader; native paths still go through the path-based helper below. +pub fn build_cram_indexed_reader_from_reader( + reader: R, + crai_index: crai::Index, + repository: fasta::Repository, +) -> Result, RuntimeError> +where + R: Read, +{ + cram::io::indexed_reader::Builder::default() + .set_reference_sequence_repository(repository) + .set_index(crai_index) + .build_from_reader(reader) + .map_err(|err| RuntimeError::Io(format!("failed to build indexed CRAM reader: {err}"))) } -pub(crate) fn query_cram_records( +/// Build a FASTA `Repository` over any `BufRead + Seek + Send + Sync` source +/// given a parsed FAI index. The `Send + Sync + 'static` bounds come from +/// `fasta::Repository`'s internal `Arc>` +/// cache — on single-threaded wasm32 these can be met via `unsafe impl`. +pub fn build_reference_repository_from_readers( + reader: R, + fai_index: fasta::fai::Index, +) -> fasta::Repository +where + R: BufRead + Seek + Send + Sync + 'static, +{ + let indexed = fasta::io::IndexedReader::new(reader, fai_index); + fasta::Repository::new(FastaIndexedReader::new(indexed)) +} + +/// Parse a CRAM index (`.crai`) from an in-memory byte buffer. Used by wasm +/// callers that receive the small index inline while the big CRAM stays on a +/// JS-backed reader. +pub fn parse_crai_bytes(bytes: &[u8]) -> Result { + crai::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse CRAM index bytes: {err}"))) +} + +/// Parse a FASTA index (`.fai`) from an in-memory byte buffer. +pub fn parse_fai_bytes(bytes: &[u8]) -> Result { + fasta::fai::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse FASTA index bytes: {err}"))) +} + +pub(crate) fn build_cram_indexed_reader_from_path( path: &Path, options: &GenotypeLoadOptions, - reference_file: &Path, - locus: &GenomicLocus, -) -> Result, RuntimeError> { - let mut records = Vec::new(); - for_each_cram_record(path, options, reference_file, locus, |record| { - records.push(record); - Ok(true) - })?; - Ok(records) + repository: fasta::Repository, +) -> Result, RuntimeError> { + let mut builder = + cram::io::indexed_reader::Builder::default().set_reference_sequence_repository(repository); + + if let Some(index_path) = options.input_index.as_ref() { + let index = crai::fs::read(index_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to read CRAM index {} for {}: {err}", + index_path.display(), + path.display() + )) + })?; + builder = builder.set_index(index); + } + + builder.build_from_path(path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open indexed CRAM {}: {err}", + path.display() + )) + }) } -fn build_reference_repository(reference_file: &Path) -> Result { +pub(crate) fn build_reference_repository(reference_file: &Path) -> Result { let reader = fasta::io::indexed_reader::Builder::default() .build_from_path(reference_file) .map_err(|err| { @@ -245,9 +291,9 @@ fn select_query_containers( .collect()) } -fn stream_selected_alignment_records( - path: &Path, - reader: &mut cram::io::indexed_reader::IndexedReader, +fn stream_selected_alignment_records( + label: &str, + reader: &mut cram::io::indexed_reader::IndexedReader, header: &sam::Header, region: &Region, locus_end: i64, @@ -255,25 +301,26 @@ fn stream_selected_alignment_records( on_record: &mut F, ) -> Result<(), RuntimeError> where + R: Read + Seek, F: FnMut(AlignmentRecord) -> Result, { stream_selected_cram_records( - path, + label, reader, header, region, locus_end, selected_containers, &mut |record| { - let alignment_record = build_alignment_record_from_cram(path, &record)?; + let alignment_record = build_alignment_record_from_cram(label, &record)?; on_record(alignment_record) }, ) } -fn stream_selected_cram_records( - path: &Path, - reader: &mut cram::io::indexed_reader::IndexedReader, +fn stream_selected_cram_records( + label: &str, + reader: &mut cram::io::indexed_reader::IndexedReader, header: &sam::Header, region: &Region, locus_end: i64, @@ -281,6 +328,7 @@ fn stream_selected_cram_records( on_record: &mut F, ) -> Result<(), RuntimeError> where + R: Read + Seek, F: FnMut(cram::Record<'_>) -> Result, { let interval = region.interval(); @@ -292,16 +340,14 @@ where .seek(std::io::SeekFrom::Start(offset)) .map_err(|err| { RuntimeError::Io(format!( - "failed to seek CRAM container at offset {offset} in {}: {err}", - path.display() + "failed to seek CRAM container at offset {offset} in {label}: {err}" )) })?; let mut container = Container::default(); let len = reader.read_container(&mut container).map_err(|err| { RuntimeError::Io(format!( - "failed to read CRAM container at offset {offset} in {}: {err}", - path.display() + "failed to read CRAM container at offset {offset} in {label}: {err}" )) })?; @@ -311,8 +357,7 @@ where let compression_header = container.compression_header().map_err(|err| { RuntimeError::Io(format!( - "failed to decode CRAM compression header from {}: {err}", - path.display() + "failed to decode CRAM compression header from {label}: {err}" )) })?; @@ -324,16 +369,13 @@ where for (index, slice_result) in container.slices().enumerate() { let slice = slice_result.map_err(|err| { RuntimeError::Io(format!( - "failed to read CRAM slice from {}: {err}", - path.display() + "failed to read CRAM slice from {label}: {err}" )) })?; let Some(&landmark_i32) = landmarks.get(index) else { return Err(RuntimeError::Io(format!( - "missing CRAM slice landmark {} in {}", - index, - path.display() + "missing CRAM slice landmark {index} in {label}" ))); }; let Ok(landmark) = u64::try_from(landmark_i32) else { @@ -345,8 +387,7 @@ where let (core_data_src, external_data_srcs) = slice.decode_blocks().map_err(|err| { RuntimeError::Io(format!( - "failed to decode CRAM slice blocks from {}: {err}", - path.display() + "failed to decode CRAM slice blocks from {label}: {err}" )) })?; @@ -359,7 +400,7 @@ where &external_data_srcs, true, |record| { - let alignment_record = match build_alignment_record_from_cram(path, record) { + let alignment_record = match build_alignment_record_from_cram(label, record) { Ok(r) => r, Err(e) => { callback_err = Some(e); @@ -394,13 +435,10 @@ where Ok(()) => {} Err(err) if is_reference_md5_mismatch(&err) => { eprintln!( - "[bioscript] warning: CRAM reference MD5 mismatch for {} slice landmark {} — \ + "[bioscript] warning: CRAM reference MD5 mismatch for {label} slice landmark {landmark} — \ retrying without checksum validation. Results may be incorrect if the \ supplied reference differs from the one used to encode this CRAM. \ - Details: {}", - path.display(), - landmark, - err + Details: {err}" ); callback_err = None; stop = false; @@ -414,7 +452,7 @@ where false, |record| { let alignment_record = - match build_alignment_record_from_cram(path, record) { + match build_alignment_record_from_cram(label, record) { Ok(r) => r, Err(e) => { callback_err = Some(e); @@ -449,15 +487,13 @@ where ) .map_err(|err| { RuntimeError::Io(format!( - "failed to decode CRAM slice records from {} (unchecked): {err}", - path.display() + "failed to decode CRAM slice records from {label} (unchecked): {err}" )) })?; } Err(err) => { return Err(RuntimeError::Io(format!( - "failed to decode CRAM slice records from {}: {err}", - path.display() + "failed to decode CRAM slice records from {label}: {err}" ))); } } @@ -485,13 +521,12 @@ fn is_reference_md5_mismatch(err: &std::io::Error) -> bool { } fn build_alignment_record_from_cram( - path: &Path, + label: &str, record: &cram::Record<'_>, ) -> Result { let flags = record.flags().map_err(|err| { RuntimeError::Io(format!( - "failed to read CRAM record flags from {}: {err}", - path.display() + "failed to read CRAM record flags from {label}: {err}" )) })?; let is_unmapped = flags.is_unmapped(); @@ -499,14 +534,12 @@ fn build_alignment_record_from_cram( let start = match record.alignment_start() { Some(Ok(pos)) => i64::try_from(usize::from(pos)).map_err(|_| { RuntimeError::Unsupported(format!( - "record alignment start exceeds i64 range in {}", - path.display() + "record alignment start exceeds i64 range in {label}" )) })?, Some(Err(err)) => { return Err(RuntimeError::Io(format!( - "failed to read CRAM alignment_start from {}: {err}", - path.display() + "failed to read CRAM alignment_start from {label}: {err}" ))); } None => 0, @@ -515,14 +548,12 @@ fn build_alignment_record_from_cram( let end = match record.alignment_end() { Some(Ok(pos)) => i64::try_from(usize::from(pos)).map_err(|_| { RuntimeError::Unsupported(format!( - "record alignment end exceeds i64 range in {}", - path.display() + "record alignment end exceeds i64 range in {label}" )) })?, Some(Err(err)) => { return Err(RuntimeError::Io(format!( - "failed to read CRAM alignment_end from {}: {err}", - path.display() + "failed to read CRAM alignment_end from {label}: {err}" ))); } None => start, @@ -534,8 +565,7 @@ fn build_alignment_record_from_cram( .map(|result| { result.map(map_op).map_err(|err| { RuntimeError::Io(format!( - "failed to read record CIGAR from {}: {err}", - path.display() + "failed to read record CIGAR from {label}: {err}" )) }) }) diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 74df992..794f5f8 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -2,13 +2,14 @@ use std::{ collections::{BTreeMap, BTreeSet, HashMap}, fmt::Write as _, fs::File, - io::{BufRead, BufReader}, + io::{BufRead, BufReader, Read, Seek}, path::{Path, PathBuf}, str::FromStr, }; use noodles::bgzf; use noodles::core::Position; +use noodles::cram; use noodles::sam::alignment::Record as _; use zip::ZipArchive; @@ -852,6 +853,20 @@ fn observe_snp_pileup( locus: &GenomicLocus, reference: char, alternate: char, +) -> Result { + let repository = alignment::build_reference_repository(reference_file)?; + let mut reader = + alignment::build_cram_indexed_reader_from_path(cram_path, options, repository)?; + let label = cram_path.display().to_string(); + snp_pileup_with_reader(&mut reader, &label, locus, reference, alternate) +} + +fn snp_pileup_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, ) -> Result { let mut counts = SnpPileupCounts::default(); let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { @@ -860,7 +875,7 @@ fn observe_snp_pileup( .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; let reference_base = reference as u8; - alignment::for_each_raw_cram_record(cram_path, options, reference_file, locus, |record| { + alignment::for_each_raw_cram_record_with_reader(reader, label, locus, |record| { let flags = record .flags() .map_err(|err| RuntimeError::Io(format!("failed to read CRAM flags: {err}")))?; @@ -938,6 +953,45 @@ fn observe_snp_pileup( Ok(counts) } +/// Observe a SNP at `locus` over an already-built CRAM `IndexedReader` and +/// reference repository (held by the reader). Mirrors the internal +/// `CramBackend::observe_snp` but reader-based, so non-filesystem callers +/// (e.g. wasm with a JS-backed reader) don't need a `GenotypeStore` or paths. +/// +/// `matched_rsid` and `assembly` are passed through to the returned +/// observation unchanged — callers that already know them (e.g. from +/// compiling a YAML variant) should supply them; otherwise `None`. +pub fn observe_cram_snp_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + matched_rsid: Option, + assembly: Option, +) -> Result { + let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate)?; + let ref_count = pileup.filtered_ref_count; + let alt_count = pileup.filtered_alt_count; + let depth = pileup.filtered_depth; + let evidence = pileup.evidence_lines(&describe_locus(locus), locus.start); + + Ok(VariantObservation { + backend: "cram".to_owned(), + matched_rsid, + assembly, + genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: pileup.raw_base_counts, + decision: Some(describe_snp_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence, + }) +} + fn normalize_pileup_base(base: u8) -> Option { match (base as char).to_ascii_uppercase() { 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index ff0aada..42739aa 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -2,9 +2,37 @@ use std::{ fs::File, io::{BufRead, BufReader, Cursor, Read}, path::{Path, PathBuf}, - time::Instant, }; +#[cfg(not(target_arch = "wasm32"))] +use std::time::Instant; + +// std::time::Instant::now() panics on wasm32-unknown-unknown ("time not +// implemented on this platform"). duration_ms is diagnostic-only, so on wasm +// we stub the timer instead of pulling in a perf/date shim crate. +#[cfg(target_arch = "wasm32")] +struct Instant; + +#[cfg(target_arch = "wasm32")] +impl Instant { + fn now() -> Self { + Self + } + fn elapsed(&self) -> StubDuration { + StubDuration + } +} + +#[cfg(target_arch = "wasm32")] +struct StubDuration; + +#[cfg(target_arch = "wasm32")] +impl StubDuration { + fn as_millis(&self) -> u128 { + 0 + } +} + use bioscript_core::{Assembly, RuntimeError}; use noodles::bgzf; use zip::ZipArchive; @@ -122,6 +150,178 @@ impl FileInspection { } } +/// Classify a file from in-memory bytes. Mirrors `inspect_file` but sources +/// its sample lines / zip entries from a byte buffer instead of the +/// filesystem. Needed by wasm targets where `std::fs` isn't available. +/// +/// `name` is used for extension-based detection (.cram / .bam / .fa / .zip / +/// .vcf.gz) and vendor sniffing from the filename. `bytes` is the file +/// contents; for zips we scan the central directory out of these bytes. +pub fn inspect_bytes( + name: &str, + bytes: &[u8], + options: &InspectOptions, +) -> Result { + let started = Instant::now(); + let lower = name.to_ascii_lowercase(); + let mut evidence = Vec::new(); + let mut warnings = Vec::new(); + let path = Path::new(name); + + if lower.ends_with(".zip") { + let selected_entry = select_zip_entry_from_bytes(bytes)?; + let sample_lines = read_zip_sample_lines_from_bytes(bytes, &selected_entry)?; + let mut inspection = inspect_from_textual_sample( + path, + FileContainer::Zip, + &selected_entry, + &sample_lines, + options, + ); + inspection.duration_ms = started.elapsed().as_millis(); + return Ok(inspection); + } + + let detected_kind = if lower.ends_with(".cram") { + evidence.push("extension .cram".to_owned()); + DetectedKind::AlignmentCram + } else if lower.ends_with(".bam") { + evidence.push("extension .bam".to_owned()); + DetectedKind::AlignmentBam + } else if is_reference_path(path) { + evidence.push("reference fasta extension".to_owned()); + DetectedKind::ReferenceFasta + } else { + let sample_lines = read_plain_sample_lines_from_bytes(&lower, bytes)?; + let sample_lower = sample_lines.join("\n").to_ascii_lowercase(); + if looks_like_vcf_lines(&sample_lines) { + evidence.push("vcf header markers".to_owned()); + DetectedKind::Vcf + } else if looks_like_genotype_text(&sample_lines) { + if sample_lower.contains("rsid") || sample_lower.contains("allele1") { + evidence.push("genotype-like sampled rows and headers".to_owned()); + } else { + evidence.push("genotype-like sampled rows".to_owned()); + } + DetectedKind::GenotypeText + } else { + warnings.push("file did not match known textual heuristics".to_owned()); + DetectedKind::Unknown + } + }; + + let sample_lines = match detected_kind { + DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { + Vec::new() + } + _ => read_plain_sample_lines_from_bytes(&lower, bytes)?, + }; + let source = detect_source(&lower, &sample_lines, detected_kind); + let assembly = detect_assembly(&lower, &sample_lines); + let phased = (detected_kind == DetectedKind::Vcf) + .then(|| detect_vcf_phasing(&sample_lines)) + .flatten(); + // Index discovery is filesystem-only; wasm callers pass indexes separately + // through `InspectOptions.input_index` / `reference_index` and we surface + // whichever is provided. + let has_index = options + .input_index + .as_ref() + .or(options.reference_index.as_ref()) + .map(|_| true); + let index_path = options + .input_index + .clone() + .or_else(|| options.reference_index.clone()); + let confidence = classify_confidence(detected_kind, &sample_lines, source.as_ref()); + + Ok(FileInspection { + path: path.to_path_buf(), + container: FileContainer::Plain, + detected_kind, + confidence, + source, + assembly, + phased, + selected_entry: None, + has_index, + index_path, + reference_matches: None, + evidence, + warnings, + duration_ms: started.elapsed().as_millis(), + }) +} + +fn read_plain_sample_lines_from_bytes( + lower_name: &str, + bytes: &[u8], +) -> Result, RuntimeError> { + if lower_name.ends_with(".vcf.gz") { + return read_sample_lines_from_reader(BufReader::new(bgzf::io::Reader::new(Cursor::new( + bytes, + )))); + } + read_sample_lines_from_reader(BufReader::new(Cursor::new(bytes))) +} + +fn read_zip_sample_lines_from_bytes( + bytes: &[u8], + selected_entry: &str, +) -> Result, RuntimeError> { + let mut archive = ZipArchive::new(Cursor::new(bytes)) + .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; + let mut entry = archive.by_name(selected_entry).map_err(|err| { + RuntimeError::Io(format!( + "failed to open zip entry {selected_entry} from bytes: {err}" + )) + })?; + if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { + let mut inner = Vec::new(); + entry.read_to_end(&mut inner).map_err(|err| { + RuntimeError::Io(format!( + "failed to read compressed zip entry {selected_entry}: {err}" + )) + })?; + let reader = bgzf::io::Reader::new(Cursor::new(inner)); + return read_sample_lines_from_reader(BufReader::new(reader)); + } + read_sample_lines_from_reader(BufReader::new(entry)) +} + +fn select_zip_entry_from_bytes(bytes: &[u8]) -> Result { + let mut archive = ZipArchive::new(Cursor::new(bytes)) + .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; + let mut fallback = None; + for idx in 0..archive.len() { + let entry = archive + .by_index(idx) + .map_err(|err| RuntimeError::Io(format!("failed to inspect zip bytes: {err}")))?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + if name.starts_with("__MACOSX/") { + continue; + } + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + || lower.ends_with(".txt") + || lower.ends_with(".tsv") + || lower.ends_with(".csv") + { + return Ok(name); + } + if fallback.is_none() { + fallback = Some(name); + } + } + fallback.ok_or_else(|| { + RuntimeError::Unsupported("zip archive does not contain a supported file".to_owned()) + }) +} + pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result { let started = Instant::now(); let lower = path.to_string_lossy().to_ascii_lowercase(); diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index 687c969..065c099 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -7,16 +7,17 @@ clippy::unused_self )] -mod alignment; +pub mod alignment; mod genotype; mod inspect; mod prepare; pub use genotype::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, + observe_cram_snp_with_reader, }; pub use inspect::{ DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, - SourceMetadata, inspect_file, + SourceMetadata, inspect_bytes, inspect_file, }; pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; diff --git a/rust/bioscript-wasm/Cargo.toml b/rust/bioscript-wasm/Cargo.toml new file mode 100644 index 0000000..7a4dce2 --- /dev/null +++ b/rust/bioscript-wasm/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "bioscript-wasm" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["cdylib", "rlib"] + +[dependencies] +bioscript-core = { path = "../bioscript-core" } +bioscript-formats = { path = "../bioscript-formats" } +noodles = { version = "0.109.0", features = ["cram", "fasta"] } +wasm-bindgen = "0.2" +js-sys = "0.3" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +console_error_panic_hook = { version = "0.1", optional = true } + +[features] +default = ["console_error_panic_hook"] + +[profile.release] +opt-level = "z" +lto = true +codegen-units = 1 diff --git a/rust/bioscript-wasm/src/js_reader.rs b/rust/bioscript-wasm/src/js_reader.rs new file mode 100644 index 0000000..e28af35 --- /dev/null +++ b/rust/bioscript-wasm/src/js_reader.rs @@ -0,0 +1,106 @@ +//! A `Read + Seek` shim backed by a JS `readAt(offset, length) -> Uint8Array` +//! callback. The host JS (Node or browser) owns the file handle; we ask for +//! byte ranges on demand so a 20 GB CRAM never needs to load into memory. +//! +//! On wasm32-unknown-unknown there are no real threads, so the `Send + Sync` +//! unsafe impls below are sound — `fasta::Repository` requires them for its +//! `Arc>` cache. + +use std::io::{self, Read, Seek, SeekFrom}; + +use js_sys::{Function, Uint8Array}; +use wasm_bindgen::JsValue; + +pub struct JsReader { + read_at: Function, + length: u64, + position: u64, + label: String, +} + +impl JsReader { + pub fn new(read_at: Function, length: u64, label: impl Into) -> Self { + Self { + read_at, + length, + position: 0, + label: label.into(), + } + } +} + +impl Read for JsReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if buf.is_empty() || self.position >= self.length { + return Ok(0); + } + let remaining = self.length - self.position; + let want = u64::try_from(buf.len()).unwrap_or(u64::MAX).min(remaining); + let result = self + .read_at + .call2( + &JsValue::NULL, + &JsValue::from_f64(self.position as f64), + &JsValue::from_f64(want as f64), + ) + .map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!( + "{} readAt({}, {}) threw: {:?}", + self.label, self.position, want, err + ), + ) + })?; + let array = Uint8Array::from(result); + let got = array.byte_length() as usize; + if got == 0 { + return Ok(0); + } + if got > buf.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "{} readAt returned {} bytes but caller asked for {}", + self.label, + got, + buf.len() + ), + )); + } + array.copy_to(&mut buf[..got]); + self.position += got as u64; + Ok(got) + } +} + +impl Seek for JsReader { + fn seek(&mut self, pos: SeekFrom) -> io::Result { + let new_pos = match pos { + SeekFrom::Start(n) => n as i128, + SeekFrom::End(n) => self.length as i128 + n as i128, + SeekFrom::Current(n) => self.position as i128 + n as i128, + }; + if new_pos < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("{} seek before start of stream", self.label), + )); + } + self.position = new_pos as u64; + Ok(self.position) + } +} + +// SAFETY: wasm32-unknown-unknown is single-threaded. `JsValue`/`Function` are +// `!Send + !Sync` in the general case because shared access from multiple OS +// threads would race, but under single-threaded wasm that scenario can't +// happen. The `fasta::Repository` cache requires `Send + Sync` on its adapter +// to satisfy `Arc>` — this unsafe impl lets us satisfy that bound +// without the runtime ever actually crossing a thread boundary. +// +// Applied unconditionally: on native targets this crate is only ever built +// for type-checking (there's no real `js_sys::Function` available anyway), so +// the `Send + Sync` impls are equally unobservable there. +unsafe impl Send for JsReader {} +unsafe impl Sync for JsReader {} diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs new file mode 100644 index 0000000..6a38bae --- /dev/null +++ b/rust/bioscript-wasm/src/lib.rs @@ -0,0 +1,296 @@ +//! Browser-facing bindings around the existing bioscript Rust code. +//! See docs/architecture/bioscript-is-source-of-truth.md — the app layer +//! must not reimplement file parsing or lookups in TS/JS. It goes through here. +//! +//! Current surface: +//! - `inspectBytes(name, bytes, options)` — file classification / vendor sniff +//! - `lookupCramVariants(cramReadAt, cramLen, craiBytes, fastaReadAt, fastaLen, +//! faiBytes, variantsJson)` — SNP lookups against an indexed CRAM + FASTA +//! through JS-supplied random-read callbacks. +//! +//! Pending (see migration checklist in the architecture doc): +//! - `loadGenotypesBytes(name, bytes)` / `lookupVariants(storeId, planJson)` +//! - `compileVariantYaml(yamlText)` +//! - Index-less fallback (linear scan or on-the-fly index build). +//! - Indel / deletion observations on CRAM. + +mod js_reader; + +use std::{io::BufReader, path::PathBuf}; + +use bioscript_core::GenomicLocus; +use bioscript_formats::{ + DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, + SourceMetadata, alignment, inspect_bytes as inspect_bytes_rs, observe_cram_snp_with_reader, +}; +use serde::{Deserialize, Serialize}; +use wasm_bindgen::prelude::*; + +use crate::js_reader::JsReader; + +#[wasm_bindgen(start)] +pub fn start() { + #[cfg(feature = "console_error_panic_hook")] + console_error_panic_hook::set_once(); +} + +#[derive(Default, Deserialize)] +struct InspectOptionsJs { + input_index: Option, + reference_file: Option, + reference_index: Option, +} + +/// Classify bytes as a known genomic file. Mirrors `bioscript-formats::inspect::inspect_bytes`. +/// Returns JSON matching the `Inspection` shape the app already uses. +#[wasm_bindgen(js_name = inspectBytes)] +pub fn inspect_bytes(name: &str, bytes: &[u8], options_json: Option) -> Result { + let options_js: InspectOptionsJs = match options_json { + Some(text) if !text.is_empty() => serde_json::from_str(&text) + .map_err(|err| JsError::new(&format!("invalid InspectOptions JSON: {err}")))?, + _ => InspectOptionsJs::default(), + }; + let options = InspectOptions { + input_index: options_js.input_index.map(PathBuf::from), + reference_file: options_js.reference_file.map(PathBuf::from), + reference_index: options_js.reference_index.map(PathBuf::from), + }; + + let inspection = inspect_bytes_rs(name, bytes, &options) + .map_err(|err| JsError::new(&format!("inspect_bytes failed: {err:?}")))?; + + let resp = InspectionJs::from(inspection); + serde_json::to_string(&resp) + .map_err(|err| JsError::new(&format!("failed to encode response: {err}"))) +} + +#[derive(Deserialize)] +struct VariantInput { + name: String, + chrom: String, + // 1-based genomic position of the SNP. + pos: i64, + #[serde(rename = "ref")] + ref_base: String, + #[serde(rename = "alt")] + alt_base: String, + #[serde(default)] + rsid: Option, + #[serde(default)] + assembly: Option, +} + +#[derive(Serialize)] +struct VariantObservationJs { + name: String, + backend: String, + #[serde(rename = "matchedRsid", skip_serializing_if = "Option::is_none")] + matched_rsid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + assembly: Option, + #[serde(skip_serializing_if = "Option::is_none")] + genotype: Option, + #[serde(rename = "refCount", skip_serializing_if = "Option::is_none")] + ref_count: Option, + #[serde(rename = "altCount", skip_serializing_if = "Option::is_none")] + alt_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + depth: Option, + #[serde(rename = "rawCounts")] + raw_counts: std::collections::BTreeMap, + #[serde(skip_serializing_if = "Option::is_none")] + decision: Option, + evidence: Vec, +} + +/// Observe a list of SNP variants against an indexed CRAM + reference FASTA, +/// with the bulk bytes pulled on demand via JS-supplied `readAt(offset, len)` +/// callbacks. The small index payloads (`.crai`, `.fai`) are passed inline. +/// +/// Both callbacks must return a `Uint8Array` synchronously (or via a Node +/// sync read) — wasm's `Read + Seek` contract is synchronous. Async reads are +/// a follow-up that needs buffered pre-fetch on the JS side. +#[wasm_bindgen(js_name = lookupCramVariants)] +pub fn lookup_cram_variants( + cram_read_at: js_sys::Function, + cram_len: f64, + crai_bytes: &[u8], + fasta_read_at: js_sys::Function, + fasta_len: f64, + fai_bytes: &[u8], + variants_json: &str, +) -> Result { + let crai_index = alignment::parse_crai_bytes(crai_bytes) + .map_err(|err| JsError::new(&format!("parse crai: {err:?}")))?; + let fai_index = alignment::parse_fai_bytes(fai_bytes) + .map_err(|err| JsError::new(&format!("parse fai: {err:?}")))?; + + let fasta_reader = BufReader::new(JsReader::new(fasta_read_at, fasta_len as u64, "fasta")); + let repository = alignment::build_reference_repository_from_readers(fasta_reader, fai_index); + + let cram_reader = JsReader::new(cram_read_at, cram_len as u64, "cram"); + let mut indexed = alignment::build_cram_indexed_reader_from_reader( + cram_reader, + crai_index, + repository, + ) + .map_err(|err| JsError::new(&format!("build cram reader: {err:?}")))?; + + let variants: Vec = serde_json::from_str(variants_json) + .map_err(|err| JsError::new(&format!("parse variantsJson: {err}")))?; + + let mut results = Vec::with_capacity(variants.len()); + for variant in variants { + let ref_char = variant + .ref_base + .chars() + .next() + .ok_or_else(|| JsError::new(&format!("variant {}: empty ref", variant.name)))?; + let alt_char = variant + .alt_base + .chars() + .next() + .ok_or_else(|| JsError::new(&format!("variant {}: empty alt", variant.name)))?; + let assembly = variant + .assembly + .as_deref() + .and_then(parse_assembly_str); + let locus = GenomicLocus { + chrom: variant.chrom.clone(), + start: variant.pos, + end: variant.pos, + }; + let observation = observe_cram_snp_with_reader( + &mut indexed, + &variant.name, + &locus, + ref_char, + alt_char, + variant.rsid.clone(), + assembly, + ) + .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", variant.name)))?; + results.push(VariantObservationJs { + name: variant.name, + backend: observation.backend, + matched_rsid: observation.matched_rsid, + assembly: observation.assembly.map(|a| render_assembly(a).to_owned()), + genotype: observation.genotype, + ref_count: observation.ref_count, + alt_count: observation.alt_count, + depth: observation.depth, + raw_counts: observation.raw_counts, + decision: observation.decision, + evidence: observation.evidence, + }); + } + + serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + +fn parse_assembly_str(s: &str) -> Option { + match s.to_ascii_lowercase().as_str() { + "grch37" | "hg19" | "b37" => Some(bioscript_core::Assembly::Grch37), + "grch38" | "hg38" => Some(bioscript_core::Assembly::Grch38), + _ => None, + } +} + +// Wire types — we flatten the Rust FileInspection into the shape the app's +// TS Inspection type already expects (matches widgets/FilePicker/types.ts). +#[derive(Serialize)] +struct InspectionJs { + #[serde(rename = "fileName")] + file_name: String, + container: &'static str, + #[serde(rename = "detectedKind")] + detected_kind: &'static str, + confidence: &'static str, + assembly: Option<&'static str>, + phased: Option, + source: Option, + #[serde(rename = "selectedEntry", skip_serializing_if = "Option::is_none")] + selected_entry: Option, + #[serde(rename = "hasIndex", skip_serializing_if = "Option::is_none")] + has_index: Option, + #[serde(rename = "referenceMatches", skip_serializing_if = "Option::is_none")] + reference_matches: Option, + evidence: Vec, + warnings: Vec, + #[serde(rename = "durationMs")] + duration_ms: u128, +} + +#[derive(Serialize)] +struct SourceJs { + vendor: String, + #[serde(rename = "platformVersion", skip_serializing_if = "Option::is_none")] + platform_version: Option, + confidence: &'static str, + evidence: Vec, +} + +impl From for InspectionJs { + fn from(i: FileInspection) -> Self { + InspectionJs { + file_name: i.path.display().to_string(), + container: render_container(i.container), + detected_kind: render_kind(i.detected_kind), + confidence: render_confidence(i.confidence), + assembly: i.assembly.map(render_assembly), + phased: i.phased, + source: i.source.map(SourceJs::from), + selected_entry: i.selected_entry, + has_index: i.has_index, + reference_matches: i.reference_matches, + evidence: i.evidence, + warnings: i.warnings, + duration_ms: i.duration_ms, + } + } +} + +impl From for SourceJs { + fn from(s: SourceMetadata) -> Self { + SourceJs { + vendor: s.vendor.unwrap_or_default(), + platform_version: s.platform_version, + confidence: render_confidence(s.confidence), + evidence: s.evidence, + } + } +} + +fn render_container(c: FileContainer) -> &'static str { + match c { + FileContainer::Plain => "plain", + FileContainer::Zip => "zip", + } +} + +fn render_kind(k: DetectedKind) -> &'static str { + match k { + DetectedKind::GenotypeText => "genotype_text", + DetectedKind::Vcf => "vcf", + DetectedKind::AlignmentCram => "alignment_cram", + DetectedKind::AlignmentBam => "alignment_bam", + DetectedKind::ReferenceFasta => "reference_fasta", + DetectedKind::Unknown => "unknown", + } +} + +fn render_confidence(c: DetectionConfidence) -> &'static str { + match c { + DetectionConfidence::Authoritative => "authoritative", + DetectionConfidence::StrongHeuristic => "strong_heuristic", + DetectionConfidence::WeakHeuristic => "weak_heuristic", + DetectionConfidence::Unknown => "unknown", + } +} + +fn render_assembly(a: bioscript_core::Assembly) -> &'static str { + match a { + bioscript_core::Assembly::Grch37 => "grch37", + bioscript_core::Assembly::Grch38 => "grch38", + } +}