Skip to content

Commit f386944

Browse files
committed
add support for compression level 1
1 parent cf4a7a8 commit f386944

File tree

6 files changed

+253
-42
lines changed

6 files changed

+253
-42
lines changed

Cargo.lock

Lines changed: 21 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ parquet = { version = "53.0.0", optional = true }
2525
serde = { version = "1.0.210", features = ["derive"], optional = true }
2626
serde_json = { version = "1.0.128", optional = true }
2727
timscompress = {version = "0.1.0", optional=true}
28+
lzf = "1.0.0"
2829

2930
[features]
3031
tdf = ["rusqlite"]

src/io/readers/file_readers/tdf_blob_reader.rs

Lines changed: 176 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
mod tdf_blobs;
22

3+
use lzf::decompress as lzf_decompress;
34
use memmap2::Mmap;
45
use std::fs::File;
56
use std::io;
@@ -9,6 +10,7 @@ use zstd::decode_all;
910
use crate::readers::{TimsTofFileType, TimsTofPathError, TimsTofPathLike};
1011

1112
const U32_SIZE: usize = std::mem::size_of::<u32>();
13+
1214
const HEADER_SIZE: usize = 2;
1315

1416
#[derive(Debug)]
@@ -23,7 +25,14 @@ impl TdfBlobReader {
2325
Ok(reader)
2426
}
2527

26-
pub fn get(&self, offset: usize) -> Result<TdfBlob, TdfBlobReaderError> {
28+
/// Returns a TDF blob with uncompressed data
29+
///
30+
pub fn get(
31+
&self,
32+
offset: usize,
33+
compression_type: u8,
34+
max_peaks_per_scan: usize,
35+
) -> Result<TdfBlob, TdfBlobReaderError> {
2736
let offset = self.bin_file_reader.global_file_offset + offset;
2837
let byte_count = self
2938
.bin_file_reader
@@ -36,10 +45,126 @@ impl TdfBlobReader {
3645
if data.len() == 0 {
3746
return Err(TdfBlobReaderError::EmptyData);
3847
}
39-
let bytes =
40-
decode_all(data).map_err(|_| TdfBlobReaderError::Decompression)?;
41-
let blob = TdfBlob::new(bytes)?;
42-
Ok(blob)
48+
if compression_type == 1 {
49+
let bytes = self.decompress_v1(
50+
offset,
51+
byte_count,
52+
data,
53+
max_peaks_per_scan,
54+
)?;
55+
let blob = TdfBlob::new(bytes)?;
56+
Ok(blob)
57+
} else {
58+
let bytes = decode_all(data)
59+
.map_err(|_| TdfBlobReaderError::Decompression)?;
60+
let blob: TdfBlob = TdfBlob::new(bytes)?;
61+
Ok(blob)
62+
}
63+
}
64+
65+
/// Get a TDF blob compressed with version 1
66+
/// Basically a reimplementation of the alphatims implementation
67+
/// Returns the uncompressed data compatible
68+
/// * scan_count: 4 bytes
69+
/// * scan_indices: (scan_count) * 4 bytes
70+
/// * scan: remaining bytes
71+
///
72+
/// # Arguments
73+
/// * `offset` - The offset of the blob in the binary file
74+
/// * `max_peaks_per_scan` - The maximum number of peaks per scan
75+
/// * `data` - The compressed data
76+
/// * `max_peaks_per_scan` - The maximum number of peaks per scan from the metadata
77+
fn decompress_v1(
78+
&self,
79+
offset: usize,
80+
byte_count: usize,
81+
data: &[u8],
82+
max_peaks_per_scan: usize,
83+
) -> Result<Vec<u8>, TdfBlobReaderError> {
84+
// bin_size = int.from_bytes(infile.read(4), "little")
85+
// bin_size == byte_count
86+
87+
// scan_count = int.from_bytes(infile.read(4), "little")
88+
let scan_count = self
89+
.bin_file_reader
90+
.get_scan_count(offset)
91+
.ok_or(TdfBlobReaderError::NoScanCount)?;
92+
93+
// TODO: frame_end - frame_start should be equal to bin_size/byte_count?
94+
// max_peak_count = min(
95+
// max_peaks_per_scan,
96+
// frame_end - frame_start
97+
// )
98+
let max_peak_count = std::cmp::min(max_peaks_per_scan, byte_count);
99+
100+
// compression_offset = 8 + (scan_count + 1) * 4
101+
let compression_offset = (scan_count + 1) * U32_SIZE;
102+
103+
// TODO: For some reason scan offsets were i32 not u32. Convert to u32 than to usize for easier indexing
104+
// scan_offsets = np.frombuffer(
105+
// infile.read((scan_count + 1) * 4),
106+
// dtype=np.int32
107+
// ) - compression_offset
108+
let mut scan_offsets = self
109+
.bin_file_reader
110+
.get_scan_offsets(offset, scan_count)
111+
.ok_or(TdfBlobReaderError::CorruptData)?;
112+
scan_offsets = scan_offsets.iter_mut().map(|x| *x - compression_offset).collect();
113+
114+
// bin_size + scan_count + scan_offsets + compressed_data (which is max_peak_count * 4)
115+
let tdf_bytes_capacity = U32_SIZE + U32_SIZE + scan_offsets.len() * U32_SIZE + scan_count * max_peak_count;
116+
117+
// this is basically the uncompressed frame
118+
// scan_count: 4 bytes
119+
// scan_indices: (scan_count) * 4 bytes
120+
// scan: remaining bytes
121+
let mut tdf_bytes = Vec::with_capacity(tdf_bytes_capacity);
122+
tdf_bytes.extend_from_slice(&byte_count.to_le_bytes());
123+
124+
125+
let mut scan_indexes: Vec<u8> = Vec::with_capacity(scan_count * U32_SIZE);
126+
let mut scans: Vec<u8> = Vec::with_capacity(byte_count);
127+
128+
let mut scan_start: u32 = 0;
129+
// for scan_index in range(scan_count):
130+
for scan_index in 0..scan_count {
131+
//start = scan_offsets[scan_index]
132+
let start = scan_offsets[scan_index];
133+
134+
//end = scan_offsets[scan_index + 1]
135+
let end = scan_offsets[scan_index + 1];
136+
137+
//if start == end:
138+
// continue
139+
if start == end {
140+
continue;
141+
}
142+
143+
//decompressed_bytes = lzf.decompress(
144+
// compressed_data[start: end],
145+
// max_peak_count * 4 * 2
146+
//)
147+
let mut decompressed_bytes = match lzf_decompress(
148+
&data[start as usize..end as usize],
149+
max_peak_count * U32_SIZE * 2,
150+
) {
151+
Ok(bytes) => bytes,
152+
Err(_) => return Err(TdfBlobReaderError::Decompression),
153+
};
154+
155+
if decompressed_bytes.len() % U32_SIZE != 0 {
156+
return Err(TdfBlobReaderError::CorruptData);
157+
}
158+
159+
scan_indexes.extend_from_slice(&scan_start.to_le_bytes());
160+
scan_start = decompressed_bytes.len() as u32;
161+
scans.append(&mut decompressed_bytes);
162+
}
163+
164+
tdf_bytes.append(&mut scan_indexes);
165+
tdf_bytes.append(&mut scans);
166+
167+
Ok(tdf_bytes)
43168
}
44169
}
45170

@@ -68,6 +193,8 @@ impl TdfBinFileReader {
68193
Ok(reader)
69194
}
70195

196+
/// Get byte count, first 4 bytes of the blob
197+
///
71198
fn get_byte_count(&self, offset: usize) -> Option<usize> {
72199
let start = offset as usize;
73200
let end = start + U32_SIZE as usize;
@@ -77,14 +204,33 @@ impl TdfBinFileReader {
77204
Some(byte_count)
78205
}
79206

80-
// fn get_scan_count(&self, offset: usize) -> Option<usize> {
81-
// let start = (offset + U32_SIZE) as usize;
82-
// let end = start + U32_SIZE as usize;
83-
// let raw_scan_count = self.mmap.get(start..end)?;
84-
// let scan_count =
85-
// u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
86-
// Some(scan_count)
87-
// }
207+
/// Get scan count, second 4 bytes of the blob
208+
///
209+
fn get_scan_count(&self, offset: usize) -> Option<usize> {
210+
let start = (offset + U32_SIZE) as usize;
211+
let end = start + U32_SIZE as usize;
212+
let raw_scan_count = self.mmap.get(start..end)?;
213+
let scan_count =
214+
u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
215+
Some(scan_count)
216+
}
217+
218+
/// Get scan offsets, third 4 bytes of the blob
219+
///
220+
fn get_scan_offsets(&self, offset: usize, scan_count: usize) -> Option<Vec<usize>> {
221+
let start = (offset + U32_SIZE * 2) as usize;
222+
let end = start + U32_SIZE * (scan_count + 1) as usize;
223+
let raw_scan_offsets = self.mmap.get(start..end)?;
224+
if raw_scan_offsets.len() % U32_SIZE != 0 {
225+
return None;
226+
}
227+
let scan_offsets = raw_scan_offsets
228+
.chunks_exact(U32_SIZE)
229+
.map(|x| u32::from_le_bytes(x.try_into().unwrap()))
230+
.map(|x| x as usize)
231+
.collect::<Vec<usize>>();
232+
Some(scan_offsets)
233+
}
88234

89235
fn get_data(&self, offset: usize, byte_count: usize) -> Option<&[u8]> {
90236
let start = offset + HEADER_SIZE * U32_SIZE;
@@ -106,10 +252,11 @@ impl IndexedTdfBlobReader {
106252
path: impl TimsTofPathLike,
107253
binary_offsets: Vec<usize>,
108254
) -> Result<Self, IndexedTdfBlobReaderError> {
255+
109256
let blob_reader = TdfBlobReader::new(path)?;
110257
let reader = Self {
111258
binary_offsets,
112-
blob_reader: blob_reader,
259+
blob_reader,
113260
};
114261
Ok(reader)
115262
}
@@ -122,7 +269,13 @@ impl IndexedTdfBlobReader {
122269
.binary_offsets
123270
.get(index)
124271
.ok_or(IndexedTdfBlobReaderError::InvalidIndex(index))?;
125-
let blob = self.blob_reader.get(offset)?;
272+
let blob = self.blob_reader.get(
273+
offset,
274+
// TODO: Compression type 1 seems to be irrelevant for minitdf. Correct?
275+
// Set compression to type 2 for latest compression and max peaks to 0 which is only relevant for type 1
276+
2,
277+
0
278+
)?;
126279
Ok(blob)
127280
}
128281
}
@@ -145,6 +298,14 @@ pub enum TdfBlobReaderError {
145298
TimsTofPathError(#[from] TimsTofPathError),
146299
#[error("No binary file found")]
147300
NoBinary,
301+
#[error("No scan count found")]
302+
NoScanCount,
303+
#[error("No binary size found")]
304+
NoBinarySize,
305+
#[error("Scan offset error")]
306+
ScanOffsetError,
307+
#[error("No scan offsets found")]
308+
NoScanOffsets,
148309
}
149310

150311
#[derive(Debug, thiserror::Error)]

0 commit comments

Comments
 (0)