11mod tdf_blobs;
22
3+ use lzf:: decompress as lzf_decompress;
34use memmap2:: Mmap ;
45use std:: fs:: File ;
56use std:: io;
@@ -9,6 +10,7 @@ use zstd::decode_all;
910use crate :: readers:: { TimsTofFileType , TimsTofPathError , TimsTofPathLike } ;
1011
1112const U32_SIZE : usize = std:: mem:: size_of :: < u32 > ( ) ;
13+
1214const HEADER_SIZE : usize = 2 ;
1315
1416#[ derive( Debug ) ]
@@ -23,7 +25,14 @@ impl TdfBlobReader {
2325 Ok ( reader)
2426 }
2527
26- pub fn get ( & self , offset : usize ) -> Result < TdfBlob , TdfBlobReaderError > {
28+ /// Returns a TDF blob with uncompressed data
29+ ///
30+ pub fn get (
31+ & self ,
32+ offset : usize ,
33+ compression_type : u8 ,
34+ max_peaks_per_scan : usize ,
35+ ) -> Result < TdfBlob , TdfBlobReaderError > {
2736 let offset = self . bin_file_reader . global_file_offset + offset;
2837 let byte_count = self
2938 . bin_file_reader
@@ -36,10 +45,126 @@ impl TdfBlobReader {
3645 if data. len ( ) == 0 {
3746 return Err ( TdfBlobReaderError :: EmptyData ) ;
3847 }
39- let bytes =
40- decode_all ( data) . map_err ( |_| TdfBlobReaderError :: Decompression ) ?;
41- let blob = TdfBlob :: new ( bytes) ?;
42- Ok ( blob)
48+ if compression_type == 1 {
49+ let bytes = self . decompress_v1 (
50+ offset,
51+ byte_count,
52+ data,
53+ max_peaks_per_scan,
54+ ) ?;
55+ let blob = TdfBlob :: new ( bytes) ?;
56+ Ok ( blob)
57+ } else {
58+ let bytes = decode_all ( data)
59+ . map_err ( |_| TdfBlobReaderError :: Decompression ) ?;
60+ let blob: TdfBlob = TdfBlob :: new ( bytes) ?;
61+ Ok ( blob)
62+ }
63+ }
64+
65+ /// Get a TDF blob compressed with version 1
66+ /// Basically a reimplementation of the alphatims implementation
67+ /// Returns the uncompressed data compatible
68+ /// * scan_count: 4 bytes
69+ /// * scan_indices: (scan_count) * 4 bytes
70+ /// * scan: remaining bytes
71+ ///
72+ /// # Arguments
73+ /// * `offset` - The offset of the blob in the binary file
74+ /// * `max_peaks_per_scan` - The maximum number of peaks per scan
75+ /// * `data` - The compressed data
76+ /// * `max_peaks_per_scan` - The maximum number of peaks per scan from the metadata
77+ fn decompress_v1 (
78+ & self ,
79+ offset : usize ,
80+ byte_count : usize ,
81+ data : & [ u8 ] ,
82+ max_peaks_per_scan : usize ,
83+ ) -> Result < Vec < u8 > , TdfBlobReaderError > {
84+ // bin_size = int.from_bytes(infile.read(4), "little")
85+ // bin_size == byte_count
86+
87+ // scan_count = int.from_bytes(infile.read(4), "little")
88+ let scan_count = self
89+ . bin_file_reader
90+ . get_scan_count ( offset)
91+ . ok_or ( TdfBlobReaderError :: NoScanCount ) ?;
92+
93+ // TODO: frame_end - frame_start should be equal to bin_size/byte_count?
94+ // max_peak_count = min(
95+ // max_peaks_per_scan,
96+ // frame_end - frame_start
97+ // )
98+ let max_peak_count = std:: cmp:: min ( max_peaks_per_scan, byte_count) ;
99+
100+ // compression_offset = 8 + (scan_count + 1) * 4
101+ let compression_offset = ( scan_count + 1 ) * U32_SIZE ;
102+
103+ // TODO: For some reason scan offsets were i32 not u32. Convert to u32 than to usize for easier indexing
104+ // scan_offsets = np.frombuffer(
105+ // infile.read((scan_count + 1) * 4),
106+ // dtype=np.int32
107+ // ) - compression_offset
108+ let mut scan_offsets = self
109+ . bin_file_reader
110+ . get_scan_offsets ( offset, scan_count)
111+ . ok_or ( TdfBlobReaderError :: CorruptData ) ?;
112+ scan_offsets = scan_offsets. iter_mut ( ) . map ( |x| * x - compression_offset) . collect ( ) ;
113+
114+ // bin_size + scan_count + scan_offsets + compressed_data (which is max_peak_count * 4)
115+ let tdf_bytes_capacity = U32_SIZE + U32_SIZE + scan_offsets. len ( ) * U32_SIZE + scan_count * max_peak_count;
116+
117+ // this is basically the uncompressed frame
118+ // scan_count: 4 bytes
119+ // scan_indices: (scan_count) * 4 bytes
120+ // scan: remaining bytes
121+ let mut tdf_bytes = Vec :: with_capacity ( tdf_bytes_capacity) ;
122+ tdf_bytes. extend_from_slice ( & byte_count. to_le_bytes ( ) ) ;
123+
124+
125+ let mut scan_indexes: Vec < u8 > = Vec :: with_capacity ( scan_count * U32_SIZE ) ;
126+ let mut scans: Vec < u8 > = Vec :: with_capacity ( byte_count) ;
127+
128+ let mut scan_start: u32 = 0 ;
129+ // for scan_index in range(scan_count):
130+ for scan_index in 0 ..scan_count {
131+ //start = scan_offsets[scan_index]
132+ let start = scan_offsets[ scan_index] ;
133+
134+ //end = scan_offsets[scan_index + 1]
135+ let end = scan_offsets[ scan_index + 1 ] ;
136+
137+ //if start == end:
138+ // continue
139+ if start == end {
140+ continue ;
141+ }
142+
143+ //decompressed_bytes = lzf.decompress(
144+ // compressed_data[start: end],
145+ // max_peak_count * 4 * 2
146+ //)
147+ let mut decompressed_bytes = match lzf_decompress (
148+ & data[ start as usize ..end as usize ] ,
149+ max_peak_count * U32_SIZE * 2 ,
150+ ) {
151+ Ok ( bytes) => bytes,
152+ Err ( _) => return Err ( TdfBlobReaderError :: Decompression ) ,
153+ } ;
154+
155+ if decompressed_bytes. len ( ) % U32_SIZE != 0 {
156+ return Err ( TdfBlobReaderError :: CorruptData ) ;
157+ }
158+
159+ scan_indexes. extend_from_slice ( & scan_start. to_le_bytes ( ) ) ;
160+ scan_start = decompressed_bytes. len ( ) as u32 ;
161+ scans. append ( & mut decompressed_bytes) ;
162+ }
163+
164+ tdf_bytes. append ( & mut scan_indexes) ;
165+ tdf_bytes. append ( & mut scans) ;
166+
167+ Ok ( tdf_bytes)
43168 }
44169}
45170
@@ -68,6 +193,8 @@ impl TdfBinFileReader {
68193 Ok ( reader)
69194 }
70195
196+ /// Get byte count, first 4 bytes of the blob
197+ ///
71198 fn get_byte_count ( & self , offset : usize ) -> Option < usize > {
72199 let start = offset as usize ;
73200 let end = start + U32_SIZE as usize ;
@@ -77,14 +204,33 @@ impl TdfBinFileReader {
77204 Some ( byte_count)
78205 }
79206
80- // fn get_scan_count(&self, offset: usize) -> Option<usize> {
81- // let start = (offset + U32_SIZE) as usize;
82- // let end = start + U32_SIZE as usize;
83- // let raw_scan_count = self.mmap.get(start..end)?;
84- // let scan_count =
85- // u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
86- // Some(scan_count)
87- // }
207+ /// Get scan count, second 4 bytes of the blob
208+ ///
209+ fn get_scan_count ( & self , offset : usize ) -> Option < usize > {
210+ let start = ( offset + U32_SIZE ) as usize ;
211+ let end = start + U32_SIZE as usize ;
212+ let raw_scan_count = self . mmap . get ( start..end) ?;
213+ let scan_count =
214+ u32:: from_le_bytes ( raw_scan_count. try_into ( ) . ok ( ) ?) as usize ;
215+ Some ( scan_count)
216+ }
217+
218+ /// Get scan offsets, third 4 bytes of the blob
219+ ///
220+ fn get_scan_offsets ( & self , offset : usize , scan_count : usize ) -> Option < Vec < usize > > {
221+ let start = ( offset + U32_SIZE * 2 ) as usize ;
222+ let end = start + U32_SIZE * ( scan_count + 1 ) as usize ;
223+ let raw_scan_offsets = self . mmap . get ( start..end) ?;
224+ if raw_scan_offsets. len ( ) % U32_SIZE != 0 {
225+ return None ;
226+ }
227+ let scan_offsets = raw_scan_offsets
228+ . chunks_exact ( U32_SIZE )
229+ . map ( |x| u32:: from_le_bytes ( x. try_into ( ) . unwrap ( ) ) )
230+ . map ( |x| x as usize )
231+ . collect :: < Vec < usize > > ( ) ;
232+ Some ( scan_offsets)
233+ }
88234
89235 fn get_data ( & self , offset : usize , byte_count : usize ) -> Option < & [ u8 ] > {
90236 let start = offset + HEADER_SIZE * U32_SIZE ;
@@ -106,10 +252,11 @@ impl IndexedTdfBlobReader {
106252 path : impl TimsTofPathLike ,
107253 binary_offsets : Vec < usize > ,
108254 ) -> Result < Self , IndexedTdfBlobReaderError > {
255+
109256 let blob_reader = TdfBlobReader :: new ( path) ?;
110257 let reader = Self {
111258 binary_offsets,
112- blob_reader : blob_reader ,
259+ blob_reader,
113260 } ;
114261 Ok ( reader)
115262 }
@@ -122,7 +269,13 @@ impl IndexedTdfBlobReader {
122269 . binary_offsets
123270 . get ( index)
124271 . ok_or ( IndexedTdfBlobReaderError :: InvalidIndex ( index) ) ?;
125- let blob = self . blob_reader . get ( offset) ?;
272+ let blob = self . blob_reader . get (
273+ offset,
274+ // TODO: Compression type 1 seems to be irrelevant for minitdf. Correct?
275+ // Set compression to type 2 for latest compression and max peaks to 0 which is only relevant for type 1
276+ 2 ,
277+ 0
278+ ) ?;
126279 Ok ( blob)
127280 }
128281}
@@ -145,6 +298,14 @@ pub enum TdfBlobReaderError {
145298 TimsTofPathError ( #[ from] TimsTofPathError ) ,
146299 #[ error( "No binary file found" ) ]
147300 NoBinary ,
301+ #[ error( "No scan count found" ) ]
302+ NoScanCount ,
303+ #[ error( "No binary size found" ) ]
304+ NoBinarySize ,
305+ #[ error( "Scan offset error" ) ]
306+ ScanOffsetError ,
307+ #[ error( "No scan offsets found" ) ]
308+ NoScanOffsets ,
148309}
149310
150311#[ derive( Debug , thiserror:: Error ) ]
0 commit comments