r247: fixed an off-by-1 bug in #4

lh3 · Oct 18, 2024 · e55a3d2 · e55a3d2
1 parent d675136
commit e55a3d2
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 30 deletions.
diff --git a/main.c b/main.c
@@ -5,7 +5,7 @@
 #include "io.h"
 #include "ketopt.h"
 
-#define RB3_VERSION "3.7-r246-dirty"
+#define RB3_VERSION "3.7-r247-dirty"
 
 int main_build(int argc, char *argv[]);
 int main_merge(int argc, char *argv[]);

diff --git a/ssa.c b/ssa.c
@@ -178,7 +178,7 @@ int64_t rb3_ssa_multi(void *km, const rb3_fmi_t *f, const rb3_ssa_t *ssa, int64_
 		rb3_fmi_rank2a(f, x.lo, x.hi, ok, ol);
 		for (l = ok[0]; l < ol[0]; ++l) { // reaching sentinels
 			aux.sa[aux.n_sa].sid = ssa->r2i[l];
-			aux.sa[aux.n_sa].pos = x.off - 1;
+			aux.sa[aux.n_sa].pos = x.off;
 			aux.n_sa++;
 			if (aux.n_sa == aux.max_sa) goto end_ssa_multi;
 		}

diff --git a/tex/ropebwt3.bib b/tex/ropebwt3.bib
@@ -441,3 +441,14 @@ @article{DBLP:journals/csur/Navarro21
 	title = {Indexing Highly Repetitive String Collections, Part {II:} Compressed Indexes},
 	volume = {54},
 	year = {2022}}
+
+@inproceedings{DBLP:conf/icalp/NishimotoT21,
+	author = {Takaaki Nishimoto and Yasuo Tabei},
+	booktitle = {48th International Colloquium on Automata, Languages, and Programming, {ICALP} 2021, July 12-16, 2021, Glasgow, Scotland (Virtual Conference)},
+	editor = {Nikhil Bansal and Emanuela Merelli and James Worrell},
+	pages = {101:1--101:15},
+	publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
+	series = {LIPIcs},
+	title = {Optimal-Time Queries on BWT-Runs Compressed Indexes},
+	volume = {198},
+	year = {2021}}
diff --git a/tex/ropebwt3.tex b/tex/ropebwt3.tex
@@ -80,7 +80,8 @@ \section{Introduction}
 they still take large space.
 The largest lossless k-mer index consists of a few terabases~\citep{Karasikov2020.10.01.322164}.
 
-Compressed full-text indices, such as FM-index~\citep{DBLP:conf/focs/FerraginaM00} and r-index~\citep{DBLP:conf/soda/GagieNP18,DBLP:journals/tcs/BannaiGI20,DBLP:journals/jacm/GagieNP20},
+Compressed full-text indices, such as FM-index~\citep{DBLP:conf/focs/FerraginaM00} r-index~\citep{DBLP:conf/soda/GagieNP18,DBLP:journals/tcs/BannaiGI20,DBLP:journals/jacm/GagieNP20}
+and move index~\citep{DBLP:conf/icalp/NishimotoT21},
 provide an alternative way for fast sequence search~\citep{DBLP:journals/csur/Navarro21}.
 The core component of these data structures is often Burrows-Wheeler Transform (BWT; \citealt*{Burrows:1994aa})
 which is a lossless transformation of strings.
@@ -136,7 +137,7 @@ \section{Introduction}
 
 \section{Methods}
 
-In a nutshell, ropebwt3 computes the partial multi-string BWT of a subset of sequences with libsais
+In a nutshell, ropebwt3 computes the partial multi-string BWT of a subset of sequences with libsais (\url{https://github.com/IlyaGrebnov/libsais})
 and merges the partial BWT into the existing BWT run-length encoded as a B+-tree~\citep{Li:2014ab}.
 It repeats this procedure until all input sequences are processed.
 The BWT by default includes input sequences on both strands.
@@ -209,6 +210,33 @@ \subsection{Basic concepts}
 {\bf (c)} The prefix directed acyclic word graph (DAWG) of $T$ by merging nodes with identical suffix array intervals.}\label{fig:1}
 \end{figure}
 
+\begin{algorithm}[tb]
+	\caption{A naive bwa-aln algorithm with PSSM}\label{algo:bwa-aln}
+	\begin{algorithmic}[1]
+		\Procedure{BwaAlnPSSM}{$B,P,s,g$}\Comment{$g$ is gap penalty}
+			\State $H\gets\mbox{empty heap prioritized on the 1st value}$
+			\State $H.{\rm push}(0,|P|,0,|T|)$
+			\While{$H$ is not empty}
+				\State $(p,i,l,h)\gets H.{\rm pop}()$\Comment{element with the smallest $p$}
+				\If{$i=0$}
+					\State \Return $[l,h)$\Comment{this is the best match}
+				\EndIf
+				\State $H.{\rm push}(p+g_{i-1},i-1,l,h)$\Comment{insertion}
+				\State $a\gets P[i-1]$
+				\For{$c\in\Sigma$}\Comment{match, mismatch or deletion}
+					\State $l'\gets C_B(c)+{\rm rank}_B(c,l)$\Comment{backward search}
+					\State $h'\gets C_B(c)+{\rm rank}_B(c,h)$
+					\If{$l'<h'$}\Comment{$[l',h')$ is a child of $[l,h)$}
+						\State $H.{\rm push}(p+s_i(a|c),i-1,l',h')$\Comment{(mis)match}
+						\State $H.{\rm push}(p+g_i,i,l',h')$\Comment{deletion}
+					\EndIf
+				\EndFor
+			\EndWhile
+			\State \Return $\emptyset$\Comment{no match}
+		\EndProcedure
+	\end{algorithmic}
+\end{algorithm}
+
 %\begin{algorithm}[!b]
 %	\caption{Retrieve the $i$-th sequence, $0\le i<m$}\label{algo:get}
 %	\begin{algorithmic}[1]
@@ -229,7 +257,8 @@ \subsection{BWT construction}
 	\caption{Insert BWT $B_2$ into BWT $B_1$}\label{algo:merge}
 	\begin{algorithmic}[1]
 		\Procedure{AppendBWT}{$B_1,B_2$}
-			\State $m_1\gets|\{k:B_1[k]=\$\}|$; $m_2\gets|\{k:B_2[k]=\$\}|$
+			\State $m_1\gets|\{k:B_1[k]=\$\}|$
+			\State $m_2\gets|\{k:B_2[k]=\$\}|$
 			\For{$k\gets 0$ {\bf to} $|B_2|-1$}
 				\State $a\gets B_2[k]$
 				\State $R(k)\gets (a,C_{B_2}(a)+{\rm rank}_{B_2}(a,k))$
@@ -264,8 +293,8 @@ \subsection{BWT construction}
 The basic idea behind the algorithm is well known~\citep{DBLP:conf/latin/FerraginaGM10}
 but implementations vary~\citep{DBLP:conf/dcc/Siren16,DBLP:conf/dcc/Oliva0SMKGB21}.
 In ropebwt3, we encode BWT with a B+-tree (Fig.~\ref{fig:2}a).
-This yields $O(\log r)$ rank query (line 12) and insertion (line 16), where $r$ is the number of runs in the merged BWT.
-The bottleneck of the algorithm lies in rank calculation (line 7), which can be parallelized if $m_2>1$.
+This yields $O(\log r)$ rank query (line 13) and insertion (line 17), where $r$ is the number of runs in the merged BWT.
+The bottleneck of the algorithm lies in rank calculation (line 8), which can be parallelized if $m_2>1$.
 
 This online BWT construction algorithm does not use temporary disk space.
 The overall time complexity is $O(n\log r)$.
@@ -278,18 +307,20 @@ \subsection{BWT construction}
 Ropebwt2~\citep{Li:2014ab} uses the same B+-tree to encode BWT and has the same time complexity.
 However, it inserts sequences, not partial BWTs, into existing BWT.
 It cannot be efficiently parallelized for long strings.
-Note that independent of our earlier work, \citet{DBLP:journals/jda/OhnoSTIS18} also used B+-tree to encode BWT.
+Note that independent of our earlier work, \citet{DBLP:journals/jda/OhnoSTIS18} also used a B+-tree to encode BWT.
 Its implementation~\citep{DBLP:journals/tcs/BannaiGI20} is several times slower than ropebwt2 on 152 bacterial genomes from \citet{Li:2024ab},
 possibly because ropebwt2 is optimized for the small DNA alphabet.
 
 \begin{figure}[bt]
 \centering
 \includegraphics[width=.49\textwidth]{fig2}
 \caption{Examples of binary BWT encoding.
+The alphabet is $\{{\tt A},{\tt C},{\tt G}\}$.
 {\bf (a)} Encoded as a B+-tree.
 An internal node keeps the marginal counts of symbols in its descendants.
 An external node keeps the run-length encoded substring in BWT.
 A run length may be encoded in one, two, four or eight bytes in a scheme inspired by UTF-8.
+A B+-tree organized this way resembles the rope data structure.
 {\bf (b)} Encoded as a bit-packed array.
 The first two rows store run-length encoded BWT interleaved with marginal counts in each block.
 The Elias delta encoding is used to represent run lengths.
@@ -300,7 +331,7 @@ \subsection{BWT construction}
 \subsection{Suffix array interval and backward search}
 
 For a string $P\in\Sigma^*$ (i.e. not including sentinels), let ${\rm occ}(P)$ be the number of occurrences of $P$ in $T$.
-Define ${\rm lo}(P)$ to be the number of suffixes that lexicographically smaller than $P$
+Define ${\rm lo}(P)$ to be the number of suffixes that are lexicographically smaller than $P$
 and ${\rm hi}(P)\triangleq {\rm lo}(P)+{\rm occ}(P)$.
 $[{\rm lo}(P),{\rm hi}(P))$ is called the \emph{suffix array interval} of $P$, or \emph{SA interval} in brief.
 An SA interval is \emph{important} if there exists $P$ such that it is the SA interval of $P$.
@@ -340,10 +371,10 @@ \subsection{Double-strand BWT}\label{sec:ds-bwt}
 	\begin{algorithmic}[1]
 		\Procedure{BackwardExt}{$B,(k,k',s),a$}
 			\ForAll{$b<\overline{a}$}\Comment{$b$ can be $\$$}
-				\State $k'\gets k'+\big[{\rm rank}(\overline{b},k+s)-{\rm rank}(\overline{b},k)\big]$
+				\State $k'\gets k'+\big[{\rm rank}_B(\overline{b},k+s)-{\rm rank}_B(\overline{b},k)\big]$
 			\EndFor
-			\State $s\gets {\rm rank}(a,k+s)-{\rm rank}(a,k)$
-			\State $k\gets C(a)+{\rm rank}(a,k)$
+			\State $s\gets {\rm rank}_B(a,k+s)-{\rm rank}_B(a,k)$
+			\State $k\gets C_B(a)+{\rm rank}_B(a,k)$
 			\State \Return{$(k,k',s)$}
 		\EndProcedure
 		\Procedure{ForwardExt}{$B,(k,k',s),a$}
@@ -366,13 +397,13 @@ \subsection{Double-strand BWT}\label{sec:ds-bwt}
 
 \subsection{Finding supermaximal exact matches}
 
-An \emph{exact match} between string $T$ and $P$ is a 3-tuple $(l,t,p)$ such that $T[t,t+l)=P[p,p+l)$.
+An \emph{exact match} between strings $T$ and $P$ is a 3-tuple $(t,p,l)$ such that $T[t,t+l)=P[p,p+l)$.
 A \emph{maximal exact match} (MEM) is an exact match that cannot be extended in either direction.
 A MEM may be contained in another MEM on the pattern string $P$.
 For example, suppose $T={\tt GACCTCCG}$ and $P={\tt ACCT}$.
-MEM $(2,6,1)$ is contained in MEM $(4,1,0)$ on the pattern string.
+MEM $(5,1,2)$ is contained in MEM $(1,0,4)$ on the pattern string.
 A \emph{supermaximal exact match} (SMEM) is a MEM that is not contained in other MEMs on the pattern string.
-In the example above, only $(4,1,0)$ is a SMEM.
+In the example above, only $(1,0,4)$ is a SMEM.
 There are usually much fewer SMEMs than MEMs.
 \citet{DBLP:conf/dlt/Gagie24} recently proposed a new algorithm to find long SMEMs (Algorithm~\ref{algo:smem})
 that is faster than our earlier algorithm~\citep{Li:2012fk}
@@ -409,7 +440,7 @@ \subsection{Finding supermaximal exact matches}
 	\end{algorithmic}
 \end{algorithm}
 
-\subsection{Finding inexact matches}
+\subsection{Finding inexact matches with BWA-SW}
 
 The prefix trie of $T$ is a tree that encodes all the prefixes of $T$ (Fig.~\ref{fig:1}b).
 Each edge in the tree is labeled with a symbol in $\Sigma$.
@@ -476,6 +507,32 @@ \subsection{Finding inexact matches}
 	\end{algorithmic}
 \end{algorithm}
 
+%\begin{algorithm}[tb]
+%	\caption{Locate a hit given suffix array samples}\label{algo:locate}
+%	\begin{algorithmic}[1]
+%		\Procedure{Locate1}{$B,S,R,{\rm lo},{\rm hi},s$}
+%			\State $I\gets\{({\rm lo},{\rm hi},0)\}$
+%			\While{$I\not=\emptyset$}
+%				\State $(l,h,o)\gets\mbox{longest interval in $I$}$
+%				\State $I\gets I\setminus \{(l,h,o)\}$
+%				\If{$\exists k$ such that $k\cdot s\in[l,h)$}
+%					\State \Return $S(ks)+o$
+%				\EndIf
+%				\If{${\rm rank}_B(\$,l)<{\rm rank}_B(\$,h)$}
+%					\State \Return $R({\rm rank}_B(\$,l))+o$
+%				\EndIf
+%				\For{$a\in\Sigma$}
+%					\State $l'\gets C(a)+{\rm rank}_B(a,l)$
+%					\State $h'\gets C(a)+{\rm rank}_B(a,h)$
+%					\If{$l'<h'$}
+%						\State $I\gets I\cup\{(l',h',o+1)\}$
+%					\EndIf
+%				\EndFor
+%			\EndWhile
+%		\EndProcedure
+%	\end{algorithmic}
+%\end{algorithm}
+
 \subsection{Estimating local haplotype diversity}
 
 BWA-SW with the banding heuristic may miss the best matching haplotype especially given an index consisting of similar haplotypes.
@@ -492,7 +549,7 @@ \subsection{Estimating local haplotype diversity}
 $\mathcal{M}$ may include suboptimal haplotypes caused by small variants as well as the optimal one.
 Importantly, $P$ may be aligned to similar positions on $T$ with slightly different gap placements and alignment scores.
 In theory, we can identify this case by locating the position of each alignment.
-This procedure is slow as the ``locate'' operation is costly.
+This procedure is slow as such locating operation is costly.
 We instead leverage the bi-directionality to identify redundancy heuristically.
 Suppose $P$ can be aligned to SA bi-intervals $(k_1,k'_1,s_1)$ and $(k_2,k'_2,s_2)$ and the alignment score to the first interval is higher.
 We filter out the second interval if $[k_2,k_2+s_2)\subset[k_1,k_1+s_1)$ or $[k'_2,k'_2+s_2)\subset[k'_1,k'_1+s_1)$.
@@ -528,11 +585,11 @@ \section{Results}
 Dataset        & Algorithm     & Elapsed$^1$ & CPU$^1$  &     RAM \\
 \midrule
 human100       & grlBWT        &  8.3 h       &  29.6 h &  84.8 GB \\
-		       & pfp-thres$^2$ & 51.7 h       &  51.5 h & 788.1 GB \\
+		       & pfp-thres$^2$ &<51.7 h       & <51.5 h & 788.1 GB \\
                & ropebwt3      & 20.5 h       & 469.4 h &  83.0 GB \\
 			   & metagraph$^3$ & 16.9 h       & 314.8 h & 251.0 GB \\
 			   & fulgor$^4$ (lossy)& 1.2 h    &  32.7 h & 165.2 GB \\
-CommonBacteria & ropebwt3      & 26.5 d       & 830.3 d &  67.3 GB \\
+CommonBacteria & ropebwt3      & 25.6 d       & 830.3 d &  67.3 GB \\
 \botrule
 \end{tabular*}
 \begin{tablenotes}\setlength\itemsep{0.0em}
@@ -547,11 +604,11 @@ \section{Results}
 \subsection{Performance on index construction}
 
 We evaluated the performance of BWT construction on 100 haplotype-resolved human assemblies collected in~\citet{Li:2024ab}.
-As we included both strands (Section~\ref{sec:ds-bwt}), each BWT construction algorithm took about 600 billion as input.
+As we included both strands (Section~\ref{sec:ds-bwt}), each BWT construction algorithm took about 600 billion bases as input.
 The average run length is 141.6.
 grlBWT (commit 5b6d26a; \citealt*{DBLP:journals/iandc/DiazDominguezN23}) is the fastest algorithm (Table~\ref{tab:index})
 at the cost of more than a terabyte of working disk space including decompressed sequences.
-Ropebwt3 took 21 hours from input sequences in gzip'd FASTA to the final index.
+Ropebwt3 took 21 hours from input sequences in gzip'd FASTA to the final index, of which 7.7 hours were spent on libsais.
 It does not use working disk space and can append new sequences to an existing BWT.
 However, hardcoded for the DNA alphabet, ropebwt3 does not work with more general alphabets.
 pfp-thresholds~\citep{Rossi:2022aa} used more memory than the input sequences.
@@ -570,9 +627,8 @@ \subsection{Performance on index construction}
 
 To test the scalability of ropebwt3 in preparation for larger incoming pangenome datasets,
 we constructed the BWT of AllTheBacteria~\citep{Hunt2024.03.08.584059} excluding the ``dustbin'' and ``unknown'' categories that include relatively unique genomes.
-This subset, which we call as CommonBacteria, consists of 278.4 million sequences in 7.33 Tb.
+This subset, which we call CommonBacteria, consists of 278.4 million sequences in 7.33 Tb.
 An older version of ropebwt3 took less than a month to construct the double-strand BWT with 14.66 trillion symbols.
-We expect the latest version to reduce the elapsed time to about 17 days.
 The final index in the fermi format is 27.6 GB in size.
 Indexing the entire AllTheBacteria dataset will probably take 100--200 GB as unique sequences are not compressed well.
 
@@ -582,11 +638,11 @@ \subsection{Performance on index construction}
 \toprule
 Data   & Algorithm     &Type$^4$&Speed$^5$ (kb/s)&RAM (GB) \\
 \midrule
-SR$+^1$& ropebwt3      & MEM31  & 1758.5       &  10.6 \\
+SR$+^1$& ropebwt3$^6$  & MEM31  & 1758.5       &  10.6 \\
        &               & MEM51  & 1907.5       &  10.6 \\
 	   &               & SW10   & 84.1         &  15.2 \\
 %	   &               & suffix & 2216.4       &  10.3 \\
-	   & MONI$^6$      & MEM$-$ & 453.2        &  68.4 \\
+	   & MONI$^7$      & MEM$-$ & 453.2        &  68.4 \\
 	   &               & extend & 348.2        &  68.4 \\
        & Movi          & PML    & 5894.0       &  47.6 \\
 %	   &               & suffix & 6792.0       &  79.4 \\
@@ -619,7 +675,8 @@ \subsection{Performance on index construction}
 PML: pseudo-matching length; PA: pseudo-alignment; PA$+$: pseudo-alignment with contig names;
 SW$y$: BWA-SW with bandwidth $y$
 \item[$^5$] kilobases processed per CPU second. Index loading time excluded
-\item[$^6$] The MONI index includes both strands.
+\item[$^6$] index in the binary fermi format
+\item[$^7$] The MONI index includes both strands.
 We modified MONI such that extension is performed on the forward query strand only
 \end{tablenotes}
 \end{table}
@@ -635,7 +692,8 @@ \subsection{Query performance}
 Although the fastest, it finds the longest exact matches for only 195 out of one million short reads
 and it misses the longest exact matches for all long reads.
 The longest PML of each read is on average 27\% shorter than the longest exact match for LR$+$ and 22\% shorter for SR$+$.
-Nonetheless, we believe it should be possible to implement SMEM finding based on the Movi data structure with a minor performance overhead.
+Nonetheless, PML is not designed for finding the longest matches.
+We also believe it should be possible to implement SMEM finding based on the Movi data structure with a minor performance overhead.
 
 MONI and ropebwt3 can also find inexact matches.
 Implementing r-index, MONI can relatively cheaply locate one SMEM.
@@ -653,7 +711,7 @@ \subsection{Identifying novel sequences}
 
 As a biological application, we used the pangenome index to identify novel sequences in reads that are absent from other human genomes.
 For this, we downloaded the PacBio HiFi reads for tumor sample COLO829 (\url{https://downloads.pacbcloud.com/public/revio/2023Q2/COLO829/COLO829/}),
-mapped them to the pangenome index with 100 human genomes
+mapped them to the pangenome index with 100 human genomes, which includes T2T-CHM13,
 and extracted $\ge$1kb regions on reads that are not covered by SMEMs of 51bp or longer.
 We found 95kb sequences in 43 reads.
 These sequences could not be assembled.
@@ -662,11 +720,11 @@ \subsection{Identifying novel sequences}
 
 When we mapped the COLO829 reads to a single T2T-CHM13 genome~\citep{Nurk:2022up} and applied the same procedure,
 we found 55.9Mb of ``novel'' sequences in 25,365 reads.
-The much larger number is probably caused by regions with dense SNPs that prevented long exact matches.
+The much larger number is caused by regions with dense SNPs that prevented long exact matches.
 Counterintuitively, mapping these reads to T2T-CHM13 with minimap2~\citep{Li:2018ab} resulted in more ``novel'' sequences at 78.6Mb,
 probably because minimap2 ignores seeds occurring thousands of times in T2T-CHM13 and may miss more alignments.
 Capable of finding SMEMs at the pangenome scale, ropebwt3 is more effective for identifying known sequences.
-It is also faster than full minimap2 alignment against a single genome.
+It is also 16\% faster and uses less memory than full minimap2 alignment against a single genome.
 
 \subsection{Haplotype diversity around C4 genes}