Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: support check --upgrade to upgrade old versions of RocksDB/RevIndex #581

Merged
merged 16 commits into from
Jan 11, 2025
Merged
Next Next commit
MRG: support multiple input files for singlesketch
  • Loading branch information
ctb committed Jan 9, 2025
commit 99d504229ad994cfa4c1486b4103d35460ee3514
8 changes: 5 additions & 3 deletions doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
| command | functionality | docs |
| -------- | -------- | -------- |
| `manysketch` | Rapidly build sketches for many input files | [link](#Running-manysketch) |
| `singlesketch` | Sketch a single sequence file | [link](#Running-singlesketch)
| `singlesketch` | Sketch a single sample | [link](#Running-singlesketch)
| `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link](#Running-fastgather)
| `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link](#Running-fastmultigather)
| `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link](#Running-manysearch)
Expand Down Expand Up @@ -259,19 +259,21 @@ In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were mad

## Running `singlesketch`

The `singlesketch` command generates a sketch for a single sequence file.
The `singlesketch` command generates a sketch for a single sample, with one or more input FASTA/FASTQ files.

### Basic Usage

```bash
sourmash scripts singlesketch input.fa -p k=21,scaled=1000,dna -o output.sig --name signature_name
```

### Using `stdin/stdout`

You can use `-` for `stdin` and output the result to `stdout`:
```bash
cat input.fa | sourmash scripts singlesketch - -o -
```


### Running `multisearch` and `pairwise`

The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` because it loads everything into memory.
Expand Down
6 changes: 3 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,15 +324,15 @@ fn do_manysketch(
}

#[pyfunction]
#[pyo3(signature = (input_filename, input_moltype, param_str, output, name))]
#[pyo3(signature = (input_filenames, input_moltype, param_str, output, name))]
fn do_singlesketch(
input_filename: String,
input_filenames: Vec<String>,
input_moltype: String,
param_str: String,
output: String,
name: String,
) -> anyhow::Result<u8> {
match singlesketch::singlesketch(input_filename, input_moltype, param_str, output, name) {
match singlesketch::singlesketch(input_filenames, input_moltype, param_str, output, name) {
Ok(_) => Ok(0),
Err(e) => {
eprintln!("Error: {e}");
Expand Down
14 changes: 8 additions & 6 deletions src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,11 +602,13 @@ def main(self, args):

class Branchwater_SingleSketch(CommandLinePlugin):
command = "singlesketch"
description = "sketch a single sequence file"
description = "sketch a single sample"

def __init__(self, p):
super().__init__(p)
p.add_argument("input_filename", help="input FASTA file or '-' for stdin")
p.add_argument(
"input_filenames", help="input file(s); use '-' for stdin", nargs="+"
)
p.add_argument(
"-o",
"--output",
Expand Down Expand Up @@ -660,19 +662,19 @@ def main(self, args):
args.name
if args.name
else (
os.path.basename(args.input_filename)
if args.input_filename != "-"
os.path.basename(args.input_filenames[0])
if args.input_filenames[0] != "-"
else ""
)
)

notify(
f"sketching file '{args.input_filename}' ({args.input_moltype}) with params '{args.param_string}' and name '{signature_name}' using a single thread"
f"sketching {len(args.input_filenames)} files ({args.input_moltype}) with params '{args.param_string}' and name '{signature_name}' using a single thread"
)

super().main(args)
status = sourmash_plugin_branchwater.do_singlesketch(
args.input_filename,
args.input_filenames,
args.input_moltype,
args.param_string,
args.output,
Expand Down
52 changes: 52 additions & 0 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1630,3 +1630,55 @@ def test_singlesketch_stdin(runtmp):
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "dna,scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes


def test_singlesketch_multifiles(runtmp, capfd):
# multiple input files to singlesketch
fa_csv = runtmp.output("db-fa.csv")

fa1 = get_test_data("short.fa")
fa2 = get_test_data("short2.fa")

output = runtmp.output("db.zip")

runtmp.sourmash(
"scripts",
"singlesketch",
fa1,
fa2,
"-o",
output,
"--param-str",
"dna,k=31,scaled=1",
)

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.out)
print(captured.err)
assert "calculated 1 signatures for 2 sequences in 2 files" in captured.err

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())
print(sigs)
assert len(sigs) == 1
made_sig = sigs[0]
assert made_sig.name == "short.fa"

s1 = runtmp.output("short.sig")
runtmp.sourmash(
"sketch",
"dna",
fa1,
fa2,
"-o",
s1,
"--param-str",
"k=31,scaled=1",
"--name",
"short.fa",
)
sig1 = sourmash.load_one_signature(s1)

assert made_sig == sig1
16 changes: 11 additions & 5 deletions src/singlesketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::utils::buildutils::BuildCollection;
use anyhow::{bail, Result};

pub fn singlesketch(
input_filename: String,
input_filenames: Vec<String>,
input_moltype: String,
param_str: String,
output: String,
Expand All @@ -26,14 +26,20 @@ pub fn singlesketch(
bail!("No signatures to build for the given parameters.");
}

let sequence_count =
sigs.build_sigs_from_file_or_stdin(&input_moltype, name, input_filename.clone())?;
let mut sequence_count = 0;
for input_filename in input_filenames.iter() {
sequence_count += sigs.build_sigs_from_file_or_stdin(
&input_moltype,
name.clone(),
input_filename.clone(),
)?;
}

eprintln!(
"calculated {} signatures for {} sequences in {}",
"calculated {} signatures for {} sequences in {} files",
sigs.size(),
sequence_count,
input_filename
input_filenames.len(),
);

// Write signatures to stdout or output file
Expand Down
Loading