Skip to content

Commit

Permalink
add standardization script
Browse files Browse the repository at this point in the history
  • Loading branch information
maxall41 committed Jan 7, 2025
1 parent 46de180 commit f6cbbf5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 10 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,14 @@ To do so create a new CSV file in `scripts/` with the column: `sequence` for the
```bash
python scripts/VIPER_run.py --input input_csv_file.csv --output out.csv
```

## Data standardization

You can run our full data standardization pipeline with the `scripts/data/standardize.py` script like this:

```python
python scripts/data/standardize.py input_csv_path output_csv_path path_to_xtb_executable
```

Required DF columns: `SMILES`
Added DF columns: `XTB_STANDARDIZED_SMILES`
20 changes: 10 additions & 10 deletions scripts/VIPER_run.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import sys

import ankh
import torch
import numpy as np
from deps.molformer import compute_molformer_emb
from io import StringIO
from deps.model import create_model
import os
import fire
import numpy as np
import pandas as pd
import torch
from deps.model import create_model
from deps.molformer import compute_molformer_emb

SEQ_COL = 'sequence'
SMILES_COL = 'smiles'
Expand All @@ -22,10 +22,10 @@ def gen_ankh(seq):
model, tokenizer = ankh.load_base_model()
model.eval()
model.cuda()
outputs = tokenizer.batch_encode_plus([list(seq)],
add_special_tokens=True,
padding=True,
is_split_into_words=True,
outputs = tokenizer.batch_encode_plus([list(seq)],
add_special_tokens=True,
padding=True,
is_split_into_words=True,
return_tensors="pt")
with torch.no_grad():
embeddings = model(input_ids=outputs['input_ids'].cuda(), attention_mask=outputs['attention_mask'].cuda())
Expand Down

0 comments on commit f6cbbf5

Please sign in to comment.