Skip to content

Commit 930ec61

Browse files
author
joshhorton
committed
fixed the hill formula to construct in alphabetical order.
1 parent cb104da commit 930ec61

File tree

4 files changed

+36
-28
lines changed

4 files changed

+36
-28
lines changed

openforcefield/tests/test_molecule.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,15 @@ def test_hill_formula(self):
538538
# make sure is not order dependent
539539
molecule_smiles_reverse = create_reversed_ethanol()
540540
assert molecule_smiles.hill_formula == molecule_smiles_reverse.hill_formula
541+
# make sure single element names are put first
542+
order_mol = Molecule.from_smiles('C(Br)CB')
543+
assert order_mol.hill_formula == 'C2H6BBr'
544+
# test molecule with no carbon
545+
no_carb_mol = Molecule.from_smiles('OS(=O)(=O)O')
546+
assert no_carb_mol.hill_formula == 'H2O4S'
547+
# test no carbon and hydrogen
548+
br_i = Molecule.from_smiles('BrI')
549+
assert br_i.hill_formula == 'BrI'
541550
# make sure files and smiles match
542551
molecule_file = Molecule.from_file(get_data_file_path('molecules/ethanol.sdf'))
543552
assert molecule_smiles.hill_formula == molecule_file.hill_formula
@@ -549,6 +558,7 @@ def test_hill_formula(self):
549558
# make sure the networkx matches
550559
assert molecule_smiles.hill_formula == Molecule.to_hill_formula(molecule_smiles.to_networkx())
551560

561+
552562
def test_isomorphic_general(self):
553563
"""Test the matching using different input types"""
554564
# check that hill formula fails are caught

openforcefield/tests/test_topology.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ def test_from_openmm_missing_conect(self):
429429

430430
molecules = []
431431
molecules.append(Molecule.from_smiles('CCO'))
432-
with pytest.raises(ValueError, match='No match found for molecule C1. This would be a '
432+
with pytest.raises(ValueError, match='No match found for molecule C. This would be a '
433433
'very unusual molecule to try and parameterize, '
434434
'and it is likely that the data source it was '
435435
'read from does not contain connectivity '

openforcefield/topology/molecule.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2864,6 +2864,7 @@ def to_hill_formula(molecule):
28642864
"""
28652865

28662866
from openforcefield.topology import TopologyMolecule
2867+
import collections
28672868
# check for networkx then assuming we have a Molecule or TopologyMolecule instance just try and
28682869
# extract the info. Note we do not type check the TopologyMolecule due to cyclic dependencies
28692870
if isinstance(molecule, nx.Graph):
@@ -2881,32 +2882,29 @@ def to_hill_formula(molecule):
28812882
f'openforcefield.topology.topology.TopologyMolecule or networkx representaion '
28822883
f'of the molecule.')
28832884

2884-
# Now sort the elements, method taken from topology._networkx_to_hill_formula
2885-
# Count the number of instances of each atomic number
2886-
at_num_to_counts = dict([(unq, atom_nums.count(unq)) for unq in atom_nums])
2887-
2888-
symbol_to_counts = {}
2889-
# Check for C and H first, to make a correct hill formula (remember dicts in python 3.6+ are ordered)
2890-
if 6 in at_num_to_counts:
2891-
symbol_to_counts['C'] = at_num_to_counts[6]
2892-
del at_num_to_counts[6]
2893-
2894-
if 1 in at_num_to_counts:
2895-
symbol_to_counts['H'] = at_num_to_counts[1]
2896-
del at_num_to_counts[1]
2897-
2898-
# Now count instances of all elements other than C and H, in order of ascending atomic number
2899-
sorted_atom_nums = sorted(at_num_to_counts.keys())
2900-
for atom_num in sorted_atom_nums:
2901-
symbol_to_counts[Element.getByAtomicNumber(atom_num).symbol] = at_num_to_counts[atom_num]
2902-
2903-
# Finally format the formula as string
2904-
formula = ''
2905-
for ele, count in symbol_to_counts.items():
2906-
if count == 1:
2907-
count = ''
2908-
formula += f'{ele}{count}'
2909-
return formula
2885+
# make a correct hill formula representation following this guide
2886+
# https://en.wikipedia.org/wiki/Chemical_formula#Hill_system
2887+
2888+
# create the counter dictionary using chemical symbols
2889+
atom_symbol_counts = collections.Counter(Element.getByAtomicNumber(atom_num).symbol for atom_num in atom_nums)
2890+
2891+
formula = []
2892+
# Check for C and H first, to make a correct hill formula
2893+
for el in ['C', 'H']:
2894+
if el in atom_symbol_counts:
2895+
count = atom_symbol_counts.pop(el)
2896+
formula.append(el)
2897+
if count > 1:
2898+
formula.append(str(count))
2899+
2900+
# now get the rest of the elements in alphabetical ordering
2901+
for el in sorted(atom_symbol_counts.keys()):
2902+
count = atom_symbol_counts.pop(el)
2903+
formula.append(el)
2904+
if count > 1:
2905+
formula.append(str(count))
2906+
2907+
return "".join(formula)
29102908

29112909
def chemical_environment_matches(self,
29122910
query,

openforcefield/topology/topology.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1653,7 +1653,7 @@ def from_openmm(cls, openmm_topology, unique_molecules=None):
16531653
if match_found is False:
16541654
hill_formula = Molecule.to_hill_formula(omm_mol_G)
16551655
msg = f'No match found for molecule {hill_formula}. '
1656-
probably_missing_conect = ['C1', 'H1', 'O1', 'N1', 'P1', 'S1', 'F1', 'Cl1', 'Br1']
1656+
probably_missing_conect = ['C', 'H', 'O', 'N', 'P', 'S', 'F', 'Cl', 'Br']
16571657
if hill_formula in probably_missing_conect:
16581658
msg += ('This would be a very unusual molecule to try and parameterize, '
16591659
'and it is likely that the data source it was read from does not '

0 commit comments

Comments
 (0)