Merge pull request #11 from Materials-Data-Science-and-Informatics/sp…

…eed_up_defect_calculation Speed up defect detection calculation using Approximate Nearest Neighbor
Materials-Data-Science-and-Informatics · May 7, 2024 · 360ceeb · 360ceeb
2 parents b3bf736 + 8bbdd4c
commit 360ceeb
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,7 @@
+# Changelog
+
+Here we provide notes that summarize the most important changes in each released version.
+
+## v0.3.3
+
+* Added a feature to use approximate nearest neighbors search for point defect detection.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "atomID"
-version = "0.3.2"
+version = "0.3.3"
 description = "Python package to identify and annotate crystal structure data files"
 authors = ["Ninad Bhat"]
 readme = "README.md"

diff --git a/src/atomid/annotate.py b/src/atomid/annotate.py
@@ -1,5 +1,7 @@
 """Annotate crystal class."""
 
+from typing import Optional
+
 import atomrdf as ardf
 from ase.io import read as ase_read
 
@@ -57,7 +59,9 @@ def annotate_crystal_structure(self) -> None:
                 lattice_constant=lattice_constants,
             )
 
-    def identify_defects(self, reference_data_file: str, ref_format: str) -> dict:
+    def identify_defects(
+        self, reference_data_file: str, ref_format: str, method: Optional[str] = None
+    ) -> dict:
         """Identify defects in the crystal structure using the reference data file.
 
         Parameters
@@ -77,12 +81,15 @@ def identify_defects(self, reference_data_file: str, ref_format: str) -> dict:
         ref_positions = ref_ase.positions
 
         defects: dict[str, dict[str, float]] = analyze_defects(
-            reference_positions_list=ref_positions,
-            actual_positions_list=actual_positions,
+            reference_positions=ref_positions,
+            actual_positions=actual_positions,
+            method=method,
         )
         return defects
 
-    def annotate_defects(self, reference_data_file: str, ref_format: str) -> None:
+    def annotate_defects(
+        self, reference_data_file: str, ref_format: str, method: Optional[str] = None
+    ) -> None:
         """Annotate defects in the crystal structure using the reference data file.
 
         Parameters
@@ -93,7 +100,7 @@ def annotate_defects(self, reference_data_file: str, ref_format: str) -> None:
             The format of the file. If None, the format is guessed from the file extension
 
         """
-        defects = self.identify_defects(reference_data_file, ref_format)
+        defects = self.identify_defects(reference_data_file, ref_format, method)
 
         vacancies = defects.get("Vacancies", {"count": 0, "fraction": 0})
 

diff --git a/src/atomid/atomid.py b/src/atomid/atomid.py
@@ -58,7 +58,7 @@ def identify_defects_in_crystal_structure(
     ref_positions = ref_ase.positions
 
     defects: dict[str, dict[str, float]] = analyze_defects(
-        reference_positions_list=ref_positions, actual_positions_list=actual_positions
+        reference_positions=ref_positions, actual_positions=actual_positions
     )
 
     return defects

diff --git a/src/atomid/point_defect_analysis/wigner_seitz_method.py b/src/atomid/point_defect_analysis/wigner_seitz_method.py
@@ -1,105 +1,137 @@
-"""Point defect identification using the Wigner-Seitz method."""
+"""Wigner-Seitz method for point defect analysis."""
 
-from typing import Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import numpy as np
 
 
-def find_nearest_atom(
-    atom: tuple, atom_positions: np.ndarray
-) -> Tuple[np.signedinteger, list]:
-    """Find the nearest atom to a given defect position.
+def analyze_defects(
+    reference_positions: List[Tuple[float, float, float]],
+    actual_positions: List[Tuple[float, float, float]],
+    species_ref: Optional[List[str]] = None,
+    species_actual: Optional[List[str]] = None,
+    method: Optional[str] = None,
+) -> Dict[str, Dict[str, float]]:
+    """Analyze the lattice for vacancy, interstitial, and substitution defects.
 
     Parameters
     ----------
-    atom : tuple
-        The position of the defect atom.
-    atom_positions : np.ndarra
-        The positions of the atoms in the lattice.
+    reference_positions : list of tuples
+        The expected positions of the atoms in the lattice.
+    actual_positions : list of tuples
+        The actual positions of the atoms in the lattice.
+    species_ref : list of str, optional
+        Species at each reference position.
+    species_actual : list of str, optional
+        Species at each actual position.
+    method : str, optional
+        The method to find nearest positions ('annoy' for using AnnoyIndex).
 
     Returns
     -------
-    nearest_index : int
-        The index of the nearest atom.
-    distance : float
-        The distance between the defect and the nearest atom.
+    dict
+        A dictionary containing the counts and fractions of vacancies, interstitials, and substitutions.
     """
-    distances = np.linalg.norm(atom_positions - atom, axis=1)
-    nearest_index: np.signedinteger = np.argmin(distances)
-    return nearest_index, distances[nearest_index]
+    reference_array = np.array(reference_positions)
+    actual_array = np.array(actual_positions)
 
+    atom_position_count = np.zeros(len(reference_array))
+    substitution_count = np.zeros(len(reference_array))
+    index_finder = create_index_finder(reference_array, method)
 
-def analyze_defects(
-    reference_positions_list: list,
-    actual_positions_list: list,
-    species_ref: Optional[list] = None,
-    species_actual: Optional[list] = None,
-) -> dict[str, dict[str, float]]:
-    """Analyze the lattice for vacancy and interstitial defects.
+    for i, actual in enumerate(actual_array):
+        nearest_index = index_finder(actual)
+        atom_position_count[nearest_index] += 1
+        if (
+            species_ref
+            and species_actual
+            and species_actual[i] != species_ref[nearest_index]
+        ):
+            substitution_count[nearest_index] += 1
+
+    defects: dict = calculate_defects(
+        reference_array, atom_position_count, substitution_count
+    )
+    return defects
+
+
+def create_index_finder(
+    reference_array: np.ndarray, method: Optional[str] = None
+) -> Callable:
+    """Create a function to find the index of the nearest reference position.
 
     Parameters
     ----------
-    reference_positions : list of tuples
-        The expected positions of the atoms in the lattice.
-    actual_positions : list of tuples
-        The actual positions of the atoms in the lattice.
+    reference_array : np.ndarray
+        The reference positions of the atoms.
+    method : str, optional
+        The method to find nearest positions ('annoy' for using AnnoyIndex).
 
     Returns
     -------
-    defect_analysis : dict
-        A dictionary containing the vacancy and interstitial defects.
-
+    function
+    A function that takes an actual position and returns the index of the nearest reference position.
     """
-    reference_positions: np.ndarray = np.array(reference_positions_list)
-    actual_positions: np.ndarray = np.array(actual_positions_list)
-    atom_position_count = np.zeros(len(reference_positions))
-    substitution_count = np.zeros(len(reference_positions))
-
-    # Process actual positions and compare with reference to identify defects
-    for i, actual in enumerate(actual_positions):
-        nearest_index, _ = find_nearest_atom(actual, reference_positions)
-        atom_position_count[nearest_index] += 1
+    if method == "annoy":
+        from annoy import AnnoyIndex
 
-        # Check for substitutions if species information is provided
-        if species_actual and species_ref:
-            if species_actual[i] != species_ref[nearest_index]:
-                substitution_count[nearest_index] += 1
+        t = AnnoyIndex(len(reference_array[0]), "euclidean")
+        for i, ref in enumerate(reference_array):
+            t.add_item(i, ref)
+        t.build(10)
+        return lambda x: t.get_nns_by_vector(x, 1)[0]
+    else:
+        return lambda x: np.argmin(np.sum((reference_array - x) ** 2, axis=1))
 
-    # Determine vacancies taking into account both atom positions and substitutions
-    vacancies = [
-        (i, tuple(pos))
-        for i, pos in enumerate(reference_positions)
-        if atom_position_count[i] == 0
-        and (not species_actual or substitution_count[i] == 0)
-    ]
 
+def calculate_defects(
+    reference_array: np.ndarray,
+    atom_position_count: np.ndarray,
+    substitution_count: np.ndarray,
+) -> Dict:
+    """Calculate the number and fraction of vacancies, interstitials, and substitutions.
+
+    Parameters
+    ----------
+    reference_array : np.ndarray
+        The reference positions of the atoms.
+    atom_position_count : np.ndarray
+        The number of atoms at each reference position.
+    substitution_count : np.ndarray
+        The number of substitutions at each reference position.
+
+    Returns
+    -------
+    dict
+        A dictionary containing the counts and fractions of vacancies, interstitials, and substitutions.
+    """
     vacancies = [
         (i, tuple(pos))
-        for i, pos in enumerate(reference_positions)
+        for i, pos in enumerate(reference_array)
         if atom_position_count[i] == 0
     ]
     interstitials = [
         (i, tuple(pos))
-        for i, pos in enumerate(actual_positions)
+        for i, pos in enumerate(reference_array)
         if atom_position_count[i] > 1
     ]
     substitutions = [
         (i, tuple(pos))
-        for i, pos in enumerate(reference_positions)
+        for i, pos in enumerate(reference_array)
         if substitution_count[i] > 0
     ]
 
     return {
         "Vacancies": {
             "count": len(vacancies),
-            "fraction": len(vacancies) / len(reference_positions),
+            "fraction": len(vacancies) / len(reference_array),
         },
         "Interstitials": {
             "count": len(interstitials),
-            "fraction": len(interstitials) / len(actual_positions),
+            "fraction": len(interstitials) / len(reference_array),
         },
         "Substitutions": {
             "count": len(substitutions),
-            "fraction": len(substitutions) / len(reference_positions),
+            "fraction": len(substitutions) / len(reference_array),
         },
     }