Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions ppmat/datasets/jarvis_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,9 +897,15 @@ def filter_unvalid_by_property(self):
data = self.property_data[property_name]
reserve_idx = []
for i, data_item in enumerate(data):
if isinstance(data_item, str) or (
data_item is not None and not math.isnan(data_item)
):
# Convert 'na' strings to NaN for proper filtering
if isinstance(data_item, str):
if data_item.lower() in ['na', 'nan', 'none', '']:
data_item = np.nan
else:
# Skip non-numeric strings (they're invalid for numeric properties)
continue
# Keep only valid numeric values (not None, not NaN)
if data_item is not None and not math.isnan(data_item):
reserve_idx.append(i)
for key in self.property_data.keys():
self.property_data[key] = [
Expand Down Expand Up @@ -979,9 +985,22 @@ def __getitem__(self, idx: int):
data["structure_array"] = self.get_structure_array(structure)
for property_name in self.property_names:
if property_name in self.property_data:
data[property_name] = np.array(
[self.property_data[property_name][idx]]
).astype("float32")
value = self.property_data[property_name][idx]
# Check for 'na' strings - these should have been filtered out during initialization
if isinstance(value, str) and value.lower() in ['na', 'nan', 'none', '']:
raise ValueError(
f"Found invalid property value '{value}' at index {idx} for property "
f"'{property_name}'. This should have been filtered out during dataset "
f"initialization. Please ensure 'filter_unvalid=True' is set and "
f"consider clearing the cache to regenerate filtered data."
)
# Check for NaN values - these should also have been filtered out
if value is not None and (isinstance(value, float) and math.isnan(value)):
raise ValueError(
f"Found NaN value at index {idx} for property '{property_name}'. "
f"This should have been filtered out during dataset initialization."
)
data[property_name] = np.array([value]).astype("float32")
else:
raise KeyError(f"Property {property_name} not found.")

Expand Down