simon-p-2000 · simon-p-2000 · Jan 24, 2025 · Jan 25, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## [0.5.9] - 2025-02-01
 ### Added
 - Add support to store NSRR token in environment variable or user config ([#243](https://github.com/cbrnr/sleepecg/pull/243) by [Simon Pusterhofer](https://github.com/simon-p-2000))
+- Add support for downloading and storing activity counts for the MESA dataset ([#249](https://github.com/cbrnr/sleepecg/pull/249) by [Simon Pusterhofer](https://github.com/simon-p-2000))
 - Add Python 3.13+ support by transforming wheel builds using ABI3 mode ([#251](https://github.com/cbrnr/sleepecg/pull/251) by [Eric Larson](https://github.com/larsoner))
 
 ### Changed

diff --git a/docs/datasets.md b/docs/datasets.md
@@ -44,7 +44,7 @@ Instead of always using [`set_nsrr_token()`](sleepecg.set_nsrr_token), you can s
 
 SleepECG checks for the NSRR token in the following order:
 
-1. Token set via [`set_nsrr_token()`][sleepecg.set_nsrr_token]
+1. Token set via [`set_nsrr_token()`](sleepecg.set_nsrr_token)
 2. Token set via environment variable `NSRR_TOKEN`
 3. Token set in the user configuration
 
@@ -59,6 +59,8 @@ set_nsrr_token("<your-download-token-here>")
 mesa = read_mesa(records_pattern="00*")  # note that this is a generator
 ```
 
+SleepECG supports downloading and storing activity counts for the MESA dataset. These metrics quantify a subject's movement based on accelerometer measurements recorded and processed using a proprietary algorithm in Philips Actiware. To access activity counts, call [`read_mesa()`](sleepecg.read_mesa) with `activity_source='actigraphy'` to download the data or `activity_source='cached'` to use previously downloaded counts.
+
 !!! note
     Reader functions are generators, so they do not return the data directly. To access the data, you need to consume the generator, either by iterating over it or with subsequent calls of `next()`.
 

diff --git a/examples/classifiers/ws_gru_mesa.py b/examples/classifiers/ws_gru_mesa.py
@@ -15,7 +15,7 @@
     set_nsrr_token,
 )
 
-set_nsrr_token("25042-5JxoCwc8KQ3uV3ubyK-D")
+set_nsrr_token("your-token-here")
 
 TRAIN = True  # set to False to skip training and load classifier from disk
 
@@ -27,7 +27,7 @@
 if TRAIN:
     print("‣  Starting training...")
     print("‣‣ Extracting features...")
-    records = list(read_mesa(offline=False, activity_source="actigraphy"))
+    records = list(read_mesa(offline=False))
 
     feature_extraction_params = {
         "lookback": 240,
@@ -38,7 +38,6 @@
             "recording_start_time",
             "age",
             "gender",
-            "activity_counts"
         ],
         "min_rri": 0.3,
         "max_rri": 2,

diff --git a/examples/classifiers/ws_lda_mesa.py b/examples/classifiers/ws_lda_mesa.py
diff --git a/src/sleepecg/__init__.py b/src/sleepecg/__init__.py
@@ -6,7 +6,6 @@
     list_classifiers,
     load_classifier,
     prepare_data_keras,
-    prepare_data_sklearn,
     print_class_balance,
     save_classifier,
     stage,

diff --git a/src/sleepecg/classification.py b/src/sleepecg/classification.py
@@ -21,55 +21,6 @@
 from sleepecg.io.sleep_readers import SleepRecord, SleepStage
 from sleepecg.utils import _STAGE_NAMES, _merge_sleep_stages
 
-def prepare_data_sklearn(
-    features: list[np.ndarray],
-    stages: list[np.ndarray],
-    feature_ids: list[str],
-    stages_mode: str,
-    remove_nan: str = 'none',
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Prepare sleep records for a sklearn model.
-
-    The following steps are performed:
-
-    - Merge sleep stages in `stages` according to `stage_mode`.
-    - Set features corresponding to `SleepStage.UNDEFINED` as invalid.
-    - Replace `np.nan` and `np.inf` in `features` with `mask_value`.
-    - Pad to a common length, where `mask_value` is used for `features` and
-      `SleepStage.UNDEFINED` (i.e `0`) is used for stages.
-
-    Parameters
-    ----------
-    features : list[np.ndarray]
-        Each 2D array in this list is a feature matrix of shape `(n_samples, n_features)`
-        corresponding to a single record as returned by `extract_features()`.
-    feature_ids: list[str]
-        A list containing the identifiers of the extracted features. Feature groups passed
-        in `feature_selection` are expanded to all individual features they contain. The
-        order matches the column order of the feature matrix.
-    stages : list[np.ndarray]
-        Each 1D array in this list contains the sleep stages of a single record as returned
-        by `extract_features()`.
-    stages_mode : str
-        Identifier of the grouping mode. Can be any of `'wake-sleep'`, `'wake-rem-nrem'`,
-        `'wake-rem-light-n3'`, `'wake-rem-n1-n2-n3'`.
-    Returns
-    -------
-    features_stacked : np.ndarray
-        A 2D array of shape `(total samples, features)`.
-    stages_stacked : np.ndarray
-        A 1D array containing the annotated sleep stage for each sample. The sleep stages
-        are merged based on the stages_mode parameter.
-    record_ids : np.ndarray
-        A 1D array containing a calculated index for each valid sample that is returned.
-    """
-    record_ids = np.hstack([i * np.ones(len(X)) for i, X in enumerate(features)])
-    features_stacked = np.vstack(features)
-    stages_stacked = np.hstack(_merge_sleep_stages(stages, stages_mode))
-    valid = stages_stacked != SleepStage.UNDEFINED
-
-    return features_stacked[valid], stages_stacked[valid], record_ids[valid]
 
 def prepare_data_keras(
     features: list[np.ndarray],

diff --git a/src/sleepecg/feature_extraction.py b/src/sleepecg/feature_extraction.py
@@ -11,6 +11,7 @@
 
 import numpy as np
 from numpy.lib.stride_tricks import sliding_window_view
+from scipy.integrate import trapezoid
 from scipy.interpolate import interp1d
 from scipy.signal import periodogram
 
@@ -56,7 +57,6 @@
         "LF_HF_ratio",
     ),
     "metadata": ("recording_start_time", "age", "gender", "weight"),
-    "actigraphy": ("activity_counts", "dummy_feature"),
 }
 _FEATURE_ID_TO_GROUP = {id: group for group, ids in _FEATURE_GROUPS.items() for id in ids}
 
@@ -368,10 +368,10 @@ def _hrv_frequencydomain_features(
     lf_mask = (0.04 < freq) & (freq <= 0.15)
     hf_mask = (0.15 < freq) & (freq <= 0.4)
 
-    total_power = np.trapz(psd[:, total_power_mask], freq[total_power_mask])
-    vlf = np.trapz(psd[:, vlf_mask], freq[vlf_mask])
-    lf = np.trapz(psd[:, lf_mask], freq[lf_mask])
-    hf = np.trapz(psd[:, hf_mask], freq[hf_mask])
+    total_power = trapezoid(psd[:, total_power_mask], freq[total_power_mask])
+    vlf = trapezoid(psd[:, vlf_mask], freq[vlf_mask])
+    lf = trapezoid(psd[:, lf_mask], freq[lf_mask])
+    hf = trapezoid(psd[:, hf_mask], freq[hf_mask])
 
     lf_norm = lf / (lf + hf) * 100
     hf_norm = hf / (lf + hf) * 100
@@ -657,9 +657,6 @@ def _extract_features_single(
             )
         elif feature_group == "metadata":
             X.append(_metadata_features(record, num_stages))
-        elif feature_group == "actigraphy":
-            if record.activity_counts is not None:
-                X.append(record.activity_counts.reshape(-1, 1))
     features = np.hstack(X)[:, col_indices]
 
     if record.sleep_stages is None or sleep_stage_duration == record.sleep_stage_duration:

diff --git a/src/sleepecg/io/sleep_readers.py b/src/sleepecg/io/sleep_readers.py
@@ -8,6 +8,7 @@
 
 import csv
 import datetime
+import os
 from collections.abc import Iterator
 from dataclasses import dataclass
 from enum import IntEnum
@@ -442,6 +443,10 @@ def read_mesa(
                         checksums[activity_filename],
                     )
 
+                if not os.path.exists(activity_filepath):
+                    print(f"Skipping {record_id} due to missing activity data.")
+                    continue
+
                 activity_data = []
 
                 with open(activity_filepath) as csv_file:
@@ -470,19 +475,19 @@ def read_mesa(
 
                 start_line = overlap_data[mesaid] + 1
 
-                end_line = (
-                    int(
-                        next(
-                            row["line"]
-                            for row in activity_data
-                            if row.get("linetime") == recording_end_time_str
-                        )
+                for item in activity_data:
+                    if item.get("linetime") == recording_end_time_str:
+                        end_line = int(item["line"]) - 1
+                        break
+                else:
+                    print(
+                        f"Skipping {record_id} due to missing line matching "
+                        f"{recording_end_time_str}."
                     )
-                    - 1
-                )
+                    continue
 
                 activity_counts = [
-                    row["activity"] for row in activity_data[start_line - 1 : end_line]
+                    item["activity"] for item in activity_data[start_line - 1 : end_line]
                 ]
 
                 activity_counts = np.array(activity_counts)

diff --git a/tests/test_sleep_readers.py b/tests/test_sleep_readers.py
@@ -25,6 +25,7 @@ def _dummy_nsrr_overlap(filename: str, mesa_ids: list[int]):
 
 
 def _dummy_nsrr_actigraphy(filename: str, mesa_id: str):
+    """Create dummy actigraphy file with four usable activity counts."""
     base_time = datetime.datetime(2024, 1, 1, 20, 30, 0)
 
     linetimes = [
@@ -39,6 +40,7 @@ def _dummy_nsrr_actigraphy(filename: str, mesa_id: str):
 
 
 def _dummy_nsrr_actigraphy_cached(filename: str):
+    """Create dummy npy file that resembles cached activity counts."""
     activity_counts = np.array([10, 10, 10, 10, 10, 10])
     np.save(filename, activity_counts)