From 7d5509cb6742b95747137e5ee927ade48ba14be7 Mon Sep 17 00:00:00 2001
From: Soledad Galli <solegalli@protonmail.com>
Date: Tue, 11 Jul 2023 20:54:30 +0200
Subject: [PATCH] DOC improve instance hardness threshold user guide (#1029)

---
 doc/under_sampling.rst                        | 38 ++++++++++++++-----
 .../_instance_hardness_threshold.py           |  2 +-
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
index ac16c2921..499b5a3d9 100644
--- a/doc/under_sampling.rst
+++ b/doc/under_sampling.rst
@@ -467,12 +467,32 @@ and the output a 3 nearest neighbors classifier. The class can be used as::
 
 .. _instance_hardness_threshold:
 
+Additional undersampling techniques
+-----------------------------------
+
 Instance hardness threshold
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-:class:`InstanceHardnessThreshold` is a specific algorithm in which a
-classifier is trained on the data and the samples with lower probabilities are
-removed :cite:`smith2014instance`. The class can be used as::
+**Instance Hardness** is a measure of how difficult it is to classify an instance or
+observation correctly. In other words, hard instances are observations that are hard to
+classify correctly.
+
+Fundamentally, instances that are hard to classify correctly are those for which the
+learning algorithm or classifier produces a low probability of predicting the correct
+class label.
+
+If we removed these hard instances from the dataset, the logic goes, we would help the
+classifier better identify the different classes :cite:`smith2014instance`.
+
+:class:`InstanceHardnessThreshold` trains a classifier on the data and then removes the
+samples with lower probabilities :cite:`smith2014instance`. Or in other words, it
+retains the observations with the higher class probabilities.
+
+In our implementation, :class:`InstanceHardnessThreshold` is (almost) a controlled
+under-sampling method: it will retain a specific number of observations of the target
+class(es), which is specified by the user (see caveat below).
+
+The class can be used as::
 
   >>> from sklearn.linear_model import LogisticRegression
   >>> from imblearn.under_sampling import InstanceHardnessThreshold
@@ -483,18 +503,18 @@ removed :cite:`smith2014instance`. The class can be used as::
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 64), (2, 64)]
 
-This class has 2 important parameters. ``estimator`` will accept any
-scikit-learn classifier which has a method ``predict_proba``. The classifier
-training is performed using a cross-validation and the parameter ``cv`` can set
-the number of folds to use.
+:class:`InstanceHardnessThreshold` has 2 important parameters. The parameter
+``estimator`` accepts any scikit-learn classifier with a method ``predict_proba``.
+This classifier will be used to identify the hard instances. The training is performed
+with cross-validation which can be specified through the parameter ``cv`.
 
 .. note::
 
    :class:`InstanceHardnessThreshold` could almost be considered as a
    controlled under-sampling method. However, due to the probability outputs, it
-   is not always possible to get a specific number of samples.
+   is not always possible to get the specified number of samples.
 
-The figure below gives another examples on some toy data.
+The figure below shows examples of instance hardness undersampling on a toy dataset.
 
 .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_006.png
    :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html
diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
index 177b29784..52d9280b6 100644
--- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
+++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
@@ -51,7 +51,7 @@ class InstanceHardnessThreshold(BaseUnderSampler):
     ----------
     sampling_strategy_ : dict
         Dictionary containing the information to sample the dataset. The keys
-        corresponds to the class labels from which to sample and the values
+        correspond to the class labels from which to sample and the values
         are the number of samples to sample.
 
     estimator_ : estimator object