diff --git a/.github/workflows/anserini.yml b/.github/workflows/anserini.yml
index 72e94e36..e72d9d98 100644
--- a/.github/workflows/anserini.yml
+++ b/.github/workflows/anserini.yml
@@ -17,7 +17,6 @@ jobs:
         python-version: ['3.10']
         java: [13]
         os: ['ubuntu-latest']
-        architecture: ['x64']
         terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', 
 
     runs-on: ${{ matrix.os }}
@@ -28,29 +27,28 @@ jobs:
       run: |
         brew install libomp
 
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     
     - name: Set up Python ${{ matrix.python-version }}
       if: matrix.os != 'self-hosted'
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     
     - name: Setup java
       if: matrix.os != 'self-hosted'
-      uses: actions/setup-java@v3
+      uses: actions/setup-java@v4
       with:
         java-version: ${{ matrix.java }}
-        architecture: ${{ matrix.architecture }}
         distribution: 'zulu'
   
     # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d
     - name: Loading Python & dependencies from cache
       if: matrix.os != 'self-hosted'
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ env.pythonLocation }}
-        key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }}
+        key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }}
 
     - name: Install Python dependencies
       run: |
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index 188a3038..5c14416a 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -18,12 +18,12 @@ jobs:
     - name: Display phase
       run: echo "Deploying "${{github.event.inputs.releasetype}}" $INPUT_RELEASETYPE to PyPI"
 
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
-    - name: Set up Python 3.7
-      uses: actions/setup-python@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
       with:
-        python-version: 3.7
+        python-version: 3.8
 
     - name: Build a test source tarball
       if: github.event.inputs.releasetype == 'test'
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 492ab0e8..c0b93066 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -14,17 +14,15 @@ jobs:
     
     strategy:
       matrix:
-        python-version: ['3.7', '3.11']
+        python-version: ['3.8', '3.11']
         java: [11, 13]
-        os: ['ubuntu-latest', 'macOs-latest', 'windows-latest']
-        architecture: ['x64']
+        os: ['ubuntu-latest', 'macos-13', 'windows-latest']
         terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', 
-        # include:
-        #   - os: 'self-hosted' # our m1 runner, only one setting
-        #     python-version: '3.9'
-        #     java: 11
-        #     architecture: 'arm'
-        #     terrier: 'snapshot'
+        include:
+            - os: 'macos-latest'
+              python-version: '3.9'
+              java: 11
+              terrier: 'snapshot'
 
     runs-on: ${{ matrix.os }}
     steps:
@@ -34,20 +32,19 @@ jobs:
       run: |
         brew install libomp
 
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     
     - name: Set up Python ${{ matrix.python-version }}
       if: matrix.os != 'self-hosted'
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     
     - name: Setup java
       if: matrix.os != 'self-hosted'
-      uses: actions/setup-java@v3
+      uses: actions/setup-java@v4
       with:
         java-version: ${{ matrix.java }}
-        architecture: ${{ matrix.architecture }}
         distribution: 'zulu'
     
     - name: Install Terrier snapshot
@@ -60,10 +57,10 @@ jobs:
     # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d
     - name: Loading Python & dependencies from cache
       if: matrix.os != 'self-hosted'
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ env.pythonLocation }}
-        key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }}
+        key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }}
 
     - name: Install Python dependencies
       run: |
@@ -93,4 +90,4 @@ jobs:
       env:
         TERRIER_VERSION: ${{ matrix.terrier }}
       run: |
-        pytest --durations=20 -p no:faulthandler
\ No newline at end of file
+        pytest --durations=20 -p no:faulthandler
diff --git a/README.md b/README.md
index 9819c6f2..27719e13 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![Python package](https://github.com/terrier-org/pyterrier/workflows/Python%20package/badge.svg) 
+[![Continuous Testing](https://github.com/terrier-org/pyterrier/actions/workflows/push.yml/badge.svg)](https://github.com/terrier-org/pyterrier/actions/workflows/push.yml)
 [![PyPI version](https://badge.fury.io/py/python-terrier.svg)](https://badge.fury.io/py/python-terrier)
 [![Documentation Status](https://readthedocs.org/projects/pyterrier/badge/?version=latest)](https://pyterrier.readthedocs.io/en/latest/)
 
@@ -162,3 +162,4 @@ By downloading and using PyTerrier, you agree to cite at the undernoted paper de
  - Sarawoot Kongyoung, University of Glasgow
  - Zhan Su, Copenhagen University
  - Marcus Schutte, TU Delft
+ - Lukas Zeit-Altpeter, Friedrich Schiller University Jena
diff --git a/docs/extras/generate_includes.py b/docs/extras/generate_includes.py
index 9e982303..27e6ea56 100644
--- a/docs/extras/generate_includes.py
+++ b/docs/extras/generate_includes.py
@@ -40,8 +40,17 @@ def experiment_includes():
     # vaswani dataset provides an index, topics and qrels
 
     # lets generate two BRs to compare
-    tfidf = pt.BatchRetrieve(dataset.get_index(), wmodel="TF_IDF")
-    bm25 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")
+    try:
+        indexref = dataset.get_index()
+    except ValueError:
+        import os, tempfile 
+        # if data.terrier.org is down, build the index
+        indexref = pt.IterDictIndexer(
+                os.path.join(tempfile.gettempdir(), "vaswani_index")
+            ).index(pt.get_dataset('vaswani').get_corpus_iter())
+
+    tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF")
+    bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")
 
     table = pt.Experiment(
         [tfidf, bm25],
diff --git a/docs/installation.rst b/docs/installation.rst
index 82698bd4..69abe482 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -6,7 +6,7 @@ PyTerrier is a declarative platform for information retrieval experiemnts in Pyt
 Pre-requisites
 ==============
 
-PyTerrier requires Python 3.7 or newer, and Java 11 or newer. PyTerrier is natively supported on Linux, Mac OS X and Windows. 
+PyTerrier requires Python 3.8 or newer, and Java 11 or newer. PyTerrier is natively supported on Linux, Mac OS X and Windows. 
 
 Installation
 ============
@@ -23,6 +23,24 @@ If you want the latest version of PyTerrier, you can install direct from the Git
 
 NB: There is no need to have a local installation of the Java component, Terrier. PyTerrier will download the latest release on startup.
 
+Installation Troubleshooting
+============
+
+We aim to ensure that there are pre-compiled binaries available for any dependencies with native components, for all supported Python versions and for all major platforms (Linux, macOS, Windows).
+One notable exception is Mac M1 etc., as there are no freely available GitHub Actions runners for M1. Mac M1 installs may require to compile some dependencies.
+
+If the installation failed due to `pyautocorpus` did not run successfully, you may need to install `pcre` to your machine.
+
+macOS::
+
+    brew install pcre
+
+Linux::
+
+    apt-get update -y
+    apt-get install libpcre3-dev -y
+
+
 Configuration
 ==============
 
@@ -41,6 +59,9 @@ the usual places on your machine for a Java installation. If you have problems,
 
 `pt.init()` has a multitude of options, for instance that can make PyTerrier more notebook friendly, or to change the underlying version of Terrier, as described below.
 
+For users with an M1 Mac or later models, it is necessary to install the SSL certificates to avoid certificate errors. 
+To do this, locate the `Install Certificates.command` file within the `Application/Python[version]` directory. Once found, double-click on it to run the installation process.
+
 API Reference
 =============
 
@@ -79,4 +100,4 @@ Methods to change PyTerrier configuration
 
 .. autofunction:: pyterrier.set_properties()
 
-.. autofunction:: pyterrier.set_tqdm()
\ No newline at end of file
+.. autofunction:: pyterrier.set_tqdm()
diff --git a/docs/operators.rst b/docs/operators.rst
index b8e53631..164aefde 100644
--- a/docs/operators.rst
+++ b/docs/operators.rst
@@ -5,19 +5,19 @@ Part of the power of PyTerrier comes from the ease in which researchers can form
 This is made possible by the operators available on Pyterrier's transformer objects. The following table summarises
 the available operators:
 
-============ =================================
+============ =======================================================
  Operator    Meaning                           
-============ =================================
+============ =======================================================
 `>>`         Then - chaining pipes            
 `+`          Linear combination of scores            
 `*`          Scalar factoring of scores         
 `&`          Document Set Intersection              
 `|`          Document Set Union         
 `%`          Apply rank cutoff                  
-`^`          Concatenate run with another       
+`^`          Concatenate the output of one transformer with another       
 `**`         Feature Union           
 `~`          Cache transformer result              
-============ =================================
+============ =======================================================
 
 NB: These operators retain their default Python operator precedence - that may not be aligned with your expectations
 in a PyTerrier context (e.g. `&` is higher than `>>`). 
diff --git a/docs/rewrite.rst b/docs/rewrite.rst
index 6de11465..450b6950 100644
--- a/docs/rewrite.rst
+++ b/docs/rewrite.rst
@@ -64,6 +64,14 @@ Example::
     dph = pt.BatchRetrieve(index, wmodel="DPH")
     pipelineQE = dph >> bo1 >> dph
 
+View the expansion terms::
+
+    pipelineDisplay = dph >> bo1
+    pipelineDisplay.search("chemical reactions")
+    # will return a dataframe with ['qid', 'query', 'query_0'] columns
+    # the reformulated query can be found in the 'query' column, 
+    # while the original query is in the 'query_0' columns
+
 **Alternative Formulations**
 
 Note that it is also possible to configure BatchRetrieve to perform QE directly using controls,
diff --git a/docs/terrier-index-api.rst b/docs/terrier-index-api.rst
index 4e389bca..438053ab 100644
--- a/docs/terrier-index-api.rst
+++ b/docs/terrier-index-api.rst
@@ -171,3 +171,32 @@ of the term (obtained from the Lexicon, in the form of the LexiconEntry), as wel
         print("%s with score %0.4f"  % (docno, score))
 
 Note that using BatchRetrieve or similar is probably an easier prospect for such a use case.
+
+Can I get the index as a corpus_iter()?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A corpus_iter can be obtained from an Index object, which allows for instance:
+ - indexing the pre-tokenised Terrier index directly in another indexing pipeline
+ - extracting document metadata for ingestion into another indexing pipeline
+
+Metadata Example::
+
+    iter = index.get_corpus_iter(return_toks=False)
+    next(iter)
+    # would display {'docno' : 'd1', 'text' : 'This document contains ...' }
+    # assuming that index has been built with metadata=['docno', 'text']
+
+Pre-tokenised Example::
+
+    iter = index.get_corpus_iter()
+    next(iter)
+    # would display {'docno' : 'd1', 'toks' : {'a' : 1, 'the' : 2}}
+
+Document Pruning Example::
+
+    index_pipe = (
+        # update the toks column for each document, keeping only terms with frequency > 1
+        pt.apply.toks(lambda row: { t : row['toks'][t] for t in row['toks'] if row['toks'][t] > 1 } ) 
+        >> pt.IterDictIndexer("./pruned_index", pretokenised=True)
+    )
+    new_index_ref = index_pipe.index( index.get_corpus_iter())
diff --git a/examples/notebooks/ltr.ipynb b/examples/notebooks/ltr.ipynb
index a0521136..5cc117b7 100644
--- a/examples/notebooks/ltr.ipynb
+++ b/examples/notebooks/ltr.ipynb
@@ -1,23 +1,9 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Learning to Rank Examples.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "F5Ng-_HyW5LP",
-        "colab_type": "text"
+        "id": "F5Ng-_HyW5LP"
       },
       "source": [
         "# Terrier Learning to Rank Examples\n",
@@ -31,146 +17,128 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
-        "id": "eypl7XPrkifV",
-        "colab_type": "code",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 718
-        },
-        "outputId": "e042ffb0-8ee5-4d95-c8bc-7e2895df541f"
+        "id": "eypl7XPrkifV"
       },
+      "outputs": [],
       "source": [
-        "!pip install python-terrier\n",
-        "#!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier"
-      ],
-      "execution_count": 14,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Collecting python-terrier\n",
-            "  Cloning https://github.com/terrier-org/pyterrier.git to /tmp/pip-install-e2c_k3ze/python-terrier\n",
-            "  Running command git clone -q https://github.com/terrier-org/pyterrier.git /tmp/pip-install-e2c_k3ze/python-terrier\n",
-            "Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.18.5)\n",
-            "Requirement already satisfied, skipping upgrade: pandas in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.0.5)\n",
-            "Requirement already satisfied, skipping upgrade: wget in /usr/local/lib/python3.6/dist-packages (from python-terrier) (3.2)\n",
-            "Requirement already satisfied, skipping upgrade: pytrec_eval in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.4)\n",
-            "Requirement already satisfied, skipping upgrade: tqdm in /usr/local/lib/python3.6/dist-packages (from python-terrier) (4.41.1)\n",
-            "Requirement already satisfied, skipping upgrade: pyjnius~=1.3.0 in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.3.0)\n",
-            "Requirement already satisfied, skipping upgrade: matchpy in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.5.1)\n",
-            "Requirement already satisfied, skipping upgrade: sklearn in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.0)\n",
-            "Requirement already satisfied, skipping upgrade: deprecation in /usr/local/lib/python3.6/dist-packages (from python-terrier) (2.1.0)\n",
-            "Requirement already satisfied, skipping upgrade: chest in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.2.3)\n",
-            "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.4.1)\n",
-            "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->python-terrier) (2.8.1)\n",
-            "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->python-terrier) (2018.9)\n",
-            "Requirement already satisfied, skipping upgrade: cython in /usr/local/lib/python3.6/dist-packages (from pyjnius~=1.3.0->python-terrier) (0.29.20)\n",
-            "Requirement already satisfied, skipping upgrade: six>=1.7.0 in /usr/local/lib/python3.6/dist-packages (from pyjnius~=1.3.0->python-terrier) (1.12.0)\n",
-            "Requirement already satisfied, skipping upgrade: hopcroftkarp<2.0,>=1.2 in /usr/local/lib/python3.6/dist-packages (from matchpy->python-terrier) (1.2.5)\n",
-            "Requirement already satisfied, skipping upgrade: multiset<3.0,>=2.0 in /usr/local/lib/python3.6/dist-packages (from matchpy->python-terrier) (2.1.1)\n",
-            "Requirement already satisfied, skipping upgrade: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn->python-terrier) (0.22.2.post1)\n",
-            "Requirement already satisfied, skipping upgrade: packaging in /usr/local/lib/python3.6/dist-packages (from deprecation->python-terrier) (20.4)\n",
-            "Requirement already satisfied, skipping upgrade: heapdict in /usr/local/lib/python3.6/dist-packages (from chest->python-terrier) (1.0.1)\n",
-            "Requirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn->python-terrier) (0.15.1)\n",
-            "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->deprecation->python-terrier) (2.4.7)\n",
-            "Building wheels for collected packages: python-terrier\n",
-            "  Building wheel for python-terrier (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for python-terrier: filename=python_terrier-0.3.0.dev0-cp36-none-any.whl size=37418 sha256=9fcedd75b4d85b9c026e34c93cd5c20167ed10636d545e3da9de41a545d003ba\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-75s589ez/wheels/cc/bb/69/836d846a92c787b35ca6478119c0033762ab2b95d866eeb288\n",
-            "Successfully built python-terrier\n",
-            "Installing collected packages: python-terrier\n",
-            "  Found existing installation: python-terrier 0.3.0.dev0\n",
-            "    Uninstalling python-terrier-0.3.0.dev0:\n",
-            "      Successfully uninstalled python-terrier-0.3.0.dev0\n",
-            "Successfully installed python-terrier-0.3.0.dev0\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.colab-display-data+json": {
-              "pip_warning": {
-                "packages": [
-                  "pyterrier"
-                ]
-              }
-            }
-          },
-          "metadata": {
-            "tags": []
-          }
-        }
+        "%pip install -q python-terrier"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "5thmTselkuBv",
-        "colab_type": "text"
+        "id": "5thmTselkuBv"
       },
       "source": [
-        "## Init \n",
+        "## Init\n",
+        "\n",
+        "You must run `pt.init()` before other PyTerrier functions and classes.\n",
         "\n",
-        "You must run pt.init() before other pyterrier functions and classes\n",
+        "`pt.init()` takes arguments such as:    \n",
+        "- `version` - Terrier version e.g. \"5.2\"    \n",
+        "- `mem` - megabytes allocated to JVM e.g. 4096\n",
         "\n",
-        "Arguments:    \n",
-        "- `version` - terrier IR version e.g. \"5.2\"    \n",
-        "- `mem` - megabytes allocated to java e.g. 4096"
+        "See also: https://pyterrier.readthedocs.io/en/latest/installation.html"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 2,
       "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
         "id": "hPK5k4g2kkKo",
-        "colab_type": "code",
-        "colab": {}
+        "outputId": "67632f17-c2e4-4229-b7dc-d5671ec19ea6"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading terrier-assemblies 5.x-SNAPSHOT jar-with-dependencies to /Users/craigm/.pyterrier...\n",
+            "Done\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "PyTerrier 0.10.0 has loaded Terrier 5.9-SNAPSHOT (built by jitpack on 2024-04-22 17:11) and terrier-helper 0.0.8\n",
+            "\n",
+            "No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n"
+          ]
+        }
+      ],
       "source": [
         "import numpy as np\n",
         "import pandas as pd\n",
         "import pyterrier as pt\n",
         "if not pt.started():\n",
-        "  pt.init()"
-      ],
-      "execution_count": 15,
-      "outputs": []
+        "  pt.init(version='snapshot')"
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "M5BmNjqoXGow",
-        "colab_type": "text"
+        "id": "M5BmNjqoXGow"
       },
       "source": [
         "## Load Files and Index\n",
         "\n",
-        "Again, lets focus on the small Vaswani test collection. Its easily accessible via the dataset API. "
+        "Again, lets focus on the small Vaswani test collection. Its easily accessible via the dataset API."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "metadata": {
-        "id": "1MCH20mGB8EG",
-        "colab_type": "code",
-        "colab": {}
+        "id": "1MCH20mGB8EG"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading vaswani topics to /Users/craigm/.pyterrier/corpora/vaswani/query-text.trec\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "query-text.trec: 10.7kiB [00:00, 2.76MiB/s]                  \n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading vaswani qrels to /Users/craigm/.pyterrier/corpora/vaswani/qrels\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "qrels: 24.3kiB [00:00, 9.58MiB/s]                  \n"
+          ]
+        }
+      ],
       "source": [
         "dataset = pt.datasets.get_dataset(\"vaswani\")\n",
         "\n",
         "indexref = dataset.get_index()\n",
         "topics = dataset.get_topics()\n",
         "qrels = dataset.get_qrels()"
-      ],
-      "execution_count": 16,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "c8hUuA_KKPUH",
-        "colab_type": "text"
+        "id": "c8hUuA_KKPUH"
       },
       "source": [
         "## Multi-stage Retrieval\n",
@@ -186,27 +154,24 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 4,
       "metadata": {
-        "id": "QEjmsD3ya8Pc",
-        "colab_type": "code",
-        "colab": {}
+        "id": "QEjmsD3ya8Pc"
       },
+      "outputs": [],
       "source": [
         "#this ranker will make the candidate set of documents for each query\n",
-        "BM25 = pt.BatchRetrieve(indexref, controls = {\"wmodel\": \"BM25\"})\n",
+        "BM25 = pt.BatchRetrieve(indexref, wmodel=\"BM25\")\n",
         "\n",
         "#these rankers we will use to re-rank the BM25 results\n",
-        "TF_IDF =  pt.BatchRetrieve(indexref, controls = {\"wmodel\": \"TF_IDF\"})\n",
-        "PL2 =  pt.BatchRetrieve(indexref, controls = {\"wmodel\": \"PL2\"})"
-      ],
-      "execution_count": 17,
-      "outputs": []
+        "TF_IDF =  pt.BatchRetrieve(indexref, wmodel=\"TF_IDF\")\n",
+        "PL2 =  pt.BatchRetrieve(indexref, wmodel=\"PL2\")"
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "T07YF3-ULGsG",
-        "colab_type": "text"
+        "id": "T07YF3-ULGsG"
       },
       "source": [
         "OK, so how do we combine these?"
@@ -214,22 +179,19 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 5,
       "metadata": {
-        "id": "vTLh6SrCLGM0",
-        "colab_type": "code",
-        "colab": {}
+        "id": "vTLh6SrCLGM0"
       },
+      "outputs": [],
       "source": [
         "pipe = BM25 >> (TF_IDF ** PL2)"
-      ],
-      "execution_count": 18,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "l7M4cUxCLMTo",
-        "colab_type": "text"
+        "id": "l7M4cUxCLMTo"
       },
       "source": [
         "Here, we are using two Pyterrer operators:\n",
@@ -241,22 +203,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 6,
       "metadata": {
-        "id": "DYNOf_TwLp0Z",
-        "colab_type": "code",
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 142
+          "height": 112
         },
-        "outputId": "d9779320-58f8-4197-aa91-f05f7d05a8c5"
+        "id": "DYNOf_TwLp0Z",
+        "outputId": "66f01ca8-43da-4c89-a11e-f36a73d2ac44"
       },
-      "source": [
-        "pipe.transform(\"chemical end:2\")"
-      ],
-      "execution_count": 19,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -283,12 +240,6 @@
               "      <th>rank</th>\n",
               "      <th>score</th>\n",
               "      <th>query</th>\n",
-              "      <th>docid_x</th>\n",
-              "      <th>rank_x</th>\n",
-              "      <th>query_x</th>\n",
-              "      <th>docid_y</th>\n",
-              "      <th>rank_y</th>\n",
-              "      <th>query_y</th>\n",
               "      <th>features</th>\n",
               "    </tr>\n",
               "  </thead>\n",
@@ -300,13 +251,7 @@
               "      <td>10703</td>\n",
               "      <td>0</td>\n",
               "      <td>13.472012</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>10702</td>\n",
-              "      <td>0</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>10702</td>\n",
-              "      <td>0</td>\n",
-              "      <td>chemical end:2</td>\n",
+              "      <td>chemical</td>\n",
               "      <td>[7.38109017620895, 6.9992254918907575]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
@@ -316,56 +261,36 @@
               "      <td>1056</td>\n",
               "      <td>1</td>\n",
               "      <td>12.517082</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>1055</td>\n",
-              "      <td>1</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>1055</td>\n",
-              "      <td>1</td>\n",
-              "      <td>chemical end:2</td>\n",
+              "      <td>chemical</td>\n",
               "      <td>[6.857899681644975, 6.358419229871986]</td>\n",
               "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>1</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>4886</td>\n",
-              "      <td>2</td>\n",
-              "      <td>12.228161</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>2</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>2</td>\n",
-              "      <td>chemical end:2</td>\n",
-              "      <td>[6.69960466053696, 6.181368165774688]</td>\n",
-              "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
               "</div>"
             ],
             "text/plain": [
-              "  qid  docid  ...         query_y                                features\n",
-              "0   1  10702  ...  chemical end:2  [7.38109017620895, 6.9992254918907575]\n",
-              "1   1   1055  ...  chemical end:2  [6.857899681644975, 6.358419229871986]\n",
-              "2   1   4885  ...  chemical end:2   [6.69960466053696, 6.181368165774688]\n",
+              "  qid  docid  docno  rank      score     query  \\\n",
+              "0   1  10702  10703     0  13.472012  chemical   \n",
+              "1   1   1055   1056     1  12.517082  chemical   \n",
               "\n",
-              "[3 rows x 13 columns]"
+              "                                 features  \n",
+              "0  [7.38109017620895, 6.9992254918907575]  \n",
+              "1  [6.857899681644975, 6.358419229871986]  "
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 19
+          "execution_count": 6,
+          "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "pipe.search(\"chemical\").head(2)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "-ZMvd3qjLkrs",
-        "colab_type": "text"
+        "id": "-ZMvd3qjLkrs"
       },
       "source": [
         "See, we now have a \"features\" column with numbers representing the TF_IDF and PL2 feature scores."
@@ -374,8 +299,7 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "Ye6ZpcZaMBjT",
-        "colab_type": "text"
+        "id": "Ye6ZpcZaMBjT"
       },
       "source": [
         "*A note about efficiency*: doing retrieval, then re-ranking the documents again can be slow. For this reason, Terrier has a FeaturesBatchRetrieve. Lets try this:"
@@ -383,24 +307,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 7,
       "metadata": {
-        "id": "5gCHuDiJMNJZ",
-        "colab_type": "code",
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 142
+          "height": 112
         },
-        "outputId": "cd0d5320-4d08-417a-d2dd-08e07f500793"
+        "id": "5gCHuDiJMNJZ",
+        "outputId": "4e6ec41d-1c1b-4b7e-d318-de6ed0ef0883"
       },
-      "source": [
-        "fbr = pt.FeaturesBatchRetrieve(indexref, controls = {\"wmodel\": \"BM25\"}, features=[\"WMODEL:TF_IDF\", \"WMODEL:PL2\"]) \n",
-        "#lets look at the top 2 results\n",
-        "(fbr %2).search(\"chemical\")"
-      ],
-      "execution_count": 20,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -422,64 +339,64 @@
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
               "      <th>qid</th>\n",
+              "      <th>query</th>\n",
               "      <th>docid</th>\n",
               "      <th>rank</th>\n",
+              "      <th>features</th>\n",
               "      <th>docno</th>\n",
               "      <th>score</th>\n",
-              "      <th>features</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
               "      <td>1</td>\n",
+              "      <td>chemical</td>\n",
               "      <td>10702</td>\n",
               "      <td>0</td>\n",
+              "      <td>[7.38109017620895, 6.9992254918907575]</td>\n",
               "      <td>10703</td>\n",
               "      <td>13.472012</td>\n",
-              "      <td>[7.38109017620895, 6.9992254918907575]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
               "      <td>1</td>\n",
+              "      <td>chemical</td>\n",
               "      <td>1055</td>\n",
               "      <td>1</td>\n",
+              "      <td>[6.857899681644975, 6.358419229871986]</td>\n",
               "      <td>1056</td>\n",
               "      <td>12.517082</td>\n",
-              "      <td>[6.857899681644975, 6.358419229871986]</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>1</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>2</td>\n",
-              "      <td>4886</td>\n",
-              "      <td>12.228161</td>\n",
-              "      <td>[6.69960466053696, 6.181368165774688]</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
               "</div>"
             ],
             "text/plain": [
-              "  qid  docid  rank  docno      score                                features\n",
-              "0   1  10702     0  10703  13.472012  [7.38109017620895, 6.9992254918907575]\n",
-              "1   1   1055     1   1056  12.517082  [6.857899681644975, 6.358419229871986]\n",
-              "2   1   4885     2   4886  12.228161   [6.69960466053696, 6.181368165774688]"
+              "  qid     query  docid  rank                                features  docno  \\\n",
+              "0   1  chemical  10702     0  [7.38109017620895, 6.9992254918907575]  10703   \n",
+              "1   1  chemical   1055     1  [6.857899681644975, 6.358419229871986]   1056   \n",
+              "\n",
+              "       score  \n",
+              "0  13.472012  \n",
+              "1  12.517082  "
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 20
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "fbr = pt.FeaturesBatchRetrieve(indexref, wmodel=\"BM25\", features=[\"WMODEL:TF_IDF\", \"WMODEL:PL2\"])\n",
+        "#lets look at the top 2 results\n",
+        "(fbr %2).search(\"chemical\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "fo567qmCMZ41",
-        "colab_type": "text"
+        "id": "fo567qmCMZ41"
       },
       "source": [
         "However, this kind of optimisation is common in Pyterrier, so Pyterrier actually supports automatic pipeline optimisation, using the `.compile()` function."
@@ -487,30 +404,24 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 14,
       "metadata": {
-        "id": "jmrnqg9YMpl2",
-        "colab_type": "code",
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 159
+          "height": 130
         },
-        "outputId": "70882fb4-8057-4014-ece8-899a593e4cd0"
+        "id": "jmrnqg9YMpl2",
+        "outputId": "a1fd9210-5cb2-4d3f-9a4a-8045502092b4"
       },
-      "source": [
-        "pipe_fast = pipe.compile()\n",
-        "(pipe_fast %2).search(\"chemical\")"
-      ],
-      "execution_count": 21,
       "outputs": [
         {
+          "name": "stdout",
           "output_type": "stream",
           "text": [
             "Applying 8 rules\n"
-          ],
-          "name": "stdout"
+          ]
         },
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -533,9 +444,10 @@
               "      <th></th>\n",
               "      <th>qid</th>\n",
               "      <th>docid</th>\n",
-              "      <th>rank</th>\n",
               "      <th>docno</th>\n",
+              "      <th>rank</th>\n",
               "      <th>score</th>\n",
+              "      <th>query</th>\n",
               "      <th>features</th>\n",
               "    </tr>\n",
               "  </thead>\n",
@@ -544,80 +456,72 @@
               "      <th>0</th>\n",
               "      <td>1</td>\n",
               "      <td>10702</td>\n",
-              "      <td>0</td>\n",
               "      <td>10703</td>\n",
+              "      <td>0</td>\n",
               "      <td>13.472012</td>\n",
+              "      <td>chemical</td>\n",
               "      <td>[7.38109017620895, 6.9992254918907575]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>1</th>\n",
               "      <td>1</td>\n",
               "      <td>1055</td>\n",
-              "      <td>1</td>\n",
               "      <td>1056</td>\n",
+              "      <td>1</td>\n",
               "      <td>12.517082</td>\n",
+              "      <td>chemical</td>\n",
               "      <td>[6.857899681644975, 6.358419229871986]</td>\n",
               "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>1</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>2</td>\n",
-              "      <td>4886</td>\n",
-              "      <td>12.228161</td>\n",
-              "      <td>[6.69960466053696, 6.181368165774688]</td>\n",
-              "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
               "</div>"
             ],
             "text/plain": [
-              "  qid  docid  rank  docno      score                                features\n",
-              "0   1  10702     0  10703  13.472012  [7.38109017620895, 6.9992254918907575]\n",
-              "1   1   1055     1   1056  12.517082  [6.857899681644975, 6.358419229871986]\n",
-              "2   1   4885     2   4886  12.228161   [6.69960466053696, 6.181368165774688]"
+              "  qid  docid  docno  rank      score     query  \\\n",
+              "0   1  10702  10703     0  13.472012  chemical   \n",
+              "1   1   1055   1056     1  12.517082  chemical   \n",
+              "\n",
+              "                                 features  \n",
+              "0  [7.38109017620895, 6.9992254918907575]  \n",
+              "1  [6.857899681644975, 6.358419229871986]  "
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 21
+          "execution_count": 14,
+          "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "pipe_fast = pipe.compile()\n",
+        "(pipe_fast %2).search(\"chemical\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "siS6M5t_hugs",
-        "colab_type": "text"
+        "id": "siS6M5t_hugs"
       },
       "source": [
         "Finally, often we want our initial retrieval score to be a feature also. We can do this in one of two ways:\n",
         " - by adding a `SAMPLE` feature to FeaturesBatchRetrieve\n",
-        " - or in the original feature-union definition, including an IdentityTransformer "
+        " - or in the original feature-union definition, including an identity Transformer\n",
+        "\n",
+        "In doing so, the BM25 score (13.47 andf 12.51) are now copied in as the first position of the features column."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 15,
       "metadata": {
-        "id": "iXxeKfPXhuPA",
-        "colab_type": "code",
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 142
+          "height": 112
         },
-        "outputId": "25a8c80c-c277-476e-c243-7c6f9d5989cb"
+        "id": "iXxeKfPXhuPA",
+        "outputId": "1fd3bf99-ec04-4572-cc6d-625d50cd1529"
       },
-      "source": [
-        "fbr = pt.FeaturesBatchRetrieve(indexref, controls = {\"wmodel\": \"BM25\"}, features=[\"SAMPLE\", \"WMODEL:TF_IDF\", \"WMODEL:PL2\"]) \n",
-        "pipe = BM25 >> (pt.transformer.IdentityTransformer() ** TF_IDF ** PL2)\n",
-        "\n",
-        "(pipe %2).search(\"chemical\")"
-      ],
-      "execution_count": 22,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -639,34 +543,100 @@
               "    <tr style=\"text-align: right;\">\n",
               "      <th></th>\n",
               "      <th>qid</th>\n",
+              "      <th>query</th>\n",
               "      <th>docid</th>\n",
-              "      <th>docno</th>\n",
               "      <th>rank</th>\n",
-              "      <th>score_x</th>\n",
-              "      <th>query</th>\n",
-              "      <th>docid_x</th>\n",
-              "      <th>rank_x</th>\n",
-              "      <th>query_x</th>\n",
-              "      <th>docid_y</th>\n",
-              "      <th>rank_y</th>\n",
-              "      <th>score_y</th>\n",
-              "      <th>query_y</th>\n",
               "      <th>features</th>\n",
+              "      <th>docno</th>\n",
+              "      <th>score</th>\n",
               "    </tr>\n",
               "  </thead>\n",
               "  <tbody>\n",
               "    <tr>\n",
               "      <th>0</th>\n",
               "      <td>1</td>\n",
-              "      <td>10702</td>\n",
-              "      <td>10703</td>\n",
-              "      <td>0</td>\n",
-              "      <td>13.472012</td>\n",
               "      <td>chemical</td>\n",
               "      <td>10702</td>\n",
               "      <td>0</td>\n",
+              "      <td>[13.472012496423268, 7.38109017620895, 6.99922...</td>\n",
+              "      <td>10703</td>\n",
+              "      <td>13.472012</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>1</td>\n",
               "      <td>chemical</td>\n",
+              "      <td>1055</td>\n",
+              "      <td>1</td>\n",
+              "      <td>[12.517081895047532, 6.857899681644975, 6.3584...</td>\n",
+              "      <td>1056</td>\n",
+              "      <td>12.517082</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "  qid     query  docid  rank  \\\n",
+              "0   1  chemical  10702     0   \n",
+              "1   1  chemical   1055     1   \n",
+              "\n",
+              "                                            features  docno      score  \n",
+              "0  [13.472012496423268, 7.38109017620895, 6.99922...  10703  13.472012  \n",
+              "1  [12.517081895047532, 6.857899681644975, 6.3584...   1056  12.517082  "
+            ]
+          },
+          "execution_count": 15,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "fbr3f = pt.FeaturesBatchRetrieve(indexref, wmodel=\"BM25\", features=[\"SAMPLE\", \"WMODEL:TF_IDF\", \"WMODEL:PL2\"])\n",
+        "(fbr3f %2).search(\"chemical\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>qid</th>\n",
+              "      <th>docid</th>\n",
+              "      <th>docno</th>\n",
+              "      <th>rank</th>\n",
+              "      <th>score</th>\n",
+              "      <th>query</th>\n",
+              "      <th>features</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>1</td>\n",
               "      <td>10702</td>\n",
+              "      <td>10703</td>\n",
               "      <td>0</td>\n",
               "      <td>13.472012</td>\n",
               "      <td>chemical</td>\n",
@@ -680,57 +650,43 @@
               "      <td>1</td>\n",
               "      <td>12.517082</td>\n",
               "      <td>chemical</td>\n",
-              "      <td>1055</td>\n",
-              "      <td>1</td>\n",
-              "      <td>chemical</td>\n",
-              "      <td>1055</td>\n",
-              "      <td>1</td>\n",
-              "      <td>12.517082</td>\n",
-              "      <td>chemical</td>\n",
               "      <td>[12.517081895047532, 6.857899681644975, 6.3584...</td>\n",
               "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>1</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>4886</td>\n",
-              "      <td>2</td>\n",
-              "      <td>12.228161</td>\n",
-              "      <td>chemical</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>2</td>\n",
-              "      <td>chemical</td>\n",
-              "      <td>4885</td>\n",
-              "      <td>2</td>\n",
-              "      <td>12.228161</td>\n",
-              "      <td>chemical</td>\n",
-              "      <td>[12.22816082084599, 6.69960466053696, 6.181368...</td>\n",
-              "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
               "</div>"
             ],
             "text/plain": [
-              "  qid  docid  ...   query_y                                           features\n",
-              "0   1  10702  ...  chemical  [13.472012496423268, 7.38109017620895, 6.99922...\n",
-              "1   1   1055  ...  chemical  [12.517081895047532, 6.857899681644975, 6.3584...\n",
-              "2   1   4885  ...  chemical  [12.22816082084599, 6.69960466053696, 6.181368...\n",
+              "  qid  docid  docno  rank      score     query  \\\n",
+              "0   1  10702  10703     0  13.472012  chemical   \n",
+              "1   1   1055   1056     1  12.517082  chemical   \n",
               "\n",
-              "[3 rows x 14 columns]"
+              "                                            features  \n",
+              "0  [13.472012496423268, 7.38109017620895, 6.99922...  \n",
+              "1  [12.517081895047532, 6.857899681644975, 6.3584...  "
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 22
+          "execution_count": 16,
+          "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "pipe3f = BM25 >> (pt.Transformer.identity() ** TF_IDF ** PL2)\n",
+        "(pipe3f %2).search(\"chemical\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As you can see, the results of both pipelines are identical."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "R47HlFoMYAhi",
-        "colab_type": "text"
+        "id": "R47HlFoMYAhi"
       },
       "source": [
         "# Learning models and re-ranking\n",
@@ -739,7 +695,7 @@
         "\n",
         "In each case, the pattern is the same:\n",
         " - Create a transformer that does the re-ranking\n",
-        " - Call the fit() method on the created object with the training topics (and validation topics as necessary)\n",
+        " - Call the `fit()` method on the created object with the training topics (and validation topics as necessary)\n",
         " - Evaluate the results with the Experiment function by using the test topics\n",
         "\n",
         " Firstly, lets separate our topics into train/validation/test."
@@ -747,29 +703,26 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 17,
       "metadata": {
-        "id": "e7r10lR3DvzM",
-        "colab_type": "code",
-        "colab": {}
+        "id": "e7r10lR3DvzM"
       },
+      "outputs": [],
       "source": [
         "train_topics, valid_topics, test_topics = np.split(topics, [int(.6*len(topics)), int(.8*len(topics))])"
-      ],
-      "execution_count": 23,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "3PYw_jasN6Vk",
-        "colab_type": "text"
+        "id": "3PYw_jasN6Vk"
       },
       "source": [
         "## sci-kit learn RandomForestRegressor\n",
         "\n",
-        "Our first learning-to-rank will be done using sci-kit learn's [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html). \n",
+        "Our first learning-to-rank will be done using sci-kit learn's [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html).\n",
         "\n",
-        "We use `pt.piptlines.LTR_pipeline`, which is a pyterrier transformer that passes the document features as \"X\" features to RandomForest. To learn the model (called fitting) the RandomForest, we invoke the `fit()` method - on the entire pipeline, specifying the queries (topics) and relevance assessment (qrels). The latter for the \"Y\" labels for the RandomForest fitting.\n",
+        "We use `pt.ltr.apply_learned_model()`, which returns a PyTerrier Transformer that passes the document features as \"X\" features to RandomForest. To learn the model (called fitting) the RandomForest, we invoke the `fit()` method - on the entire pipeline, specifying the queries (topics) and relevance assessment (qrels). The latter are used for the \"Y\" labels for the RandomForest fitting.\n",
         "\n",
         "NB: due to their bootstrap nature, Random Forests do not overfit, so we do not provide validation data to `fit()`.\n",
         "\n",
@@ -780,28 +733,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 18,
       "metadata": {
-        "colab_type": "code",
-        "id": "YTI_ax4K19nl",
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 111
+          "height": 112
         },
-        "outputId": "186b0de0-4793-4afc-c463-c9082a2129ec"
+        "id": "YTI_ax4K19nl",
+        "outputId": "4973c7b5-14fd-4034-b5cc-6557a5156485"
       },
-      "source": [
-        "from sklearn.ensemble import RandomForestRegressor\n",
-        "\n",
-        "BaselineLTR = fbr >> pt.pipelines.LTR_pipeline(RandomForestRegressor(n_estimators=400))\n",
-        "BaselineLTR.fit(train_topics, qrels)\n",
-        "\n",
-        "results = pt.pipelines.Experiment([PL2, BaselineLTR], test_topics, qrels, [\"map\"], names=[\"PL2 Baseline\", \"LTR Baseline\"])\n",
-        "results"
-      ],
-      "execution_count": 24,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -835,7 +777,7 @@
               "    <tr>\n",
               "      <th>1</th>\n",
               "      <td>LTR Baseline</td>\n",
-              "      <td>0.144980</td>\n",
+              "      <td>0.144662</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
@@ -844,60 +786,75 @@
             "text/plain": [
               "           name       map\n",
               "0  PL2 Baseline  0.206031\n",
-              "1  LTR Baseline  0.144980"
+              "1  LTR Baseline  0.144662"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 24
+          "execution_count": 18,
+          "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "from sklearn.ensemble import RandomForestRegressor\n",
+        "\n",
+        "BaselineLTR = fbr3f >> pt.ltr.apply_learned_model(RandomForestRegressor(n_estimators=400))\n",
+        "BaselineLTR.fit(train_topics, qrels)\n",
+        "\n",
+        "results = pt.Experiment([PL2, BaselineLTR], test_topics, qrels, [\"map\"], names=[\"PL2 Baseline\", \"LTR Baseline\"])\n",
+        "results"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here, the RandomForest pipeline wasnt very good. LambdaMART is normally a bit better. Lets try that next..."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "iGw58PCuumuT",
-        "colab_type": "text"
+        "id": "iGw58PCuumuT"
       },
       "source": [
         "## XgBoost Pipeline\n",
         "\n",
-        "We now demonstrate the use of a LambdaMART implementation from [xgBoost](https://xgboost.readthedocs.io/en/latest/). Again, pyTerrier provides a transformer object, namely `XGBoostLTR_pipeline`, which takes in the constrcutor the actual xgBoost model that you want to train. We took the xgBoost configuration from [their example code](https://github.com/dmlc/xgboost/blob/master/demo/rank/rank.py).\n",
+        "We now demonstrate the use of a LambdaMART implementation from [xgBoost](https://xgboost.readthedocs.io/en/latest/). Again, PyTerrier provides a Transformer object from `pt.ltr.apply_learned_model()`, this time passing `form='ltr'` as kwarg.\n",
         "\n",
-        "Call the `fit()` method on the full pipeline with the training and validation topics.\n",
+        "This takes in the constrcutor the actual xgBoost model that you want to train. We took the xgBoost configuration from [their example code](https://github.com/dmlc/xgboost/blob/master/demo/rank/rank.py).\n",
         "\n",
-        "Evaluate the results with the Experiment function by using the test topics"
+        "Call the `fit()` method on the full pipeline with the training *and validation* topics.\n",
+        "\n",
+        "The same pipeline can also be used with [LightGBM](https://github.com/microsoft/LightGBM).\n",
+        "\n",
+        "Evaluate the results with the Experiment function by using the test topics."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 19,
       "metadata": {
-        "id": "nM0r8EgFuGtQ",
-        "colab_type": "code",
-        "colab": {}
+        "id": "nM0r8EgFuGtQ"
       },
+      "outputs": [],
       "source": [
         "import xgboost as xgb\n",
-        "params = {'objective': 'rank:ndcg', \n",
-        "          'learning_rate': 0.1, \n",
-        "          'gamma': 1.0, 'min_child_weight': 0.1,\n",
+        "params = {'objective': 'rank:ndcg',\n",
+        "          'learning_rate': 0.1,\n",
+        "          'gamma': 1.0, \n",
+        "          'min_child_weight': 0.1,\n",
         "          'max_depth': 6,\n",
-        "          'verbose': 2,\n",
-        "          'random_state': 42 \n",
+        "          'random_state': 42\n",
         "         }\n",
         "\n",
-        "BaseLTR_LM = fbr >> pt.pipelines.XGBoostLTR_pipeline(xgb.sklearn.XGBRanker(**params))\n",
+        "BaseLTR_LM = fbr3f >> pt.ltr.apply_learned_model(xgb.sklearn.XGBRanker(**params), form='ltr')\n",
         "BaseLTR_LM.fit(train_topics, qrels, valid_topics, qrels)"
-      ],
-      "execution_count": 25,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "HVXoNhzSP-k2",
-        "colab_type": "text"
+        "id": "HVXoNhzSP-k2"
       },
       "source": [
         "And evaluate the results."
@@ -905,26 +862,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 20,
       "metadata": {
-        "id": "Dn56DKZMTQ_m",
-        "colab_type": "code",
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 111
+          "height": 112
         },
-        "outputId": "6688d85e-8599-4f11-db18-231abd0d7aee"
+        "id": "Dn56DKZMTQ_m",
+        "outputId": "133260ca-e979-4006-9120-5339682331e0"
       },
-      "source": [
-        "allresultsLM = pt.pipelines.Experiment([PL2, BaseLTR_LM],\n",
-        "                                test_topics,                                  \n",
-        "                                qrels, [\"map\"], \n",
-        "                                names=[\"PL2 Baseline\", \"LambdaMART\"])\n",
-        "allresultsLM"
-      ],
-      "execution_count": 26,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
             "text/html": [
               "<div>\n",
@@ -958,7 +906,7 @@
               "    <tr>\n",
               "      <th>1</th>\n",
               "      <td>LambdaMART</td>\n",
-              "      <td>0.204391</td>\n",
+              "      <td>0.210969</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
@@ -967,15 +915,51 @@
             "text/plain": [
               "           name       map\n",
               "0  PL2 Baseline  0.206031\n",
-              "1    LambdaMART  0.204391"
+              "1    LambdaMART  0.210969"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 26
+          "execution_count": 20,
+          "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "allresultsLM = pt.Experiment([PL2, BaseLTR_LM],\n",
+        "                                test_topics,\n",
+        "                                qrels, [\"map\"],\n",
+        "                                names=[\"PL2 Baseline\", \"LambdaMART\"])\n",
+        "allresultsLM"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Excellent, event on this small dataset, adding a few more features and LambdaMART can enhance effectiveness!"
       ]
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/pyterrier/__init__.py b/pyterrier/__init__.py
index 98d5ff7d..8198b684 100644
--- a/pyterrier/__init__.py
+++ b/pyterrier/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.10.0"
+__version__ = "0.10.1"
 
 import os
 
diff --git a/pyterrier/apply_base.py b/pyterrier/apply_base.py
index 0290fdaf..bdd69449 100644
--- a/pyterrier/apply_base.py
+++ b/pyterrier/apply_base.py
@@ -210,11 +210,18 @@ def transform(self, inputRes):
             outputRes = push_queries(inputRes.copy(), inplace=True, keep_original=True)
         else:
             outputRes = inputRes.copy()
-        if self.verbose:
-            tqdm.pandas(desc="pt.apply.query", unit="d")
-            outputRes["query"] = outputRes.progress_apply(fn, axis=1)
-        else:
-            outputRes["query"] = outputRes.apply(fn, axis=1)
+        try:
+            if self.verbose:
+                tqdm.pandas(desc="pt.apply.query", unit="d")
+                outputRes["query"] = outputRes.progress_apply(fn, axis=1)
+            else:
+                outputRes["query"] = outputRes.apply(fn, axis=1)
+        except ValueError as ve:
+            msg = str(ve)
+            if "Columns must be same length as key" in msg:
+                raise TypeError("Could not coerce return from pt.apply.query function into a list of strings. Check your function returns a string.") from ve
+            else:
+                raise ve
         return outputRes
 
 class ApplyGenericTransformer(ApplyTransformerBase):
diff --git a/pyterrier/batchretrieve.py b/pyterrier/batchretrieve.py
index 3c59ca9d..357a3116 100644
--- a/pyterrier/batchretrieve.py
+++ b/pyterrier/batchretrieve.py
@@ -553,7 +553,10 @@ class TextScorer(TextIndexProcessor):
             takes(str): configuration - what is needed as input: `"queries"`, or `"docs"`. Default is `"docs"` since v0.8.
             returns(str): configuration - what is needed as output: `"queries"`, or `"docs"`. Default is `"docs"`.
             body_attr(str): what dataframe input column contains the text of the document. Default is `"body"`.
-            wmodel(str): example of configuration passed to BatchRetrieve.
+            wmodel(str): name of the weighting model to use for scoring.
+            background_index(index_like): An optional background index to use for term and collection statistics. If a weighting
+                model such as BM25 or TF_IDF or PL2 is used without setting the background_index, the background statistics
+                will be calculated from the dataframe, which is ususally not the desired behaviour.
 
         Example::
 
@@ -562,9 +565,21 @@ class TextScorer(TextIndexProcessor):
                     ["q1", "chemical reactions", "d1", "professor protor poured the chemicals"],
                     ["q1", "chemical reactions", "d2", "chemical brothers turned up the beats"],
                 ], columns=["qid", "query", "text"])
-            textscorer = pt.TextScorer(takes="docs", body_attr="text", wmodel="TF_IDF")
+            textscorer = pt.TextScorer(takes="docs", body_attr="text", wmodel="Tf")
             rtr = textscorer.transform(df)
-            #rtr will score each document for the query "chemical reactions" based on the provided document contents
+            #rtr will score each document by term frequency for the query "chemical reactions" based on the provided document contents
+
+        Example::
+
+            df = pd.DataFrame(
+                [
+                    ["q1", "chemical reactions", "d1", "professor protor poured the chemicals"],
+                    ["q1", "chemical reactions", "d2", "chemical brothers turned up the beats"],
+                ], columns=["qid", "query", "text"])
+            existing_index = pt.IndexFactory.of(...)
+            textscorer = pt.TextScorer(takes="docs", body_attr="text", wmodel="TF_IDF", background_index=existing_index)
+            rtr = textscorer.transform(df)
+            #rtr will score each document by TF_IDF for the query "chemical reactions" based on the provided document contents
     """
 
     def __init__(self, takes="docs", **kwargs):
@@ -606,6 +621,12 @@ def __init__(self, index_location, features, controls=None, properties=None, thr
             self.wmodel = kwargs["wmodel"]
         if "wmodel" in controls:
             self.wmodel = controls["wmodel"]
+        
+        # check for terrier-core#246 bug usiung FatFull
+        if self.wmodel is not None:    
+            from . import check_version
+            assert check_version(5.9), "Terrier 5.9 is required for this functionality, see https://github.com/terrier-org/terrier-core/pull/246"
+            
         if threads > 1:
             raise ValueError("Multi-threaded retrieval not yet supported by FeaturesBatchRetrieve")
         
@@ -657,7 +678,7 @@ def transform(self, queries):
         Performs the retrieval with multiple features
 
         Args:
-            queries: String for a single query, list of queries, or a pandas.Dataframe with columns=['qid', 'query']. For re-ranking,
+            queries: A pandas.Dataframe with columns=['qid', 'query']. For re-ranking,
                 the DataFrame may also have a 'docid' and or 'docno' column.
 
         Returns:
@@ -846,4 +867,4 @@ def push_fbr_earlier(_br1, _fbr):
     global rewrites_setup
     rewrites_setup = True
 
-setup_rewrites()
\ No newline at end of file
+setup_rewrites()
diff --git a/pyterrier/bootstrap.py b/pyterrier/bootstrap.py
index a606ef71..99be0550 100644
--- a/pyterrier/bootstrap.py
+++ b/pyterrier/bootstrap.py
@@ -46,7 +46,7 @@ def _load_into_memory(index, structures=['lexicon', 'direct', 'inverted', 'meta'
             },
             'inverted' : {
                 'org.terrier.structures.bit.BitPostingIndex' : {
-                    'index.direct.data-source' : 'fileinmem'}
+                    'index.inverted.data-source' : 'fileinmem'}
             },
         }
         if "direct" in structures:
@@ -271,6 +271,60 @@ def _index_add(self, other):
             raise ValueError("Cannot document-wise merge indices with and without positions (%r vs %r)" % (blocks_1, blocks_2))
         multiindex_cls = autoclass("org.terrier.realtime.multi.MultiIndex")
         return multiindex_cls([self, other], blocks_1, fields_1 > 0)
+    
+    def _index_corpusiter(self, return_toks=True):
+        def _index_corpusiter_meta(self):
+            meta_inputstream = self.getIndexStructureInputStream("meta")
+            keys = self.getMetaIndex().getKeys()
+            keys_offset = { k: offset for offset, k in enumerate(keys) }
+            while meta_inputstream.hasNext():
+                item = meta_inputstream.next()
+                yield {k : item[keys_offset[k]] for k in keys_offset}
+
+        def _index_corpusiter_direct_pretok(self):
+            import sys
+            MIN_PYTHON = (3, 8)
+            if sys.version_info < MIN_PYTHON:
+                raise NotImplementedError("Sorry, Python 3.8+ is required for this functionality")
+
+            meta_inputstream = self.getIndexStructureInputStream("meta")
+            keys = self.getMetaIndex().getKeys()
+            keys_offset = { k: offset for offset, k in enumerate(keys) }
+            keys_offset = {'docno' : keys_offset['docno'] }
+            direct_inputstream = self.getIndexStructureInputStream("direct")
+            lex = self.getLexicon()
+
+            ip = None
+            while (ip := direct_inputstream.getNextPostings()) is not None: # this is the next() method
+
+                # yield empty toks dicts for empty documents
+                for skipped in range(0, direct_inputstream.getEntriesSkipped()):
+                    meta = meta_inputstream.next()
+                    rtr = {k : meta[keys_offset[k]] for k in keys_offset}   
+                    rtr['toks'] = {}
+                    yield rtr
+
+                toks = {}
+                while ip.next() != ip.EOL:
+                    t, _ = lex[ip.getId()]
+                    toks[t] = ip.getFrequency()
+                meta = meta_inputstream.next()
+                rtr = {'toks' : toks}
+                rtr.update({k : meta[keys_offset[k]] for k in keys_offset})
+                yield rtr
+
+            # yield for trailing empty documents
+            for skipped in range(0, direct_inputstream.getEntriesSkipped()):
+                meta = meta_inputstream.next()
+                rtr = {k : meta[keys_offset[k]] for k in keys_offset}   
+                rtr['toks'] = {}
+                yield rtr
+        
+        if return_toks:
+            if not self.hasIndexStructureInputStream("direct"):
+                raise ValueError("No direct index input stream available, cannot use return_toks=True")
+            return _index_corpusiter_direct_pretok(self)
+        return _index_corpusiter_meta(self)
 
     protocol_map["org.terrier.structures.Index"] = {
         # this means that len(index) returns the number of documents in the index
@@ -278,7 +332,10 @@ def _index_add(self, other):
 
         # document-wise composition of indices: adding more documents to an index, by merging two indices with 
         # different numbers of documents. This implemented by the overloading the `+` Python operator
-        '__add__': _index_add
+        '__add__': _index_add,
+
+        # get_corpus_iter returns a yield generator that return {"docno": "d1", "toks" : {'a' : 1}}
+        'get_corpus_iter' : _index_corpusiter
     }
 
 def setup_terrier(file_path, terrier_version=None, helper_version=None, boot_packages=[], force_download=True):
diff --git a/pyterrier/datasets.py b/pyterrier/datasets.py
index 87aa3aeb..247b1825 100644
--- a/pyterrier/datasets.py
+++ b/pyterrier/datasets.py
@@ -644,21 +644,21 @@ def msmarco_document_generate(dataset):
 
 MSMARCO_DOC_FILES = {
     "corpus" : 
-        [("msmarco-docs.trec.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")],
+        [("msmarco-docs.trec.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")],
     "corpus-tsv":
-        [("msmarco-docs.tsv.gz",  "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")],
+        [("msmarco-docs.tsv.gz",  "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")],
     "topics" : 
         { 
-            "train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"),
-            "dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"),
-            "test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
-            "test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
-            'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline")
+            "train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"),
+            "dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"),
+            "test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
+            "test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
+            'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline")
         },
     "qrels" : 
         { 
-            "train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"),
-            "dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"),
+            "train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"),
+            "dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"),
             "test" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-docs.txt"),
             "test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-docs.txt")
         },
@@ -685,18 +685,18 @@ def msmarco_document_generate(dataset):
             "dev.small" : ("queries.dev.small.tsv", "collectionandqueries.tar.gz#queries.dev.small.tsv", "singleline"),
             "eval" : ("queries.eval.tsv", "queries.tar.gz#queries.eval.tsv", "singleline"),
             "eval.small" : ("queries.eval.small.tsv", "collectionandqueries.tar.gz#queries.eval.small.tsv", "singleline"),
-            "test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
-            "test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline")
+            "test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
+            "test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline")
         },        
     "tars" : {
-        "queries.tar.gz" : ("queries.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz"),
-        "collection.tar.gz" : ("collection.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz"),
-        "collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz")
+        "queries.tar.gz" : ("queries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz"),
+        "collection.tar.gz" : ("collection.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz"),
+        "collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz")
     },
     "qrels" : 
         { 
-            "train" : ("qrels.train.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv"),
-            "dev" : ("qrels.dev.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv"),
+            "train" : ("qrels.train.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv"),
+            "dev" : ("qrels.dev.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv"),
             "test-2019" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-pass.txt"),
             "test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-pass.txt"),
             "dev.small" : ("qrels.dev.small.tsv", "collectionandqueries.tar.gz#qrels.dev.small.tsv"),
@@ -709,19 +709,19 @@ def msmarco_document_generate(dataset):
 MSMARCOv2_DOC_FILES = {
     "info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
     "topics" : {
-        "train" : ("docv2_train_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"),
-        "dev1"  :("docv2_dev_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"),
-        "dev2"  :("docv2_dev2_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"),
-        "valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
-        "valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
-        "trec_2021" : ("2021_queries.tsv" , "https://msmarco.blob.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
+        "train" : ("docv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"),
+        "dev1"  :("docv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"),
+        "dev2"  :("docv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"),
+        "valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"),
+        "valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"),
+        "trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
     },
     "qrels" : {
-        "train" : ("docv2_train_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"),
-        "dev1"  :("docv2_dev_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"),
-        "dev2"  :("docv2_dev2_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"),
-        "valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"),
-        "valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz")
+        "train" : ("docv2_train_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"),
+        "dev1"  :("docv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"),
+        "dev2"  :("docv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"),
+        "valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"),
+        "valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz")
     },
     "index" : _datarepo_index,
 }
@@ -729,15 +729,15 @@ def msmarco_document_generate(dataset):
 MSMARCOv2_PASSAGE_FILES = {
     "info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html",
     "topics" : {
-        "train" : ("passv2_train_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"),
-        "dev1"  : ("passv2_dev_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"),
-        "dev2"  : ("passv2_dev2_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"),
-        "trec_2021" : ("2021_queries.tsv" , "https://msmarco.blob.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
+        "train" : ("passv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"),
+        "dev1"  : ("passv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"),
+        "dev2"  : ("passv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"),
+        "trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"),
     },
     "qrels" : {
-        "train" : ("passv2_train_qrels.tsv" "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"),
-        "dev1"  : ("passv2_dev_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"),
-        "dev2"  : ("passv2_dev2_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"),
+        "train" : ("passv2_train_qrels.tsv" "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"),
+        "dev1"  : ("passv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"),
+        "dev2"  : ("passv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"),
     },
     "index" : _datarepo_index,
 }
diff --git a/pyterrier/io.py b/pyterrier/io.py
index 9673b56f..3d220e25 100644
--- a/pyterrier/io.py
+++ b/pyterrier/io.py
@@ -228,12 +228,8 @@ def _parse_line(l):
 
 def _read_results_trec(filename):
     results = []
-    df = pd.read_csv(filename, sep=r'\s+', names=["qid", "iter", "docno", "rank", "score", "name"])
+    df = pd.read_csv(filename, sep=r'\s+', names=["qid", "iter", "docno", "rank", "score", "name"], dtype={'qid': str, 'docno': str, 'rank': int, 'score': float}) 
     df = df.drop(columns="iter")
-    df["qid"] = df["qid"].astype(str)
-    df["docno"] = df["docno"].astype(str)
-    df["rank"] = df["rank"].astype(int)
-    df["score"] = df["score"].astype(float)
     return df
 
 def write_results(res, filename, format="trec", append=False, **kwargs):
@@ -294,13 +290,13 @@ def read_topics(filename, format="trec", **kwargs):
 
     Supported Formats:
         * "trec" -- an SGML-formatted TREC topics file. Delimited by TOP tags, each having NUM and TITLE tags; DESC and NARR tags are skipped by default. Control using whitelist and blacklist kwargs
-        * "trecxml" -- a more modern XML formatted topics file. Delimited by topic tags, each having nunber tags. query, question and narrative tags are parsed by default. Control using tags kwarg.
+        * "trecxml" -- a more modern XML formatted topics file. Delimited by topic tags, each having number tags. query, question and narrative tags are parsed by default. Control using tags kwarg.
         * "singeline" -- one query per line, preceeded by a space or colon. Tokenised by default, use tokenise=False kwargs to prevent tokenisation.
     """
     if format is None:
         format = "trec"
     if not format in SUPPORTED_TOPICS_FORMATS:
-        raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_RESULTS_FORMATS.keys())))
+        raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_TOPICS_FORMATS.keys())))
     return SUPPORTED_TOPICS_FORMATS[format](filename, **kwargs)
 
 def _read_topics_trec(file_path, doc_tag="TOP", id_tag="NUM", whitelist=["TITLE"], blacklist=["DESC","NARR"]):
@@ -339,7 +335,10 @@ def _read_topics_trecxml(filename, tags=["query", "question", "narrative"], toke
     from jnius import autoclass
     tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
     for child in root.iter('topic'):
-        qid = child.attrib["number"]
+        try:
+            qid = child.attrib["number"]
+        except KeyError:
+            qid = child.find("number").text
         query = ""
         for tag in child:
             if tag.tag in tags:
@@ -347,7 +346,7 @@ def _read_topics_trecxml(filename, tags=["query", "question", "narrative"], toke
                 if tokenise:
                     query_text = " ".join(tokeniser.getTokens(query_text))
                 query += " " + query_text
-        topics.append((str(qid), query))
+        topics.append((str(qid), query.strip()))
     return pd.DataFrame(topics, columns=["qid", "query"])
 
 def _read_topics_singleline(filepath, tokenise=True):
diff --git a/pyterrier/pipelines.py b/pyterrier/pipelines.py
index 345812c8..8188ad08 100644
--- a/pyterrier/pipelines.py
+++ b/pyterrier/pipelines.py
@@ -561,8 +561,11 @@ def _apply_round(measure, value):
             for pcol in p_col_names:
                 pcol_reject = pcol.replace("p-value", "reject")
                 pcol_corrected = pcol + " corrected"                
-                reject, corrected, _, _ = statsmodels.stats.multitest.multipletests(df[pcol], alpha=correction_alpha, method=correction)
+                reject, corrected, _, _ = statsmodels.stats.multitest.multipletests(df[pcol].drop(df.index[baseline]), alpha=correction_alpha, method=correction)
                 insert_pos = df.columns.get_loc(pcol)
+                # add reject/corrected values for the baseline
+                reject = np.insert(reject, baseline, False)
+                corrected = np.insert(corrected, baseline, np.nan)
                 # add extra columns, put place directly after the p-value column
                 df.insert(insert_pos+1, pcol_reject, reject)
                 df.insert(insert_pos+2, pcol_corrected, corrected)
diff --git a/pyterrier/rewrite.py b/pyterrier/rewrite.py
index c2f65e44..46e313b7 100644
--- a/pyterrier/rewrite.py
+++ b/pyterrier/rewrite.py
@@ -205,6 +205,7 @@ def __init__(self, index_like, fb_terms=10, fb_docs=3, qeclass="org.terrier.quer
         else:
             self.qe = qeclass
         self.indexref = _parse_index_like(index_like)
+        self.properties = properties
         for k,v in properties.items():
             pt.ApplicationSetup.setProperty(k, str(v))
         self.applytp = pt.autoclass("org.terrier.querying.ApplyTermPipeline")()
@@ -212,6 +213,34 @@ def __init__(self, index_like, fb_terms=10, fb_docs=3, qeclass="org.terrier.quer
         self.fb_docs = fb_docs
         self.manager = pt.autoclass("org.terrier.querying.ManagerFactory")._from_(self.indexref)
 
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (self.indexref,),
+            self.__getstate__()
+        )
+
+    def __getstate__(self): 
+        if isinstance(self.qe, str):
+            qe = self.qe
+        else:
+            qe = self.qe.getClass().getName()
+        return  {
+                'fb_terms' : self.fb_terms, 
+                'fb_docs' : self.fb_docs,
+                'qeclass' : qe,
+                'properties' : self.properties
+                }
+
+    def __setstate__(self, d): 
+        self.fb_terms = d["fb_terms"]
+        self.fb_docs = d["fb_docs"]
+        self.qe = pt.autoclass(d['qeclass'])()
+        self.properties.update(d["properties"])
+        for key,value in d["properties"].items():
+            self.appSetup.setProperty(key, str(value))
+        self.manager = pt.autoclass("org.terrier.querying.ManagerFactory")._from_(self.indexref)
+
     def _populate_resultset(self, topics_and_res, qid, index):
         
         docids=None
@@ -387,6 +416,15 @@ def __init__(self, *args, fb_terms=10, fb_docs=3, fb_lambda=0.6, **kwargs):
         kwargs["qeclass"] = rm
         super().__init__(*args, fb_terms=fb_terms, fb_docs=fb_docs, **kwargs)
 
+    def __getstate__(self): 
+        rtr = super().__getstate__()
+        rtr['fb_lambda'] = self.fb_lambda
+        return rtr
+    
+    def __setstate__(self, d): 
+        super().__setstate__(d)
+        self.fb_lambda = d["fb_lambda"]
+
     def _configure_request(self, rq):
         super()._configure_request(rq)
         rq.setControl("rm3.lambda", str(self.fb_lambda))
diff --git a/pyterrier/text.py b/pyterrier/text.py
index a3ac6f87..51a2dbce 100644
--- a/pyterrier/text.py
+++ b/pyterrier/text.py
@@ -135,6 +135,13 @@ def scorer(*args, **kwargs) -> Transformer:
     This is an alias to pt.TextScorer(). Internally, a Terrier memory index is created, before being
     used for scoring.
 
+    Arguments:
+        body_attr(str): what dataframe input column contains the text of the document. Default is `"body"`.
+        wmodel(str): name of the weighting model to use for scoring.
+        background_index(index_like): An optional background index to use for collection statistics. If a weighting
+            model such as BM25 or TF_IDF or PL2 is used without setting the background_index, the background statistics
+            will be calculated from the dataframe, which is ususally not the desired behaviour.
+
     Example::
     
         df = pd.DataFrame(
@@ -149,8 +156,9 @@ def scorer(*args, **kwargs) -> Transformer:
         # ["q1", "chemical reactions", "d1", "professor protor poured the chemicals", 0, 1]
         # ["q1", "chemical reactions", "d2", "chemical brothers turned up the beats", 0, 1]
 
-    For calculating the scores of documents using any weighting model with the concept of IDF, it may be useful to make use of
-    an existing Terrier index for background statistics::
+    For calculating the scores of documents using any weighting model with the concept of IDF, it is strongly advised to make use of
+    an existing Terrier index for background statistics. Without a background index, IDF will be calculated based on the supplied
+    dataframe (for models such as BM25, this can lead to negative scores)::
 
         textscorerTfIdf = pt.text.scorer(body_attr="text", wmodel="TF_IDF", background_index=index)
 
@@ -512,8 +520,8 @@ def applyPassaging(self, df, labels=True):
                         newRows.append(newRow)
                         passageCount+=1
         newDF = pd.DataFrame(newRows)
-        newDF['query'].fillna('',inplace=True)
-        newDF[self.text_attr].fillna('',inplace=True)
-        newDF['qid'].fillna('',inplace=True)
+        newDF['query'] = newDF['query'].fillna('')
+        newDF[self.text_attr] = newDF[self.text_attr].fillna('')
+        newDF['qid'] = newDF['qid'].fillna('')
         newDF.reset_index(inplace=True,drop=True)
         return newDF
diff --git a/pyterrier/transformer.py b/pyterrier/transformer.py
index 38978227..7d1c55e3 100644
--- a/pyterrier/transformer.py
+++ b/pyterrier/transformer.py
@@ -39,7 +39,7 @@ def get_transformer(v, stacklevel=1):
     if isinstance(v, pd.DataFrame):
         warn('Coercion of a dataframe into a transformer is deprecated; use a pt.Transformer.from_df() instead', stacklevel=stacklevel, category=DeprecationWarning)
         return SourceTransformer(v)
-    raise ValueError("Passed parameter %s of type %s cannot be coerced into a transformer" % (str(v), type(v)), stacklevel=stacklevel, category=DeprecationWarning)
+    raise ValueError("Passed parameter %s of type %s cannot be coerced into a transformer" % (str(v), type(v)))
 
 rewrite_rules = []
 
@@ -281,8 +281,10 @@ def __init__(self, *args, **kwargs):
 class Indexer(Transformer):
     def index(self, iter : Iterable[dict], **kwargs):
         """
-            Takes an iterable of dictionaries ("iterdict"), and consumes them. There is no return;
-            This method is typically used to implement indexers.
+            Takes an iterable of dictionaries ("iterdict"), and consumes them. The index method may return
+            an instance of the index or retriever. This method is typically used to implement indexers that
+            consume a corpus (or to consume the output of previous pipeline components that have
+            transformer the documents being consumed).
         """
         pass
 
@@ -368,4 +370,4 @@ def __init__(self, rtr, **kwargs):
     
     def transform(self, topics):
         rtr = self.rtr.copy()
-        return rtr
\ No newline at end of file
+        return rtr
diff --git a/requirements-test.txt b/requirements-test.txt
index a2cc5b7a..9e41c8e0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -4,3 +4,4 @@ fastrank>=0.7.0
 torch
 lz4
 transformers
+scikit-learn
diff --git a/requirements.txt b/requirements.txt
index 55baa50f..6106ceb7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ wget
 tqdm
 pyjnius>=1.4.2
 matchpy
-scikit-learn
 deprecated
 chest
 scipy
diff --git a/setup.py b/setup.py
index 07913e5c..be9968c1 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,12 @@ def get_version(rel_path):
     author="Craig Macdonald",
     author_email='craigm@dcs.gla.ac.uk',
     description="Terrier IR platform Python API",
+    project_urls={
+        'Documentation': 'https://pyterrier.readthedocs.io',
+        'Changelog': 'https://github.com/terrier-org/pyterrier/releases',
+        'Issue Tracker': 'https://github.com/terrier-org/pyterrier/issues',
+        'CI': 'https://github.com/terrier-org/pyterrier/actions',
+    },
     long_description=long_description,
     long_description_content_type="text/markdown",
     package_data={'': ['LICENSE.txt', 'requirements.txt', 'requirements-test.txt']},
@@ -65,5 +71,5 @@ def get_version(rel_path):
         "Operating System :: OS Independent",
     ],
     install_requires=requirements,
-    python_requires='>=3.7',
+    python_requires='>=3.8',
 )
diff --git a/terrier-python-helper/pom.xml b/terrier-python-helper/pom.xml
index c1281371..c829982a 100644
--- a/terrier-python-helper/pom.xml
+++ b/terrier-python-helper/pom.xml
@@ -110,14 +110,14 @@
     <dependency>
 			<groupId>ch.qos.logback</groupId>
 			<artifactId>logback-classic</artifactId>
-			<version>1.2.0</version>
+			<version>1.2.13</version>
       <scope>provided</scope>
 		</dependency>
 
 		<dependency>
 			<groupId>ch.qos.logback</groupId>
 			<artifactId>logback-core</artifactId>
-			<version>1.2.9</version>
+			<version>1.2.13</version>
       <scope>provided</scope>
 		</dependency>
   </dependencies>
diff --git a/tests/base.py b/tests/base.py
index 93594a10..501cd234 100644
--- a/tests/base.py
+++ b/tests/base.py
@@ -10,14 +10,20 @@ class BaseTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(BaseTestCase, self).__init__(*args, **kwargs)
         terrier_version = os.environ.get("TERRIER_VERSION", None)
-        if terrier_version is not None:
-            print("Testing with Terrier version " + terrier_version)
         terrier_helper_version = os.environ.get("TERRIER_HELPER_VERSION", None)
-        if terrier_helper_version is not None:
-            print("Testing with Terrier Helper version " + terrier_helper_version)
         if not pt.started():
+
+            # display for debugging what is being used
+            if terrier_version is not None:
+                print("Testing with Terrier version " + terrier_version)
+            if terrier_helper_version is not None:
+                print("Testing with Terrier Helper version " + terrier_helper_version)
+            
             pt.init(version=terrier_version, logging="DEBUG", helper_version=terrier_helper_version)
+            # jvm_opts=['-ea'] can be added here to ensure that all Java assertions are met
         self.here = os.path.dirname(os.path.realpath(__file__))
+
+        # check that pt.init() is saving its arguments
         assert "version" in pt.init_args
         assert pt.init_args["version"] == terrier_version
 
@@ -42,4 +48,4 @@ def tearDown(self):
         except:
             pass
 
-    
\ No newline at end of file
+    
diff --git a/tests/fixtures/topics.trecxml b/tests/fixtures/topics.trecxml
new file mode 100644
index 00000000..bea592cb
--- /dev/null
+++ b/tests/fixtures/topics.trecxml
@@ -0,0 +1,20 @@
+<topics>
+  <topic>
+    <number>1</number>
+    <title>lights</title>
+    <description>Description lights</description>
+    <narrative>Documents are relevant if they describe lights.</narrative>
+  </topic>
+  <topic>
+    <number>2</number>
+    <title>radiowaves</title>
+    <description>Description radiowaves</description>
+    <narrative>Documents are relevant if they describe radiowaves.</narrative>
+  </topic>
+  <!-- Use attribute rather than tag for number -->
+  <topic number="3">
+    <title>sounds</title>
+    <description>Description sound</description>
+    <narrative>Documents are relevant if they describe sounds.</narrative>
+  </topic>
+</topics>
\ No newline at end of file
diff --git a/tests/test_apply.py b/tests/test_apply.py
index 0f6faf8f..ad834eaf 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -59,6 +59,14 @@ def test_query_apply(self):
         rtrDR2 = pt.apply.query(lambda row : row["qid"] )(testDF2)
         self.assertEqual(rtrDR2.iloc[0]["query"], "q1")
 
+    def test_query_apply_error(self):
+        origquery="the bear and the wolf"
+        testDF = pd.DataFrame([["q1", origquery]], columns=["qid", "query"])
+        p = pt.apply.query(lambda q : q) # should thrown an error, as pt.apply.query should return a string, not a row
+        with self.assertRaises(TypeError) as te:
+            p(testDF)
+        self.assertTrue("Could not coerce return from pt.apply.query function into a list of strings" in str(te.exception))
+
     def test_by_query_apply(self):
         inputDf = pt.new.ranked_documents([[1], [2]], qid=["1", "2"])
         def _inc_score(res):
diff --git a/tests/test_experiment.py b/tests/test_experiment.py
index a4f380f9..5498ba30 100644
--- a/tests/test_experiment.py
+++ b/tests/test_experiment.py
@@ -321,7 +321,7 @@ def test_baseline_and_tests(self):
         # user-specified TOST
         # TOST will omit warnings here, due to low numbers of topics
         import statsmodels.stats.weightstats
-        fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_ind(X, Y, -0.01, 0.01)[0])
+        fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_paired(X, Y, -0.01, 0.01)[0])
         
         #This filter doesnt work
         with warnings.catch_warnings(record=True) as w:
@@ -363,15 +363,17 @@ def test_baseline_corrected(self):
         dataset = pt.get_dataset("vaswani")
         res1 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")(dataset.get_topics().head(10))
         res2 = pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")(dataset.get_topics().head(10))
-        for corr in ['hs', 'bonferroni', 'holm-sidak']:            
+        baseline = 0
+        for corr in ['hs', 'bonferroni', 'hommel']:
             df = pt.Experiment(
                 [res1, res2], 
                 dataset.get_topics().head(10), 
                 dataset.get_qrels(),
                 eval_metrics=["map", "ndcg"], 
-                baseline=0, correction='hs')
+                baseline=baseline, correction=corr)
             self.assertTrue("map +" in df.columns)
             self.assertTrue("map -" in df.columns)
             self.assertTrue("map p-value" in df.columns)
             self.assertTrue("map p-value corrected" in df.columns)
             self.assertTrue("map reject" in df.columns)
+            self.assertFalse(any(df["map p-value corrected"].drop(df.index[baseline]).isna()))
diff --git a/tests/test_fbr.py b/tests/test_fbr.py
index b25c58ee..d7c49e25 100644
--- a/tests/test_fbr.py
+++ b/tests/test_fbr.py
@@ -137,6 +137,35 @@ def test_fbr(self):
         if "matching" in retrBasic.controls:
             self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull")
 
+    def test_fbr_example(self):
+        JIR = pt.autoclass('org.terrier.querying.IndexRef')
+        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
+        index = pt.IndexFactory.of(indexref)
+        # this ranker will make the candidate set of documents for each query
+        BM25 = pt.BatchRetrieve(index, wmodel="BM25")
+
+        # these rankers we will use to re-rank the BM25 results
+        TF_IDF =  pt.BatchRetrieve(index, wmodel="Dl")
+        PL2 =  pt.BatchRetrieve(index, wmodel="PL2")
+
+        pipe =  (BM25 %2) >> (TF_IDF ** PL2)
+        fbr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:Dl", "WMODEL:PL2"], wmodel="BM25") % 2
+        resultP = pipe.search("chemical")
+        resultF = fbr.search("chemical")
+        pd.set_option('display.max_columns', None)
+
+        self.assertEqual(resultP.iloc[0].docno, resultF.iloc[0].docno)
+        self.assertEqual(resultP.iloc[0].score, resultF.iloc[0].score)
+        self.assertEqual(resultP.iloc[0].features[0], resultF.iloc[0].features[0])
+        self.assertEqual(resultP.iloc[0].features[1], resultF.iloc[0].features[1])
+
+        pipeCompiled = pipe.compile()
+        resultC = pipeCompiled.search("chemical")
+        self.assertEqual(resultP.iloc[0].docno, resultC.iloc[0].docno)
+        self.assertEqual(resultP.iloc[0].score, resultC.iloc[0].score)
+        self.assertEqual(resultP.iloc[0].features[0], resultC.iloc[0].features[0])
+        self.assertEqual(resultP.iloc[0].features[1], resultC.iloc[0].features[1])
+
     def test_fbr_empty(self):
         JIR = pt.autoclass('org.terrier.querying.IndexRef')
         indexref = JIR.of(self.here + "/fixtures/index/data.properties")
diff --git a/tests/test_index_op.py b/tests/test_index_op.py
index 25b0ae3e..a6f27538 100644
--- a/tests/test_index_op.py
+++ b/tests/test_index_op.py
@@ -10,6 +10,98 @@
 
 class TestIndexOp(TempDirTestCase):
 
+    def test_index_corpus_iter(self):
+        import sys
+        MIN_PYTHON = (3, 8)
+        if sys.version_info < MIN_PYTHON:
+            self.skipTest("Not minimum Python requirements")
+
+        documents = [
+            {'docno' : 'd1', 'text': 'stemming stopwords stopwords'},
+        ]
+        index = pt.IndexFactory.of( pt.IterDictIndexer(tempfile.mkdtemp(), stopwords=None, stemmer=None).index(documents) )
+        self.assertEqual(1, len(index))
+        self.assertEqual(2, index.getCollectionStatistics().getNumberOfUniqueTerms())
+        self.assertEqual(3, index.getCollectionStatistics().getNumberOfTokens())
+
+        # check that get_corpus_iter() contains the correct information 
+        iter = index.get_corpus_iter()
+        first_doc = next(iter)
+        self.assertTrue(first_doc is not None)
+        self.assertIn('docno', first_doc)
+        self.assertIn('toks', first_doc)
+        self.assertIn('stemming', first_doc['toks'])
+        self.assertIn('stopwords', first_doc['toks'])
+        self.assertEqual(1, first_doc['toks']['stemming'])
+        self.assertEqual(2, first_doc['toks']['stopwords'])
+        with(self.assertRaises(StopIteration)):
+            next(iter)
+
+        #  now check that a static pruning pipe can operate as expected. this example comes from terrier-index-api.rst
+        index_pipe = (
+            # update the toks column for each document, keeping only terms with frequency > 1
+            pt.apply.toks(lambda row: { t : row['toks'][t] for t in row['toks'] if row['toks'][t] > 1 } ) 
+            >> pt.IterDictIndexer(tempfile.mkdtemp(), pretokenised=True)
+        )
+        new_index_ref = index_pipe.index( index.get_corpus_iter())
+        pruned_index = pt.IndexFactory.of(new_index_ref)
+        self.assertEqual(1, len(pruned_index))
+        self.assertEqual(1, pruned_index.getCollectionStatistics().getNumberOfUniqueTerms())
+        self.assertEqual(2, pruned_index.getCollectionStatistics().getNumberOfTokens())
+
+    def test_index_corpus_iter_empty(self):
+        import sys
+        MIN_PYTHON = (3, 8)
+        if sys.version_info < MIN_PYTHON:
+            self.skipTest("Not minimum Python requirements")
+            
+        # compared to test_index_corpus_iter, this tests empty documents are handled correctly.
+        documents = [
+            {'docno' : 'd0', 'text':''},
+            {'docno' : 'd1', 'text':''},
+            {'docno' : 'd2', 'text': 'stemming stopwords stopwords'},
+            {'docno' : 'd3', 'text':''},
+            {'docno' : 'd4', 'text': 'stemming stopwords stopwords'},
+            {'docno' : 'd5', 'text': ''}
+        ]
+        index = pt.IndexFactory.of( pt.IterDictIndexer(tempfile.mkdtemp(), stopwords=None, stemmer=None).index(documents) )
+        self.assertEqual(6, len(index))
+        self.assertEqual(2, index.getCollectionStatistics().getNumberOfUniqueTerms())
+        self.assertEqual(6, index.getCollectionStatistics().getNumberOfTokens())
+
+        iter = index.get_corpus_iter()
+
+        counter = 0
+        for doc in documents:
+            next_doc = next(iter)
+            counter += 1
+            self.assertTrue(next_doc is not None)
+            self.assertIn('docno', next_doc)
+            self.assertIn('toks', next_doc)
+            if doc['text'] == '':
+                self.assertEqual(0, len(next_doc['toks']))
+            else:
+                self.assertIn('stemming', next_doc['toks'])
+                self.assertIn('stopwords', next_doc['toks'])
+                self.assertEqual(1, next_doc['toks']['stemming'])
+                self.assertEqual(2, next_doc['toks']['stopwords'])
+
+        with(self.assertRaises(StopIteration)):
+            next(iter)
+        self.assertEqual(counter, len(documents))
+
+        #  now check that a static pruning pipe can operate as expected. this example comes from terrier-index-api.rst
+        index_pipe = (
+            # update the toks column for each document, keeping only terms with frequency > 1
+            pt.apply.toks(lambda row: { t : row['toks'][t] for t in row['toks'] if row['toks'][t] > 1 } ) 
+            >> pt.IterDictIndexer(tempfile.mkdtemp(), pretokenised=True)
+        )
+        new_index_ref = index_pipe.index( index.get_corpus_iter())
+        pruned_index = pt.IndexFactory.of(new_index_ref)
+        self.assertEqual(6, len(pruned_index))
+        self.assertEqual(1, pruned_index.getCollectionStatistics().getNumberOfUniqueTerms())
+        self.assertEqual(4, pruned_index.getCollectionStatistics().getNumberOfTokens())
+
     def test_index_add_write(self):
         # inspired by https://github.com/terrier-org/pyterrier/issues/390
         documents = [
diff --git a/tests/test_ltr_pipelines.py b/tests/test_ltr_pipelines.py
index 9824d8bf..fdc967c7 100644
--- a/tests/test_ltr_pipelines.py
+++ b/tests/test_ltr_pipelines.py
@@ -39,7 +39,6 @@ def test_xgltr_pipeline(self):
             'learning_rate': 0.1,
             'gamma': 1.0, 'min_child_weight': 0.1,
             'max_depth': 6,
-            'verbose': 2,
             'random_state': 42
         }
 
diff --git a/tests/test_pickle.py b/tests/test_pickle.py
index 8d1a7064..769b228d 100644
--- a/tests/test_pickle.py
+++ b/tests/test_pickle.py
@@ -98,6 +98,22 @@ def test_fbr_joblib(self):
         self._fix_joblib()
         self._fbr(joblib)
 
+    def test_qe_pickle(self):
+        self._qe(pickle)
+
+    def _qe(self, pickler):
+        vaswani = pt.datasets.get_dataset("vaswani")
+        index = vaswani.get_index()
+        bm25 = pt.BatchRetrieve(index, wmodel='BM25', controls={"c" : 0.75}, num_results=15)
+        br = bm25 >> pt.rewrite.Bo1QueryExpansion(index) >> bm25
+        q  = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"])
+        res1 = br(q)
+        byterep = pickler.dumps(br)
+        br2 = pickler.loads(byterep)
+
+        res2 = br2(q)
+        pd.testing.assert_frame_equal(res1, res2)
+    
     def _br(self, pickler, wmodel='BM25'):
         vaswani = pt.datasets.get_dataset("vaswani")
         br = pt.BatchRetrieve(vaswani.get_index(), wmodel=wmodel, controls={"c" : 0.75}, num_results=15)
diff --git a/tests/test_text.py b/tests/test_text.py
index f24dd6db..12e248c0 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -41,7 +41,8 @@ def test_scorer_rerank(self):
         self.assertEqual(1, dfOut.iloc[0]["rank"])
 
     def test_snippets(self):
-        br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed_text", metadata=["docno", "text"])
+        br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed") >> pt.text.get_text(pt.get_dataset('irds:vaswani'), "text")
+        #br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed_text", metadata=["docno", "text"])
         psg_scorer = ( 
             pt.text.sliding(text_attr='text', length=25, stride=12, prepend_attr=None) 
             >> pt.text.scorer(body_attr="text", wmodel='Tf', takes='docs')
diff --git a/tests/test_topicsparsing.py b/tests/test_topicsparsing.py
index cedcebbb..20ecde94 100644
--- a/tests/test_topicsparsing.py
+++ b/tests/test_topicsparsing.py
@@ -1,14 +1,19 @@
-import pyterrier as pt
-import unittest
-from .base import BaseTestCase
 import os
+import unittest
+
 import pandas as pd
 
-class TestTopicsParsing(BaseTestCase):
+import pyterrier as pt
 
+from .base import BaseTestCase
+
+
+class TestTopicsParsing(BaseTestCase):
     def testSingleLine(self):
         topics = pt.io.read_topics(
-            os.path.dirname(os.path.realpath(__file__)) + "/fixtures/singleline.topics", format="singleline")
+            os.path.dirname(os.path.realpath(__file__)) + "/fixtures/singleline.topics",
+            format="singleline",
+        )
         self.assertEqual(2, len(topics))
         self.assertTrue("qid" in topics.columns)
         self.assertTrue("query" in topics.columns)
@@ -19,12 +24,29 @@ def testSingleLine(self):
 
     def test_parse_trec_topics_file_T(self):
         input = os.path.dirname(os.path.realpath(__file__)) + "/fixtures/topics.trec"
-        exp_result = pd.DataFrame([["1", "light"], ["2", "radiowave"], ["3", "sound"]], columns=['qid', 'query'])
+        exp_result = pd.DataFrame(
+            [["1", "light"], ["2", "radiowave"], ["3", "sound"]],
+            columns=["qid", "query"],
+        )
         result = pt.io.read_topics(input)
         self.assertTrue(exp_result.equals(result))
 
     def test_parse_trec_topics_file_D(self):
         input = os.path.dirname(os.path.realpath(__file__)) + "/fixtures/topics.trec"
-        exp_result = pd.DataFrame([["1", "lights"], ["2", "radiowaves"], ["3", "sounds"]], columns=['qid', 'query'])
-        result = pt.io.read_topics(input, format="trec", whitelist=["DESC"], blacklist=["TITLE"])
-        self.assertTrue(exp_result.equals(result))
\ No newline at end of file
+        exp_result = pd.DataFrame(
+            [["1", "lights"], ["2", "radiowaves"], ["3", "sounds"]],
+            columns=["qid", "query"],
+        )
+        result = pt.io.read_topics(
+            input, format="trec", whitelist=["DESC"], blacklist=["TITLE"]
+        )
+        self.assertTrue(exp_result.equals(result))
+
+    def test_parse_trecxml_topics_file(self):
+        input = os.path.dirname(os.path.realpath(__file__)) + "/fixtures/topics.trecxml"
+        result = pt.io.read_topics(input, format="trecxml", tags=["title"])
+        exp_result = pd.DataFrame(
+            [["1", "lights"], ["2", "radiowaves"], ["3", "sounds"]],
+            columns=["qid", "query"],
+        )
+        self.assertTrue(exp_result.equals(result))