diff --git a/.github/workflows/anserini.yml b/.github/workflows/anserini.yml index 72e94e36..e72d9d98 100644 --- a/.github/workflows/anserini.yml +++ b/.github/workflows/anserini.yml @@ -17,7 +17,6 @@ jobs: python-version: ['3.10'] java: [13] os: ['ubuntu-latest'] - architecture: ['x64'] terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', runs-on: ${{ matrix.os }} @@ -28,29 +27,28 @@ jobs: run: | brew install libomp - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} if: matrix.os != 'self-hosted' - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Setup java if: matrix.os != 'self-hosted' - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: java-version: ${{ matrix.java }} - architecture: ${{ matrix.architecture }} distribution: 'zulu' # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d - name: Loading Python & dependencies from cache if: matrix.os != 'self-hosted' - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }} + key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }} - name: Install Python dependencies run: | diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 188a3038..5c14416a 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -18,12 +18,12 @@ jobs: - name: Display phase run: echo "Deploying "${{github.event.inputs.releasetype}}" $INPUT_RELEASETYPE to PyPI" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Set up Python 3.7 - uses: actions/setup-python@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 with: - python-version: 3.7 + python-version: 3.8 - name: Build a test source tarball if: github.event.inputs.releasetype == 'test' diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 492ab0e8..c0b93066 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -14,17 +14,15 @@ jobs: strategy: matrix: - python-version: ['3.7', '3.11'] + python-version: ['3.8', '3.11'] java: [11, 13] - os: ['ubuntu-latest', 'macOs-latest', 'windows-latest'] - architecture: ['x64'] + os: ['ubuntu-latest', 'macos-13', 'windows-latest'] terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', - # include: - # - os: 'self-hosted' # our m1 runner, only one setting - # python-version: '3.9' - # java: 11 - # architecture: 'arm' - # terrier: 'snapshot' + include: + - os: 'macos-latest' + python-version: '3.9' + java: 11 + terrier: 'snapshot' runs-on: ${{ matrix.os }} steps: @@ -34,20 +32,19 @@ jobs: run: | brew install libomp - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} if: matrix.os != 'self-hosted' - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Setup java if: matrix.os != 'self-hosted' - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: java-version: ${{ matrix.java }} - architecture: ${{ matrix.architecture }} distribution: 'zulu' - name: Install Terrier snapshot @@ -60,10 +57,10 @@ jobs: # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d - name: Loading Python & dependencies from cache if: matrix.os != 'self-hosted' - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }} + key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-test.txt') }} - name: Install Python dependencies run: | @@ -93,4 +90,4 @@ jobs: env: TERRIER_VERSION: ${{ matrix.terrier }} run: | - pytest --durations=20 -p no:faulthandler \ No newline at end of file + pytest --durations=20 -p no:faulthandler diff --git a/README.md b/README.md index 9819c6f2..27719e13 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Python package](https://github.com/terrier-org/pyterrier/workflows/Python%20package/badge.svg) +[![Continuous Testing](https://github.com/terrier-org/pyterrier/actions/workflows/push.yml/badge.svg)](https://github.com/terrier-org/pyterrier/actions/workflows/push.yml) [![PyPI version](https://badge.fury.io/py/python-terrier.svg)](https://badge.fury.io/py/python-terrier) [![Documentation Status](https://readthedocs.org/projects/pyterrier/badge/?version=latest)](https://pyterrier.readthedocs.io/en/latest/) @@ -162,3 +162,4 @@ By downloading and using PyTerrier, you agree to cite at the undernoted paper de - Sarawoot Kongyoung, University of Glasgow - Zhan Su, Copenhagen University - Marcus Schutte, TU Delft + - Lukas Zeit-Altpeter, Friedrich Schiller University Jena diff --git a/docs/extras/generate_includes.py b/docs/extras/generate_includes.py index 9e982303..27e6ea56 100644 --- a/docs/extras/generate_includes.py +++ b/docs/extras/generate_includes.py @@ -40,8 +40,17 @@ def experiment_includes(): # vaswani dataset provides an index, topics and qrels # lets generate two BRs to compare - tfidf = pt.BatchRetrieve(dataset.get_index(), wmodel="TF_IDF") - bm25 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25") + try: + indexref = dataset.get_index() + except ValueError: + import os, tempfile + # if data.terrier.org is down, build the index + indexref = pt.IterDictIndexer( + os.path.join(tempfile.gettempdir(), "vaswani_index") + ).index(pt.get_dataset('vaswani').get_corpus_iter()) + + tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF") + bm25 = pt.BatchRetrieve(indexref, wmodel="BM25") table = pt.Experiment( [tfidf, bm25], diff --git a/docs/installation.rst b/docs/installation.rst index 82698bd4..69abe482 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -6,7 +6,7 @@ PyTerrier is a declarative platform for information retrieval experiemnts in Pyt Pre-requisites ============== -PyTerrier requires Python 3.7 or newer, and Java 11 or newer. PyTerrier is natively supported on Linux, Mac OS X and Windows. +PyTerrier requires Python 3.8 or newer, and Java 11 or newer. PyTerrier is natively supported on Linux, Mac OS X and Windows. Installation ============ @@ -23,6 +23,24 @@ If you want the latest version of PyTerrier, you can install direct from the Git NB: There is no need to have a local installation of the Java component, Terrier. PyTerrier will download the latest release on startup. +Installation Troubleshooting +============ + +We aim to ensure that there are pre-compiled binaries available for any dependencies with native components, for all supported Python versions and for all major platforms (Linux, macOS, Windows). +One notable exception is Mac M1 etc., as there are no freely available GitHub Actions runners for M1. Mac M1 installs may require to compile some dependencies. + +If the installation failed due to `pyautocorpus` did not run successfully, you may need to install `pcre` to your machine. + +macOS:: + + brew install pcre + +Linux:: + + apt-get update -y + apt-get install libpcre3-dev -y + + Configuration ============== @@ -41,6 +59,9 @@ the usual places on your machine for a Java installation. If you have problems, `pt.init()` has a multitude of options, for instance that can make PyTerrier more notebook friendly, or to change the underlying version of Terrier, as described below. +For users with an M1 Mac or later models, it is necessary to install the SSL certificates to avoid certificate errors. +To do this, locate the `Install Certificates.command` file within the `Application/Python[version]` directory. Once found, double-click on it to run the installation process. + API Reference ============= @@ -79,4 +100,4 @@ Methods to change PyTerrier configuration .. autofunction:: pyterrier.set_properties() -.. autofunction:: pyterrier.set_tqdm() \ No newline at end of file +.. autofunction:: pyterrier.set_tqdm() diff --git a/docs/operators.rst b/docs/operators.rst index b8e53631..164aefde 100644 --- a/docs/operators.rst +++ b/docs/operators.rst @@ -5,19 +5,19 @@ Part of the power of PyTerrier comes from the ease in which researchers can form This is made possible by the operators available on Pyterrier's transformer objects. The following table summarises the available operators: -============ ================================= +============ ======================================================= Operator Meaning -============ ================================= +============ ======================================================= `>>` Then - chaining pipes `+` Linear combination of scores `*` Scalar factoring of scores `&` Document Set Intersection `|` Document Set Union `%` Apply rank cutoff -`^` Concatenate run with another +`^` Concatenate the output of one transformer with another `**` Feature Union `~` Cache transformer result -============ ================================= +============ ======================================================= NB: These operators retain their default Python operator precedence - that may not be aligned with your expectations in a PyTerrier context (e.g. `&` is higher than `>>`). diff --git a/docs/rewrite.rst b/docs/rewrite.rst index 6de11465..450b6950 100644 --- a/docs/rewrite.rst +++ b/docs/rewrite.rst @@ -64,6 +64,14 @@ Example:: dph = pt.BatchRetrieve(index, wmodel="DPH") pipelineQE = dph >> bo1 >> dph +View the expansion terms:: + + pipelineDisplay = dph >> bo1 + pipelineDisplay.search("chemical reactions") + # will return a dataframe with ['qid', 'query', 'query_0'] columns + # the reformulated query can be found in the 'query' column, + # while the original query is in the 'query_0' columns + **Alternative Formulations** Note that it is also possible to configure BatchRetrieve to perform QE directly using controls, diff --git a/docs/terrier-index-api.rst b/docs/terrier-index-api.rst index 4e389bca..438053ab 100644 --- a/docs/terrier-index-api.rst +++ b/docs/terrier-index-api.rst @@ -171,3 +171,32 @@ of the term (obtained from the Lexicon, in the form of the LexiconEntry), as wel print("%s with score %0.4f" % (docno, score)) Note that using BatchRetrieve or similar is probably an easier prospect for such a use case. + +Can I get the index as a corpus_iter()? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A corpus_iter can be obtained from an Index object, which allows for instance: + - indexing the pre-tokenised Terrier index directly in another indexing pipeline + - extracting document metadata for ingestion into another indexing pipeline + +Metadata Example:: + + iter = index.get_corpus_iter(return_toks=False) + next(iter) + # would display {'docno' : 'd1', 'text' : 'This document contains ...' } + # assuming that index has been built with metadata=['docno', 'text'] + +Pre-tokenised Example:: + + iter = index.get_corpus_iter() + next(iter) + # would display {'docno' : 'd1', 'toks' : {'a' : 1, 'the' : 2}} + +Document Pruning Example:: + + index_pipe = ( + # update the toks column for each document, keeping only terms with frequency > 1 + pt.apply.toks(lambda row: { t : row['toks'][t] for t in row['toks'] if row['toks'][t] > 1 } ) + >> pt.IterDictIndexer("./pruned_index", pretokenised=True) + ) + new_index_ref = index_pipe.index( index.get_corpus_iter()) diff --git a/examples/notebooks/ltr.ipynb b/examples/notebooks/ltr.ipynb index a0521136..5cc117b7 100644 --- a/examples/notebooks/ltr.ipynb +++ b/examples/notebooks/ltr.ipynb @@ -1,23 +1,9 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Learning to Rank Examples.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, "cells": [ { "cell_type": "markdown", "metadata": { - "id": "F5Ng-_HyW5LP", - "colab_type": "text" + "id": "F5Ng-_HyW5LP" }, "source": [ "# Terrier Learning to Rank Examples\n", @@ -31,146 +17,128 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "eypl7XPrkifV", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 718 - }, - "outputId": "e042ffb0-8ee5-4d95-c8bc-7e2895df541f" + "id": "eypl7XPrkifV" }, + "outputs": [], "source": [ - "!pip install python-terrier\n", - "#!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting python-terrier\n", - " Cloning https://github.com/terrier-org/pyterrier.git to /tmp/pip-install-e2c_k3ze/python-terrier\n", - " Running command git clone -q https://github.com/terrier-org/pyterrier.git /tmp/pip-install-e2c_k3ze/python-terrier\n", - "Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.18.5)\n", - "Requirement already satisfied, skipping upgrade: pandas in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.0.5)\n", - "Requirement already satisfied, skipping upgrade: wget in /usr/local/lib/python3.6/dist-packages (from python-terrier) (3.2)\n", - "Requirement already satisfied, skipping upgrade: pytrec_eval in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.4)\n", - "Requirement already satisfied, skipping upgrade: tqdm in /usr/local/lib/python3.6/dist-packages (from python-terrier) (4.41.1)\n", - "Requirement already satisfied, skipping upgrade: pyjnius~=1.3.0 in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.3.0)\n", - "Requirement already satisfied, skipping upgrade: matchpy in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.5.1)\n", - "Requirement already satisfied, skipping upgrade: sklearn in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.0)\n", - "Requirement already satisfied, skipping upgrade: deprecation in /usr/local/lib/python3.6/dist-packages (from python-terrier) (2.1.0)\n", - "Requirement already satisfied, skipping upgrade: chest in /usr/local/lib/python3.6/dist-packages (from python-terrier) (0.2.3)\n", - "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.6/dist-packages (from python-terrier) (1.4.1)\n", - "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->python-terrier) (2.8.1)\n", - "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->python-terrier) (2018.9)\n", - "Requirement already satisfied, skipping upgrade: cython in /usr/local/lib/python3.6/dist-packages (from pyjnius~=1.3.0->python-terrier) (0.29.20)\n", - "Requirement already satisfied, skipping upgrade: six>=1.7.0 in /usr/local/lib/python3.6/dist-packages (from pyjnius~=1.3.0->python-terrier) (1.12.0)\n", - "Requirement already satisfied, skipping upgrade: hopcroftkarp<2.0,>=1.2 in /usr/local/lib/python3.6/dist-packages (from matchpy->python-terrier) (1.2.5)\n", - "Requirement already satisfied, skipping upgrade: multiset<3.0,>=2.0 in /usr/local/lib/python3.6/dist-packages (from matchpy->python-terrier) (2.1.1)\n", - "Requirement already satisfied, skipping upgrade: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn->python-terrier) (0.22.2.post1)\n", - "Requirement already satisfied, skipping upgrade: packaging in /usr/local/lib/python3.6/dist-packages (from deprecation->python-terrier) (20.4)\n", - "Requirement already satisfied, skipping upgrade: heapdict in /usr/local/lib/python3.6/dist-packages (from chest->python-terrier) (1.0.1)\n", - "Requirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn->python-terrier) (0.15.1)\n", - "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->deprecation->python-terrier) (2.4.7)\n", - "Building wheels for collected packages: python-terrier\n", - " Building wheel for python-terrier (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for python-terrier: filename=python_terrier-0.3.0.dev0-cp36-none-any.whl size=37418 sha256=9fcedd75b4d85b9c026e34c93cd5c20167ed10636d545e3da9de41a545d003ba\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-75s589ez/wheels/cc/bb/69/836d846a92c787b35ca6478119c0033762ab2b95d866eeb288\n", - "Successfully built python-terrier\n", - "Installing collected packages: python-terrier\n", - " Found existing installation: python-terrier 0.3.0.dev0\n", - " Uninstalling python-terrier-0.3.0.dev0:\n", - " Successfully uninstalled python-terrier-0.3.0.dev0\n", - "Successfully installed python-terrier-0.3.0.dev0\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "pyterrier" - ] - } - } - }, - "metadata": { - "tags": [] - } - } + "%pip install -q python-terrier" ] }, { "cell_type": "markdown", "metadata": { - "id": "5thmTselkuBv", - "colab_type": "text" + "id": "5thmTselkuBv" }, "source": [ - "## Init \n", + "## Init\n", + "\n", + "You must run `pt.init()` before other PyTerrier functions and classes.\n", "\n", - "You must run pt.init() before other pyterrier functions and classes\n", + "`pt.init()` takes arguments such as: \n", + "- `version` - Terrier version e.g. \"5.2\" \n", + "- `mem` - megabytes allocated to JVM e.g. 4096\n", "\n", - "Arguments: \n", - "- `version` - terrier IR version e.g. \"5.2\" \n", - "- `mem` - megabytes allocated to java e.g. 4096" + "See also: https://pyterrier.readthedocs.io/en/latest/installation.html" ] }, { "cell_type": "code", + "execution_count": 2, "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, "id": "hPK5k4g2kkKo", - "colab_type": "code", - "colab": {} + "outputId": "67632f17-c2e4-4229-b7dc-d5671ec19ea6" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading terrier-assemblies 5.x-SNAPSHOT jar-with-dependencies to /Users/craigm/.pyterrier...\n", + "Done\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PyTerrier 0.10.0 has loaded Terrier 5.9-SNAPSHOT (built by jitpack on 2024-04-22 17:11) and terrier-helper 0.0.8\n", + "\n", + "No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n" + ] + } + ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import pyterrier as pt\n", "if not pt.started():\n", - " pt.init()" - ], - "execution_count": 15, - "outputs": [] + " pt.init(version='snapshot')" + ] }, { "cell_type": "markdown", "metadata": { - "id": "M5BmNjqoXGow", - "colab_type": "text" + "id": "M5BmNjqoXGow" }, "source": [ "## Load Files and Index\n", "\n", - "Again, lets focus on the small Vaswani test collection. Its easily accessible via the dataset API. " + "Again, lets focus on the small Vaswani test collection. Its easily accessible via the dataset API." ] }, { "cell_type": "code", + "execution_count": 3, "metadata": { - "id": "1MCH20mGB8EG", - "colab_type": "code", - "colab": {} + "id": "1MCH20mGB8EG" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading vaswani topics to /Users/craigm/.pyterrier/corpora/vaswani/query-text.trec\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "query-text.trec: 10.7kiB [00:00, 2.76MiB/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading vaswani qrels to /Users/craigm/.pyterrier/corpora/vaswani/qrels\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "qrels: 24.3kiB [00:00, 9.58MiB/s] \n" + ] + } + ], "source": [ "dataset = pt.datasets.get_dataset(\"vaswani\")\n", "\n", "indexref = dataset.get_index()\n", "topics = dataset.get_topics()\n", "qrels = dataset.get_qrels()" - ], - "execution_count": 16, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { - "id": "c8hUuA_KKPUH", - "colab_type": "text" + "id": "c8hUuA_KKPUH" }, "source": [ "## Multi-stage Retrieval\n", @@ -186,27 +154,24 @@ }, { "cell_type": "code", + "execution_count": 4, "metadata": { - "id": "QEjmsD3ya8Pc", - "colab_type": "code", - "colab": {} + "id": "QEjmsD3ya8Pc" }, + "outputs": [], "source": [ "#this ranker will make the candidate set of documents for each query\n", - "BM25 = pt.BatchRetrieve(indexref, controls = {\"wmodel\": \"BM25\"})\n", + "BM25 = pt.BatchRetrieve(indexref, wmodel=\"BM25\")\n", "\n", "#these rankers we will use to re-rank the BM25 results\n", - "TF_IDF = pt.BatchRetrieve(indexref, controls = {\"wmodel\": \"TF_IDF\"})\n", - "PL2 = pt.BatchRetrieve(indexref, controls = {\"wmodel\": \"PL2\"})" - ], - "execution_count": 17, - "outputs": [] + "TF_IDF = pt.BatchRetrieve(indexref, wmodel=\"TF_IDF\")\n", + "PL2 = pt.BatchRetrieve(indexref, wmodel=\"PL2\")" + ] }, { "cell_type": "markdown", "metadata": { - "id": "T07YF3-ULGsG", - "colab_type": "text" + "id": "T07YF3-ULGsG" }, "source": [ "OK, so how do we combine these?" @@ -214,22 +179,19 @@ }, { "cell_type": "code", + "execution_count": 5, "metadata": { - "id": "vTLh6SrCLGM0", - "colab_type": "code", - "colab": {} + "id": "vTLh6SrCLGM0" }, + "outputs": [], "source": [ "pipe = BM25 >> (TF_IDF ** PL2)" - ], - "execution_count": 18, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { - "id": "l7M4cUxCLMTo", - "colab_type": "text" + "id": "l7M4cUxCLMTo" }, "source": [ "Here, we are using two Pyterrer operators:\n", @@ -241,22 +203,17 @@ }, { "cell_type": "code", + "execution_count": 6, "metadata": { - "id": "DYNOf_TwLp0Z", - "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", - "height": 142 + "height": 112 }, - "outputId": "d9779320-58f8-4197-aa91-f05f7d05a8c5" + "id": "DYNOf_TwLp0Z", + "outputId": "66f01ca8-43da-4c89-a11e-f36a73d2ac44" }, - "source": [ - "pipe.transform(\"chemical end:2\")" - ], - "execution_count": 19, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -283,12 +240,6 @@ " rank\n", " score\n", " query\n", - " docid_x\n", - " rank_x\n", - " query_x\n", - " docid_y\n", - " rank_y\n", - " query_y\n", " features\n", " \n", " \n", @@ -300,13 +251,7 @@ " 10703\n", " 0\n", " 13.472012\n", - " chemical end:2\n", - " 10702\n", - " 0\n", - " chemical end:2\n", - " 10702\n", - " 0\n", - " chemical end:2\n", + " chemical\n", " [7.38109017620895, 6.9992254918907575]\n", " \n", " \n", @@ -316,56 +261,36 @@ " 1056\n", " 1\n", " 12.517082\n", - " chemical end:2\n", - " 1055\n", - " 1\n", - " chemical end:2\n", - " 1055\n", - " 1\n", - " chemical end:2\n", + " chemical\n", " [6.857899681644975, 6.358419229871986]\n", " \n", - " \n", - " 2\n", - " 1\n", - " 4885\n", - " 4886\n", - " 2\n", - " 12.228161\n", - " chemical end:2\n", - " 4885\n", - " 2\n", - " chemical end:2\n", - " 4885\n", - " 2\n", - " chemical end:2\n", - " [6.69960466053696, 6.181368165774688]\n", - " \n", " \n", "\n", "
" ], "text/plain": [ - " qid docid ... query_y features\n", - "0 1 10702 ... chemical end:2 [7.38109017620895, 6.9992254918907575]\n", - "1 1 1055 ... chemical end:2 [6.857899681644975, 6.358419229871986]\n", - "2 1 4885 ... chemical end:2 [6.69960466053696, 6.181368165774688]\n", + " qid docid docno rank score query \\\n", + "0 1 10702 10703 0 13.472012 chemical \n", + "1 1 1055 1056 1 12.517082 chemical \n", "\n", - "[3 rows x 13 columns]" + " features \n", + "0 [7.38109017620895, 6.9992254918907575] \n", + "1 [6.857899681644975, 6.358419229871986] " ] }, - "metadata": { - "tags": [] - }, - "execution_count": 19 + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "pipe.search(\"chemical\").head(2)" ] }, { "cell_type": "markdown", "metadata": { - "id": "-ZMvd3qjLkrs", - "colab_type": "text" + "id": "-ZMvd3qjLkrs" }, "source": [ "See, we now have a \"features\" column with numbers representing the TF_IDF and PL2 feature scores." @@ -374,8 +299,7 @@ { "cell_type": "markdown", "metadata": { - "id": "Ye6ZpcZaMBjT", - "colab_type": "text" + "id": "Ye6ZpcZaMBjT" }, "source": [ "*A note about efficiency*: doing retrieval, then re-ranking the documents again can be slow. For this reason, Terrier has a FeaturesBatchRetrieve. Lets try this:" @@ -383,24 +307,17 @@ }, { "cell_type": "code", + "execution_count": 7, "metadata": { - "id": "5gCHuDiJMNJZ", - "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", - "height": 142 + "height": 112 }, - "outputId": "cd0d5320-4d08-417a-d2dd-08e07f500793" + "id": "5gCHuDiJMNJZ", + "outputId": "4e6ec41d-1c1b-4b7e-d318-de6ed0ef0883" }, - "source": [ - "fbr = pt.FeaturesBatchRetrieve(indexref, controls = {\"wmodel\": \"BM25\"}, features=[\"WMODEL:TF_IDF\", \"WMODEL:PL2\"]) \n", - "#lets look at the top 2 results\n", - "(fbr %2).search(\"chemical\")" - ], - "execution_count": 20, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -422,64 +339,64 @@ " \n", " \n", " qid\n", + " query\n", " docid\n", " rank\n", + " features\n", " docno\n", " score\n", - " features\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", + " chemical\n", " 10702\n", " 0\n", + " [7.38109017620895, 6.9992254918907575]\n", " 10703\n", " 13.472012\n", - " [7.38109017620895, 6.9992254918907575]\n", " \n", " \n", " 1\n", " 1\n", + " chemical\n", " 1055\n", " 1\n", + " [6.857899681644975, 6.358419229871986]\n", " 1056\n", " 12.517082\n", - " [6.857899681644975, 6.358419229871986]\n", - " \n", - " \n", - " 2\n", - " 1\n", - " 4885\n", - " 2\n", - " 4886\n", - " 12.228161\n", - " [6.69960466053696, 6.181368165774688]\n", " \n", " \n", "\n", "
" ], "text/plain": [ - " qid docid rank docno score features\n", - "0 1 10702 0 10703 13.472012 [7.38109017620895, 6.9992254918907575]\n", - "1 1 1055 1 1056 12.517082 [6.857899681644975, 6.358419229871986]\n", - "2 1 4885 2 4886 12.228161 [6.69960466053696, 6.181368165774688]" + " qid query docid rank features docno \\\n", + "0 1 chemical 10702 0 [7.38109017620895, 6.9992254918907575] 10703 \n", + "1 1 chemical 1055 1 [6.857899681644975, 6.358419229871986] 1056 \n", + "\n", + " score \n", + "0 13.472012 \n", + "1 12.517082 " ] }, - "metadata": { - "tags": [] - }, - "execution_count": 20 + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "fbr = pt.FeaturesBatchRetrieve(indexref, wmodel=\"BM25\", features=[\"WMODEL:TF_IDF\", \"WMODEL:PL2\"])\n", + "#lets look at the top 2 results\n", + "(fbr %2).search(\"chemical\")" ] }, { "cell_type": "markdown", "metadata": { - "id": "fo567qmCMZ41", - "colab_type": "text" + "id": "fo567qmCMZ41" }, "source": [ "However, this kind of optimisation is common in Pyterrier, so Pyterrier actually supports automatic pipeline optimisation, using the `.compile()` function." @@ -487,30 +404,24 @@ }, { "cell_type": "code", + "execution_count": 14, "metadata": { - "id": "jmrnqg9YMpl2", - "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", - "height": 159 + "height": 130 }, - "outputId": "70882fb4-8057-4014-ece8-899a593e4cd0" + "id": "jmrnqg9YMpl2", + "outputId": "a1fd9210-5cb2-4d3f-9a4a-8045502092b4" }, - "source": [ - "pipe_fast = pipe.compile()\n", - "(pipe_fast %2).search(\"chemical\")" - ], - "execution_count": 21, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Applying 8 rules\n" - ], - "name": "stdout" + ] }, { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -533,9 +444,10 @@ " \n", " qid\n", " docid\n", - " rank\n", " docno\n", + " rank\n", " score\n", + " query\n", " features\n", " \n", " \n", @@ -544,80 +456,72 @@ " 0\n", " 1\n", " 10702\n", - " 0\n", " 10703\n", + " 0\n", " 13.472012\n", + " chemical\n", " [7.38109017620895, 6.9992254918907575]\n", " \n", " \n", " 1\n", " 1\n", " 1055\n", - " 1\n", " 1056\n", + " 1\n", " 12.517082\n", + " chemical\n", " [6.857899681644975, 6.358419229871986]\n", " \n", - " \n", - " 2\n", - " 1\n", - " 4885\n", - " 2\n", - " 4886\n", - " 12.228161\n", - " [6.69960466053696, 6.181368165774688]\n", - " \n", " \n", "\n", "
" ], "text/plain": [ - " qid docid rank docno score features\n", - "0 1 10702 0 10703 13.472012 [7.38109017620895, 6.9992254918907575]\n", - "1 1 1055 1 1056 12.517082 [6.857899681644975, 6.358419229871986]\n", - "2 1 4885 2 4886 12.228161 [6.69960466053696, 6.181368165774688]" + " qid docid docno rank score query \\\n", + "0 1 10702 10703 0 13.472012 chemical \n", + "1 1 1055 1056 1 12.517082 chemical \n", + "\n", + " features \n", + "0 [7.38109017620895, 6.9992254918907575] \n", + "1 [6.857899681644975, 6.358419229871986] " ] }, - "metadata": { - "tags": [] - }, - "execution_count": 21 + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "pipe_fast = pipe.compile()\n", + "(pipe_fast %2).search(\"chemical\")" ] }, { "cell_type": "markdown", "metadata": { - "id": "siS6M5t_hugs", - "colab_type": "text" + "id": "siS6M5t_hugs" }, "source": [ "Finally, often we want our initial retrieval score to be a feature also. We can do this in one of two ways:\n", " - by adding a `SAMPLE` feature to FeaturesBatchRetrieve\n", - " - or in the original feature-union definition, including an IdentityTransformer " + " - or in the original feature-union definition, including an identity Transformer\n", + "\n", + "In doing so, the BM25 score (13.47 andf 12.51) are now copied in as the first position of the features column." ] }, { "cell_type": "code", + "execution_count": 15, "metadata": { - "id": "iXxeKfPXhuPA", - "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", - "height": 142 + "height": 112 }, - "outputId": "25a8c80c-c277-476e-c243-7c6f9d5989cb" + "id": "iXxeKfPXhuPA", + "outputId": "1fd3bf99-ec04-4572-cc6d-625d50cd1529" }, - "source": [ - "fbr = pt.FeaturesBatchRetrieve(indexref, controls = {\"wmodel\": \"BM25\"}, features=[\"SAMPLE\", \"WMODEL:TF_IDF\", \"WMODEL:PL2\"]) \n", - "pipe = BM25 >> (pt.transformer.IdentityTransformer() ** TF_IDF ** PL2)\n", - "\n", - "(pipe %2).search(\"chemical\")" - ], - "execution_count": 22, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -639,34 +543,100 @@ " \n", " \n", " qid\n", + " query\n", " docid\n", - " docno\n", " rank\n", - " score_x\n", - " query\n", - " docid_x\n", - " rank_x\n", - " query_x\n", - " docid_y\n", - " rank_y\n", - " score_y\n", - " query_y\n", " features\n", + " docno\n", + " score\n", " \n", " \n", " \n", " \n", " 0\n", " 1\n", - " 10702\n", - " 10703\n", - " 0\n", - " 13.472012\n", " chemical\n", " 10702\n", " 0\n", + " [13.472012496423268, 7.38109017620895, 6.99922...\n", + " 10703\n", + " 13.472012\n", + " \n", + " \n", + " 1\n", + " 1\n", " chemical\n", + " 1055\n", + " 1\n", + " [12.517081895047532, 6.857899681644975, 6.3584...\n", + " 1056\n", + " 12.517082\n", + " \n", + " \n", + "\n", + "
" + ], + "text/plain": [ + " qid query docid rank \\\n", + "0 1 chemical 10702 0 \n", + "1 1 chemical 1055 1 \n", + "\n", + " features docno score \n", + "0 [13.472012496423268, 7.38109017620895, 6.99922... 10703 13.472012 \n", + "1 [12.517081895047532, 6.857899681644975, 6.3584... 1056 12.517082 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fbr3f = pt.FeaturesBatchRetrieve(indexref, wmodel=\"BM25\", features=[\"SAMPLE\", \"WMODEL:TF_IDF\", \"WMODEL:PL2\"])\n", + "(fbr3f %2).search(\"chemical\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", " \n", " \n", @@ -680,57 +650,43 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", "
qiddociddocnorankscorequeryfeatures
011070210703013.472012chemical112.517082chemical10551chemical1055112.517082chemical[12.517081895047532, 6.857899681644975, 6.3584...
2148854886212.228161chemical48852chemical4885212.228161chemical[12.22816082084599, 6.69960466053696, 6.181368...
\n", "
" ], "text/plain": [ - " qid docid ... query_y features\n", - "0 1 10702 ... chemical [13.472012496423268, 7.38109017620895, 6.99922...\n", - "1 1 1055 ... chemical [12.517081895047532, 6.857899681644975, 6.3584...\n", - "2 1 4885 ... chemical [12.22816082084599, 6.69960466053696, 6.181368...\n", + " qid docid docno rank score query \\\n", + "0 1 10702 10703 0 13.472012 chemical \n", + "1 1 1055 1056 1 12.517082 chemical \n", "\n", - "[3 rows x 14 columns]" + " features \n", + "0 [13.472012496423268, 7.38109017620895, 6.99922... \n", + "1 [12.517081895047532, 6.857899681644975, 6.3584... " ] }, - "metadata": { - "tags": [] - }, - "execution_count": 22 + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "pipe3f = BM25 >> (pt.Transformer.identity() ** TF_IDF ** PL2)\n", + "(pipe3f %2).search(\"chemical\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the results of both pipelines are identical." ] }, { "cell_type": "markdown", "metadata": { - "id": "R47HlFoMYAhi", - "colab_type": "text" + "id": "R47HlFoMYAhi" }, "source": [ "# Learning models and re-ranking\n", @@ -739,7 +695,7 @@ "\n", "In each case, the pattern is the same:\n", " - Create a transformer that does the re-ranking\n", - " - Call the fit() method on the created object with the training topics (and validation topics as necessary)\n", + " - Call the `fit()` method on the created object with the training topics (and validation topics as necessary)\n", " - Evaluate the results with the Experiment function by using the test topics\n", "\n", " Firstly, lets separate our topics into train/validation/test." @@ -747,29 +703,26 @@ }, { "cell_type": "code", + "execution_count": 17, "metadata": { - "id": "e7r10lR3DvzM", - "colab_type": "code", - "colab": {} + "id": "e7r10lR3DvzM" }, + "outputs": [], "source": [ "train_topics, valid_topics, test_topics = np.split(topics, [int(.6*len(topics)), int(.8*len(topics))])" - ], - "execution_count": 23, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { - "id": "3PYw_jasN6Vk", - "colab_type": "text" + "id": "3PYw_jasN6Vk" }, "source": [ "## sci-kit learn RandomForestRegressor\n", "\n", - "Our first learning-to-rank will be done using sci-kit learn's [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html). \n", + "Our first learning-to-rank will be done using sci-kit learn's [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html).\n", "\n", - "We use `pt.piptlines.LTR_pipeline`, which is a pyterrier transformer that passes the document featuresĀ as \"X\" features to RandomForest. To learn the model (called fitting) the RandomForest, we invoke the `fit()` method - on the entire pipeline, specifying the queries (topics) and relevance assessment (qrels). The latter for the \"Y\" labels for the RandomForest fitting.\n", + "We use `pt.ltr.apply_learned_model()`, which returns a PyTerrier Transformer that passes the document featuresĀ as \"X\" features to RandomForest. To learn the model (called fitting) the RandomForest, we invoke the `fit()` method - on the entire pipeline, specifying the queries (topics) and relevance assessment (qrels). The latter are used for the \"Y\" labels for the RandomForest fitting.\n", "\n", "NB: due to their bootstrap nature, Random Forests do not overfit, so we do not provide validation data to `fit()`.\n", "\n", @@ -780,28 +733,17 @@ }, { "cell_type": "code", + "execution_count": 18, "metadata": { - "colab_type": "code", - "id": "YTI_ax4K19nl", "colab": { "base_uri": "https://localhost:8080/", - "height": 111 + "height": 112 }, - "outputId": "186b0de0-4793-4afc-c463-c9082a2129ec" + "id": "YTI_ax4K19nl", + "outputId": "4973c7b5-14fd-4034-b5cc-6557a5156485" }, - "source": [ - "from sklearn.ensemble import RandomForestRegressor\n", - "\n", - "BaselineLTR = fbr >> pt.pipelines.LTR_pipeline(RandomForestRegressor(n_estimators=400))\n", - "BaselineLTR.fit(train_topics, qrels)\n", - "\n", - "results = pt.pipelines.Experiment([PL2, BaselineLTR], test_topics, qrels, [\"map\"], names=[\"PL2 Baseline\", \"LTR Baseline\"])\n", - "results" - ], - "execution_count": 24, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -835,7 +777,7 @@ " \n", " 1\n", " LTR Baseline\n", - " 0.144980\n", + " 0.144662\n", " \n", " \n", "\n", @@ -844,60 +786,75 @@ "text/plain": [ " name map\n", "0 PL2 Baseline 0.206031\n", - "1 LTR Baseline 0.144980" + "1 LTR Baseline 0.144662" ] }, - "metadata": { - "tags": [] - }, - "execution_count": 24 + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "BaselineLTR = fbr3f >> pt.ltr.apply_learned_model(RandomForestRegressor(n_estimators=400))\n", + "BaselineLTR.fit(train_topics, qrels)\n", + "\n", + "results = pt.Experiment([PL2, BaselineLTR], test_topics, qrels, [\"map\"], names=[\"PL2 Baseline\", \"LTR Baseline\"])\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, the RandomForest pipeline wasnt very good. LambdaMART is normally a bit better. Lets try that next..." ] }, { "cell_type": "markdown", "metadata": { - "id": "iGw58PCuumuT", - "colab_type": "text" + "id": "iGw58PCuumuT" }, "source": [ "## XgBoost Pipeline\n", "\n", - "We now demonstrate the use of a LambdaMART implementation from [xgBoost](https://xgboost.readthedocs.io/en/latest/). Again, pyTerrier provides a transformer object, namely `XGBoostLTR_pipeline`, which takes in the constrcutor the actual xgBoost model that you want to train. We took the xgBoost configuration from [their example code](https://github.com/dmlc/xgboost/blob/master/demo/rank/rank.py).\n", + "We now demonstrate the use of a LambdaMART implementation from [xgBoost](https://xgboost.readthedocs.io/en/latest/). Again, PyTerrier provides a Transformer object from `pt.ltr.apply_learned_model()`, this time passing `form='ltr'` as kwarg.\n", "\n", - "Call the `fit()` method on the full pipeline with the training and validation topics.\n", + "This takes in the constrcutor the actual xgBoost model that you want to train. We took the xgBoost configuration from [their example code](https://github.com/dmlc/xgboost/blob/master/demo/rank/rank.py).\n", "\n", - "Evaluate the results with the Experiment function by using the test topics" + "Call the `fit()` method on the full pipeline with the training *and validation* topics.\n", + "\n", + "The same pipeline can also be used with [LightGBM](https://github.com/microsoft/LightGBM).\n", + "\n", + "Evaluate the results with the Experiment function by using the test topics." ] }, { "cell_type": "code", + "execution_count": 19, "metadata": { - "id": "nM0r8EgFuGtQ", - "colab_type": "code", - "colab": {} + "id": "nM0r8EgFuGtQ" }, + "outputs": [], "source": [ "import xgboost as xgb\n", - "params = {'objective': 'rank:ndcg', \n", - " 'learning_rate': 0.1, \n", - " 'gamma': 1.0, 'min_child_weight': 0.1,\n", + "params = {'objective': 'rank:ndcg',\n", + " 'learning_rate': 0.1,\n", + " 'gamma': 1.0, \n", + " 'min_child_weight': 0.1,\n", " 'max_depth': 6,\n", - " 'verbose': 2,\n", - " 'random_state': 42 \n", + " 'random_state': 42\n", " }\n", "\n", - "BaseLTR_LM = fbr >> pt.pipelines.XGBoostLTR_pipeline(xgb.sklearn.XGBRanker(**params))\n", + "BaseLTR_LM = fbr3f >> pt.ltr.apply_learned_model(xgb.sklearn.XGBRanker(**params), form='ltr')\n", "BaseLTR_LM.fit(train_topics, qrels, valid_topics, qrels)" - ], - "execution_count": 25, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { - "id": "HVXoNhzSP-k2", - "colab_type": "text" + "id": "HVXoNhzSP-k2" }, "source": [ "And evaluate the results." @@ -905,26 +862,17 @@ }, { "cell_type": "code", + "execution_count": 20, "metadata": { - "id": "Dn56DKZMTQ_m", - "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", - "height": 111 + "height": 112 }, - "outputId": "6688d85e-8599-4f11-db18-231abd0d7aee" + "id": "Dn56DKZMTQ_m", + "outputId": "133260ca-e979-4006-9120-5339682331e0" }, - "source": [ - "allresultsLM = pt.pipelines.Experiment([PL2, BaseLTR_LM],\n", - " test_topics, \n", - " qrels, [\"map\"], \n", - " names=[\"PL2 Baseline\", \"LambdaMART\"])\n", - "allresultsLM" - ], - "execution_count": 26, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -958,7 +906,7 @@ " \n", " 1\n", " LambdaMART\n", - " 0.204391\n", + " 0.210969\n", " \n", " \n", "\n", @@ -967,15 +915,51 @@ "text/plain": [ " name map\n", "0 PL2 Baseline 0.206031\n", - "1 LambdaMART 0.204391" + "1 LambdaMART 0.210969" ] }, - "metadata": { - "tags": [] - }, - "execution_count": 26 + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "allresultsLM = pt.Experiment([PL2, BaseLTR_LM],\n", + " test_topics,\n", + " qrels, [\"map\"],\n", + " names=[\"PL2 Baseline\", \"LambdaMART\"])\n", + "allresultsLM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Excellent, event on this small dataset, adding a few more features and LambdaMART can enhance effectiveness!" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/pyterrier/__init__.py b/pyterrier/__init__.py index 98d5ff7d..8198b684 100644 --- a/pyterrier/__init__.py +++ b/pyterrier/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.10.0" +__version__ = "0.10.1" import os diff --git a/pyterrier/apply_base.py b/pyterrier/apply_base.py index 0290fdaf..bdd69449 100644 --- a/pyterrier/apply_base.py +++ b/pyterrier/apply_base.py @@ -210,11 +210,18 @@ def transform(self, inputRes): outputRes = push_queries(inputRes.copy(), inplace=True, keep_original=True) else: outputRes = inputRes.copy() - if self.verbose: - tqdm.pandas(desc="pt.apply.query", unit="d") - outputRes["query"] = outputRes.progress_apply(fn, axis=1) - else: - outputRes["query"] = outputRes.apply(fn, axis=1) + try: + if self.verbose: + tqdm.pandas(desc="pt.apply.query", unit="d") + outputRes["query"] = outputRes.progress_apply(fn, axis=1) + else: + outputRes["query"] = outputRes.apply(fn, axis=1) + except ValueError as ve: + msg = str(ve) + if "Columns must be same length as key" in msg: + raise TypeError("Could not coerce return from pt.apply.query function into a list of strings. Check your function returns a string.") from ve + else: + raise ve return outputRes class ApplyGenericTransformer(ApplyTransformerBase): diff --git a/pyterrier/batchretrieve.py b/pyterrier/batchretrieve.py index 3c59ca9d..357a3116 100644 --- a/pyterrier/batchretrieve.py +++ b/pyterrier/batchretrieve.py @@ -553,7 +553,10 @@ class TextScorer(TextIndexProcessor): takes(str): configuration - what is needed as input: `"queries"`, or `"docs"`. Default is `"docs"` since v0.8. returns(str): configuration - what is needed as output: `"queries"`, or `"docs"`. Default is `"docs"`. body_attr(str): what dataframe input column contains the text of the document. Default is `"body"`. - wmodel(str): example of configuration passed to BatchRetrieve. + wmodel(str): name of the weighting model to use for scoring. + background_index(index_like): An optional background index to use for term and collection statistics. If a weighting + model such as BM25 or TF_IDF or PL2 is used without setting the background_index, the background statistics + will be calculated from the dataframe, which is ususally not the desired behaviour. Example:: @@ -562,9 +565,21 @@ class TextScorer(TextIndexProcessor): ["q1", "chemical reactions", "d1", "professor protor poured the chemicals"], ["q1", "chemical reactions", "d2", "chemical brothers turned up the beats"], ], columns=["qid", "query", "text"]) - textscorer = pt.TextScorer(takes="docs", body_attr="text", wmodel="TF_IDF") + textscorer = pt.TextScorer(takes="docs", body_attr="text", wmodel="Tf") rtr = textscorer.transform(df) - #rtr will score each document for the query "chemical reactions" based on the provided document contents + #rtr will score each document by term frequency for the query "chemical reactions" based on the provided document contents + + Example:: + + df = pd.DataFrame( + [ + ["q1", "chemical reactions", "d1", "professor protor poured the chemicals"], + ["q1", "chemical reactions", "d2", "chemical brothers turned up the beats"], + ], columns=["qid", "query", "text"]) + existing_index = pt.IndexFactory.of(...) + textscorer = pt.TextScorer(takes="docs", body_attr="text", wmodel="TF_IDF", background_index=existing_index) + rtr = textscorer.transform(df) + #rtr will score each document by TF_IDF for the query "chemical reactions" based on the provided document contents """ def __init__(self, takes="docs", **kwargs): @@ -606,6 +621,12 @@ def __init__(self, index_location, features, controls=None, properties=None, thr self.wmodel = kwargs["wmodel"] if "wmodel" in controls: self.wmodel = controls["wmodel"] + + # check for terrier-core#246 bug usiung FatFull + if self.wmodel is not None: + from . import check_version + assert check_version(5.9), "Terrier 5.9 is required for this functionality, see https://github.com/terrier-org/terrier-core/pull/246" + if threads > 1: raise ValueError("Multi-threaded retrieval not yet supported by FeaturesBatchRetrieve") @@ -657,7 +678,7 @@ def transform(self, queries): Performs the retrieval with multiple features Args: - queries: String for a single query, list of queries, or a pandas.Dataframe with columns=['qid', 'query']. For re-ranking, + queries: A pandas.Dataframe with columns=['qid', 'query']. For re-ranking, the DataFrame may also have a 'docid' and or 'docno' column. Returns: @@ -846,4 +867,4 @@ def push_fbr_earlier(_br1, _fbr): global rewrites_setup rewrites_setup = True -setup_rewrites() \ No newline at end of file +setup_rewrites() diff --git a/pyterrier/bootstrap.py b/pyterrier/bootstrap.py index a606ef71..99be0550 100644 --- a/pyterrier/bootstrap.py +++ b/pyterrier/bootstrap.py @@ -46,7 +46,7 @@ def _load_into_memory(index, structures=['lexicon', 'direct', 'inverted', 'meta' }, 'inverted' : { 'org.terrier.structures.bit.BitPostingIndex' : { - 'index.direct.data-source' : 'fileinmem'} + 'index.inverted.data-source' : 'fileinmem'} }, } if "direct" in structures: @@ -271,6 +271,60 @@ def _index_add(self, other): raise ValueError("Cannot document-wise merge indices with and without positions (%r vs %r)" % (blocks_1, blocks_2)) multiindex_cls = autoclass("org.terrier.realtime.multi.MultiIndex") return multiindex_cls([self, other], blocks_1, fields_1 > 0) + + def _index_corpusiter(self, return_toks=True): + def _index_corpusiter_meta(self): + meta_inputstream = self.getIndexStructureInputStream("meta") + keys = self.getMetaIndex().getKeys() + keys_offset = { k: offset for offset, k in enumerate(keys) } + while meta_inputstream.hasNext(): + item = meta_inputstream.next() + yield {k : item[keys_offset[k]] for k in keys_offset} + + def _index_corpusiter_direct_pretok(self): + import sys + MIN_PYTHON = (3, 8) + if sys.version_info < MIN_PYTHON: + raise NotImplementedError("Sorry, Python 3.8+ is required for this functionality") + + meta_inputstream = self.getIndexStructureInputStream("meta") + keys = self.getMetaIndex().getKeys() + keys_offset = { k: offset for offset, k in enumerate(keys) } + keys_offset = {'docno' : keys_offset['docno'] } + direct_inputstream = self.getIndexStructureInputStream("direct") + lex = self.getLexicon() + + ip = None + while (ip := direct_inputstream.getNextPostings()) is not None: # this is the next() method + + # yield empty toks dicts for empty documents + for skipped in range(0, direct_inputstream.getEntriesSkipped()): + meta = meta_inputstream.next() + rtr = {k : meta[keys_offset[k]] for k in keys_offset} + rtr['toks'] = {} + yield rtr + + toks = {} + while ip.next() != ip.EOL: + t, _ = lex[ip.getId()] + toks[t] = ip.getFrequency() + meta = meta_inputstream.next() + rtr = {'toks' : toks} + rtr.update({k : meta[keys_offset[k]] for k in keys_offset}) + yield rtr + + # yield for trailing empty documents + for skipped in range(0, direct_inputstream.getEntriesSkipped()): + meta = meta_inputstream.next() + rtr = {k : meta[keys_offset[k]] for k in keys_offset} + rtr['toks'] = {} + yield rtr + + if return_toks: + if not self.hasIndexStructureInputStream("direct"): + raise ValueError("No direct index input stream available, cannot use return_toks=True") + return _index_corpusiter_direct_pretok(self) + return _index_corpusiter_meta(self) protocol_map["org.terrier.structures.Index"] = { # this means that len(index) returns the number of documents in the index @@ -278,7 +332,10 @@ def _index_add(self, other): # document-wise composition of indices: adding more documents to an index, by merging two indices with # different numbers of documents. This implemented by the overloading the `+` Python operator - '__add__': _index_add + '__add__': _index_add, + + # get_corpus_iter returns a yield generator that return {"docno": "d1", "toks" : {'a' : 1}} + 'get_corpus_iter' : _index_corpusiter } def setup_terrier(file_path, terrier_version=None, helper_version=None, boot_packages=[], force_download=True): diff --git a/pyterrier/datasets.py b/pyterrier/datasets.py index 87aa3aeb..247b1825 100644 --- a/pyterrier/datasets.py +++ b/pyterrier/datasets.py @@ -644,21 +644,21 @@ def msmarco_document_generate(dataset): MSMARCO_DOC_FILES = { "corpus" : - [("msmarco-docs.trec.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")], + [("msmarco-docs.trec.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.trec.gz")], "corpus-tsv": - [("msmarco-docs.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")], + [("msmarco-docs.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz")], "topics" : { - "train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"), - "dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"), - "test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), - "test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"), - 'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline") + "train" : ("msmarco-doctrain-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz", "singleline"), + "dev" : ("msmarco-docdev-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz", "singleline"), + "test" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), + "test-2020" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"), + 'leaderboard-2020' : ("docleaderboard-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz", "singleline") }, "qrels" : { - "train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"), - "dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"), + "train" : ("msmarco-doctrain-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz"), + "dev" : ("msmarco-docdev-qrels.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz"), "test" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-docs.txt"), "test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-docs.txt") }, @@ -685,18 +685,18 @@ def msmarco_document_generate(dataset): "dev.small" : ("queries.dev.small.tsv", "collectionandqueries.tar.gz#queries.dev.small.tsv", "singleline"), "eval" : ("queries.eval.tsv", "queries.tar.gz#queries.eval.tsv", "singleline"), "eval.small" : ("queries.eval.small.tsv", "collectionandqueries.tar.gz#queries.eval.small.tsv", "singleline"), - "test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), - "test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline") + "test-2019" : ("msmarco-test2019-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), + "test-2020" : ("msmarco-test2020-queries.tsv.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline") }, "tars" : { - "queries.tar.gz" : ("queries.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz"), - "collection.tar.gz" : ("collection.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz"), - "collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz") + "queries.tar.gz" : ("queries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz"), + "collection.tar.gz" : ("collection.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz"), + "collectionandqueries.tar.gz" : ("collectionandqueries.tar.gz", "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz") }, "qrels" : { - "train" : ("qrels.train.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv"), - "dev" : ("qrels.dev.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv"), + "train" : ("qrels.train.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv"), + "dev" : ("qrels.dev.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv"), "test-2019" : ("2019qrels-docs.txt", "https://trec.nist.gov/data/deep/2019qrels-pass.txt"), "test-2020" : ("2020qrels-docs.txt", "https://trec.nist.gov/data/deep/2020qrels-pass.txt"), "dev.small" : ("qrels.dev.small.tsv", "collectionandqueries.tar.gz#qrels.dev.small.tsv"), @@ -709,19 +709,19 @@ def msmarco_document_generate(dataset): MSMARCOv2_DOC_FILES = { "info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html", "topics" : { - "train" : ("docv2_train_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"), - "dev1" :("docv2_dev_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"), - "dev2" :("docv2_dev2_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"), - "valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), - "valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"), - "trec_2021" : ("2021_queries.tsv" , "https://msmarco.blob.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"), + "train" : ("docv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_queries.tsv", "singleline"), + "dev1" :("docv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_queries.tsv", "singleline"), + "dev2" :("docv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_queries.tsv", "singleline"), + "valid1" : ("msmarco-test2019-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", "singleline"), + "valid2" : ("msmarco-test2020-queries.tsv.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz", "singleline"), + "trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"), }, "qrels" : { - "train" : ("docv2_train_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"), - "dev1" :("docv2_dev_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"), - "dev2" :("docv2_dev2_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"), - "valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"), - "valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.blob.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz") + "train" : ("docv2_train_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_train_qrels.tsv"), + "dev1" :("docv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev_qrels.tsv"), + "dev2" :("docv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_dev2_qrels.tsv"), + "valid1" : ("docv2_trec2019_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2019_qrels.txt.gz"), + "valid2" : ("docv2_trec2020_qrels.txt.gz" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/docv2_trec2020_qrels.txt.gz") }, "index" : _datarepo_index, } @@ -729,15 +729,15 @@ def msmarco_document_generate(dataset): MSMARCOv2_PASSAGE_FILES = { "info_url" : "https://microsoft.github.io/msmarco/TREC-Deep-Learning.html", "topics" : { - "train" : ("passv2_train_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"), - "dev1" : ("passv2_dev_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"), - "dev2" : ("passv2_dev2_queries.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"), - "trec_2021" : ("2021_queries.tsv" , "https://msmarco.blob.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"), + "train" : ("passv2_train_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_queries.tsv", "singleline"), + "dev1" : ("passv2_dev_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_queries.tsv", "singleline"), + "dev2" : ("passv2_dev2_queries.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_queries.tsv", "singleline"), + "trec_2021" : ("2021_queries.tsv" , "https://msmarco.z22.web.core.windows.net/msmarcoranking/2021_queries.tsv", "singleline"), }, "qrels" : { - "train" : ("passv2_train_qrels.tsv" "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"), - "dev1" : ("passv2_dev_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"), - "dev2" : ("passv2_dev2_qrels.tsv", "https://msmarco.blob.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"), + "train" : ("passv2_train_qrels.tsv" "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_train_qrels.tsv"), + "dev1" : ("passv2_dev_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev_qrels.tsv"), + "dev2" : ("passv2_dev2_qrels.tsv", "https://msmarco.z22.web.core.windows.net/msmarcoranking/passv2_dev2_qrels.tsv"), }, "index" : _datarepo_index, } diff --git a/pyterrier/io.py b/pyterrier/io.py index 9673b56f..3d220e25 100644 --- a/pyterrier/io.py +++ b/pyterrier/io.py @@ -228,12 +228,8 @@ def _parse_line(l): def _read_results_trec(filename): results = [] - df = pd.read_csv(filename, sep=r'\s+', names=["qid", "iter", "docno", "rank", "score", "name"]) + df = pd.read_csv(filename, sep=r'\s+', names=["qid", "iter", "docno", "rank", "score", "name"], dtype={'qid': str, 'docno': str, 'rank': int, 'score': float}) df = df.drop(columns="iter") - df["qid"] = df["qid"].astype(str) - df["docno"] = df["docno"].astype(str) - df["rank"] = df["rank"].astype(int) - df["score"] = df["score"].astype(float) return df def write_results(res, filename, format="trec", append=False, **kwargs): @@ -294,13 +290,13 @@ def read_topics(filename, format="trec", **kwargs): Supported Formats: * "trec" -- an SGML-formatted TREC topics file. Delimited by TOP tags, each having NUM and TITLE tags; DESC and NARR tags are skipped by default. Control using whitelist and blacklist kwargs - * "trecxml" -- a more modern XML formatted topics file. Delimited by topic tags, each having nunber tags. query, question and narrative tags are parsed by default. Control using tags kwarg. + * "trecxml" -- a more modern XML formatted topics file. Delimited by topic tags, each having number tags. query, question and narrative tags are parsed by default. Control using tags kwarg. * "singeline" -- one query per line, preceeded by a space or colon. Tokenised by default, use tokenise=False kwargs to prevent tokenisation. """ if format is None: format = "trec" if not format in SUPPORTED_TOPICS_FORMATS: - raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_RESULTS_FORMATS.keys()))) + raise ValueError("Format %s not known, supported types are %s" % (format, str(SUPPORTED_TOPICS_FORMATS.keys()))) return SUPPORTED_TOPICS_FORMATS[format](filename, **kwargs) def _read_topics_trec(file_path, doc_tag="TOP", id_tag="NUM", whitelist=["TITLE"], blacklist=["DESC","NARR"]): @@ -339,7 +335,10 @@ def _read_topics_trecxml(filename, tags=["query", "question", "narrative"], toke from jnius import autoclass tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser() for child in root.iter('topic'): - qid = child.attrib["number"] + try: + qid = child.attrib["number"] + except KeyError: + qid = child.find("number").text query = "" for tag in child: if tag.tag in tags: @@ -347,7 +346,7 @@ def _read_topics_trecxml(filename, tags=["query", "question", "narrative"], toke if tokenise: query_text = " ".join(tokeniser.getTokens(query_text)) query += " " + query_text - topics.append((str(qid), query)) + topics.append((str(qid), query.strip())) return pd.DataFrame(topics, columns=["qid", "query"]) def _read_topics_singleline(filepath, tokenise=True): diff --git a/pyterrier/pipelines.py b/pyterrier/pipelines.py index 345812c8..8188ad08 100644 --- a/pyterrier/pipelines.py +++ b/pyterrier/pipelines.py @@ -561,8 +561,11 @@ def _apply_round(measure, value): for pcol in p_col_names: pcol_reject = pcol.replace("p-value", "reject") pcol_corrected = pcol + " corrected" - reject, corrected, _, _ = statsmodels.stats.multitest.multipletests(df[pcol], alpha=correction_alpha, method=correction) + reject, corrected, _, _ = statsmodels.stats.multitest.multipletests(df[pcol].drop(df.index[baseline]), alpha=correction_alpha, method=correction) insert_pos = df.columns.get_loc(pcol) + # add reject/corrected values for the baseline + reject = np.insert(reject, baseline, False) + corrected = np.insert(corrected, baseline, np.nan) # add extra columns, put place directly after the p-value column df.insert(insert_pos+1, pcol_reject, reject) df.insert(insert_pos+2, pcol_corrected, corrected) diff --git a/pyterrier/rewrite.py b/pyterrier/rewrite.py index c2f65e44..46e313b7 100644 --- a/pyterrier/rewrite.py +++ b/pyterrier/rewrite.py @@ -205,6 +205,7 @@ def __init__(self, index_like, fb_terms=10, fb_docs=3, qeclass="org.terrier.quer else: self.qe = qeclass self.indexref = _parse_index_like(index_like) + self.properties = properties for k,v in properties.items(): pt.ApplicationSetup.setProperty(k, str(v)) self.applytp = pt.autoclass("org.terrier.querying.ApplyTermPipeline")() @@ -212,6 +213,34 @@ def __init__(self, index_like, fb_terms=10, fb_docs=3, qeclass="org.terrier.quer self.fb_docs = fb_docs self.manager = pt.autoclass("org.terrier.querying.ManagerFactory")._from_(self.indexref) + def __reduce__(self): + return ( + self.__class__, + (self.indexref,), + self.__getstate__() + ) + + def __getstate__(self): + if isinstance(self.qe, str): + qe = self.qe + else: + qe = self.qe.getClass().getName() + return { + 'fb_terms' : self.fb_terms, + 'fb_docs' : self.fb_docs, + 'qeclass' : qe, + 'properties' : self.properties + } + + def __setstate__(self, d): + self.fb_terms = d["fb_terms"] + self.fb_docs = d["fb_docs"] + self.qe = pt.autoclass(d['qeclass'])() + self.properties.update(d["properties"]) + for key,value in d["properties"].items(): + self.appSetup.setProperty(key, str(value)) + self.manager = pt.autoclass("org.terrier.querying.ManagerFactory")._from_(self.indexref) + def _populate_resultset(self, topics_and_res, qid, index): docids=None @@ -387,6 +416,15 @@ def __init__(self, *args, fb_terms=10, fb_docs=3, fb_lambda=0.6, **kwargs): kwargs["qeclass"] = rm super().__init__(*args, fb_terms=fb_terms, fb_docs=fb_docs, **kwargs) + def __getstate__(self): + rtr = super().__getstate__() + rtr['fb_lambda'] = self.fb_lambda + return rtr + + def __setstate__(self, d): + super().__setstate__(d) + self.fb_lambda = d["fb_lambda"] + def _configure_request(self, rq): super()._configure_request(rq) rq.setControl("rm3.lambda", str(self.fb_lambda)) diff --git a/pyterrier/text.py b/pyterrier/text.py index a3ac6f87..51a2dbce 100644 --- a/pyterrier/text.py +++ b/pyterrier/text.py @@ -135,6 +135,13 @@ def scorer(*args, **kwargs) -> Transformer: This is an alias to pt.TextScorer(). Internally, a Terrier memory index is created, before being used for scoring. + Arguments: + body_attr(str): what dataframe input column contains the text of the document. Default is `"body"`. + wmodel(str): name of the weighting model to use for scoring. + background_index(index_like): An optional background index to use for collection statistics. If a weighting + model such as BM25 or TF_IDF or PL2 is used without setting the background_index, the background statistics + will be calculated from the dataframe, which is ususally not the desired behaviour. + Example:: df = pd.DataFrame( @@ -149,8 +156,9 @@ def scorer(*args, **kwargs) -> Transformer: # ["q1", "chemical reactions", "d1", "professor protor poured the chemicals", 0, 1] # ["q1", "chemical reactions", "d2", "chemical brothers turned up the beats", 0, 1] - For calculating the scores of documents using any weighting model with the concept of IDF, it may be useful to make use of - an existing Terrier index for background statistics:: + For calculating the scores of documents using any weighting model with the concept of IDF, it is strongly advised to make use of + an existing Terrier index for background statistics. Without a background index, IDF will be calculated based on the supplied + dataframe (for models such as BM25, this can lead to negative scores):: textscorerTfIdf = pt.text.scorer(body_attr="text", wmodel="TF_IDF", background_index=index) @@ -512,8 +520,8 @@ def applyPassaging(self, df, labels=True): newRows.append(newRow) passageCount+=1 newDF = pd.DataFrame(newRows) - newDF['query'].fillna('',inplace=True) - newDF[self.text_attr].fillna('',inplace=True) - newDF['qid'].fillna('',inplace=True) + newDF['query'] = newDF['query'].fillna('') + newDF[self.text_attr] = newDF[self.text_attr].fillna('') + newDF['qid'] = newDF['qid'].fillna('') newDF.reset_index(inplace=True,drop=True) return newDF diff --git a/pyterrier/transformer.py b/pyterrier/transformer.py index 38978227..7d1c55e3 100644 --- a/pyterrier/transformer.py +++ b/pyterrier/transformer.py @@ -39,7 +39,7 @@ def get_transformer(v, stacklevel=1): if isinstance(v, pd.DataFrame): warn('Coercion of a dataframe into a transformer is deprecated; use a pt.Transformer.from_df() instead', stacklevel=stacklevel, category=DeprecationWarning) return SourceTransformer(v) - raise ValueError("Passed parameter %s of type %s cannot be coerced into a transformer" % (str(v), type(v)), stacklevel=stacklevel, category=DeprecationWarning) + raise ValueError("Passed parameter %s of type %s cannot be coerced into a transformer" % (str(v), type(v))) rewrite_rules = [] @@ -281,8 +281,10 @@ def __init__(self, *args, **kwargs): class Indexer(Transformer): def index(self, iter : Iterable[dict], **kwargs): """ - Takes an iterable of dictionaries ("iterdict"), and consumes them. There is no return; - This method is typically used to implement indexers. + Takes an iterable of dictionaries ("iterdict"), and consumes them. The index method may return + an instance of the index or retriever. This method is typically used to implement indexers that + consume a corpus (or to consume the output of previous pipeline components that have + transformer the documents being consumed). """ pass @@ -368,4 +370,4 @@ def __init__(self, rtr, **kwargs): def transform(self, topics): rtr = self.rtr.copy() - return rtr \ No newline at end of file + return rtr diff --git a/requirements-test.txt b/requirements-test.txt index a2cc5b7a..9e41c8e0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,3 +4,4 @@ fastrank>=0.7.0 torch lz4 transformers +scikit-learn diff --git a/requirements.txt b/requirements.txt index 55baa50f..6106ceb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ wget tqdm pyjnius>=1.4.2 matchpy -scikit-learn deprecated chest scipy diff --git a/setup.py b/setup.py index 07913e5c..be9968c1 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,12 @@ def get_version(rel_path): author="Craig Macdonald", author_email='craigm@dcs.gla.ac.uk', description="Terrier IR platform Python API", + project_urls={ + 'Documentation': 'https://pyterrier.readthedocs.io', + 'Changelog': 'https://github.com/terrier-org/pyterrier/releases', + 'Issue Tracker': 'https://github.com/terrier-org/pyterrier/issues', + 'CI': 'https://github.com/terrier-org/pyterrier/actions', + }, long_description=long_description, long_description_content_type="text/markdown", package_data={'': ['LICENSE.txt', 'requirements.txt', 'requirements-test.txt']}, @@ -65,5 +71,5 @@ def get_version(rel_path): "Operating System :: OS Independent", ], install_requires=requirements, - python_requires='>=3.7', + python_requires='>=3.8', ) diff --git a/terrier-python-helper/pom.xml b/terrier-python-helper/pom.xml index c1281371..c829982a 100644 --- a/terrier-python-helper/pom.xml +++ b/terrier-python-helper/pom.xml @@ -110,14 +110,14 @@ ch.qos.logback logback-classic - 1.2.0 + 1.2.13 provided ch.qos.logback logback-core - 1.2.9 + 1.2.13 provided diff --git a/tests/base.py b/tests/base.py index 93594a10..501cd234 100644 --- a/tests/base.py +++ b/tests/base.py @@ -10,14 +10,20 @@ class BaseTestCase(unittest.TestCase): def __init__(self, *args, **kwargs): super(BaseTestCase, self).__init__(*args, **kwargs) terrier_version = os.environ.get("TERRIER_VERSION", None) - if terrier_version is not None: - print("Testing with Terrier version " + terrier_version) terrier_helper_version = os.environ.get("TERRIER_HELPER_VERSION", None) - if terrier_helper_version is not None: - print("Testing with Terrier Helper version " + terrier_helper_version) if not pt.started(): + + # display for debugging what is being used + if terrier_version is not None: + print("Testing with Terrier version " + terrier_version) + if terrier_helper_version is not None: + print("Testing with Terrier Helper version " + terrier_helper_version) + pt.init(version=terrier_version, logging="DEBUG", helper_version=terrier_helper_version) + # jvm_opts=['-ea'] can be added here to ensure that all Java assertions are met self.here = os.path.dirname(os.path.realpath(__file__)) + + # check that pt.init() is saving its arguments assert "version" in pt.init_args assert pt.init_args["version"] == terrier_version @@ -42,4 +48,4 @@ def tearDown(self): except: pass - \ No newline at end of file + diff --git a/tests/fixtures/topics.trecxml b/tests/fixtures/topics.trecxml new file mode 100644 index 00000000..bea592cb --- /dev/null +++ b/tests/fixtures/topics.trecxml @@ -0,0 +1,20 @@ + + + 1 + lights + Description lights + Documents are relevant if they describe lights. + + + 2 + radiowaves + Description radiowaves + Documents are relevant if they describe radiowaves. + + + + sounds + Description sound + Documents are relevant if they describe sounds. + + \ No newline at end of file diff --git a/tests/test_apply.py b/tests/test_apply.py index 0f6faf8f..ad834eaf 100644 --- a/tests/test_apply.py +++ b/tests/test_apply.py @@ -59,6 +59,14 @@ def test_query_apply(self): rtrDR2 = pt.apply.query(lambda row : row["qid"] )(testDF2) self.assertEqual(rtrDR2.iloc[0]["query"], "q1") + def test_query_apply_error(self): + origquery="the bear and the wolf" + testDF = pd.DataFrame([["q1", origquery]], columns=["qid", "query"]) + p = pt.apply.query(lambda q : q) # should thrown an error, as pt.apply.query should return a string, not a row + with self.assertRaises(TypeError) as te: + p(testDF) + self.assertTrue("Could not coerce return from pt.apply.query function into a list of strings" in str(te.exception)) + def test_by_query_apply(self): inputDf = pt.new.ranked_documents([[1], [2]], qid=["1", "2"]) def _inc_score(res): diff --git a/tests/test_experiment.py b/tests/test_experiment.py index a4f380f9..5498ba30 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -321,7 +321,7 @@ def test_baseline_and_tests(self): # user-specified TOST # TOST will omit warnings here, due to low numbers of topics import statsmodels.stats.weightstats - fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_ind(X, Y, -0.01, 0.01)[0]) + fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_paired(X, Y, -0.01, 0.01)[0]) #This filter doesnt work with warnings.catch_warnings(record=True) as w: @@ -363,15 +363,17 @@ def test_baseline_corrected(self): dataset = pt.get_dataset("vaswani") res1 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")(dataset.get_topics().head(10)) res2 = pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")(dataset.get_topics().head(10)) - for corr in ['hs', 'bonferroni', 'holm-sidak']: + baseline = 0 + for corr in ['hs', 'bonferroni', 'hommel']: df = pt.Experiment( [res1, res2], dataset.get_topics().head(10), dataset.get_qrels(), eval_metrics=["map", "ndcg"], - baseline=0, correction='hs') + baseline=baseline, correction=corr) self.assertTrue("map +" in df.columns) self.assertTrue("map -" in df.columns) self.assertTrue("map p-value" in df.columns) self.assertTrue("map p-value corrected" in df.columns) self.assertTrue("map reject" in df.columns) + self.assertFalse(any(df["map p-value corrected"].drop(df.index[baseline]).isna())) diff --git a/tests/test_fbr.py b/tests/test_fbr.py index b25c58ee..d7c49e25 100644 --- a/tests/test_fbr.py +++ b/tests/test_fbr.py @@ -137,6 +137,35 @@ def test_fbr(self): if "matching" in retrBasic.controls: self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull") + def test_fbr_example(self): + JIR = pt.autoclass('org.terrier.querying.IndexRef') + indexref = JIR.of(self.here + "/fixtures/index/data.properties") + index = pt.IndexFactory.of(indexref) + # this ranker will make the candidate set of documents for each query + BM25 = pt.BatchRetrieve(index, wmodel="BM25") + + # these rankers we will use to re-rank the BM25 results + TF_IDF = pt.BatchRetrieve(index, wmodel="Dl") + PL2 = pt.BatchRetrieve(index, wmodel="PL2") + + pipe = (BM25 %2) >> (TF_IDF ** PL2) + fbr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:Dl", "WMODEL:PL2"], wmodel="BM25") % 2 + resultP = pipe.search("chemical") + resultF = fbr.search("chemical") + pd.set_option('display.max_columns', None) + + self.assertEqual(resultP.iloc[0].docno, resultF.iloc[0].docno) + self.assertEqual(resultP.iloc[0].score, resultF.iloc[0].score) + self.assertEqual(resultP.iloc[0].features[0], resultF.iloc[0].features[0]) + self.assertEqual(resultP.iloc[0].features[1], resultF.iloc[0].features[1]) + + pipeCompiled = pipe.compile() + resultC = pipeCompiled.search("chemical") + self.assertEqual(resultP.iloc[0].docno, resultC.iloc[0].docno) + self.assertEqual(resultP.iloc[0].score, resultC.iloc[0].score) + self.assertEqual(resultP.iloc[0].features[0], resultC.iloc[0].features[0]) + self.assertEqual(resultP.iloc[0].features[1], resultC.iloc[0].features[1]) + def test_fbr_empty(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") diff --git a/tests/test_index_op.py b/tests/test_index_op.py index 25b0ae3e..a6f27538 100644 --- a/tests/test_index_op.py +++ b/tests/test_index_op.py @@ -10,6 +10,98 @@ class TestIndexOp(TempDirTestCase): + def test_index_corpus_iter(self): + import sys + MIN_PYTHON = (3, 8) + if sys.version_info < MIN_PYTHON: + self.skipTest("Not minimum Python requirements") + + documents = [ + {'docno' : 'd1', 'text': 'stemming stopwords stopwords'}, + ] + index = pt.IndexFactory.of( pt.IterDictIndexer(tempfile.mkdtemp(), stopwords=None, stemmer=None).index(documents) ) + self.assertEqual(1, len(index)) + self.assertEqual(2, index.getCollectionStatistics().getNumberOfUniqueTerms()) + self.assertEqual(3, index.getCollectionStatistics().getNumberOfTokens()) + + # check that get_corpus_iter() contains the correct information + iter = index.get_corpus_iter() + first_doc = next(iter) + self.assertTrue(first_doc is not None) + self.assertIn('docno', first_doc) + self.assertIn('toks', first_doc) + self.assertIn('stemming', first_doc['toks']) + self.assertIn('stopwords', first_doc['toks']) + self.assertEqual(1, first_doc['toks']['stemming']) + self.assertEqual(2, first_doc['toks']['stopwords']) + with(self.assertRaises(StopIteration)): + next(iter) + + # now check that a static pruning pipe can operate as expected. this example comes from terrier-index-api.rst + index_pipe = ( + # update the toks column for each document, keeping only terms with frequency > 1 + pt.apply.toks(lambda row: { t : row['toks'][t] for t in row['toks'] if row['toks'][t] > 1 } ) + >> pt.IterDictIndexer(tempfile.mkdtemp(), pretokenised=True) + ) + new_index_ref = index_pipe.index( index.get_corpus_iter()) + pruned_index = pt.IndexFactory.of(new_index_ref) + self.assertEqual(1, len(pruned_index)) + self.assertEqual(1, pruned_index.getCollectionStatistics().getNumberOfUniqueTerms()) + self.assertEqual(2, pruned_index.getCollectionStatistics().getNumberOfTokens()) + + def test_index_corpus_iter_empty(self): + import sys + MIN_PYTHON = (3, 8) + if sys.version_info < MIN_PYTHON: + self.skipTest("Not minimum Python requirements") + + # compared to test_index_corpus_iter, this tests empty documents are handled correctly. + documents = [ + {'docno' : 'd0', 'text':''}, + {'docno' : 'd1', 'text':''}, + {'docno' : 'd2', 'text': 'stemming stopwords stopwords'}, + {'docno' : 'd3', 'text':''}, + {'docno' : 'd4', 'text': 'stemming stopwords stopwords'}, + {'docno' : 'd5', 'text': ''} + ] + index = pt.IndexFactory.of( pt.IterDictIndexer(tempfile.mkdtemp(), stopwords=None, stemmer=None).index(documents) ) + self.assertEqual(6, len(index)) + self.assertEqual(2, index.getCollectionStatistics().getNumberOfUniqueTerms()) + self.assertEqual(6, index.getCollectionStatistics().getNumberOfTokens()) + + iter = index.get_corpus_iter() + + counter = 0 + for doc in documents: + next_doc = next(iter) + counter += 1 + self.assertTrue(next_doc is not None) + self.assertIn('docno', next_doc) + self.assertIn('toks', next_doc) + if doc['text'] == '': + self.assertEqual(0, len(next_doc['toks'])) + else: + self.assertIn('stemming', next_doc['toks']) + self.assertIn('stopwords', next_doc['toks']) + self.assertEqual(1, next_doc['toks']['stemming']) + self.assertEqual(2, next_doc['toks']['stopwords']) + + with(self.assertRaises(StopIteration)): + next(iter) + self.assertEqual(counter, len(documents)) + + # now check that a static pruning pipe can operate as expected. this example comes from terrier-index-api.rst + index_pipe = ( + # update the toks column for each document, keeping only terms with frequency > 1 + pt.apply.toks(lambda row: { t : row['toks'][t] for t in row['toks'] if row['toks'][t] > 1 } ) + >> pt.IterDictIndexer(tempfile.mkdtemp(), pretokenised=True) + ) + new_index_ref = index_pipe.index( index.get_corpus_iter()) + pruned_index = pt.IndexFactory.of(new_index_ref) + self.assertEqual(6, len(pruned_index)) + self.assertEqual(1, pruned_index.getCollectionStatistics().getNumberOfUniqueTerms()) + self.assertEqual(4, pruned_index.getCollectionStatistics().getNumberOfTokens()) + def test_index_add_write(self): # inspired by https://github.com/terrier-org/pyterrier/issues/390 documents = [ diff --git a/tests/test_ltr_pipelines.py b/tests/test_ltr_pipelines.py index 9824d8bf..fdc967c7 100644 --- a/tests/test_ltr_pipelines.py +++ b/tests/test_ltr_pipelines.py @@ -39,7 +39,6 @@ def test_xgltr_pipeline(self): 'learning_rate': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6, - 'verbose': 2, 'random_state': 42 } diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 8d1a7064..769b228d 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -98,6 +98,22 @@ def test_fbr_joblib(self): self._fix_joblib() self._fbr(joblib) + def test_qe_pickle(self): + self._qe(pickle) + + def _qe(self, pickler): + vaswani = pt.datasets.get_dataset("vaswani") + index = vaswani.get_index() + bm25 = pt.BatchRetrieve(index, wmodel='BM25', controls={"c" : 0.75}, num_results=15) + br = bm25 >> pt.rewrite.Bo1QueryExpansion(index) >> bm25 + q = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"]) + res1 = br(q) + byterep = pickler.dumps(br) + br2 = pickler.loads(byterep) + + res2 = br2(q) + pd.testing.assert_frame_equal(res1, res2) + def _br(self, pickler, wmodel='BM25'): vaswani = pt.datasets.get_dataset("vaswani") br = pt.BatchRetrieve(vaswani.get_index(), wmodel=wmodel, controls={"c" : 0.75}, num_results=15) diff --git a/tests/test_text.py b/tests/test_text.py index f24dd6db..12e248c0 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -41,7 +41,8 @@ def test_scorer_rerank(self): self.assertEqual(1, dfOut.iloc[0]["rank"]) def test_snippets(self): - br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed_text", metadata=["docno", "text"]) + br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed") >> pt.text.get_text(pt.get_dataset('irds:vaswani'), "text") + #br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed_text", metadata=["docno", "text"]) psg_scorer = ( pt.text.sliding(text_attr='text', length=25, stride=12, prepend_attr=None) >> pt.text.scorer(body_attr="text", wmodel='Tf', takes='docs') diff --git a/tests/test_topicsparsing.py b/tests/test_topicsparsing.py index cedcebbb..20ecde94 100644 --- a/tests/test_topicsparsing.py +++ b/tests/test_topicsparsing.py @@ -1,14 +1,19 @@ -import pyterrier as pt -import unittest -from .base import BaseTestCase import os +import unittest + import pandas as pd -class TestTopicsParsing(BaseTestCase): +import pyterrier as pt +from .base import BaseTestCase + + +class TestTopicsParsing(BaseTestCase): def testSingleLine(self): topics = pt.io.read_topics( - os.path.dirname(os.path.realpath(__file__)) + "/fixtures/singleline.topics", format="singleline") + os.path.dirname(os.path.realpath(__file__)) + "/fixtures/singleline.topics", + format="singleline", + ) self.assertEqual(2, len(topics)) self.assertTrue("qid" in topics.columns) self.assertTrue("query" in topics.columns) @@ -19,12 +24,29 @@ def testSingleLine(self): def test_parse_trec_topics_file_T(self): input = os.path.dirname(os.path.realpath(__file__)) + "/fixtures/topics.trec" - exp_result = pd.DataFrame([["1", "light"], ["2", "radiowave"], ["3", "sound"]], columns=['qid', 'query']) + exp_result = pd.DataFrame( + [["1", "light"], ["2", "radiowave"], ["3", "sound"]], + columns=["qid", "query"], + ) result = pt.io.read_topics(input) self.assertTrue(exp_result.equals(result)) def test_parse_trec_topics_file_D(self): input = os.path.dirname(os.path.realpath(__file__)) + "/fixtures/topics.trec" - exp_result = pd.DataFrame([["1", "lights"], ["2", "radiowaves"], ["3", "sounds"]], columns=['qid', 'query']) - result = pt.io.read_topics(input, format="trec", whitelist=["DESC"], blacklist=["TITLE"]) - self.assertTrue(exp_result.equals(result)) \ No newline at end of file + exp_result = pd.DataFrame( + [["1", "lights"], ["2", "radiowaves"], ["3", "sounds"]], + columns=["qid", "query"], + ) + result = pt.io.read_topics( + input, format="trec", whitelist=["DESC"], blacklist=["TITLE"] + ) + self.assertTrue(exp_result.equals(result)) + + def test_parse_trecxml_topics_file(self): + input = os.path.dirname(os.path.realpath(__file__)) + "/fixtures/topics.trecxml" + result = pt.io.read_topics(input, format="trecxml", tags=["title"]) + exp_result = pd.DataFrame( + [["1", "lights"], ["2", "radiowaves"], ["3", "sounds"]], + columns=["qid", "query"], + ) + self.assertTrue(exp_result.equals(result))