First version of the wikipedia dataset + creating the track. (#429)

elastic · Sep 18, 2023 · a40f531 · a40f531
1 parent 0b0b0b4
commit a40f531
Show file tree

Hide file tree

Showing 11 changed files with 10,657 additions and 0 deletions.
diff --git a/wikipedia/README.md b/wikipedia/README.md
@@ -0,0 +1,58 @@
+## Wikipedia Search track
+
+This track benchmarks
+
+The dataset is derived from a dump of wikipedia availaible here:
+https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2.
+
+Each page is formatted into a JSON document with the following fields:
+
+title: Page title
+namespace: Optional namespace for the page. [Namespaces](https://en.wikipedia.org/wiki/Wikipedia:Namespace) allow for the organization and separation of content pages from administration pages.
+content: Page content.
+redirect: If the page is a redirect, the target of the redirection. In this case content is empty.
+
+Fields that do not have values have been left out.
+
+### Generating the documents dataset
+
+To regenerate the dataset from scratch, first download and unzip an archive
+of Wikipedia dumps from [this link](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2) (~21GB).
+
+Then run this command:
+
+```bash
+python _tools/parse_documents.py <path_to_xml_file> | pbzip2 -9 -k -m2000 > documents.json.bz2
+```
+
+### Generating clickstream probability ditribution
+
+To generate the probability distribution of the most frequent queries in a specific month from the Wikimedia clickstream, please execute the following command
+
+```bash
+python3 _tools/parse_clicks.py --year 2023 --month 6 --lang en > queries.csv
+```
+
+### Example Document
+
+```json
+{
+  "title": "Anarchism",
+  "content": "{{short description|Political philosophy and movement}}\n{{other uses}}\n{{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}\n{{distinguish|Anarchy}}\n{{good article}}\n{{pp-semi-indef}}\n{{use British English|date=August 2021}}\n{{use dmy dates|date=August 2021}}\n{{Use shortened footnotes|date=May 2023}}\n{{anarchism sidebar}}\n{{basic forms of government}}\n\n'''Anarchism''' is a [[political philosophy]] and [[Political movement|movement]] that is skeptical of all justifications for [[authority]] and seeks to abolish the [[institutions]] it claims maintain unnecessary [[coercion]] and [[Social hierarchy|hierarchy]], typically including [[government]]s,<ref name=\":0\">{{Cite book |title=The Desk Encyclopedia of World History |publisher=[[Oxford University Press]] |year=2006 |isbn=978-0-7394-7809-7 |editor-last=Wright |editor-first=Edmund |location=New York |pages=20\u201321}}</ref> [[State (polity)|nation states]],{{sfn|Suissa|2019b|ps=: \"...as many anarchists have stressed, it is not government as such that they find objectionable, but the hierarchical forms of government associated with the nation state.\"}} [[law]] and [[law enforcement]],<ref name=\":0\" /> and [[capitalism]]. Anarchism advocates for the replacement of the state with [[Stateless society|stateless societies]] or other forms of [[Free association (communism and anarchism)|free associations]]. As a historically [[left-wing]] movement, this reading of anarchism is placed on the [[Far-left politics|farthest left]] of the [[political spectrum]], usually described as the [[libertarian]] wing of the [[socialist movement]] ([ ..."
+}
+```
+
+### Parameters
+
+This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:
+
+- `bulk_size` (default: 500)
+- `bulk_indexing_clients` (default: 1)
+- `ingest_percentage` (default: 100)
+- `number_of_replicas` (default: 0)
+- `number_of_shards` (default: 1)
+
+### License
+
+We use the same license for the data as the original data: [CC-SA-3.0](http://creativecommons.org/licenses/by-sa/3.0/).
+More details can be found on [this page](https://en.wikipedia.org/wiki/Wikipedia:Copyrights).
diff --git a/wikipedia/_tools/parse_clicks.py b/wikipedia/_tools/parse_clicks.py
@@ -0,0 +1,101 @@
+import argparse
+import csv
+import gzip
+import logging
+import os
+import pickle
+import sys
+from collections import Counter
+
+import requests
+
+# Set up the logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Define a handler to output log messages to the console
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+
+# Define a formatter for the log messages
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+console_handler.setFormatter(formatter)
+
+# Add the console handler to the logger
+logger.addHandler(console_handler)
+
+
+class ClickStreamDist:
+    def __init__(self, year, month, lang):
+        self.filename = f"clickstream-{lang}wiki-{year}-{month:02d}.tsv.gz"
+        self.url = f"https://dumps.wikimedia.org/other/clickstream/{year:d}-{month:02d}/{self.filename}"
+        self.clickstream_output_file = os.path.expanduser(f"~/.rally/benchmarks/data/wikipedia/{self.filename}")
+
+    def download(self):
+        if os.path.exists(self.clickstream_output_file):
+            logger.info("File already exists. Skipping download.")
+            return
+
+        logger.info("Downloading the clickstream file...")
+        response = requests.get(self.url)
+
+        if response.status_code == 200:
+            # Create the file if it is missing
+            os.makedirs(os.path.dirname(self.clickstream_output_file), exist_ok=True)
+
+            # Write the content to the file
+            with open(self.clickstream_output_file, "wb") as file:
+                file.write(response.content)
+            logger.info("File downloaded successfully.")
+        else:
+            logger.info("Failed to download the file.")
+
+    def analyze(self):
+        logger.info("Analyzing...")
+
+        word_freq = self.calculate_word_frequency()
+        word_prob = self.calculate_word_probability(word_freq)
+
+        self.dump_probability_distribution(word_prob)
+
+        logger.info("Analysis completed.")
+
+    def calculate_word_frequency(self):
+        logger.info("Calculating word frequency...")
+
+        word_freq = Counter()
+        with gzip.open(self.clickstream_output_file, "rt", encoding="utf-8") as file:
+            # Documentation for clickstream format: https://meta.wikimedia.org/wiki/Research:Wikipedia_clickstream
+            for row in csv.reader(file, delimiter="\t"):
+                prev, curr, count = row[0], row[1].replace("_", " ").strip().replace('"', ""), int(row[3])
+                if prev != "other-search" and curr != "Main Page":
+                    word_freq[curr] += count
+
+        sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+
+        return sorted_word_freq[:10000]
+
+    def calculate_word_probability(self, word_freq):
+        logger.info("Calculating word probability...")
+
+        total_words = sum(count for _, count in word_freq)
+
+        return [(word, count / total_words) for word, count in word_freq]
+
+    def dump_probability_distribution(self, prob_dist):
+        logger.info("Dumping probability distribution...")
+
+        writer = csv.writer(sys.stdout)
+        writer.writerow(["query", "probability"])
+        writer.writerows(prob_dist)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--year", type=int, default=2023, help="Year")
+parser.add_argument("--month", type=int, default=6, help="Month")
+parser.add_argument("--lang", type=str, default="en", help="Language")
+args = parser.parse_args()
+
+click_stream = ClickStreamDist(year=args.year, month=args.month, lang=args.lang)
+click_stream.download()
+click_stream.analyze()
diff --git a/wikipedia/_tools/parse_documents.py b/wikipedia/_tools/parse_documents.py
@@ -0,0 +1,54 @@
+import bz2
+import json
+import sys
+from xml.etree import cElementTree
+
+PAGE_TAG = "page"
+SITEINFO_TAG = "siteinfo"
+XML_NAMESPACES = {"": "http://www.mediawiki.org/xml/export-0.10/"}
+
+
+def doc_generator(f):
+    namespaces = dict()
+    for _, element in cElementTree.iterparse(f):
+        _, tag = element.tag.split("}")
+        if tag == PAGE_TAG:
+            yield parse_page(element, namespaces)
+            element.clear()
+        if tag == SITEINFO_TAG:
+            namespaces = parse_namespaces(element)
+
+
+def to_json(f):
+    with bz2.BZ2File(f, "r") as fp:
+        for doc in doc_generator(fp):
+            print(json.dumps(doc))
+
+
+def parse_namespaces(element) -> dict:
+    namespaces = dict()
+    for namespace_element in element.findall("namespaces/namespace", XML_NAMESPACES):
+        namespaces[namespace_element.get("key")] = namespace_element.text
+    return namespaces
+
+
+def parse_page(element, namespaces):
+    page_data = {
+        "title": element.find("title", XML_NAMESPACES).text,
+    }
+
+    redirect = element.find("redirect", XML_NAMESPACES)
+    if redirect is not None:
+        page_data["redirect"] = redirect.get("title")
+    else:
+        page_data["content"] = element.find("revision/text", XML_NAMESPACES).text
+
+    namespace = namespaces[element.find("ns", XML_NAMESPACES).text]
+    if namespace is not None:
+        page_data["namespace"] = namespace
+
+    return page_data
+
+
+for file_name in sys.argv[1:]:
+    to_json(file_name)
diff --git a/wikipedia/challenges/default.json b/wikipedia/challenges/default.json
@@ -0,0 +1,49 @@
+{
+  "name": "index-and-search",
+  "description": "Indexes wikipedia data, then executes searches.",
+  "default": true,
+  "schedule": [
+    {
+      "name": "delete-index",
+      "operation": "delete-index"
+    },
+    {
+      "name": "create-index",
+      "operation": "create-index"
+    },
+    {
+      "name": "check-cluster-health",
+      "operation": "check-cluster-health"
+    },
+    {
+      "name": "index-documents",
+      "operation": "index-documents",
+      "warmup-time-period": {{ bulk_warmup | default(40) | int }},
+      "clients": {{bulk_indexing_clients | default(5)}}
+    },
+    {
+      "name": "refresh-after-index",
+      "operation": "refresh-after-index"
+    },
+    {
+      "name": "query-string-search",
+      "operation": "query-string-search",
+      "clients": {{search_clients | default(20)}},
+      "warmup-iterations": 100
+    },
+    {
+      "name": "clear-cache",
+      "operation": "clear-cache"
+    },
+    {
+      "name": "create-default-search-application",
+      "operation": "create-default-search-application"
+    },
+    {
+      "name": "default-search-application-search",
+      "operation": "default-search-application-search",
+      "clients": {{search_clients | default(20)}},
+      "warmup-iterations": 100
+    }
+  ]
+}
diff --git a/wikipedia/files.txt b/wikipedia/files.txt
@@ -0,0 +1,2 @@
+documents.json
+documents-1k.json
diff --git a/wikipedia/operations/default.json b/wikipedia/operations/default.json
@@ -0,0 +1,53 @@
+{
+  "name": "delete-index",
+  "operation-type": "delete-index"
+},
+{
+  "name": "create-index",
+  "operation-type": "create-index"
+},
+{
+  "name": "check-cluster-health",
+  "operation-type": "cluster-health",
+  "request-params": {
+    "wait_for_status": "green"
+  },
+  "retry-until-success": true
+},
+{
+  "name": "index-documents",
+  "operation-type": "bulk",
+  "bulk-size": {{bulk_size | default(500)}},
+  "ingest-percentage": {{ingest_percentage | default(100)}}
+},
+{
+  "name": "refresh-after-index",
+  "operation-type": "refresh",
+  "request-timeout": 1000,
+  "include-in-reporting": true
+},
+{
+  "name": "create-default-search-application",
+  "operation-type": "raw-request",
+  "param-source": "create-search-application-param-source"
+},
+{
+  "name": "clear-cache",
+  "operation-type": "raw-request",
+  "path": "/_cache/clear",
+  "method": "POST"
+},
+{
+  "name": "default-search-application-search",
+  "operation-type": "raw-request",
+  "param-source": "search-application-search-param-source",
+  "iterations": {{search_iterations | default(100000)}}
+},
+{
+  "name": "query-string-search",
+  "operation-type": "search",
+  "param-source": "query-string-search",
+  "size" : {{search_size | default(20)}},
+  "search-fields" : "{{search_fields | default("*")}}",
+  "iterations": {{search_iterations | default(100000)}}
+}