Skip to content

Commit

Permalink
First version of the wikipedia dataset + creating the track. (#429)
Browse files Browse the repository at this point in the history
  • Loading branch information
afoucret authored Sep 18, 2023
1 parent 0b0b0b4 commit a40f531
Show file tree
Hide file tree
Showing 11 changed files with 10,657 additions and 0 deletions.
58 changes: 58 additions & 0 deletions wikipedia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
## Wikipedia Search track

This track benchmarks

The dataset is derived from a dump of wikipedia availaible here:
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2.

Each page is formatted into a JSON document with the following fields:

title: Page title
namespace: Optional namespace for the page. [Namespaces](https://en.wikipedia.org/wiki/Wikipedia:Namespace) allow for the organization and separation of content pages from administration pages.
content: Page content.
redirect: If the page is a redirect, the target of the redirection. In this case content is empty.

Fields that do not have values have been left out.

### Generating the documents dataset

To regenerate the dataset from scratch, first download and unzip an archive
of Wikipedia dumps from [this link](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2) (~21GB).

Then run this command:

```bash
python _tools/parse_documents.py <path_to_xml_file> | pbzip2 -9 -k -m2000 > documents.json.bz2
```

### Generating clickstream probability ditribution

To generate the probability distribution of the most frequent queries in a specific month from the Wikimedia clickstream, please execute the following command

```bash
python3 _tools/parse_clicks.py --year 2023 --month 6 --lang en > queries.csv
```

### Example Document

```json
{
"title": "Anarchism",
"content": "{{short description|Political philosophy and movement}}\n{{other uses}}\n{{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}\n{{distinguish|Anarchy}}\n{{good article}}\n{{pp-semi-indef}}\n{{use British English|date=August 2021}}\n{{use dmy dates|date=August 2021}}\n{{Use shortened footnotes|date=May 2023}}\n{{anarchism sidebar}}\n{{basic forms of government}}\n\n'''Anarchism''' is a [[political philosophy]] and [[Political movement|movement]] that is skeptical of all justifications for [[authority]] and seeks to abolish the [[institutions]] it claims maintain unnecessary [[coercion]] and [[Social hierarchy|hierarchy]], typically including [[government]]s,<ref name=\":0\">{{Cite book |title=The Desk Encyclopedia of World History |publisher=[[Oxford University Press]] |year=2006 |isbn=978-0-7394-7809-7 |editor-last=Wright |editor-first=Edmund |location=New York |pages=20\u201321}}</ref> [[State (polity)|nation states]],{{sfn|Suissa|2019b|ps=: \"...as many anarchists have stressed, it is not government as such that they find objectionable, but the hierarchical forms of government associated with the nation state.\"}} [[law]] and [[law enforcement]],<ref name=\":0\" /> and [[capitalism]]. Anarchism advocates for the replacement of the state with [[Stateless society|stateless societies]] or other forms of [[Free association (communism and anarchism)|free associations]]. As a historically [[left-wing]] movement, this reading of anarchism is placed on the [[Far-left politics|farthest left]] of the [[political spectrum]], usually described as the [[libertarian]] wing of the [[socialist movement]] ([ ..."
}
```

### Parameters

This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:

- `bulk_size` (default: 500)
- `bulk_indexing_clients` (default: 1)
- `ingest_percentage` (default: 100)
- `number_of_replicas` (default: 0)
- `number_of_shards` (default: 1)

### License

We use the same license for the data as the original data: [CC-SA-3.0](http://creativecommons.org/licenses/by-sa/3.0/).
More details can be found on [this page](https://en.wikipedia.org/wiki/Wikipedia:Copyrights).
101 changes: 101 additions & 0 deletions wikipedia/_tools/parse_clicks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import argparse
import csv
import gzip
import logging
import os
import pickle
import sys
from collections import Counter

import requests

# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Define a handler to output log messages to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Define a formatter for the log messages
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)

# Add the console handler to the logger
logger.addHandler(console_handler)


class ClickStreamDist:
def __init__(self, year, month, lang):
self.filename = f"clickstream-{lang}wiki-{year}-{month:02d}.tsv.gz"
self.url = f"https://dumps.wikimedia.org/other/clickstream/{year:d}-{month:02d}/{self.filename}"
self.clickstream_output_file = os.path.expanduser(f"~/.rally/benchmarks/data/wikipedia/{self.filename}")

def download(self):
if os.path.exists(self.clickstream_output_file):
logger.info("File already exists. Skipping download.")
return

logger.info("Downloading the clickstream file...")
response = requests.get(self.url)

if response.status_code == 200:
# Create the file if it is missing
os.makedirs(os.path.dirname(self.clickstream_output_file), exist_ok=True)

# Write the content to the file
with open(self.clickstream_output_file, "wb") as file:
file.write(response.content)
logger.info("File downloaded successfully.")
else:
logger.info("Failed to download the file.")

def analyze(self):
logger.info("Analyzing...")

word_freq = self.calculate_word_frequency()
word_prob = self.calculate_word_probability(word_freq)

self.dump_probability_distribution(word_prob)

logger.info("Analysis completed.")

def calculate_word_frequency(self):
logger.info("Calculating word frequency...")

word_freq = Counter()
with gzip.open(self.clickstream_output_file, "rt", encoding="utf-8") as file:
# Documentation for clickstream format: https://meta.wikimedia.org/wiki/Research:Wikipedia_clickstream
for row in csv.reader(file, delimiter="\t"):
prev, curr, count = row[0], row[1].replace("_", " ").strip().replace('"', ""), int(row[3])
if prev != "other-search" and curr != "Main Page":
word_freq[curr] += count

sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

return sorted_word_freq[:10000]

def calculate_word_probability(self, word_freq):
logger.info("Calculating word probability...")

total_words = sum(count for _, count in word_freq)

return [(word, count / total_words) for word, count in word_freq]

def dump_probability_distribution(self, prob_dist):
logger.info("Dumping probability distribution...")

writer = csv.writer(sys.stdout)
writer.writerow(["query", "probability"])
writer.writerows(prob_dist)


parser = argparse.ArgumentParser()
parser.add_argument("--year", type=int, default=2023, help="Year")
parser.add_argument("--month", type=int, default=6, help="Month")
parser.add_argument("--lang", type=str, default="en", help="Language")
args = parser.parse_args()

click_stream = ClickStreamDist(year=args.year, month=args.month, lang=args.lang)
click_stream.download()
click_stream.analyze()
54 changes: 54 additions & 0 deletions wikipedia/_tools/parse_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import bz2
import json
import sys
from xml.etree import cElementTree

PAGE_TAG = "page"
SITEINFO_TAG = "siteinfo"
XML_NAMESPACES = {"": "http://www.mediawiki.org/xml/export-0.10/"}


def doc_generator(f):
namespaces = dict()
for _, element in cElementTree.iterparse(f):
_, tag = element.tag.split("}")
if tag == PAGE_TAG:
yield parse_page(element, namespaces)
element.clear()
if tag == SITEINFO_TAG:
namespaces = parse_namespaces(element)


def to_json(f):
with bz2.BZ2File(f, "r") as fp:
for doc in doc_generator(fp):
print(json.dumps(doc))


def parse_namespaces(element) -> dict:
namespaces = dict()
for namespace_element in element.findall("namespaces/namespace", XML_NAMESPACES):
namespaces[namespace_element.get("key")] = namespace_element.text
return namespaces


def parse_page(element, namespaces):
page_data = {
"title": element.find("title", XML_NAMESPACES).text,
}

redirect = element.find("redirect", XML_NAMESPACES)
if redirect is not None:
page_data["redirect"] = redirect.get("title")
else:
page_data["content"] = element.find("revision/text", XML_NAMESPACES).text

namespace = namespaces[element.find("ns", XML_NAMESPACES).text]
if namespace is not None:
page_data["namespace"] = namespace

return page_data


for file_name in sys.argv[1:]:
to_json(file_name)
49 changes: 49 additions & 0 deletions wikipedia/challenges/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"name": "index-and-search",
"description": "Indexes wikipedia data, then executes searches.",
"default": true,
"schedule": [
{
"name": "delete-index",
"operation": "delete-index"
},
{
"name": "create-index",
"operation": "create-index"
},
{
"name": "check-cluster-health",
"operation": "check-cluster-health"
},
{
"name": "index-documents",
"operation": "index-documents",
"warmup-time-period": {{ bulk_warmup | default(40) | int }},
"clients": {{bulk_indexing_clients | default(5)}}
},
{
"name": "refresh-after-index",
"operation": "refresh-after-index"
},
{
"name": "query-string-search",
"operation": "query-string-search",
"clients": {{search_clients | default(20)}},
"warmup-iterations": 100
},
{
"name": "clear-cache",
"operation": "clear-cache"
},
{
"name": "create-default-search-application",
"operation": "create-default-search-application"
},
{
"name": "default-search-application-search",
"operation": "default-search-application-search",
"clients": {{search_clients | default(20)}},
"warmup-iterations": 100
}
]
}
2 changes: 2 additions & 0 deletions wikipedia/files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
documents.json
documents-1k.json
53 changes: 53 additions & 0 deletions wikipedia/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"name": "delete-index",
"operation-type": "delete-index"
},
{
"name": "create-index",
"operation-type": "create-index"
},
{
"name": "check-cluster-health",
"operation-type": "cluster-health",
"request-params": {
"wait_for_status": "green"
},
"retry-until-success": true
},
{
"name": "index-documents",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(500)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "refresh-after-index",
"operation-type": "refresh",
"request-timeout": 1000,
"include-in-reporting": true
},
{
"name": "create-default-search-application",
"operation-type": "raw-request",
"param-source": "create-search-application-param-source"
},
{
"name": "clear-cache",
"operation-type": "raw-request",
"path": "/_cache/clear",
"method": "POST"
},
{
"name": "default-search-application-search",
"operation-type": "raw-request",
"param-source": "search-application-search-param-source",
"iterations": {{search_iterations | default(100000)}}
},
{
"name": "query-string-search",
"operation-type": "search",
"param-source": "query-string-search",
"size" : {{search_size | default(20)}},
"search-fields" : "{{search_fields | default("*")}}",
"iterations": {{search_iterations | default(100000)}}
}
Loading

0 comments on commit a40f531

Please sign in to comment.