bigcode-project · loubnabnl · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 21, 2023
diff --git a/data_analysis/pull-requests/reconstruct_prs.ipynb b/data_analysis/pull-requests/reconstruct_prs.ipynb
@@ -0,0 +1,349 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: python-dateutil in /Users/loubnabenallal/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /Users/loubnabenallal/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages (from python-dateutil) (1.16.0)\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install python-dateutil"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 330,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "from dateutil.parser import parse\n",
+    "from datasets import load_dataset, Dataset\n",
+    "\n",
+    "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n",
+    "\n",
+    "size = 500_000\n",
+    "\n",
+    "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n",
+    "\n",
+    "# 10k subset of random samples from ds\n",
+    "ds = list(ds.take(size))\n",
+    "ds = Dataset.from_pandas(pd.DataFrame(data=ds))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 365,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['pull_request.guid', 'pull_request.code_review_events', 'pull_request.events', 'pull_request.issue_events', 'bucket', '__index_level_0__'],\n",
+       "    num_rows: 500000\n",
+       "})"
+      ]
+     },
+     "execution_count": 365,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 444,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge all three instances\n",
+    "\n",
+    "pull_request_info_cols = [\n",
+    "    \"repo.name\",\n",
+    "    \"repo.id\",\n",
+    "    \"org.id\",\n",
+    "    \"public\",\n",
+    "    \"pull_request.id\",\n",
+    "    \"pull_request.guid\",\n",
+    "    \"pull_request.number\",\n",
+    "    \"pull_request.title\",\n",
+    "    \"pull_request.body\",\n",
+    "    \"pull_request.state\",\n",
+    "    \"pull_request.user.login\",\n",
+    "    \"pull_request.user.id\",\n",
+    "    # add user type\n",
+    "    \"pull_request.head.user.type\",\n",
+    "    \"pull_request.base.user.type\",\n",
+    "    \"pull_request.created_at\",\n",
+    "    \"pull_request.closed_at\",\n",
+    "    \"pull_request.merged_at\",\n",
+    "    \"pull_request.merged_by.login\",\n",
+    "    \"pull_request.milestone.title\",\n",
+    "    \"pull_request.milestone.description\",\n",
+    "    \"pull_request.milestone.number\",\n",
+    "    # commits\n",
+    "    'pull_request.commits',\n",
+    "    'pull_request.additions',\n",
+    "    'pull_request.deletions',\n",
+    "    # changed files\n",
+    "    'pull_request.changed_files',\n",
+    "    \"pull_request.comments\",\n",
+    "    \"pull_request.review_comments\",\n",
+    "]\n",
+    "\n",
+    "head_info_cols = [\n",
+    "    \"pull_request.head.label\",\n",
+    "    \"pull_request.head.ref\",\n",
+    "    \"pull_request.head.user.login\",\n",
+    "    \"pull_request.head.user.type\",\n",
+    "    \"pull_request.head.repo.owner.login\",\n",
+    "    \"pull_request.head.repo.owner.type\",\n",
+    "    \"pull_request.head.repo.license.name\",\n",
+    "    \"pull_request.head.sha\",\n",
+    "    'pull_request.head.repo.name',\n",
+    "    'pull_request.head.repo.owner.login',\n",
+    "    'pull_request.head.repo.homepage',\n",
+    "    'pull_request.head.repo.description',\n",
+    "    'pull_request.head.repo.language',\n",
+    "    'pull_request.head.repo.stargazers_count',\n",
+    "    'pull_request.head.repo.license.name',\n",
+    "    'pull_request.head.repo.default_branch',\n",
+    "    'pull_request.head.repo.private'\n",
+    "]\n",
+    "base_info_cols = [\n",
+    "    \"pull_request.base.label\",\n",
+    "    \"pull_request.base.ref\",\n",
+    "    \"pull_request.base.sha\",\n",
+    "    \"pull_request.base.user.login\",\n",
+    "    \"pull_request.base.user.type\",\n",
+    "    \"pull_request.base.repo.owner.login\",\n",
+    "    \"pull_request.base.repo.owner.type\",\n",
+    "    \"pull_request.base.repo.license.name\",\n",
+    "    \"pull_request.base.repo.default_branch\",\n",
+    "    \"pull_request.base.repo.description\",\n",
+    "    \"pull_request.base.repo.language\",\n",
+    "    \"pull_request.base.repo.watchers_count\",\n",
+    "    \"pull_request.base.repo.open_issues_count\",\n",
+    "    \"pull_request.base.repo.forks_count\",\n",
+    "    'pull_request.base.repo.name',\n",
+    "    'pull_request.base.repo.owner.login',\n",
+    "    'pull_request.base.repo.homepage',\n",
+    "    'pull_request.base.repo.description',\n",
+    "    'pull_request.base.repo.language',\n",
+    "    'pull_request.base.repo.stargazers_count',\n",
+    "    'pull_request.base.repo.private',\n",
+    "    'pull_request.comments',\n",
+    "    'pull_request.review_comments',\n",
+    "    'pull_request.label.name',\n",
+    "]\n",
+    "\n",
+    "reviews_info = [# review events only\n",
+    "    'actor.login',\n",
+    "    'actor.id',\n",
+    "    'user.login',\n",
+    "    'user.type',\n",
+    "    'review.state',\n",
+    "    'review.id', \n",
+    "    'review.body', \n",
+    "    'review.commit_id', \n",
+    "    'review.submitted_at', \n",
+    "    'review.author_association',\n",
+    "    \"pull_request.state\",\n",
+    "    \"pull_request.merged\",\n",
+    "    \"pull_request.merged_by.login\",\n",
+    "    \"pull_request.merged_by.type\",\n",
+    "    # comments\n",
+    "    'comment.id',\n",
+    "    'comment.diff_hunk',\n",
+    "    'comment.body',\n",
+    "    'comment.path',\n",
+    "    'comment.position',\n",
+    "    'comment.original_position',\n",
+    "    'comment.commit_id',\n",
+    "    'comment.original_commit_id',\n",
+    "    'comment.created_at',\n",
+    "    'comment.updated_at',\n",
+    "    'comment.author_association',\n",
+    "    'comment.start_line',\n",
+    "    'comment.original_start_line',\n",
+    "    'comment.start_side',\n",
+    "    'comment.line',\n",
+    "    'comment.original_line',\n",
+    "    'comment.side',\n",
+    "    'comment.in_reply_to_id',]\n",
+    "\n",
+    "\n",
+    "issues_info = [\n",
+    " 'author',\n",
+    " 'comment',\n",
+    " 'comment_id']\n",
+    " \n",
+    "event_info = reviews_info + issues_info\n",
+    "\n",
+    "def get_event_info(review):\n",
+    "    res = {k: review[k] if k in review else None for k in event_info}\n",
+    "    # for keys in issues_info add prefix issue.\n",
+    "    for k in issues_info:\n",
+    "        res[\"issue.\" + k] = res[k]\n",
+    "        del res[k]\n",
+    "    return res\n",
+    "\n",
+    "def load_json(data):\n",
+    "    try:\n",
+    "        data = json.loads(data)\n",
+    "        if isinstance(data, dict):\n",
+    "            data = [data]\n",
+    "        return data\n",
+    "    except TypeError:\n",
+    "        return []\n",
+    "\n",
+    "def update_datetime(e):\n",
+    "    e[\"created_at\"] = parse(e[\"created_at\"])\n",
+    "    return e\n",
+    "\n",
+    "def merge_events(row):\n",
+    "    events = load_json(row[\"pull_request.events\"])\n",
+    "    reviews = load_json(row[\"pull_request.code_review_events\"])\n",
+    "    issues = load_json(row[\"pull_request.issue_events\"])\n",
+    "\n",
+    "    assert len(issues) <= 1\n",
+    "    if issues:\n",
+    "        issues_events = issues[0][\"events\"]\n",
+    "        # for each events in each category group all events sorted by \"created_at\" in one list\n",
+    "        for e in issues_events:\n",
+    "            e[\"created_at\"] = parse(e[\"datetime\"])\n",
+    "            del e[\"datetime\"]\n",
+    "    else:\n",
+    "        issues_events = []\n",
+    "    events = [update_datetime(e) for e in events]\n",
+    "    reviews = [update_datetime(e) for e in reviews]\n",
+    "    all_events = sorted(\n",
+    "        events + reviews + issues_events,\n",
+    "        key=lambda x: x[\"created_at\"]\n",
+    "    )\n",
+    "    try:\n",
+    "        base_data = events[0] if events else reviews[0]\n",
+    "    except IndexError:\n",
+    "        # init empty dict\n",
+    "        base_data = {}\n",
+    "        if issues:\n",
+    "            base_data = {}\n",
+    "            first_event = issues[0]\n",
+    "            base_data['pull_request.title'] = first_event[\"events\"][0][\"title\"]\n",
+    "            base_data[\"repo.name\"] = first_event[\"repo\"]\n",
+    "            base_data[\"pull_request.number\"] = first_event[\"pull_request\"][\"number\"]\n",
+    "            base_data[\"pull_request.user.login\"] = first_event[\"pull_request\"][\"user_login\"]\n",
+    "            print(\"filling PR data from issue event\")\n",
+    "        else:\n",
+    "            raise IndexError(\"No events for PR\")\n",
+    "    \n",
+    "    # Initialize with default values\n",
+    "    pr_info = {k: None for k in pull_request_info_cols}\n",
+    "    head_info = {k: None for k in head_info_cols}\n",
+    "    base_info = {k: None for k in base_info_cols}\n",
+    "\n",
+    "    # Fill available data\n",
+    "    pr_info.update({k: base_data[k] if k in base_data else None for k in pull_request_info_cols})\n",
+    "    head_info.update({k: base_data[k] if k in base_data else None  for k in head_info_cols })\n",
+    "    base_info.update({k: base_data[k]  if k in base_data else None for k in base_info_cols})\n",
+    "\n",
+    "    # each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
+    "    comments = [{\"type\": e[\"type\"],\n",
+    "                \"action\": e[\"action\"],\n",
+    "                \"created_at\": e[\"created_at\"],\n",
+    "                **get_event_info(e)} for e in all_events]\n",
+    "    new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}\n",
+    "    return new_row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_ds = ds.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 449,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.20ba/s]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.10s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 33.12ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00,  9.55s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 39.47ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00,  9.99s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.45ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:23<00:00, 23.74s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 34.84ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.48s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:03<00:00, 26.04ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.62s/it]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 6/6 [02:10<00:00, 21.69s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "merged_ds.push_to_hub(\"loubnabnl/code_reviews_500k\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.11.0 ('.venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "0cc3054246fa39b40b564a97820c10836c9fb6acdf94e9196ea3a787cac26526"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_analysis/stackoverflow/README.md b/data_analysis/stackoverflow/README.md
@@ -0,0 +1,3 @@
+## Code for processing StackExchange data
+
+Code for processing stackexchange data dump available in `h4_code` (to build https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) and `other`, notebook for further processing (e.g convert all HTML to Markdown) in `StackExchangeProcessing.ipynb` (to build https://huggingface.co/datasets/lvwerra/stack-exchange-paired)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## Code for processing StackExchange data

		Code for processing stackexchange data dump available in `h4_code` (to build https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) and `other`, notebook for further processing (e.g convert all HTML to Markdown) in `StackExchangeProcessing.ipynb` (to build https://huggingface.co/datasets/lvwerra/stack-exchange-paired)