huggingface
diff --git a/‎.circleci/deploy.sh‎
Lines changed: 1 addition & 0 deletions b/‎.circleci/deploy.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/self-assign.yaml‎
Lines changed: 16 additions & 0 deletions b/‎.github/workflows/self-assign.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 13 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎datasets/multi_woz_v22/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/multi_woz_v22/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/newsph_nli/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/newsph_nli/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/newsph_nli/dummy/1.0.0/dummy_data.zip‎
0 Bytes b/‎datasets/newsph_nli/dummy/1.0.0/dummy_data.zip‎
0 Bytes
diff --git a/‎datasets/newsph_nli/newsph_nli.py‎
Lines changed: 11 additions & 9 deletions b/‎datasets/newsph_nli/newsph_nli.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎datasets/wiki_auto/README.md‎
Lines changed: 2 additions & 2 deletions b/‎datasets/wiki_auto/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/_static/js/custom.js‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/_static/js/custom.js‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/conf.py‎
Lines changed: 1 addition & 1 deletion
@@ -34,6 +34,7 @@ deploy_doc "master" master
 
 # Example of how to deploy a doc on a certain commit (the commit doesn't have to be on the master branch).
 # The following commit would live on huggingface.co/docs/datasets/v1.0.0
+deploy_doc "b0d7ae1" v1.6.2
 deploy_doc "e8fc41f" v1.6.1
 deploy_doc "40bb9e6" v1.6.0
 deploy_doc "f256b77" v1.5.0
 
@@ -0,0 +1,16 @@
+name: Self-assign
+on:
+  issue_comment:
+    types: created
+jobs:
+  one:
+    runs-on: ubuntu-latest
+    if: >-
+      (github.event.comment.body == '#take' ||
+       github.event.comment.body == '#self-assign')
+      && !github.event.issue.assignee
+    steps:
+      - run: |
+          echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
+          curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
+          curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted
@@ -11,6 +11,19 @@ Many thanks in advance to every contributor.
 In order to facilitate healthy, constructive behavior in an open and inclusive community, we all respect and abide by 
 our [code of conduct](CODE_OF_CONDUCT.md).
 
+## How to work on an open Issue?
+You have the list of open Issues at: https://github.com/huggingface/datasets/issues
+
+Some of them may have the label `help wanted`: that means that any contributor is welcomed!
+
+If you would like to work on any of the open Issues:
+
+1. Make sure it is not already assigned to someone else. You have the assignee (if any) on the top of the right column of the Issue page.
+
+2. You can self-assign it by commenting on the Issue page with one of the keywords: `#take` or `#self-assign`.
+
+3. Work on your self-assigned issue and eventually create a Pull Request.
+
 ## How to create a Pull Request?
 1. Fork the [repository](https://github.com/huggingface/datasets) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account.
 
 
@@ -1 +1 @@
-{"default": {"description": "    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": "    @article{cruz2020investigating,\n      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n      journal={arXiv preprint arXiv:2010.11574},\n      year={2020}\n    }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
+{"default": {"description": "    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": "    @article{cruz2020investigating,\n      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n      journal={arXiv preprint arXiv:2010.11574},\n      year={2020}\n    }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}
@@ -21,22 +21,24 @@
 
 
 _DESCRIPTION = """\
-    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.
+First benchmark dataset for sentence entailment in the low-resource Filipino language.
+Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,
+in 70-15-15 split for training, validation, and testing.
 """
 
 _CITATION = """\
-    @article{cruz2020investigating,
-      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
-      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
-      journal={arXiv preprint arXiv:2010.11574},
-      year={2020}
-    }
+@article{cruz2020investigating,
+    title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
+    author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
+    journal={arXiv preprint arXiv:2010.11574},
+    year={2020}
+}
 """
 
 _HOMEPAGE = "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks"
 
 # TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0"
 
 _URL = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip"
 
@@ -68,7 +70,7 @@ def _split_generators(self, dl_manager):
         data_dir = dl_manager.download_and_extract(_URL)
         download_path = os.path.join(data_dir, "newsph-nli")
         train_path = os.path.join(download_path, "train.csv")
-        test_path = os.path.join(download_path, "train.csv")
+        test_path = os.path.join(download_path, "test.csv")
         validation_path = os.path.join(download_path, "valid.csv")
 
         return [
 
@@ -132,7 +132,7 @@ The `auto` config shows a pair of an English and corresponding Simple English Wi
   'simple_article_url': 'https://simple.wikipedia.org/wiki?curid=702227'}}
 ```
 
-Finally, the `auto_acl`, the `auto_full_no_split`, and the `auto_full_with_split` configs were obtained by selecting the aligned pairs of sentences from `auto` to provide a ready-to-go aligned dataset to train a sequence-to-sequence system. While `auto_acl` corresponds to the filtered version of the data used to train the systems in the paper, `auto_full_no_split` and `auto_full_with_split` correspond to the unfiltered versions with and without sentence splits respectively. In the `auto_full_with_split` config, we join the sentences in the simple article mapped to the same sentence in the complex article to capture sentence splitting. Split sentences are seperated by a `<SEP>` token.  In the `auto_full_no_split` config, we do not join the splits and treat them as seperate pairs. An instance is a single pair of sentences:
+Finally, the `auto_acl`, the `auto_full_no_split`, and the `auto_full_with_split` configs were obtained by selecting the aligned pairs of sentences from `auto` to provide a ready-to-go aligned dataset to train a sequence-to-sequence system. While `auto_acl` corresponds to the filtered version of the data used to train the systems in the paper, `auto_full_no_split` and `auto_full_with_split` correspond to the unfiltered versions with and without sentence splits respectively. In the `auto_full_with_split` config, we join the sentences in the simple article mapped to the same sentence in the complex article to capture sentence splitting. Split sentences are separated by a `<SEP>` token.  In the `auto_full_no_split` config, we do not join the splits and treat them as separate pairs. An instance is a single pair of sentences:
 ```
 {'normal_sentence': 'In early work , Rutherford discovered the concept of radioactive half-life , the radioactive element radon , and differentiated and named alpha and beta radiation .\n',
  'simple_sentence': 'Rutherford discovered the radioactive half-life , and the three parts of radiation which he named Alpha , Beta , and Gamma .\n'}
@@ -240,4 +240,4 @@ You can cite the paper presenting the dataset as:
 
 ### Contributions
 
-Thanks to [@yjernite](https://github.com/yjernite), [@mounicam](https://github.com/mounicam) for adding this dataset.
+Thanks to [@yjernite](https://github.com/yjernite), [@mounicam](https://github.com/mounicam) for adding this dataset.
@@ -21,6 +21,7 @@ const versionMapping = {
     "v1.5.0": "v1.5.0",
     "v1.6.0": "v1.6.0",
     "v1.6.1": "v1.6.1",
+    "v1.6.2": "v1.6.2",
 }
 
 function addIcon() {
 
@@ -25,7 +25,7 @@
 # The short X.Y version
 version = ""
 # The full version, including alpha/beta/rc tags
-release = "1.6.1"
+release = "1.6.2"
 
 
 # -- General configuration ---------------------------------------------------
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
	`1`	+{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ const versionMapping = {`
`21`	`21`	`"v1.5.0": "v1.5.0",`
`22`	`22`	`"v1.6.0": "v1.6.0",`
`23`	`23`	`"v1.6.1": "v1.6.1",`
	`24`	`+ "v1.6.2": "v1.6.2",`
`24`	`25`	`}`
`25`	`26`
`26`	`27`	`function addIcon() {`