Merge branch 'main' of https://github.com/bigcode-project/open-eval i…

…nto main
bigcode-project · Apr 25, 2024 · 7aab8ee · 7aab8ee
2 parents 1a30bc3 + 4d74c59
commit 7aab8ee
Show file tree

Hide file tree

Showing 125 changed files with 5,252 additions and 8,909 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -219,11 +219,11 @@ pip install -r requirements.txt
 If you notice any third-party libraries that are not included in the `requirements.txt` file but used in the `data/process.py` file, please add them with the compatible versions to the `requirements.txt` file.
 
 ### How to Validate Data?
-We build a GitHub action to validate the data. The action is based on the `script/bash.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
+We build a GitHub action to validate the data. The action is based on the `script/run.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
 
 If you want to validate the data locally, you can run the following command:
 ```bash
-sh script/bash.sh
+sh script/run.sh
 ```
 
 If you find any failed test cases, please fix the data in the `data/raw` folder based on the failed problem IDs. The refinement should be based on the [How to Refine Data?](#how-to-refine-data) section.
diff --git a/data/clean/f_657_simon_chien_edit.py b/data/clean/f_657_simon_chien_edit.py
@@ -0,0 +1,141 @@
+import re
+import os
+import glob
+
+
+def f_657(dir_path):
+    """
+    Search for occurrences of the word "error" in all text files within a 
+    specified directory and its subdirectories.
+    
+    Parameters:
+    dir_path (str): The path of the directory.
+    
+    Returns:
+    dict: A dictionary with relative file paths as keys and the count of 
+            occurrences of the word "error" as values.
+    
+    Raises:
+    - ValueError: If directory in dir_path does not exist.
+
+    Requirements:
+    - re: For regex pattern matching.
+    - os: For retrieving relative file paths.
+    - glob: For fetching all text file paths in the directory.
+    
+    The function specifically searches for the word "error" in text files
+    (with the extension ".txt").
+    This function is NOT case sensitive, e.g. also "ERROr" will be counted.
+    
+    Example:
+    >>> f_657("/path/to/directory")
+    {'file1.txt': 2, 'subdir/file2.txt': 1}
+    """
+
+    if not os.path.isdir(dir_path):
+        raise ValueError("Specified directory does not exist.")
+
+    result = {}
+    file_paths = glob.glob(f'{dir_path}/**/*.txt', recursive=True)
+    for file_path in file_paths:
+        with open(file_path, 'r') as file:
+            content = file.read()
+        matches = re.findall(r'\berror\b', content, re.IGNORECASE)
+        # Always set the file's count in the result dictionary, even if it's 0
+        result[os.path.relpath(file_path, dir_path)] = len(matches)
+
+    return result
+
+
+import unittest
+import os
+import shutil
+import tempfile
+
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+class TestCases(unittest.TestCase):
+
+    def setUp(self):
+        # Create a temporary directory to simulate test environments
+        self.test_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        # Remove the temporary directory after the test
+        shutil.rmtree(self.test_dir)
+
+    def create_file(self, sub_path, content=""):
+        # Helper method to create a file with given content
+        full_path = os.path.join(self.test_dir, sub_path)
+        os.makedirs(os.path.dirname(full_path), exist_ok=True)
+        with open(full_path, 'w') as file:
+            file.write(content)
+        # Return normalized path for cross-platform compatibility
+        return os.path.normpath(sub_path)
+
+    def test_non_existent(self):
+        # Expect ValueError for non-existent directory
+        with self.assertRaises(ValueError):
+            f_657(os.path.join(self.test_dir, "non_existent"))
+
+    def test_empty_folder(self):
+        # Test empty directory
+        result = f_657(self.test_dir)
+        self.assertEqual(result, {})
+
+    def test_files_with_errors(self):
+        # Files with varying counts of 'error'
+        files = {
+            "1.txt": "error\nERROR\nErrOr",
+            "subfolder1/2.txt": "",
+            "subfolder2/3.txt": "error\nerror error"
+        }
+        expected = {
+            os.path.normpath("1.txt"): 3,
+            os.path.normpath("subfolder1/2.txt"): 0,
+            os.path.normpath("subfolder2/3.txt"): 3
+        }
+        for path, content in files.items():
+            self.create_file(path, content)
+
+        result = f_657(self.test_dir)
+        self.assertEqual(result, expected)
+
+    def test_case_sensitive_and_realistic_text(self):
+        # More complex scenarios, including nested directories
+        file_path = self.create_file('nested/folder1/folder2/error_log.txt', 'Error\nerror\nERROR')
+        expected = {file_path: 3}
+        result = f_657(self.test_dir)
+        self.assertEqual(result, expected)
+
+    def test_exact_word_matching(self):
+        # Ensure only the exact word 'error' is counted and ignore similar words like 'errors'
+        files = {
+            "file1.txt": "error error error",  # Should count 3 times
+            "subdir/file2.txt": "errors error erro errors",  # Should count 1 time
+            "subdir2/nested/file3.txt": "an error occurred",  # Should count 1 time
+            "subdir3/file4.txt": "no errors here",  # Should count 0 times
+            "subdir3/file5.txt": "Error and ERROR and error"  # Should count 3 times, case insensitive
+        }
+        expected = {
+            os.path.normpath("file1.txt"): 3,
+            os.path.normpath("subdir/file2.txt"): 1,
+            os.path.normpath("subdir2/nested/file3.txt"): 1,
+            os.path.normpath("subdir3/file4.txt"): 0,
+            os.path.normpath("subdir3/file5.txt"): 3
+        }
+        for path, content in files.items():
+            self.create_file(path, content)
+
+        result = f_657(self.test_dir)
+        self.assertEqual(result, expected)
+
+
+if __name__ == "__main__":
+    run_tests()