Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/bigcode-project/open-eval i…
Browse files Browse the repository at this point in the history
…nto main
  • Loading branch information
terryyz committed Apr 25, 2024
2 parents 1a30bc3 + 4d74c59 commit 7aab8ee
Show file tree
Hide file tree
Showing 125 changed files with 5,252 additions and 8,909 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,11 @@ pip install -r requirements.txt
If you notice any third-party libraries that are not included in the `requirements.txt` file but used in the `data/process.py` file, please add them with the compatible versions to the `requirements.txt` file.
### How to Validate Data?
We build a GitHub action to validate the data. The action is based on the `script/bash.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
We build a GitHub action to validate the data. The action is based on the `script/run.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
If you want to validate the data locally, you can run the following command:
```bash
sh script/bash.sh
sh script/run.sh
```
If you find any failed test cases, please fix the data in the `data/raw` folder based on the failed problem IDs. The refinement should be based on the [How to Refine Data?](#how-to-refine-data) section.
141 changes: 141 additions & 0 deletions data/clean/f_657_simon_chien_edit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import re
import os
import glob


def f_657(dir_path):
"""
Search for occurrences of the word "error" in all text files within a
specified directory and its subdirectories.
Parameters:
dir_path (str): The path of the directory.
Returns:
dict: A dictionary with relative file paths as keys and the count of
occurrences of the word "error" as values.
Raises:
- ValueError: If directory in dir_path does not exist.
Requirements:
- re: For regex pattern matching.
- os: For retrieving relative file paths.
- glob: For fetching all text file paths in the directory.
The function specifically searches for the word "error" in text files
(with the extension ".txt").
This function is NOT case sensitive, e.g. also "ERROr" will be counted.
Example:
>>> f_657("/path/to/directory")
{'file1.txt': 2, 'subdir/file2.txt': 1}
"""

if not os.path.isdir(dir_path):
raise ValueError("Specified directory does not exist.")

result = {}
file_paths = glob.glob(f'{dir_path}/**/*.txt', recursive=True)
for file_path in file_paths:
with open(file_path, 'r') as file:
content = file.read()
matches = re.findall(r'\berror\b', content, re.IGNORECASE)
# Always set the file's count in the result dictionary, even if it's 0
result[os.path.relpath(file_path, dir_path)] = len(matches)

return result


import unittest
import os
import shutil
import tempfile


def run_tests():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestCases))
runner = unittest.TextTestRunner()
runner.run(suite)


class TestCases(unittest.TestCase):

def setUp(self):
# Create a temporary directory to simulate test environments
self.test_dir = tempfile.mkdtemp()

def tearDown(self):
# Remove the temporary directory after the test
shutil.rmtree(self.test_dir)

def create_file(self, sub_path, content=""):
# Helper method to create a file with given content
full_path = os.path.join(self.test_dir, sub_path)
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, 'w') as file:
file.write(content)
# Return normalized path for cross-platform compatibility
return os.path.normpath(sub_path)

def test_non_existent(self):
# Expect ValueError for non-existent directory
with self.assertRaises(ValueError):
f_657(os.path.join(self.test_dir, "non_existent"))

def test_empty_folder(self):
# Test empty directory
result = f_657(self.test_dir)
self.assertEqual(result, {})

def test_files_with_errors(self):
# Files with varying counts of 'error'
files = {
"1.txt": "error\nERROR\nErrOr",
"subfolder1/2.txt": "",
"subfolder2/3.txt": "error\nerror error"
}
expected = {
os.path.normpath("1.txt"): 3,
os.path.normpath("subfolder1/2.txt"): 0,
os.path.normpath("subfolder2/3.txt"): 3
}
for path, content in files.items():
self.create_file(path, content)

result = f_657(self.test_dir)
self.assertEqual(result, expected)

def test_case_sensitive_and_realistic_text(self):
# More complex scenarios, including nested directories
file_path = self.create_file('nested/folder1/folder2/error_log.txt', 'Error\nerror\nERROR')
expected = {file_path: 3}
result = f_657(self.test_dir)
self.assertEqual(result, expected)

def test_exact_word_matching(self):
# Ensure only the exact word 'error' is counted and ignore similar words like 'errors'
files = {
"file1.txt": "error error error", # Should count 3 times
"subdir/file2.txt": "errors error erro errors", # Should count 1 time
"subdir2/nested/file3.txt": "an error occurred", # Should count 1 time
"subdir3/file4.txt": "no errors here", # Should count 0 times
"subdir3/file5.txt": "Error and ERROR and error" # Should count 3 times, case insensitive
}
expected = {
os.path.normpath("file1.txt"): 3,
os.path.normpath("subdir/file2.txt"): 1,
os.path.normpath("subdir2/nested/file3.txt"): 1,
os.path.normpath("subdir3/file4.txt"): 0,
os.path.normpath("subdir3/file5.txt"): 3
}
for path, content in files.items():
self.create_file(path, content)

result = f_657(self.test_dir)
self.assertEqual(result, expected)


if __name__ == "__main__":
run_tests()
Loading

0 comments on commit 7aab8ee

Please sign in to comment.