- 
                Notifications
    You must be signed in to change notification settings 
- Fork 1.1k
Commit 57cadf8
⚡️ Speed up function 
### 📄 111% (1.11x) speedup for ***`check_for_nltk_package` in
`unstructured/nlp/tokenize.py`***
⏱️ Runtime : **`57.7 milliseconds`** **→** **`27.3 milliseconds`** (best
of `101` runs)
### 📝 Explanation and details
Here’s an optimized version of your program. The main improvements are.
- Eliminates the unnecessary list and loop for constructing `paths`;
instead, uses a generator expression so memory is not allocated for an
intermediate list.
- Uses `os.path.join` only if needed, otherwise leaves the original
path.
- Caches the result by using a local variable within the function
instead of constructing the list first.
- Overall reduced allocations & faster iteration.
- Avoid creating and storing a full list with potentially many paths,
instead lazily generate them as needed by `nltk.find`.
This is as fast as possible, given the external dependencies (nltk’s own
`find()` algorithm).
✅ **Correctness verification report:**
| Test                        | Status            |
| --------------------------- | ----------------- |
| ⚙️ Existing Unit Tests | 🔘 **None Found** |
| 🌀 Generated Regression Tests | ✅ **796 Passed** |
| ⏪ Replay Tests | ✅ **8 Passed** |
| 🔎 Concolic Coverage Tests | 🔘 **None Found** |
|📊 Tests Coverage       | 100.0% |
<details>
<summary>🌀 Generated Regression Tests and Runtime</summary>
```python
from __future__ import annotations
import os
import shutil
import tempfile
import nltk
# imports
import pytest  # used for our unit tests
from unstructured.nlp.tokenize import check_for_nltk_package
# unit tests
# -------------------
# Basic Test Cases
# -------------------
def test_existing_corpus():
    # Test with a standard corpus that is usually present if nltk_data is installed
    # 'punkt' is a common tokenizer model
    codeflash_output = check_for_nltk_package('punkt', 'tokenizers') # 117μs -> 76.8μs (53.7% faster)
    # If 'punkt' is present, should return True
    # If not present, should return False
    # We check both to allow for environments where punkt is not installed
def test_nonexistent_package():
    # Test with a package that does not exist
    codeflash_output = check_for_nltk_package('nonexistent_package_xyz', 'corpora') # 100μs -> 59.6μs (68.8% faster)
def test_existing_wordnet_corpus():
    # Test with a common corpus
    codeflash_output = check_for_nltk_package('wordnet', 'corpora') # 97.5μs -> 55.7μs (75.2% faster)
def test_existing_stopwords():
    # Test with another common corpus
    codeflash_output = check_for_nltk_package('stopwords', 'corpora') # 96.0μs -> 55.3μs (73.6% faster)
# -------------------
# Edge Test Cases
# -------------------
def test_empty_package_name():
    # Empty package name should not be found
    codeflash_output = check_for_nltk_package('', 'corpora') # 99.5μs -> 57.4μs (73.3% faster)
def test_empty_package_category():
    # Empty category should not be found
    codeflash_output = check_for_nltk_package('punkt', '') # 98.4μs -> 56.2μs (75.2% faster)
def test_empty_both():
    # Both empty should not be found
    codeflash_output = check_for_nltk_package('', '') # 18.1μs -> 19.3μs (5.86% slower)
def test_special_characters_in_name():
    # Special characters in package name should not be found
    codeflash_output = check_for_nltk_package('!@#$%^&*()', 'corpora') # 119μs -> 72.4μs (65.1% faster)
def test_special_characters_in_category():
    # Special characters in category should not be found
    codeflash_output = check_for_nltk_package('punkt', '!!!') # 96.8μs -> 56.3μs (71.9% faster)
def test_case_sensitivity():
    # NLTK is case-sensitive, so wrong case should not be found
    codeflash_output = check_for_nltk_package('PUNKT', 'tokenizers') # 96.5μs -> 55.9μs (72.6% faster)
def test_path_without_nltk_data():
    # Simulate a path without 'nltk_data' at the end
    # Create a temporary directory structure
    with tempfile.TemporaryDirectory() as tmpdir:
        # Create a fake nltk_data/tokenizers/punkt directory
        nltk_data_dir = os.path.join(tmpdir, 'nltk_data', 'tokenizers')
        os.makedirs(nltk_data_dir)
        # Place a dummy file for 'punkt'
        with open(os.path.join(nltk_data_dir, 'punkt'), 'w') as f:
            f.write('dummy')
        # Temporarily override nltk.data.path
        orig_paths = list(nltk.data.path)
        nltk.data.path.insert(0, tmpdir)
        try:
            # Should find the package now
            codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
        finally:
            nltk.data.path = orig_paths
def test_path_with_nltk_data():
    # Simulate a path that already ends with 'nltk_data'
    with tempfile.TemporaryDirectory() as tmpdir:
        nltk_data_dir = os.path.join(tmpdir, 'nltk_data')
        tokenizers_dir = os.path.join(nltk_data_dir, 'tokenizers')
        os.makedirs(tokenizers_dir)
        with open(os.path.join(tokenizers_dir, 'punkt'), 'w') as f:
            f.write('dummy')
        orig_paths = list(nltk.data.path)
        nltk.data.path.insert(0, nltk_data_dir)
        try:
            codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
        finally:
            nltk.data.path = orig_paths
def test_oserror_on_invalid_path(monkeypatch):
    # Simulate an OSError by passing in a path that cannot be accessed
    # We'll monkeypatch nltk.data.path to a directory that doesn't exist
    orig_paths = list(nltk.data.path)
    nltk.data.path.insert(0, '/nonexistent_dir_xyz_123')
    try:
        # Should not raise, but return False
        codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
    finally:
        nltk.data.path = orig_paths
def test_unicode_package_name():
    # Unicode in package name should not be found
    codeflash_output = check_for_nltk_package('punkté', 'tokenizers') # 108μs -> 64.8μs (66.7% faster)
def test_unicode_category_name():
    # Unicode in category name should not be found
    codeflash_output = check_for_nltk_package('punkt', 'tokenizersé') # 102μs -> 59.0μs (73.0% faster)
# -------------------
# Large Scale Test Cases
# -------------------
def test_large_number_of_paths():
    # Simulate a large number of nltk.data.path entries
    orig_paths = list(nltk.data.path)
    with tempfile.TemporaryDirectory() as tmpdir:
        # Create many fake paths, only one contains the package
        fake_paths = []
        for i in range(100):
            fake_dir = os.path.join(tmpdir, f"fake_{i}")
            os.makedirs(fake_dir)
            fake_paths.append(fake_dir)
        # Add the real one at the end
        real_dir = os.path.join(tmpdir, 'real_nltk_data', 'tokenizers')
        os.makedirs(real_dir)
        with open(os.path.join(real_dir, 'punkt'), 'w') as f:
            f.write('dummy')
        nltk.data.path[:] = fake_paths + [os.path.join(tmpdir, 'real_nltk_data')]
        # Should find the package
        codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
        nltk.data.path = orig_paths
def test_large_number_of_missing_packages():
    # Test that all missing packages are not found efficiently
    for i in range(100):
        codeflash_output = check_for_nltk_package(f'nonexistent_pkg_{i}', 'corpora')
def test_large_number_of_categories():
    # Test many different categories, all missing
    for i in range(100):
        codeflash_output = check_for_nltk_package('punkt', f'category_{i}')
def test_many_paths_with_some_invalid():
    # Mix valid and invalid paths
    orig_paths = list(nltk.data.path)
    with tempfile.TemporaryDirectory() as tmpdir:
        valid_dir = os.path.join(tmpdir, 'nltk_data', 'tokenizers')
        os.makedirs(valid_dir)
        with open(os.path.join(valid_dir, 'punkt'), 'w') as f:
            f.write('dummy')
        fake_paths = [f'/nonexistent_{i}' for i in range(50)]
        nltk.data.path[:] = fake_paths + [os.path.join(tmpdir, 'nltk_data')]
        codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
        nltk.data.path = orig_paths
def test_performance_many_checks():
    # Performance: check the same valid package many times
    with tempfile.TemporaryDirectory() as tmpdir:
        nltk_data_dir = os.path.join(tmpdir, 'nltk_data', 'tokenizers')
        os.makedirs(nltk_data_dir)
        with open(os.path.join(nltk_data_dir, 'punkt'), 'w') as f:
            f.write('dummy')
        orig_paths = list(nltk.data.path)
        nltk.data.path.insert(0, os.path.join(tmpdir, 'nltk_data'))
        try:
            for _ in range(100):
                codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
        finally:
            nltk.data.path = orig_paths
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
from __future__ import annotations
import os
import nltk
# imports
import pytest  # used for our unit tests
from unstructured.nlp.tokenize import check_for_nltk_package
# unit tests
# ----------- BASIC TEST CASES -----------
def test_existing_corpus_package():
    # Test with a commonly available corpus package, e.g., 'punkt'
    # Should return True if 'punkt' is installed
    codeflash_output = check_for_nltk_package('punkt', 'tokenizers'); result = codeflash_output # 110μs -> 66.0μs (68.2% faster)
def test_nonexistent_package_returns_false():
    # Test with a clearly non-existent package
    codeflash_output = check_for_nltk_package('not_a_real_package', 'corpora') # 100μs -> 59.0μs (70.2% faster)
def test_existing_grammar_package():
    # Test with a grammar package that may exist
    codeflash_output = check_for_nltk_package('sample_grammar', 'grammars'); result = codeflash_output # 98.2μs -> 56.2μs (74.8% faster)
def test_existing_corpus_category():
    # Test with a corpus that is often installed by default
    codeflash_output = check_for_nltk_package('words', 'corpora'); result = codeflash_output # 96.9μs -> 55.1μs (75.8% faster)
def test_existing_stemmer_package():
    # Test for a stemmer package
    codeflash_output = check_for_nltk_package('porter.pickle', 'stemmers'); result = codeflash_output # 98.0μs -> 55.3μs (77.2% faster)
# ----------- EDGE TEST CASES -----------
def test_empty_package_name():
    # Test with empty package name
    codeflash_output = check_for_nltk_package('', 'corpora') # 99.0μs -> 57.0μs (73.9% faster)
def test_empty_category_name():
    # Test with empty category name
    codeflash_output = check_for_nltk_package('punkt', '') # 96.7μs -> 54.9μs (76.1% faster)
def test_both_empty():
    # Test with both package and category names empty
    codeflash_output = check_for_nltk_package('', '') # 18.1μs -> 19.4μs (6.87% slower)
def test_package_name_with_special_characters():
    # Test with special characters in package name
    codeflash_output = check_for_nltk_package('!@#, 'corpora') # 101μs -> 58.5μs (73.4% faster)
def test_category_name_with_special_characters():
    # Test with special characters in category name
    codeflash_output = check_for_nltk_package('punkt', '!@#) # 97.8μs -> 55.7μs (75.4% faster)
def test_package_name_with_path_traversal():
    # Test with directory traversal in package name
    codeflash_output = check_for_nltk_package('../punkt', 'tokenizers') # 63.7μs -> 44.7μs (42.5% faster)
def test_category_name_with_path_traversal():
    # Test with directory traversal in category name
    codeflash_output = check_for_nltk_package('punkt', '../tokenizers') # 178μs -> 75.5μs (137% faster)
def test_case_sensitivity():
    # NLTK is case-sensitive: 'Punkt' should not be found if only 'punkt' exists
    codeflash_output = check_for_nltk_package('punkt', 'tokenizers'); result_lower = codeflash_output # 95.6μs -> 54.0μs (77.0% faster)
    codeflash_output = check_for_nltk_package('Punkt', 'tokenizers'); result_upper = codeflash_output # 81.4μs -> 41.5μs (96.2% faster)
    # If lower is True, upper should be False
    if result_lower:
        pass
def test_leading_trailing_spaces():
    # Leading/trailing spaces should not resolve to a valid package
    codeflash_output = check_for_nltk_package(' punkt ', 'tokenizers') # 96.2μs -> 54.0μs (78.2% faster)
    codeflash_output = check_for_nltk_package('punkt', ' tokenizers ') # 82.0μs -> 42.2μs (94.3% faster)
def test_numeric_package_and_category():
    # Numeric names are very unlikely to exist
    codeflash_output = check_for_nltk_package('12345', '67890') # 93.6μs -> 53.1μs (76.4% faster)
def test_package_name_with_unicode():
    # Test with unicode characters in package name
    codeflash_output = check_for_nltk_package('😀', 'corpora') # 110μs -> 66.9μs (64.6% faster)
def test_category_name_with_unicode():
    # Test with unicode characters in category name
    codeflash_output = check_for_nltk_package('punkt', '😀') # 103μs -> 60.1μs (72.3% faster)
def test_package_and_category_with_long_names():
    # Very long names should not exist and should not cause errors
    long_name = 'a' * 255
    codeflash_output = check_for_nltk_package(long_name, long_name) # 127μs -> 79.0μs (61.1% faster)
def test_package_and_category_with_slashes():
    # Slashes in names should not resolve to valid packages
    codeflash_output = check_for_nltk_package('punkt/other', 'tokenizers') # 125μs -> 62.9μs (99.4% faster)
    codeflash_output = check_for_nltk_package('punkt', 'tokenizers/other') # 108μs -> 47.8μs (127% faster)
# ----------- LARGE SCALE TEST CASES -----------
def test_large_number_of_nonexistent_packages():
    # Test performance/scalability with many non-existent packages
    for i in range(100):
        name = f"not_a_real_package_{i}"
        codeflash_output = check_for_nltk_package(name, 'corpora')
def test_large_number_of_nonexistent_categories():
    # Test performance/scalability with many non-existent categories
    for i in range(100):
        cat = f"not_a_real_category_{i}"
        codeflash_output = check_for_nltk_package('punkt', cat)
def test_large_number_of_random_combinations():
    # Test a large number of random package/category combinations
    for i in range(100):
        pkg = f"pkg_{i}"
        cat = f"cat_{i}"
        codeflash_output = check_for_nltk_package(pkg, cat)
def test_large_scale_existing_and_nonexisting():
    # Mix of likely existing and non-existing packages
    likely_existing = ['punkt', 'words', 'stopwords', 'averaged_perceptron_tagger']
    for pkg in likely_existing:
        codeflash_output = check_for_nltk_package(pkg, 'corpora'); result = codeflash_output # 74.8μs -> 34.1μs (119% faster)
    # Now add a batch of non-existing ones
    for i in range(50):
        codeflash_output = check_for_nltk_package(f"noexist_{i}", 'corpora')
def test_large_scale_edge_cases():
    # Edge-like names in large scale
    for i in range(50):
        weird_name = f"../noexist_{i}"
        codeflash_output = check_for_nltk_package(weird_name, 'corpora')
        codeflash_output = check_for_nltk_package('punkt', weird_name)
# ----------- DETERMINISM AND TYPE TESTS -----------
def test_return_type_is_bool():
    # The function should always return a bool, regardless of input
    inputs = [
        ('punkt', 'tokenizers'),
        ('not_a_real_package', 'corpora'),
        ('', ''),
        ('😀', '😀'),
        ('../punkt', 'tokenizers'),
        ('punkt', '../tokenizers'),
    ]
    for pkg, cat in inputs:
        pass
def test_function_is_deterministic():
    # The function should return the same result for the same input
    pkg, cat = 'punkt', 'tokenizers'
    codeflash_output = check_for_nltk_package(pkg, cat); result1 = codeflash_output # 105μs -> 57.4μs (83.5% faster)
    codeflash_output = check_for_nltk_package(pkg, cat); result2 = codeflash_output # 81.0μs -> 41.0μs (97.6% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
</details>
To edit these changes `git checkout
codeflash/optimize-check_for_nltk_package-mcftixl5` and push.
[](https://codeflash.ai)
---------
Signed-off-by: Saurabh Misra <misra.saurabh1@gmail.com>
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>check_for_nltk_package by 111% (#4081)1 parent cc635c9 commit 57cadf8Copy full SHA for 57cadf8
File tree
Expand file treeCollapse file tree
2 files changed
+6
-6
lines changedOpen diff view settings
Filter options
- unstructured/nlp
Expand file treeCollapse file tree
2 files changed
+6
-6
lines changedOpen diff view settings
Collapse file
+1Lines changed: 1 addition & 0 deletions
- Display the source diff
- Display the rich diff
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
|  | |||
| 2 | 2 |  | |
| 3 | 3 |  | |
| 4 | 4 |  | |
|  | 5 | + | |
| 5 | 6 |  | |
| 6 | 7 |  | |
| 7 | 8 |  | |
|  | |||
Collapse file
unstructured/nlp/tokenize.py
Copy file name to clipboardExpand all lines: unstructured/nlp/tokenize.py+5-6Lines changed: 5 additions & 6 deletions
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
|  | |||
| 14 | 14 |  | |
| 15 | 15 |  | |
| 16 | 16 |  | |
| 17 |  | - | |
| 18 |  | - | |
| 19 |  | - | |
| 20 |  | - | |
| 21 |  | - | |
|  | 17 | + | |
|  | 18 | + | |
|  | 19 | + | |
|  | 20 | + | |
| 22 | 21 |  | |
| 23 | 22 |  | |
| 24 |  | - | |
|  | 23 | + | |
| 25 | 24 |  | |
| 26 | 25 |  | |
| 27 | 26 |  | |
|  | |||
0 commit comments