Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 38 additions & 22 deletions cycode/cli/code_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,32 @@ def scan_ci(context: click.Context) -> None:
@click.pass_context
def scan_path(context: click.Context, path: str) -> None:
logger.debug('Starting path scan process, %s', {'path': path})
files_to_scan = get_relevant_files_in_path(path=path, exclude_patterns=['**/.git/**', '**/.cycode/**'])
files_to_scan = exclude_irrelevant_files(context, files_to_scan)
logger.debug('Found all relevant files for scanning %s', {'path': path, 'file_to_scan_count': len(files_to_scan)})
scan_disk_files(context, path, files_to_scan)

progress_bar = context.obj['progress_bar']

all_files_to_scan = get_relevant_files_in_path(path=path, exclude_patterns=['**/.git/**', '**/.cycode/**'])

# we are double the progress bar section length because we are going to process the files twice
# first time to get the file list with respect of excluded patterns (excluding takes seconds to execute)
# second time to get the files content
progress_bar_section_len = len(all_files_to_scan) * 2
progress_bar.set_section_length(ProgressBarSection.PREPARE_LOCAL_FILES, progress_bar_section_len)

relevant_files_to_scan = exclude_irrelevant_files(context, all_files_to_scan)

# after finishing the first processing (excluding),
# we must update the progress bar stage with respect of excluded files.
# now it's possible that we will not process x2 of the files count
# because some of them were excluded, we should subtract the excluded files count
# from the progress bar section length
excluded_files_count = len(all_files_to_scan) - len(relevant_files_to_scan)
progress_bar_section_len = progress_bar_section_len - excluded_files_count
progress_bar.set_section_length(ProgressBarSection.PREPARE_LOCAL_FILES, progress_bar_section_len)

logger.debug(
'Found all relevant files for scanning %s', {'path': path, 'file_to_scan_count': len(relevant_files_to_scan)}
)
scan_disk_files(context, path, relevant_files_to_scan)


@click.command(short_help='Use this command to scan the content that was not committed yet')
Expand Down Expand Up @@ -300,17 +322,15 @@ def scan_disk_files(context: click.Context, path: str, files_to_scan: List[str])

is_git_diff = False

progress_bar.set_section_length(ProgressBarSection.PREPARE_LOCAL_FILES, len(files_to_scan))

documents: List[Document] = []
for file in files_to_scan:
progress_bar.update(ProgressBarSection.PREPARE_LOCAL_FILES)

with open(file, 'r', encoding='UTF-8') as f:
try:
documents.append(Document(file, f.read(), is_git_diff))
except UnicodeDecodeError:
continue
content = get_file_content(file)
if not content:
continue

documents.append(Document(file, content, is_git_diff))

perform_pre_scan_documents_actions(context, scan_type, documents, is_git_diff)
scan_documents(context, documents, is_git_diff=is_git_diff, scan_parameters=scan_parameters)
Expand Down Expand Up @@ -826,12 +846,16 @@ def exclude_irrelevant_documents_to_scan(context: click.Context, documents_to_sc

def exclude_irrelevant_files(context: click.Context, filenames: List[str]) -> List[str]:
scan_type = context.obj['scan_type']
progress_bar = context.obj['progress_bar']

relevant_files = []
for filename in filenames:
progress_bar.update(ProgressBarSection.PREPARE_LOCAL_FILES)
if _is_relevant_file_to_scan(scan_type, filename):
relevant_files.append(filename)

is_sub_path.cache_clear() # free up memory

return relevant_files


Expand Down Expand Up @@ -1066,20 +1090,12 @@ def _is_file_extension_supported(scan_type: str, filename: str) -> bool:
filename = filename.lower()

if scan_type == consts.INFRA_CONFIGURATION_SCAN_TYPE:
return any(
filename.endswith(supported_file_extension)
for supported_file_extension in consts.INFRA_CONFIGURATION_SCAN_SUPPORTED_FILES
)
return filename.endswith(consts.INFRA_CONFIGURATION_SCAN_SUPPORTED_FILES)

if scan_type == consts.SCA_SCAN_TYPE:
return any(
filename.endswith(supported_file) for supported_file in consts.SCA_CONFIGURATION_SCAN_SUPPORTED_FILES
)
return filename.endswith(consts.SCA_CONFIGURATION_SCAN_SUPPORTED_FILES)

return all(
not filename.endswith(file_extension_to_ignore)
for file_extension_to_ignore in consts.SECRET_SCAN_FILE_EXTENSIONS_TO_IGNORE
)
return not filename.endswith(consts.SECRET_SCAN_FILE_EXTENSIONS_TO_IGNORE)


def _does_file_exceed_max_size_limit(filename: str) -> bool:
Expand Down
12 changes: 6 additions & 6 deletions cycode/cli/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
SCA_SCAN_TYPE = 'sca'
SAST_SCAN_TYPE = 'sast'

INFRA_CONFIGURATION_SCAN_SUPPORTED_FILES = ['.tf', '.tf.json', '.json', '.yaml', '.yml', 'dockerfile']
INFRA_CONFIGURATION_SCAN_SUPPORTED_FILES = ('.tf', '.tf.json', '.json', '.yaml', '.yml', 'dockerfile')

SECRET_SCAN_FILE_EXTENSIONS_TO_IGNORE = [
SECRET_SCAN_FILE_EXTENSIONS_TO_IGNORE = (
'.7z',
'.bmp',
'.bz2',
Expand Down Expand Up @@ -39,9 +39,9 @@
'.deb',
'.obj',
'.model',
]
)

SCA_CONFIGURATION_SCAN_SUPPORTED_FILES = [
SCA_CONFIGURATION_SCAN_SUPPORTED_FILES = (
'cargo.lock',
'cargo.toml',
'composer.json',
Expand Down Expand Up @@ -73,9 +73,9 @@
'pipfile.lock',
'requirements.txt',
'setup.py',
]
)

SCA_EXCLUDED_PATHS = ['node_modules']
SCA_EXCLUDED_PATHS = ('node_modules',)

PROJECT_FILES_BY_ECOSYSTEM_MAP = {
'crates': ['Cargo.lock', 'Cargo.toml'],
Expand Down
2 changes: 2 additions & 0 deletions cycode/cli/user_settings/configuration_manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Optional
from uuid import uuid4
Expand Down Expand Up @@ -61,6 +62,7 @@ def get_verbose_flag_from_environment_variables(self) -> bool:
value = self._get_value_from_environment_variables(consts.VERBOSE_ENV_VAR_NAME, '')
return value.lower() in ('true', '1')

@lru_cache(maxsize=None) # noqa: B019
def get_exclusions_by_scan_type(self, scan_type: str) -> Dict:
local_exclusions = self.local_config_file_manager.get_exclusions_by_scan_type(scan_type)
global_exclusions = self.global_config_file_manager.get_exclusions_by_scan_type(scan_type)
Expand Down
31 changes: 20 additions & 11 deletions cycode/cli/utils/path_utils.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,31 @@
import os
from pathlib import Path
from typing import AnyStr, Generator, Iterable, List, Optional
from functools import lru_cache
from typing import AnyStr, Iterable, List, Optional

import pathspec
from binaryornot.check import is_binary


def get_relevant_files_in_path(path: str, exclude_patterns: Iterable[str]) -> List[str]:
absolute_path = get_absolute_path(path)

if not os.path.isfile(absolute_path) and not os.path.isdir(absolute_path):
raise FileNotFoundError(f'the specified path was not found, path: {path}')
raise FileNotFoundError(f'the specified path was not found, path: {absolute_path}')

if os.path.isfile(absolute_path):
return [absolute_path]

directory_files_paths = _get_all_existing_files_in_directory(absolute_path)
file_paths = set({str(file_path) for file_path in directory_files_paths})
spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, exclude_patterns)
exclude_file_paths = set(spec.match_files(file_paths))
all_file_paths = set(_get_all_existing_files_in_directory(absolute_path))

path_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, exclude_patterns)
excluded_file_paths = set(path_spec.match_files(all_file_paths))

relevant_file_paths = all_file_paths - excluded_file_paths

return [file_path for file_path in (file_paths - exclude_file_paths) if os.path.isfile(file_path)]
return [file_path for file_path in relevant_file_paths if os.path.isfile(file_path)]


@lru_cache(maxsize=None)
def is_sub_path(path: str, sub_path: str) -> bool:
try:
common_path = os.path.commonpath([get_absolute_path(path), get_absolute_path(sub_path)])
Expand Down Expand Up @@ -49,9 +53,14 @@ def get_path_by_os(filename: str) -> str:
return filename.replace('/', os.sep)


def _get_all_existing_files_in_directory(path: str) -> Generator[Path, None, None]:
directory = Path(path)
return directory.rglob(r'*')
def _get_all_existing_files_in_directory(path: str) -> List[str]:
files: List[str] = []

for root, _, filenames in os.walk(path):
for filename in filenames:
files.append(os.path.join(root, filename))

return files


def is_path_exists(path: str) -> bool:
Expand Down
3 changes: 0 additions & 3 deletions cycode/cli/utils/progress_bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ class ProgressBarSectionInfo(NamedTuple):
ProgressBarSection.PREPARE_LOCAL_FILES: ProgressBarSectionInfo(
ProgressBarSection.PREPARE_LOCAL_FILES, 'Prepare local files', start_percent=0, stop_percent=5
),
# TODO(MarshalX): could be added in the future
# ProgressBarSection.UPLOAD_FILES: ProgressBarSectionInfo(
# ),
ProgressBarSection.SCAN: ProgressBarSectionInfo(
ProgressBarSection.SCAN, 'Scan in progress', start_percent=5, stop_percent=95
),
Expand Down
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
TEST_FILES_PATH = Path(__file__).parent.joinpath('test_files').absolute()


@pytest.fixture(scope='session')
def test_files_path() -> Path:
return TEST_FILES_PATH


@pytest.fixture(scope='session')
def scan_client() -> ScanClient:
return create_scan_client(_CLIENT_ID, _CLIENT_SECRET, hide_response_log=False)
Expand Down
1 change: 1 addition & 0 deletions tests/test_files/.test_env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
TELEGRAM_BOT_TOKEN=923445010:AAGWKwWTNx_6RAuRdcp2kWax5_JltwkF2Lw
94 changes: 94 additions & 0 deletions tests/test_performance_get_all_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import glob
import logging
import os
import timeit
from pathlib import Path
from typing import Dict, List, Tuple, Union

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def filter_files(paths: List[Union[Path, str]]) -> List[str]:
return [str(path) for path in paths if os.path.isfile(path)]


def get_all_files_glob(path: Union[Path, str]) -> List[str]:
# DOESN'T RETURN HIDDEN FILES. CAN'T BE USED
# and doesn't show the best performance
if not str(path).endswith(os.sep):
path = f'{path}{os.sep}'

return filter_files(glob.glob(f'{path}**', recursive=True))


def get_all_files_walk(path: str) -> List[str]:
files = []

for root, _, filenames in os.walk(path):
for filename in filenames:
files.append(os.path.join(root, filename))

return files


def get_all_files_listdir(path: str) -> List[str]:
files = []

def _(sub_path: str) -> None:
items = os.listdir(sub_path)

for item in items:
item_path = os.path.join(sub_path, item)

if os.path.isfile(item_path):
files.append(item_path)
elif os.path.isdir(item_path):
_(item_path)

_(path)
return files


def get_all_files_rglob(path: str) -> List[str]:
return filter_files(list(Path(path).rglob(r'*')))


def test_get_all_files_performance(test_files_path: str) -> None:
results: Dict[str, Tuple[int, float]] = {}
for func in {
get_all_files_rglob,
get_all_files_listdir,
get_all_files_walk,
}:
name = func.__name__
start_time = timeit.default_timer()

files_count = len(func(test_files_path))

executed_time = timeit.default_timer() - start_time
results[name] = (files_count, executed_time)

logger.info(f'Time result {name}: {executed_time}')
logger.info(f'Files count {name}: {files_count}')

files_counts = [result[0] for result in results.values()]
assert len(set(files_counts)) == 1 # all should be equal

logger.info(f'Benchmark TOP with ({files_counts[0]}) files:')
for func_name, result in sorted(results.items(), key=lambda x: x[1][1]):
logger.info(f'- {func_name}: {result[1]}')

# according to my (MarshalX) local tests, the fastest is get_all_files_walk


if __name__ == '__main__':
# provide a path with thousands of files
huge_dir_path = '/Users/ilyasiamionau/projects/cycode/'
test_get_all_files_performance(huge_dir_path)

# Output:
# INFO:__main__:Benchmark TOP with (94882) files:
# INFO:__main__:- get_all_files_walk: 0.717258458
# INFO:__main__:- get_all_files_listdir: 1.4648628330000002
# INFO:__main__:- get_all_files_rglob: 2.368291458