From 284a558303d73b4a1753b3174640b61cdc04dbe6 Mon Sep 17 00:00:00 2001 From: Devin Kenneth Gibson Date: Wed, 24 Jan 2024 22:20:07 -0500 Subject: [PATCH] Add new search scheduler implementation for improved simplicity and robustness. (#240) --- .../scripts/native/search.py | 128 +------- .../clp_package_utils/scripts/start_clp.py | 164 +++++++++- .../clp_package_utils/scripts/stop_clp.py | 11 + .../clp-py-utils/clp_py_utils/clp_config.py | 32 ++ .../clp-py-utils/clp_py_utils/clp_logging.py | 40 +++ .../clp_py_utils/create-db-tables.py | 6 + .../initialize-search-scheduler-db.py | 46 +++ components/core/src/clp/clo/CMakeLists.txt | 2 - .../core/src/clp/clo/CommandLineArguments.cpp | 25 +- .../core/src/clp/clo/CommandLineArguments.hpp | 6 - .../clp/clo/ControllerMonitoringThread.cpp | 47 --- .../clp/clo/ControllerMonitoringThread.hpp | 31 -- components/core/src/clp/clo/clo.cpp | 144 +-------- .../executor/celeryconfig.py | 2 - .../executor/search/__init__.py | 0 .../executor/search/celery.py | 5 + .../executor/search/celeryconfig.py | 32 ++ .../executor/search/fs_search_task.py | 103 +++++++ .../job_orchestration/executor/search_task.py | 4 +- .../job_orchestration/job_config.py | 8 +- .../search_scheduler/__init__.py | 0 .../search_scheduler/common.py | 24 ++ .../search_scheduler/search_scheduler.py | 283 ++++++++++++++++++ components/job-orchestration/pyproject.toml | 4 + .../package-template/src/etc/clp-config.yml | 6 + 25 files changed, 787 insertions(+), 366 deletions(-) create mode 100644 components/clp-py-utils/clp_py_utils/clp_logging.py create mode 100644 components/clp-py-utils/clp_py_utils/initialize-search-scheduler-db.py delete mode 100644 components/core/src/clp/clo/ControllerMonitoringThread.cpp delete mode 100644 components/core/src/clp/clo/ControllerMonitoringThread.hpp create mode 100644 components/job-orchestration/job_orchestration/executor/search/__init__.py create mode 100644 components/job-orchestration/job_orchestration/executor/search/celery.py create mode 100644 components/job-orchestration/job_orchestration/executor/search/celeryconfig.py create mode 100644 components/job-orchestration/job_orchestration/executor/search/fs_search_task.py create mode 100644 components/job-orchestration/job_orchestration/search_scheduler/__init__.py create mode 100644 components/job-orchestration/job_orchestration/search_scheduler/common.py create mode 100644 components/job-orchestration/job_orchestration/search_scheduler/search_scheduler.py diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/search.py b/components/clp-package-utils/clp_package_utils/scripts/native/search.py index b7bb77f8d..bca76974c 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/search.py @@ -2,18 +2,15 @@ import argparse import asyncio -import datetime import logging import multiprocessing import pathlib -import socket import sys import time from contextlib import closing import msgpack import pymongo -import zstandard from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -21,13 +18,13 @@ get_clp_home ) from clp_py_utils.clp_config import ( - CLP_METADATA_TABLE_PREFIX, + SEARCH_JOBS_TABLE_NAME, Database, ResultsCache ) from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.job_config import SearchConfig -from job_orchestration.scheduler.constants import JobStatus +from job_orchestration.search_scheduler.common import JobStatus # Setup logging # Create logger @@ -75,81 +72,31 @@ def process_error_callback(err): def create_and_monitor_job_in_db(db_config: Database, results_cache: ResultsCache, wildcard_query: str, begin_timestamp: int | None, - end_timestamp: int | None, path_filter: str, - search_controller_host: str, search_controller_port: int): + end_timestamp: int | None, path_filter: str): search_config = SearchConfig( - search_controller_host=search_controller_host, - search_controller_port=search_controller_port, - wildcard_query=wildcard_query, + query_string=wildcard_query, begin_timestamp=begin_timestamp, end_timestamp=end_timestamp, path_filter=path_filter ) sql_adapter = SQL_Adapter(db_config) - zstd_cctx = zstandard.ZstdCompressor(level=3) with closing(sql_adapter.create_connection(True)) as \ db_conn, closing(db_conn.cursor(dictionary=True)) as db_cursor: # Create job - db_cursor.execute(f"INSERT INTO `search_jobs` (`search_config`) VALUES (%s)", - (zstd_cctx.compress(msgpack.packb(search_config.dict())),)) + db_cursor.execute(f"INSERT INTO `{SEARCH_JOBS_TABLE_NAME}` (`search_config`) VALUES (%s)", + (msgpack.packb(search_config.dict()),)) db_conn.commit() job_id = db_cursor.lastrowid - next_pagination_id = 0 - pagination_limit = 64 - num_tasks_added = 0 - query_base_conditions = [] - if begin_timestamp is not None: - query_base_conditions.append(f"`end_timestamp` >= {begin_timestamp}") - if end_timestamp is not None: - query_base_conditions.append(f"`begin_timestamp` <= {end_timestamp}") - while True: - # Get next `limit` rows - query_conditions = query_base_conditions + [f"`pagination_id` >= {next_pagination_id}"] - query = f""" - SELECT `id` FROM {CLP_METADATA_TABLE_PREFIX}archives - WHERE {" AND ".join(query_conditions)} - LIMIT {pagination_limit} - """ - db_cursor.execute(query) - rows = db_cursor.fetchall() - if len(rows) == 0: - break - - # Insert tasks - db_cursor.execute(f""" - INSERT INTO `search_tasks` (`job_id`, `archive_id`, `scheduled_time`) - VALUES ({"), (".join(f"{job_id}, '{row['id']}', '{datetime.datetime.utcnow()}'" for row in rows)}) - """) - db_conn.commit() - num_tasks_added += len(rows) - - if len(rows) < pagination_limit: - # Less than limit rows returned, so there are no more rows - break - next_pagination_id += pagination_limit - - # Mark job as scheduled - db_cursor.execute(f""" - UPDATE `search_jobs` - SET num_tasks={num_tasks_added}, status = '{JobStatus.SCHEDULED}' - WHERE id = {job_id} - """) - db_conn.commit() - # Wait for the job to be marked complete - job_complete = False - while not job_complete: - db_cursor.execute(f"SELECT `status`, `status_msg` FROM `search_jobs` WHERE `id` = {job_id}") + while True: + db_cursor.execute(f"SELECT `status` FROM `{SEARCH_JOBS_TABLE_NAME}` WHERE `id` = {job_id}") # There will only ever be one row since it's impossible to have more than one job with the same ID - row = db_cursor.fetchall()[0] - if JobStatus.SUCCEEDED == row['status']: - job_complete = True - elif JobStatus.FAILED == row['status']: - logger.error(row['status_msg']) - job_complete = True + new_status = db_cursor.fetchall()[0]['status'] db_conn.commit() + if new_status in (JobStatus.SUCCESS, JobStatus.FAILED, JobStatus.CANCELLED): + break time.sleep(0.5) @@ -159,50 +106,17 @@ def create_and_monitor_job_in_db(db_config: Database, results_cache: ResultsCach print(f"{document['original_path']}: {document['message']}", end='') -async def worker_connection_handler(reader: asyncio.StreamReader, writer: asyncio.StreamWriter): - try: - buf = await reader.read(1024) - if b'' == buf: - # Worker closed - return - except asyncio.CancelledError: - return - finally: - writer.close() - - async def do_search(db_config: Database, results_cache: ResultsCache, wildcard_query: str, - begin_timestamp: int | None, end_timestamp: int | None, path_filter: str, host: str): - # Start a server - try: - server = await asyncio.start_server(client_connected_cb=worker_connection_handler, host=host, port=0, - family=socket.AF_INET) - except asyncio.CancelledError: - # Search cancelled - return - port = server.sockets[0].getsockname()[1] - - server_task = asyncio.ensure_future(server.serve_forever()) - + begin_timestamp: int | None, end_timestamp: int | None, path_filter: str): db_monitor_task = asyncio.ensure_future( run_function_in_process(create_and_monitor_job_in_db, db_config, results_cache, wildcard_query, - begin_timestamp, end_timestamp, path_filter, host, port)) + begin_timestamp, end_timestamp, path_filter)) # Wait for the job to complete or an error to occur - pending = [server_task, db_monitor_task] try: - done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) - if db_monitor_task in done: - server.close() - await server.wait_closed() - else: - logger.error("server task unexpectedly returned") - db_monitor_task.cancel() - await db_monitor_task - except asyncio.CancelledError: - server.close() - await server.wait_closed() await db_monitor_task + except asyncio.CancelledError: + pass def main(argv): @@ -237,18 +151,8 @@ def main(argv): logger.exception("Failed to load config.") return -1 - # Get IP of local machine - host_ip = None - for ip in set(socket.gethostbyname_ex(socket.gethostname())[2]): - host_ip = ip - break - if host_ip is None: - logger.error("Could not determine IP of local machine.") - return -1 - asyncio.run(do_search(clp_config.database, clp_config.results_cache, parsed_args.wildcard_query, - parsed_args.begin_time, parsed_args.end_time, parsed_args.file_path, - host_ip)) + parsed_args.begin_time, parsed_args.end_time, parsed_args.file_path)) return 0 diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index fda5e4889..e2c41b06d 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -11,6 +11,8 @@ import yaml +from pydantic import BaseModel + from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, CONTAINER_CLP_HOME, @@ -35,6 +37,8 @@ QUEUE_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, SCHEDULER_COMPONENT_NAME, + SEARCH_SCHEDULER_COMPONENT_NAME, + SEARCH_WORKER_COMPONENT_NAME, WORKER_COMPONENT_NAME, ) from job_orchestration.scheduler.constants import QueueName @@ -330,6 +334,153 @@ def start_scheduler(instance_id: str, clp_config: CLPConfig, container_clp_confi logger.info(f"Started {SCHEDULER_COMPONENT_NAME}.") +def start_search_scheduler(instance_id: str, clp_config: CLPConfig, container_clp_config: CLPConfig, + mounts: CLPDockerMounts): + component_name = SEARCH_SCHEDULER_COMPONENT_NAME + logger.info(f"Starting {component_name}...") + + container_name = f'clp-{component_name}-{instance_id}' + if container_exists(container_name): + logger.info(f"{SEARCH_SCHEDULER_COMPONENT_NAME} already running.") + return + + container_config_filename = f'{container_name}.yml' + container_config_file_path = clp_config.logs_directory / container_config_filename + with open(container_config_file_path, 'w') as f: + yaml.safe_dump(container_clp_config.dump_to_primitive_dict(), f) + + logs_dir = clp_config.logs_directory / component_name + logs_dir.mkdir(parents=True, exist_ok=True) + container_logs_dir = container_clp_config.logs_directory / component_name + + clp_site_packages_dir = CONTAINER_CLP_HOME / 'lib' / 'python3' / 'site-packages' + container_start_cmd = [ + 'docker', 'run', + '-di', + '--network', 'host', + '-w', str(CONTAINER_CLP_HOME), + '--rm', + '--name', container_name, + '-e', f'PYTHONPATH={clp_site_packages_dir}', + '-e', f'BROKER_URL=amqp://' + f'{container_clp_config.queue.username}:{container_clp_config.queue.password}@' + f'{container_clp_config.queue.host}:{container_clp_config.queue.port}', + '-e', f'RESULT_BACKEND=rpc://' + f'{container_clp_config.queue.username}:{container_clp_config.queue.password}@' + f'{container_clp_config.queue.host}:{container_clp_config.queue.port}', + '-e', f'CLP_LOGS_DIR={container_logs_dir}', + '-e', f'CLP_LOGGING_LEVEL={clp_config.search_scheduler.logging_level}', + '-u', f'{os.getuid()}:{os.getgid()}', + '--mount', str(mounts.clp_home), + ] + necessary_mounts = [ + mounts.logs_dir, + ] + for mount in necessary_mounts: + if mount: + container_start_cmd.append('--mount') + container_start_cmd.append(str(mount)) + container_start_cmd.append(clp_config.execution_container) + + scheduler_cmd = [ + 'python3', '-u', '-m', + 'job_orchestration.search_scheduler.search_scheduler', + '--config', str(container_clp_config.logs_directory / container_config_filename), + ] + cmd = container_start_cmd + scheduler_cmd + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + + logger.info(f"Started {component_name}.") + + +def start_search_worker(instance_id: str, clp_config: CLPConfig, container_clp_config: CLPConfig, + num_cpus: int, mounts: CLPDockerMounts): + celery_method = 'job_orchestration.executor.search' + celery_route = f"{QueueName.SEARCH}" + generic_start_worker( + SEARCH_WORKER_COMPONENT_NAME, + instance_id, + clp_config, + clp_config.search_worker, + container_clp_config, + celery_method, + celery_route, + num_cpus, + mounts + ) + + +def generic_start_worker(component_name: str, instance_id: str, clp_config: CLPConfig, worker_config: BaseModel, + container_clp_config: CLPConfig, celery_method: str, celery_route: str, + num_cpus: int, mounts: CLPDockerMounts): + logger.info(f"Starting {component_name}...") + + container_name = f'clp-{component_name}-{instance_id}' + if container_exists(container_name): + logger.info(f"{component_name} already running.") + return + + validate_worker_config(clp_config) + + logs_dir = clp_config.logs_directory / component_name + logs_dir.mkdir(parents=True, exist_ok=True) + container_logs_dir = container_clp_config.logs_directory / component_name + + # Create necessary directories + clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True) + + clp_site_packages_dir = CONTAINER_CLP_HOME / 'lib' / 'python3' / 'site-packages' + container_start_cmd = [ + 'docker', 'run', + '-di', + '--network', 'host', + '-w', str(CONTAINER_CLP_HOME), + '--rm', + '--name', container_name, + '-e', f'PYTHONPATH={clp_site_packages_dir}', + '-e', f'BROKER_URL=amqp://' + f'{container_clp_config.queue.username}:{container_clp_config.queue.password}@' + f'{container_clp_config.queue.host}:{container_clp_config.queue.port}', + '-e', f'RESULT_BACKEND=rpc://' + f'{container_clp_config.queue.username}:{container_clp_config.queue.password}@' + f'{container_clp_config.queue.host}:{container_clp_config.queue.port}', + '-e', f'CLP_HOME={CONTAINER_CLP_HOME}', + '-e', f'CLP_DATA_DIR={container_clp_config.data_directory}', + '-e', f'CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}', + '-e', f'CLP_LOGS_DIR={container_logs_dir}', + '-e', f'CLP_LOGGING_LEVEL={worker_config.logging_level}', + '-u', f'{os.getuid()}:{os.getgid()}', + '--mount', str(mounts.clp_home), + ] + necessary_mounts = [ + mounts.data_dir, + mounts.logs_dir, + mounts.archives_output_dir, + mounts.input_logs_dir, + ] + for mount in necessary_mounts: + if mount: + container_start_cmd.append('--mount') + container_start_cmd.append(str(mount)) + container_start_cmd.append(clp_config.execution_container) + + worker_cmd = [ + 'python3', str(clp_site_packages_dir / 'bin' / 'celery'), + '-A', + celery_method, + 'worker', + '--concurrency', str(num_cpus), + '--loglevel', 'WARNING', + '-f', str(container_logs_dir / "worker.log"), + '-Q', celery_route, + '-n', component_name, + ] + cmd = container_start_cmd + worker_cmd + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + + logger.info(f"Started {component_name}.") + + def start_worker(instance_id: str, clp_config: CLPConfig, container_clp_config: CLPConfig, num_cpus: int, mounts: CLPDockerMounts): logger.info(f"Starting {WORKER_COMPONENT_NAME}...") @@ -385,7 +536,7 @@ def start_worker(instance_id: str, clp_config: CLPConfig, container_clp_config: 'worker', '--concurrency', str(num_cpus), '--loglevel', 'WARNING', - '-Q', f"{QueueName.COMPRESSION},{QueueName.SEARCH}", + '-Q', f"{QueueName.COMPRESSION}", ] cmd = container_start_cmd + worker_cmd subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -429,9 +580,12 @@ def main(argv): clp_config = validate_and_load_config_file(config_file_path, default_config_file_path, clp_home) # Validate and load necessary credentials - if component_name in ['', DB_COMPONENT_NAME, SCHEDULER_COMPONENT_NAME]: + if component_name in ['', DB_COMPONENT_NAME, SCHEDULER_COMPONENT_NAME, + SEARCH_SCHEDULER_COMPONENT_NAME]: validate_and_load_db_credentials_file(clp_config, clp_home, True) - if component_name in ['', QUEUE_COMPONENT_NAME, SCHEDULER_COMPONENT_NAME, WORKER_COMPONENT_NAME]: + if component_name in ['', QUEUE_COMPONENT_NAME, SCHEDULER_COMPONENT_NAME, + WORKER_COMPONENT_NAME, SEARCH_SCHEDULER_COMPONENT_NAME, + SEARCH_WORKER_COMPONENT_NAME]: validate_and_load_queue_credentials_file(clp_config, clp_home, True) clp_config.validate_data_dir() @@ -475,6 +629,10 @@ def main(argv): start_results_cache(instance_id, clp_config, conf_dir) if '' == component_name or SCHEDULER_COMPONENT_NAME == component_name: start_scheduler(instance_id, clp_config, container_clp_config, mounts) + if '' == component_name or SEARCH_SCHEDULER_COMPONENT_NAME == component_name: + start_search_scheduler(instance_id, clp_config, container_clp_config, mounts) + if '' == component_name or SEARCH_WORKER_COMPONENT_NAME == component_name: + start_search_worker(instance_id, clp_config, container_clp_config, num_cpus, mounts) if '' == component_name or WORKER_COMPONENT_NAME == component_name: start_worker(instance_id, clp_config, container_clp_config, num_cpus, mounts) except Exception as ex: diff --git a/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py b/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py index d1c8c6397..44186bc0e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py @@ -16,6 +16,8 @@ DB_COMPONENT_NAME, QUEUE_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + SEARCH_SCHEDULER_COMPONENT_NAME, + SEARCH_WORKER_COMPONENT_NAME, SCHEDULER_COMPONENT_NAME, WORKER_COMPONENT_NAME ) @@ -89,10 +91,19 @@ def main(argv): if '' == component_name or WORKER_COMPONENT_NAME == component_name: stop_container(f'clp-{WORKER_COMPONENT_NAME}-{instance_id}') + if '' == component_name or SEARCH_WORKER_COMPONENT_NAME == component_name: + stop_container(f'clp-{SEARCH_WORKER_COMPONENT_NAME}-{instance_id}') if '' == component_name or SCHEDULER_COMPONENT_NAME == component_name: container_name = f'clp-{SCHEDULER_COMPONENT_NAME}-{instance_id}' stop_container(container_name) + container_config_file_path = logs_dir / f'{container_name}.yml' + if container_config_file_path.exists(): + container_config_file_path.unlink() + if '' == component_name or SEARCH_SCHEDULER_COMPONENT_NAME == component_name: + container_name = f'clp-{SEARCH_SCHEDULER_COMPONENT_NAME}-{instance_id}' + stop_container(container_name) + container_config_file_path = logs_dir / f'{container_name}.yml' if container_config_file_path.exists(): container_config_file_path.unlink() diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index e25f15819..7ccb56da7 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, validator from .core import get_config_value, make_config_path_absolute, read_yaml_config_file, validate_path_could_be_dir +from .clp_logging import is_valid_logging_level, get_valid_logging_level # Constants # Component names @@ -11,10 +12,13 @@ QUEUE_COMPONENT_NAME = 'queue' RESULTS_CACHE_COMPONENT_NAME = 'results_cache' SCHEDULER_COMPONENT_NAME = 'scheduler' +SEARCH_SCHEDULER_COMPONENT_NAME = 'search_scheduler' +SEARCH_WORKER_COMPONENT_NAME = 'search_worker' WORKER_COMPONENT_NAME = 'worker' CLP_DEFAULT_CREDENTIALS_FILE_PATH = pathlib.Path('etc') / 'credentials.yml' CLP_METADATA_TABLE_PREFIX = 'clp_' +SEARCH_JOBS_TABLE_NAME = 'distributed_search_jobs' class Database(BaseModel): @@ -96,11 +100,37 @@ def get_clp_connection_params_and_type(self, disable_localhost_socket_connection connection_params_and_type['ssl_cert'] = self.ssl_cert return connection_params_and_type +def _validate_logging_level(cls, field): + if not is_valid_logging_level(field): + raise ValueError( + f"{cls.__name__}: '{field}' is not a valid logging level. Use one of" + f" {get_valid_logging_level()}" + ) + class Scheduler(BaseModel): jobs_poll_delay: int = 1 # seconds +class SearchScheduler(BaseModel): + jobs_poll_delay: float = 0.1 # seconds + logging_level: str = 'INFO' + + @validator('logging_level') + def validate_logging_level(cls, field): + _validate_logging_level(cls, field) + return field + + +class SearchWorker(BaseModel): + logging_level: str = 'INFO' + + @validator('logging_level') + def validate_logging_level(cls, field): + _validate_logging_level(cls, field) + return field + + class ResultsCache(BaseModel): host: str = 'localhost' port: int = 27017 @@ -174,6 +204,8 @@ class CLPConfig(BaseModel): queue: Queue = Queue() results_cache: ResultsCache = ResultsCache() scheduler: Scheduler = Scheduler() + search_scheduler: SearchScheduler = SearchScheduler() + search_worker: SearchWorker = SearchWorker() credentials_file_path: pathlib.Path = CLP_DEFAULT_CREDENTIALS_FILE_PATH archive_output: ArchiveOutput = ArchiveOutput() diff --git a/components/clp-py-utils/clp_py_utils/clp_logging.py b/components/clp-py-utils/clp_py_utils/clp_logging.py new file mode 100644 index 000000000..419432072 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/clp_logging.py @@ -0,0 +1,40 @@ +import logging + +LOGGING_LEVEL_MAPPING = { + 'INFO': logging.INFO, + 'DEBUG': logging.DEBUG, + 'WARN': logging.WARNING, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL +} + +def get_logging_formatter(): + return logging.Formatter("%(asctime)s %(name)s [%(levelname)s] %(message)s") + +def get_logger(name: str): + logger = logging.getLogger(name) + # Setup console logging + logging_console_handler = logging.StreamHandler() + logging_console_handler.setFormatter(get_logging_formatter()) + logger.addHandler(logging_console_handler) + # Prevent double logging from sub loggers + logger.propagate = False + return logger + + +def get_valid_logging_level(): + return [i for i in LOGGING_LEVEL_MAPPING.keys()] + + +def is_valid_logging_level(level: str): + return level in LOGGING_LEVEL_MAPPING + + +def set_logging_level(logger: logging.Logger, level: str): + if not is_valid_logging_level(level): + logger.warning(f"Invalid logging level: {level}, using INFO as default") + logger.setLevel(logging.INFO) + return + logger.setLevel(LOGGING_LEVEL_MAPPING[level]) + diff --git a/components/clp-py-utils/clp_py_utils/create-db-tables.py b/components/clp-py-utils/clp_py_utils/create-db-tables.py index b591a0d0a..9bd0a5b89 100644 --- a/components/clp-py-utils/clp_py_utils/create-db-tables.py +++ b/components/clp-py-utils/clp_py_utils/create-db-tables.py @@ -36,6 +36,12 @@ def main(argv): ] subprocess.run(cmd, check=True) + cmd = [ + 'python3', str(script_dir / 'initialize-search-scheduler-db.py'), + '--config', str(config_file_path) + ] + subprocess.run(cmd, check=True) + return 0 diff --git a/components/clp-py-utils/clp_py_utils/initialize-search-scheduler-db.py b/components/clp-py-utils/clp_py_utils/initialize-search-scheduler-db.py new file mode 100644 index 000000000..b3aa12622 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/initialize-search-scheduler-db.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +import argparse +import sys +from contextlib import closing + +from clp_py_utils.clp_config import Database, SEARCH_JOBS_TABLE_NAME +from clp_py_utils.clp_logging import get_logger +from clp_py_utils.core import read_yaml_config_file +from job_orchestration.search_scheduler.common import JobStatus +from sql_adapter import SQL_Adapter + +logger = get_logger(__file__) + +def main(argv): + args_parser = argparse.ArgumentParser(description="Sets up tables for the search scheduler.") + args_parser.add_argument('--config', required=True, help="Database config file.") + parsed_args = args_parser.parse_args(argv[1:]) + + try: + database_config = Database.parse_obj(read_yaml_config_file(parsed_args.config)) + if database_config is None: + raise ValueError(f"Database configuration file '{parsed_args.config}' is empty.") + sql_adapter = SQL_Adapter(database_config) + with closing(sql_adapter.create_connection(True)) as scheduling_db, \ + closing(scheduling_db.cursor(dictionary=True)) as scheduling_db_cursor: + scheduling_db_cursor.execute(f""" + CREATE TABLE IF NOT EXISTS `{SEARCH_JOBS_TABLE_NAME}` ( + `id` INT NOT NULL AUTO_INCREMENT, + `status` INT NOT NULL DEFAULT '{JobStatus.PENDING}', + `submission_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), + `search_config` VARBINARY(60000) NOT NULL, + PRIMARY KEY (`id`) USING BTREE, + INDEX `JOB_STATUS` (`status`) USING BTREE + ) ROW_FORMAT=DYNAMIC + """) + + scheduling_db.commit() + except: + logger.exception("Failed to create search scheduler tables.") + return -1 + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/core/src/clp/clo/CMakeLists.txt b/components/core/src/clp/clo/CMakeLists.txt index 7ac8a36ed..aab338047 100644 --- a/components/core/src/clp/clo/CMakeLists.txt +++ b/components/core/src/clp/clo/CMakeLists.txt @@ -108,8 +108,6 @@ set( clo.cpp CommandLineArguments.cpp CommandLineArguments.hpp - ControllerMonitoringThread.cpp - ControllerMonitoringThread.hpp ResultsCacheClient.cpp ResultsCacheClient.hpp ) diff --git a/components/core/src/clp/clo/CommandLineArguments.cpp b/components/core/src/clp/clo/CommandLineArguments.cpp index 7a5d98a29..87bfd3d0b 100644 --- a/components/core/src/clp/clo/CommandLineArguments.cpp +++ b/components/core/src/clp/clo/CommandLineArguments.cpp @@ -97,12 +97,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { po::options_description hidden_positional_options; // clang-format off hidden_positional_options.add_options()( - "search-controller-host", - po::value(&m_search_controller_host) - )( - "search-controller-port", - po::value(&m_search_controller_port) - )( "mongodb-uri", po::value(&m_mongodb_uri) )( @@ -120,8 +114,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { ); // clang-format on po::positional_options_description positional_options_description; - positional_options_description.add("search-controller-host", 1); - positional_options_description.add("search-controller-port", 1); positional_options_description.add("mongodb-uri", 1); positional_options_description.add("mongodb-collection", 1); positional_options_description.add("archive-path", 1); @@ -178,10 +170,9 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { cerr << "Examples:" << endl; cerr << R"( # Search ARCHIVE_PATH for " ERROR " and send results to )" R"(mongodb://127.0.0.1:27017/test "result" collection )" - R"(and use localhost:5555 as the search controller)" << endl; cerr << " " << get_program_name() - << R"(localhost 5555 mongodb://127.0.0.1:27017/test result )" + << R"(mongodb://127.0.0.1:27017/test result )" R"(ARCHIVE_PATH " ERROR ")" << endl; cerr << endl; @@ -198,16 +189,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { return ParsingResult::InfoCommand; } - // Validate search controller host was specified - if (m_search_controller_host.empty()) { - throw invalid_argument("SEARCH_CONTROLLER_HOST not specified or empty."); - } - - // Validate search controller port was specified - if (m_search_controller_port.empty()) { - throw invalid_argument("SEARCH_CONTROLLER_PORT not specified or empty."); - } - // Validate mongodb uri was specified if (m_mongodb_uri.empty()) { throw invalid_argument("MONGODB_URI not specified or empty."); @@ -291,9 +272,7 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { } void CommandLineArguments::print_basic_usage() const { - cerr << "Usage: " << get_program_name() - << " [OPTIONS] SEARCH_CONTROLLER_HOST SEARCH_CONTROLLER_PORT " - "MONGODB_URI MONGODB_COLLECTION " + cerr << "Usage: " << get_program_name() << " [OPTIONS] MONGODB_URI MONGODB_COLLECTION " << R"(ARCHIVE_PATH "WILDCARD STRING" [FILE])" << endl; } } // namespace clp::clo diff --git a/components/core/src/clp/clo/CommandLineArguments.hpp b/components/core/src/clp/clo/CommandLineArguments.hpp index 82890f3e2..c2233e819 100644 --- a/components/core/src/clp/clo/CommandLineArguments.hpp +++ b/components/core/src/clp/clo/CommandLineArguments.hpp @@ -23,10 +23,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { // Methods ParsingResult parse_arguments(int argc, char const* argv[]) override; - std::string const& get_search_controller_host() const { return m_search_controller_host; } - - std::string const& get_search_controller_port() const { return m_search_controller_port; } - std::string const& get_mongodb_uri() const { return m_mongodb_uri; } std::string const& get_mongodb_collection() const { return m_mongodb_collection; } @@ -50,8 +46,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { void print_basic_usage() const override; // Variables - std::string m_search_controller_host; - std::string m_search_controller_port; std::string m_mongodb_uri; std::string m_mongodb_collection; uint64_t m_batch_size; diff --git a/components/core/src/clp/clo/ControllerMonitoringThread.cpp b/components/core/src/clp/clo/ControllerMonitoringThread.cpp deleted file mode 100644 index 0e5a4589a..000000000 --- a/components/core/src/clp/clo/ControllerMonitoringThread.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include "ControllerMonitoringThread.hpp" - -#include - -#include "../networking/socket_utils.hpp" -#include "../spdlog_with_specializations.hpp" - -namespace clp::clo { -void ControllerMonitoringThread::thread_method() { - // Wait for the controller socket to close - constexpr size_t cBufLen = 4096; - char buf[cBufLen]; - size_t num_bytes_received; - for (bool exit = false; false == exit;) { - auto error_code - = networking::try_receive(m_controller_socket_fd, buf, cBufLen, num_bytes_received); - switch (error_code) { - case ErrorCode_EndOfFile: - // Controller closed the connection - m_query_cancelled = true; - exit = true; - break; - case ErrorCode_Success: - // Unexpectedly received data - SPDLOG_ERROR( - "Unexpected received {} bytes of data from controller.", - num_bytes_received - ); - break; - case ErrorCode_BadParam: - SPDLOG_ERROR("Bad parameter sent to try_receive.", num_bytes_received); - exit = true; - break; - case ErrorCode_errno: - SPDLOG_ERROR("Failed to receive data from controller, errno={}.", errno); - exit = true; - break; - default: - SPDLOG_ERROR("Unexpected error from try_receive, error_code={}.", error_code); - exit = true; - break; - } - } - - close(m_controller_socket_fd); -} -} // namespace clp::clo diff --git a/components/core/src/clp/clo/ControllerMonitoringThread.hpp b/components/core/src/clp/clo/ControllerMonitoringThread.hpp deleted file mode 100644 index 5c273be5d..000000000 --- a/components/core/src/clp/clo/ControllerMonitoringThread.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef CLP_CLO_CONTROLLERMONITORINGTHREAD_HPP -#define CLP_CLO_CONTROLLERMONITORINGTHREAD_HPP - -#include "../Thread.hpp" - -namespace clp::clo { -/** - * A thread that waits for the controller to close the connection at which time it will indicate the - * query has been cancelled. - */ -class ControllerMonitoringThread : public Thread { -public: - // Constructor - ControllerMonitoringThread(int controller_socket_fd) - : m_controller_socket_fd(controller_socket_fd), - m_query_cancelled(false) {} - - std::atomic_bool const& get_query_cancelled() const { return m_query_cancelled; } - -protected: - // Methods - void thread_method() override; - -private: - // Variables - int m_controller_socket_fd; - std::atomic_bool m_query_cancelled; -}; -} // namespace clp::clo - -#endif // CLP_CLO_CONTROLLERMONITORINGTHREAD_HPP diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index fdcfd2fce..0f83db713 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -1,5 +1,3 @@ -#include - #include #include @@ -9,13 +7,11 @@ #include "../Defs.h" #include "../Grep.hpp" -#include "../networking/socket_utils.hpp" #include "../Profiler.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/Constants.hpp" #include "../Utils.hpp" #include "CommandLineArguments.hpp" -#include "ControllerMonitoringThread.hpp" #include "ResultsCacheClient.hpp" using clp::clo::CommandLineArguments; @@ -48,21 +44,11 @@ enum class SearchFilesResult { Success }; -/** - * Connects to the search controller - * @param controller_host - * @param controller_port - * @return -1 on failure - * @return Search controller socket file descriptor otherwise - */ -static int -connect_to_search_controller(string const& controller_host, string const& controller_port); /** * Searches all files referenced by a given database cursor * @param query * @param archive * @param file_metadata_ix - * @param query_cancelled * @param results_cache_client * @return SearchFilesResult::OpenFailure on failure to open a compressed file * @return SearchFilesResult::ResultSendFailure on failure to send a result @@ -72,78 +58,25 @@ static SearchFilesResult search_files( Query& query, Archive& archive, MetadataDB::FileIterator& file_metadata_ix, - std::atomic_bool const& query_cancelled, ResultsCacheClient& results_cache_client ); /** * Searches an archive with the given path * @param command_line_args * @param archive_path - * @param query_cancelled * @param results_cache_client * @return true on success, false otherwise */ static bool search_archive( CommandLineArguments const& command_line_args, boost::filesystem::path const& archive_path, - std::atomic_bool const& query_cancelled, ResultsCacheClient& results_cache_client ); -static int -connect_to_search_controller(string const& controller_host, string const& controller_port) { - // Get address info for controller - struct addrinfo hints = {}; - // Address can be IPv4 or IPV6 - hints.ai_family = AF_UNSPEC; - // TCP socket - hints.ai_socktype = SOCK_STREAM; - hints.ai_flags = 0; - hints.ai_protocol = 0; - struct addrinfo* addresses_head = nullptr; - int error = getaddrinfo( - controller_host.c_str(), - controller_port.c_str(), - &hints, - &addresses_head - ); - if (0 != error) { - SPDLOG_ERROR("Failed to get address information for search controller, error={}", error); - return -1; - } - - // Try each address until a socket can be created and connected to - int controller_socket_fd = -1; - for (auto curr = addresses_head; nullptr != curr; curr = curr->ai_next) { - // Create socket - controller_socket_fd = socket(curr->ai_family, curr->ai_socktype, curr->ai_protocol); - if (-1 == controller_socket_fd) { - continue; - } - - // Connect to address - if (connect(controller_socket_fd, curr->ai_addr, curr->ai_addrlen) != -1) { - break; - } - - // Failed to connect, so close socket - close(controller_socket_fd); - controller_socket_fd = -1; - } - freeaddrinfo(addresses_head); - if (-1 == controller_socket_fd) { - SPDLOG_ERROR("Failed to connect to search controller, errno={}", errno); - return -1; - } - - return controller_socket_fd; -} - static SearchFilesResult search_files( Query& query, Archive& archive, MetadataDB::FileIterator& file_metadata_ix, - std::atomic_bool const& query_cancelled, ResultsCacheClient& results_cache_client ) { SearchFilesResult result = SearchFilesResult::Success; @@ -168,14 +101,13 @@ static SearchFilesResult search_files( } query.make_sub_queries_relevant_to_segment(compressed_file.get_segment_id()); - while (false == query_cancelled - && Grep::search_and_decompress( - query, - archive, - compressed_file, - compressed_message, - decompressed_message - )) + while (Grep::search_and_decompress( + query, + archive, + compressed_file, + compressed_message, + decompressed_message + )) { results_cache_client.add_result( compressed_file.get_orig_path(), @@ -193,7 +125,6 @@ static SearchFilesResult search_files( static bool search_archive( CommandLineArguments const& command_line_args, boost::filesystem::path const& archive_path, - std::atomic_bool const& query_cancelled, ResultsCacheClient& results_cache_client ) { if (false == boost::filesystem::exists(archive_path)) { @@ -267,13 +198,7 @@ static bool search_archive( auto& file_metadata_ix = *file_metadata_ix_ptr; for (auto segment_id : ids_of_segments_to_search) { file_metadata_ix.set_segment_id(segment_id); - auto result = search_files( - query, - archive_reader, - file_metadata_ix, - query_cancelled, - results_cache_client - ); + auto result = search_files(query, archive_reader, file_metadata_ix, results_cache_client); if (SearchFilesResult::ResultSendFailure == result) { // Stop search now since results aren't reaching the controller break; @@ -312,14 +237,6 @@ int main(int argc, char const* argv[]) { break; } - int controller_socket_fd = connect_to_search_controller( - command_line_args.get_search_controller_host(), - command_line_args.get_search_controller_port() - ); - if (-1 == controller_socket_fd) { - return -1; - } - mongocxx::instance mongocxx_instance{}; ResultsCacheClient results_cache_client( command_line_args.get_mongodb_uri(), @@ -329,19 +246,9 @@ int main(int argc, char const* argv[]) { auto const archive_path = boost::filesystem::path(command_line_args.get_archive_path()); - clp::clo::ControllerMonitoringThread controller_monitoring_thread(controller_socket_fd); - controller_monitoring_thread.start(); - int return_value = 0; try { - if (false - == search_archive( - command_line_args, - archive_path, - controller_monitoring_thread.get_query_cancelled(), - results_cache_client - )) - { + if (false == search_archive(command_line_args, archive_path, results_cache_client)) { return_value = -1; } } catch (TraceableException& e) { @@ -365,38 +272,5 @@ int main(int argc, char const* argv[]) { } return_value = -1; } - - // Unblock the controller monitoring thread if it's blocked - auto shutdown_result = shutdown(controller_socket_fd, SHUT_RDWR); - if (0 != shutdown_result) { - if (ENOTCONN != shutdown_result) { - SPDLOG_ERROR("Failed to shutdown socket, error={}", shutdown_result); - } // else connection already disconnected, so nothing to do - } - - try { - controller_monitoring_thread.join(); - } catch (TraceableException& e) { - auto error_code = e.get_error_code(); - if (ErrorCode_errno == error_code) { - SPDLOG_ERROR( - "Failed to join with controller monitoring thread: {}:{} {}, errno={}", - e.get_filename(), - e.get_line_number(), - e.what(), - errno - ); - } else { - SPDLOG_ERROR( - "Failed to join with controller monitoring thread: {}:{} {}, error_code={}", - e.get_filename(), - e.get_line_number(), - e.what(), - error_code - ); - } - return_value = -1; - } - return return_value; } diff --git a/components/job-orchestration/job_orchestration/executor/celeryconfig.py b/components/job-orchestration/job_orchestration/executor/celeryconfig.py index f0322863f..af9e55b0b 100644 --- a/components/job-orchestration/job_orchestration/executor/celeryconfig.py +++ b/components/job-orchestration/job_orchestration/executor/celeryconfig.py @@ -7,14 +7,12 @@ worker_prefetch_multiplier = 1 imports = [ 'job_orchestration.executor.compression_task', - 'job_orchestration.executor.search_task' ] # Queue settings task_queue_max_priority = TASK_QUEUE_HIGHEST_PRIORITY task_routes = { 'job_orchestration.executor.compression_task.compress': QueueName.COMPRESSION, - 'job_orchestration.executor.search_task.search': QueueName.SEARCH } task_create_missing_queues = True diff --git a/components/job-orchestration/job_orchestration/executor/search/__init__.py b/components/job-orchestration/job_orchestration/executor/search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/job-orchestration/job_orchestration/executor/search/celery.py b/components/job-orchestration/job_orchestration/executor/search/celery.py new file mode 100644 index 000000000..7320426d8 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/search/celery.py @@ -0,0 +1,5 @@ +from celery import Celery +from . import celeryconfig # type: ignore + +app = Celery("search") +app.config_from_object(celeryconfig) diff --git a/components/job-orchestration/job_orchestration/executor/search/celeryconfig.py b/components/job-orchestration/job_orchestration/executor/search/celeryconfig.py new file mode 100644 index 000000000..b283b70b8 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/search/celeryconfig.py @@ -0,0 +1,32 @@ +import os + +from job_orchestration.scheduler.constants import QueueName + +imports = ("job_orchestration.executor.search.fs_search_task") + +task_routes = { + 'job_orchestration.executor.search.fs_search_task.search': QueueName.SEARCH, +} +task_create_missing_queues = True + +broker_url = os.getenv('BROKER_URL') +result_backend = os.getenv('RESULT_BACKEND') + +result_persistent = True + +# Differentiate between tasks that have started v.s. tasks still in queue +task_track_started = True + +accept_content = [ + "application/json", # json + "application/x-python-serialize", # pickle +] + +result_accept_content = [ + "application/json", # json + "application/x-python-serialize", # pickle +] + +# TODO: Choose a different serialization format for tasks and results. Sticking with json is +# probably not a good idea. +result_serializer = "json" diff --git a/components/job-orchestration/job_orchestration/executor/search/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/search/fs_search_task.py new file mode 100644 index 000000000..86503c8b2 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/search/fs_search_task.py @@ -0,0 +1,103 @@ +import logging +import os +import sys +import signal +import subprocess +from pathlib import Path +from typing import Any, Dict + +from celery.app.task import Task +from celery.utils.log import get_task_logger + +from clp_py_utils.clp_logging import set_logging_level + +from job_orchestration.executor.search.celery import app +from job_orchestration.job_config import SearchConfig, SearchTaskResult + +# Setup logging +logger = get_task_logger(__name__) + +@app.task(bind=True) +def search( + self: Task, + job_id: str, + search_config_obj: dict, + archive_id: str, + results_cache_uri: str, +) -> Dict[str, Any]: + task_id = str(self.request.id) + clp_home = Path(os.getenv("CLP_HOME")) + archive_directory = Path(os.getenv('CLP_ARCHIVE_OUTPUT_DIR')) + clp_logs_dir = Path(os.getenv("CLP_LOGS_DIR")) + clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) + + # Setup logging to file + worker_logs_dir = clp_logs_dir / job_id + worker_logs_dir.mkdir(exist_ok=True, parents=True) + set_logging_level(logger, clp_logging_level) + clo_log_path = worker_logs_dir / f"{task_id}-clo.log" + clo_log_file = open(clo_log_path, "w") + + logger.info(f"Started task for job {job_id}") + + search_config = SearchConfig.parse_obj(search_config_obj) + search_cmd = [ + str(clp_home / "bin" / "clo"), + results_cache_uri, + job_id, + str(archive_directory / archive_id), + search_config.query_string, + ] + + if search_config.begin_timestamp is not None: + search_cmd.append('--tge') + search_cmd.append(str(search_config.begin_timestamp)) + if search_config.end_timestamp is not None: + search_cmd.append('--tle') + search_cmd.append(str(search_config.end_timestamp)) + if search_config.path_filter is not None: + search_cmd.append(search_config.path_filter) + + logger.info(f'Running: {" ".join(search_cmd)}') + search_successful = False + search_proc = subprocess.Popen( + search_cmd, + preexec_fn=os.setpgrp, + close_fds=True, + stdout=clo_log_file, + stderr=clo_log_file, + ) + + def sigterm_handler(_signo, _stack_frame): + logger.debug("Entered sigterm handler") + if search_proc.poll() is None: + logger.debug("Trying to kill search process") + # Kill the process group in case the search process also forked + os.killpg(os.getpgid(search_proc.pid), signal.SIGTERM) + os.waitpid(search_proc.pid, 0) + logger.info(f"Cancelling search task.") + # Add 128 to follow convention for exit codes from signals + # https://tldp.org/LDP/abs/html/exitcodes.html#AEN23549 + sys.exit(_signo + 128) + + # Register the function to kill the child process at exit + signal.signal(signal.SIGTERM, sigterm_handler) + + logger.info("Waiting for search to finish") + # communicate is equivalent to wait in this case, but avoids deadlocks if we switch to piping + # stdout/stderr in the future. + search_proc.communicate() + return_code = search_proc.returncode + if 0 != return_code: + logger.error(f"Failed search task for job {job_id} - return_code={return_code}") + else: + search_successful = True + logger.info(f"Search task completed for job {job_id}") + + # Close log files + clo_log_file.close() + + return SearchTaskResult( + success=search_successful, + task_id=task_id, + ).dict() diff --git a/components/job-orchestration/job_orchestration/executor/search_task.py b/components/job-orchestration/job_orchestration/executor/search_task.py index cf3d77237..7fe573d94 100644 --- a/components/job-orchestration/job_orchestration/executor/search_task.py +++ b/components/job-orchestration/job_orchestration/executor/search_task.py @@ -38,7 +38,7 @@ def run_clo(job_id: int, task_id: int, clp_home: pathlib.Path, archive_output_di results_cache_uri, str(job_id), str(archive_output_dir / archive_id), - search_config.wildcard_query + search_config.query_string ] if search_config.begin_timestamp is not None: cmd.append('--tge') @@ -76,7 +76,7 @@ def run_clo(job_id: int, task_id: int, clp_home: pathlib.Path, archive_output_di return search_successful, f"See {stderr_filename} in logs directory." -@app.task() +#@app.task() def search(job_id: int, task_id: int, search_config_json: str, archive_id: str, results_cache_uri: str): clp_home = os.getenv('CLP_HOME') diff --git a/components/job-orchestration/job_orchestration/job_config.py b/components/job-orchestration/job_orchestration/job_config.py index cb0cd255c..20f9aa08e 100644 --- a/components/job-orchestration/job_orchestration/job_config.py +++ b/components/job-orchestration/job_orchestration/job_config.py @@ -30,9 +30,11 @@ class ClpIoConfig(BaseModel): class SearchConfig(BaseModel): - search_controller_host: str - search_controller_port: int - wildcard_query: str + query_string: str begin_timestamp: typing.Optional[int] = None end_timestamp: typing.Optional[int] = None path_filter: typing.Optional[str] = None + +class SearchTaskResult(BaseModel): + success: bool + task_id: str diff --git a/components/job-orchestration/job_orchestration/search_scheduler/__init__.py b/components/job-orchestration/job_orchestration/search_scheduler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/job-orchestration/job_orchestration/search_scheduler/common.py b/components/job-orchestration/job_orchestration/search_scheduler/common.py new file mode 100644 index 000000000..094b7288c --- /dev/null +++ b/components/job-orchestration/job_orchestration/search_scheduler/common.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from enum import IntEnum, auto + +# When adding new states always add them to the end of this enum +# and make necessary changes in the UI, Search Scheduler, and Reducer +class JobStatus(IntEnum): + PENDING = 0 + RUNNING = auto() + SUCCESS = auto() + FAILED = auto() + CANCELLING = auto() + CANCELLED = auto() + + @staticmethod + def from_str(label: str) -> JobStatus: + return JobStatus[label.upper()] + + def __str__(self) -> str: + return str(self.value) + + def to_str(self) -> str: + return str(self.name) + diff --git a/components/job-orchestration/job_orchestration/search_scheduler/search_scheduler.py b/components/job-orchestration/job_orchestration/search_scheduler/search_scheduler.py new file mode 100644 index 000000000..8217b20b8 --- /dev/null +++ b/components/job-orchestration/job_orchestration/search_scheduler/search_scheduler.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 + +import argparse +import contextlib +import logging +import os +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional + +import msgpack +import pathlib + +import celery +from clp_py_utils.clp_config import SEARCH_JOBS_TABLE_NAME, CLP_METADATA_TABLE_PREFIX +from job_orchestration.executor.search.fs_search_task import search +from job_orchestration.job_config import SearchConfig, SearchTaskResult + +from pydantic import ValidationError + +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_logging import get_logger, get_logging_formatter, set_logging_level +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.sql_adapter import SQL_Adapter + +from .common import JobStatus # type: ignore + +# Setup logging +logger = get_logger("search-job-handler") + +class SearchJob: + def __init__(self, async_task_result: any) -> None: + self.async_task_result: any = async_task_result + +# Dictionary of active jobs indexed by job id +active_jobs : Dict[str, SearchJob] = {} + +def cancel_job(job_id): + global active_jobs + active_jobs[job_id].async_task_result.revoke(terminate=True) + try: + active_jobs[job_id].async_task_result.get() + except Exception: + pass + del active_jobs[job_id] + + +def fetch_new_search_jobs(db_cursor) -> list: + db_cursor.execute(f""" + SELECT {SEARCH_JOBS_TABLE_NAME}.id as job_id, + {SEARCH_JOBS_TABLE_NAME}.status as job_status, + {SEARCH_JOBS_TABLE_NAME}.search_config, + {SEARCH_JOBS_TABLE_NAME}.submission_time + FROM {SEARCH_JOBS_TABLE_NAME} + WHERE {SEARCH_JOBS_TABLE_NAME}.status={JobStatus.PENDING} + """) + return db_cursor.fetchall() + + +def fetch_cancelling_search_jobs(db_cursor) -> list: + db_cursor.execute(f""" + SELECT {SEARCH_JOBS_TABLE_NAME}.id as job_id + FROM {SEARCH_JOBS_TABLE_NAME} + WHERE {SEARCH_JOBS_TABLE_NAME}.status={JobStatus.CANCELLING} + """) + return db_cursor.fetchall() + + +def set_job_status( + db_conn, job_id: str, status: JobStatus, prev_status: Optional[JobStatus] = None, **kwargs +) -> bool: + field_set_expressions = [f'{k}="{v}"' for k, v in kwargs.items()] + field_set_expressions.append(f"status={status}") + update = f'UPDATE {SEARCH_JOBS_TABLE_NAME} SET {", ".join(field_set_expressions)} WHERE id={job_id}' + + if prev_status is not None: + update += f' AND status={prev_status}' + + with contextlib.closing(db_conn.cursor()) as cursor: + cursor.execute(update) + db_conn.commit() + rval = cursor.rowcount != 0 + return rval + + +def handle_cancelling_search_jobs(db_conn) -> None: + global active_jobs + + with contextlib.closing(db_conn.cursor(dictionary=True)) as cursor: + cancelling_jobs = fetch_cancelling_search_jobs(cursor) + db_conn.commit() + + for job in cancelling_jobs: + job_id = job['job_id'] + if job_id in active_jobs: + cancel_job(job_id) + if set_job_status(db_conn, job_id, JobStatus.CANCELLED, prev_status=JobStatus.CANCELLING): + logger.info(f"Cancelled job {job_id}.") + else: + logger.error(f"Failed to cancel job {job_id}.") + + +def get_archives_for_search( + db_conn, + search_config: SearchConfig, +): + query = f"""SELECT id as archive_id + FROM {CLP_METADATA_TABLE_PREFIX}archives + """ + filter_clauses = [] + if search_config.end_timestamp is not None: + filter_clauses.append(f"begin_timestamp <= {search_config.end_timestamp}") + if search_config.begin_timestamp is not None: + filter_clauses.append(f"end_timestamp >= {search_config.begin_timestamp}") + if len(filter_clauses) > 0: + query += " WHERE " + " AND ".join(filter_clauses) + query += " ORDER BY end_timestamp DESC" + + with contextlib.closing(db_conn.cursor(dictionary=True)) as cursor: + cursor.execute( + query + ) + archives_for_search = [archive['archive_id'] for archive in cursor.fetchall()] + db_conn.commit() + return archives_for_search + + +def get_task_group_for_job( + archives_for_search: List[str], + job_id: str, + search_config: SearchConfig, + results_cache_uri: str, +): + search_config_obj = search_config.dict() + return celery.group( + search.s( + job_id=job_id, + archive_id=archive_id, + search_config_obj=search_config_obj, + results_cache_uri=results_cache_uri, + ) for archive_id in archives_for_search + ) + + +def dispatch_search_job( + archives_for_search: List[str], + job_id: str, + search_config: SearchConfig, + results_cache_uri: str +) -> None: + global active_jobs + task_group = get_task_group_for_job(archives_for_search, job_id, search_config, results_cache_uri) + active_jobs[job_id] = SearchJob(task_group.apply_async()) + + +def handle_pending_search_jobs(db_conn, results_cache_uri: str) -> None: + global active_jobs + + with contextlib.closing(db_conn.cursor(dictionary=True)) as cursor: + new_jobs = fetch_new_search_jobs(cursor) + db_conn.commit() + + for job in new_jobs: + logger.debug(f"Got job {job['job_id']} with status {job['job_status']}.") + search_config_obj = SearchConfig.parse_obj(msgpack.unpackb(job['search_config'])) + archives_for_search = get_archives_for_search(db_conn, search_config_obj) + if len(archives_for_search) == 0: + if set_job_status(db_conn, job['job_id'], JobStatus.SUCCESS, job['job_status']): + logger.info(f"No matching archives, skipping job {job['job_id']}.") + continue + + dispatch_search_job(archives_for_search, str(job['job_id']), search_config_obj, results_cache_uri) + if set_job_status(db_conn, job['job_id'], JobStatus.RUNNING, job['job_status']): + logger.info(f"Dispatched job {job['job_id']} with {len(archives_for_search)} archives to search.") + + +def try_getting_task_result(async_task_result): + """ + Ideally, we'd use this code: + + if not async_task_result.ready(): + return None + return async_task_result.get() + + But because of https://github.com/celery/celery/issues/4084, wew have to use the following + timeout based approach until we switch to the Redis result backend. + """ + try: + return async_task_result.get(timeout=0.1) + except celery.exceptions.TimeoutError: + return None + + +def check_job_status_and_update_db(db_conn): + global active_jobs + + for job_id in list(active_jobs.keys()): + try: + returned_results = try_getting_task_result(active_jobs[job_id].async_task_result) + except Exception as e: + logger.error(f"Job `{job_id}` failed: {e}.") + # clean up + del active_jobs[job_id] + set_job_status(db_conn, job_id, JobStatus.FAILED, JobStatus.RUNNING) + continue + + if returned_results is not None: + new_job_status = JobStatus.SUCCESS + for task_result_obj in returned_results: + task_result = SearchTaskResult.parse_obj(task_result_obj) + if not task_result.success: + task_id = task_result.task_id + new_job_status = JobStatus.FAILED + logger.debug(f"Task {task_id} failed - result {task_result}.") + + del active_jobs[job_id] + + if set_job_status(db_conn, job_id, new_job_status, JobStatus.RUNNING): + if new_job_status != JobStatus.FAILED: + logger.info(f"Completed job {job_id}.") + else: + logger.info(f"Completed job {job_id} with failing tasks.") + + +def handle_jobs( + db_conn, + results_cache_uri: str, + jobs_poll_delay: float, +) -> None: + while True: + handle_pending_search_jobs(db_conn, results_cache_uri) + handle_cancelling_search_jobs(db_conn) + check_job_status_and_update_db(db_conn) + time.sleep(jobs_poll_delay) + + +def main(argv: List[str]) -> int: + args_parser = argparse.ArgumentParser(description="Wait for and run search jobs.") + args_parser.add_argument('--config', '-c', required=True, help='CLP configuration file.') + + parsed_args = args_parser.parse_args(argv[1:]) + + # Setup logging to file + log_file = Path(os.getenv("CLP_LOGS_DIR")) / "search_scheduler.log" + logging_file_handler = logging.FileHandler(filename=log_file, encoding="utf-8") + logging_file_handler.setFormatter(get_logging_formatter()) + logger.addHandler(logging_file_handler) + + # Update logging level based on config + set_logging_level(logger, os.getenv("CLP_LOGGING_LEVEL")) + + # Load configuration + config_path = pathlib.Path(parsed_args.config) + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(config_path)) + except ValidationError as err: + logger.error(err) + return -1 + except Exception as ex: + logger.error(ex) + return -1 + + sql_adapter = SQL_Adapter(clp_config.database) + + logger.debug(f"Job polling interval {clp_config.search_scheduler.jobs_poll_delay} seconds.") + try: + with contextlib.closing(sql_adapter.create_connection(True)) as db_conn: + logger.info(f"Connected to archive database {clp_config.database.host}:{clp_config.database.port}.") + logger.info("Search scheduler started.") + handle_jobs( + db_conn=db_conn, + results_cache_uri=clp_config.results_cache.get_uri(), + jobs_poll_delay=clp_config.search_scheduler.jobs_poll_delay, + ) + except Exception: + logger.exception(f"Uncaught exception in job handling loop.") + + return 0 + + +if "__main__" == __name__: + sys.exit(main(sys.argv)) diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index 487ee8538..b99bb0e22 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -8,7 +8,11 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" celery = "^5.3.6" +# mariadb version must be compatible with libmariadev installed in runtime env. +# See https://mariadb.com/docs/server/connect/programming-languages/python/install/#Dependencies +mariadb = "~1.0.11" msgpack = "^1.0.7" +mysql-connector-python = "^8.2.0" pika = "^1.3.2" pydantic = "^1.10.13" PyYAML = "^6.0.1" diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index 1882f8270..291752f53 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -25,6 +25,12 @@ # port: 27017 # db_name: "clp-search" # +#search_scheduler: +# jobs_poll_delay: 0.1 # seconds +# logging_level: "INFO" +#search_worker: +# logging_level: "INFO" +# ## Where archives should be output to #archive_output: # directory: "var/data/archives"