From 62b657f0c4dceb04a6ae1a82e54a931d0eb0e6a8 Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Thu, 27 Mar 2025 07:08:54 +0100 Subject: [PATCH 01/10] TA: additional notes concerning department views --- .../manual/release-notes/03_major_enhancements.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/markdown/manual/release-notes/03_major_enhancements.md b/doc/markdown/manual/release-notes/03_major_enhancements.md index 783af6588..5af83cb01 100644 --- a/doc/markdown/manual/release-notes/03_major_enhancements.md +++ b/doc/markdown/manual/release-notes/03_major_enhancements.md @@ -15,6 +15,7 @@ The following is a brief list of Department View features and the underlying cha - Departments/ACLs can be assigned to cluster objects such as the global configuration, hosts, queues, projects, ... or resource quota sets to allow or deny access to these objects. This restricts the access rights for user jobs (see *users_lists* in sge_conf(5), sge_host_conf(5), sge_queue_conf(5), sge_pe(5) or sge_resource_quota(5)) and the visibility of cluster objects in the user interface if the department view is enabled. - The department view is enabled when a certain command line switch is used (`qhost/qstat/qselect, ... -sdv`). - Various default files allow managers to force a user to view the department view. (`sge_qstat`, `sge_select`, ...). This will automatically hide all details about objects that do not belong to the department. +- Cluster managers will always see all objects, regardless of the department view setting. ### Prevent Denial of Service Attacks @@ -33,17 +34,18 @@ Example: ``` gdi_request_limits=*:add:job:john:*=500, + *:add:job:eng-users:@eng-hosts=100, *:add:job:*:*=50, qstat:get:*:*:*=60000 ``` In this example: - The first rule allows user `john` to submit 500 jobs per second. -- The second rule allows all other users to submit 50 jobs per second. -- The third rule allows 60,000 `qstat` requests per second. -- All rules apply on all hosts independent where the client command is executed. +- The second rule allows all users in the `eng-users` user list to submit 100 jobs per second on hosts in the `@eng-hosts` host group. +- The third rule allows all other users to submit 50 jobs per second. +- The fourth rule allows 60,000 `qstat` requests per second. -These rules are independent of the submit client used (e.g., `qsub`, `qrsh`, DRMAA client, or GUI). If a user exceeds the limit, the submit client will display an error message indicating the violated limit rule. +If a user exceeds the limit, the used command line application will display an error message indicating the violated limit rule. Note that one `qstat` command can trigger multiple GDI requests depending on the switches used. For example, `qstat -f` can query up to 15 different objects (job, queue, execution host, etc.) with one command. Therefore, the limit should be set high enough to allow users to get all necessary information in one command. For instance, a limit of 60,000 `get` requests allows about 5,000 `qstat -f` commands or 60,000 `qstat -j` commands per second. From 7737c249bcfa40cd175508d1f3e15c9b7bad3382 Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Tue, 8 Apr 2025 14:45:46 +0200 Subject: [PATCH 02/10] EH: CS-206: Introduce new GDI category objects --- source/libs/sgeobj/cull/sge_ct_CT_L.h | 15 ++++++------- source/libs/sgeobj/json/CT.json | 25 ++++++++++++--------- source/libs/sgeobj/lwdb/ocs_CT_attributes.h | 8 +++---- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/source/libs/sgeobj/cull/sge_ct_CT_L.h b/source/libs/sgeobj/cull/sge_ct_CT_L.h index f91e8d51b..2b14fc932 100644 --- a/source/libs/sgeobj/cull/sge_ct_CT_L.h +++ b/source/libs/sgeobj/cull/sge_ct_CT_L.h @@ -32,16 +32,15 @@ * * An object of this type describes a category of jobs. * +* SGE_ULONG(CT_id) - Category ID +* Unique ID of a category +* * SGE_STRING(CT_str) - Category String * String holding all elements of a category (requests, user, project, ...). * * SGE_ULONG(CT_refcount) - Reference Count * Number of jobs referencing this category. * -* SGE_INT(CT_count) - Count -* Number of jobs of this category used in this schuling run -* If -1, then CT_refcount is used -* * SGE_BOOL(CT_rejected) - Rejected * Has this category been rejected as it can not be dispatched now? * @@ -64,9 +63,9 @@ */ enum { - CT_str = CT_LOWERBOUND, + CT_id = CT_LOWERBOUND, + CT_str, CT_refcount, - CT_count, CT_rejected, CT_cache, CT_messages_added, @@ -76,9 +75,9 @@ enum { }; LISTDEF(CT_Type) + SGE_ULONG(CT_id, CULL_UNIQUE | CULL_HASH) SGE_STRING(CT_str, CULL_UNIQUE | CULL_HASH) SGE_ULONG(CT_refcount, CULL_DEFAULT) - SGE_INT(CT_count, CULL_DEFAULT) SGE_BOOL(CT_rejected, CULL_DEFAULT) SGE_LIST(CT_cache, CCT_Type, CULL_DEFAULT) SGE_BOOL(CT_messages_added, CULL_DEFAULT) @@ -88,9 +87,9 @@ LISTDEF(CT_Type) LISTEND NAMEDEF(CTN) + NAME("CT_id") NAME("CT_str") NAME("CT_refcount") - NAME("CT_count") NAME("CT_rejected") NAME("CT_cache") NAME("CT_messages_added") diff --git a/source/libs/sgeobj/json/CT.json b/source/libs/sgeobj/json/CT.json index 6a69041a3..69e3d1473 100644 --- a/source/libs/sgeobj/json/CT.json +++ b/source/libs/sgeobj/json/CT.json @@ -5,7 +5,20 @@ "line": "An object of this type describes a category of jobs." }], "cullPrefix": "CT", - "attributes": [{ + "attributes": [ + { + "name": "id", + "summary": "Category ID", + "description": [{ + "line": "Unique ID of a category" + }], + "type": "lUlongT", + "flags": [{ + "name": "UNIQUE" + }, { + "name": "HASH" + }] + }, { "name": "str", "summary": "Category String", "description": [{ @@ -25,16 +38,6 @@ }], "type": "lUlongT", "flags": [] - }, { - "name": "count", - "summary": "Count", - "description": [{ - "line": "Number of jobs of this category used in this schuling run" - }, { - "line": "If -1, then CT_refcount is used" - }], - "type": "lIntT", - "flags": [] }, { "name": "rejected", "summary": "Rejected", diff --git a/source/libs/sgeobj/lwdb/ocs_CT_attributes.h b/source/libs/sgeobj/lwdb/ocs_CT_attributes.h index 653fd2b8f..66e6eab1f 100644 --- a/source/libs/sgeobj/lwdb/ocs_CT_attributes.h +++ b/source/libs/sgeobj/lwdb/ocs_CT_attributes.h @@ -29,9 +29,9 @@ namespace ocs { enum { - CT_str = 7050, + CT_id = 7050, + CT_str, CT_refcount, - CT_count, CT_rejected, CT_cache, CT_messages_added, @@ -41,9 +41,9 @@ enum { }; constexpr const int CT_Type[] = { + CT_id, CT_str, CT_refcount, - CT_count, CT_rejected, CT_cache, CT_messages_added, @@ -54,9 +54,9 @@ constexpr const int CT_Type[] = { }; #define CT_ATTRIBUTES \ + {CT_id, "CT_id", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::UNORDERED_UNIQUE, false, false}, \ {CT_str, "CT_str", AttributeStatic::STRING, nullptr, AttributeStatic::NO_POS, AttributeStatic::UNORDERED_UNIQUE, false, false}, \ {CT_refcount, "CT_refcount", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, false}, \ - {CT_count, "CT_count", AttributeStatic::INT, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, false}, \ {CT_rejected, "CT_rejected", AttributeStatic::BOOL, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, false}, \ {CT_cache, "CT_cache", AttributeStatic::LIST, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, false}, \ {CT_messages_added, "CT_messages_added", AttributeStatic::BOOL, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, false}, \ From 29912b797b9f66ba153503a992a3df201eb891ff Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Tue, 8 Apr 2025 14:50:38 +0200 Subject: [PATCH 03/10] EH: CS-206: Introduce new GDI category objects --- source/libs/sgeobj/cull/sge_job_JB_L.h | 8 +++++++- source/libs/sgeobj/json/JB.json | 10 ++++++++++ source/libs/sgeobj/lwdb/ocs_JB_attributes.h | 7 +++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/source/libs/sgeobj/cull/sge_job_JB_L.h b/source/libs/sgeobj/cull/sge_job_JB_L.h index bf38d8592..687e0ff34 100644 --- a/source/libs/sgeobj/cull/sge_job_JB_L.h +++ b/source/libs/sgeobj/cull/sge_job_JB_L.h @@ -381,6 +381,9 @@ * SGE_ULONG(JB_sync_options) - sync options * Bits that have been specified to the -sync switch. * +* SGE_ULONG(JB_category_id) - Category ID (CAT_id) +* Category ID (CAT_id). Within the scheduler the field category refers to the category object. +* */ enum { @@ -468,7 +471,8 @@ enum { JB_submission_command_line, JB_grp_list, JB_joker, - JB_sync_options + JB_sync_options, + JB_category_id }; LISTDEF(JB_Type) @@ -557,6 +561,7 @@ LISTDEF(JB_Type) SGE_LIST(JB_grp_list, ST_Type, CULL_SPOOL) SGE_LIST(JB_joker, VA_Type, CULL_SPOOL) SGE_ULONG(JB_sync_options, CULL_SPOOL) + SGE_ULONG(JB_category_id, CULL_HASH) LISTEND NAMEDEF(JBN) @@ -645,6 +650,7 @@ NAMEDEF(JBN) NAME("JB_grp_list") NAME("JB_joker") NAME("JB_sync_options") + NAME("JB_category_id") NAMEEND #define JB_SIZE sizeof(JBN)/sizeof(char *) diff --git a/source/libs/sgeobj/json/JB.json b/source/libs/sgeobj/json/JB.json index a80fff2d2..5ca8226df 100644 --- a/source/libs/sgeobj/json/JB.json +++ b/source/libs/sgeobj/json/JB.json @@ -1061,5 +1061,15 @@ "flags": [{ "name": "SPOOL" }] + }, { + "name": "category_id", + "summary": "Category ID (CAT_id)", + "description": [{ + "line": "Category ID (CAT_id). Within the scheduler the field category refers to the category object." + }], + "type": "lUlongT", + "flags": [{ + "name": "HASH" + }] }] } diff --git a/source/libs/sgeobj/lwdb/ocs_JB_attributes.h b/source/libs/sgeobj/lwdb/ocs_JB_attributes.h index 16eb2b3fb..bb06f8bb0 100644 --- a/source/libs/sgeobj/lwdb/ocs_JB_attributes.h +++ b/source/libs/sgeobj/lwdb/ocs_JB_attributes.h @@ -113,7 +113,8 @@ enum { JB_submission_command_line, JB_grp_list, JB_joker, - JB_sync_options + JB_sync_options, + JB_category_id }; constexpr const int JB_Type[] = { @@ -202,6 +203,7 @@ constexpr const int JB_Type[] = { JB_grp_list, JB_joker, JB_sync_options, + JB_category_id, AttributeStatic::END_OF_ATTRIBUTES }; @@ -290,7 +292,8 @@ constexpr const int JB_Type[] = { {JB_submission_command_line, "JB_submission_command_line", AttributeStatic::STRING, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ {JB_grp_list, "JB_grp_list", AttributeStatic::LIST, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ {JB_joker, "JB_joker", AttributeStatic::LIST, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ - {JB_sync_options, "JB_sync_options", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true} \ + {JB_sync_options, "JB_sync_options", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ + {JB_category_id, "JB_category_id", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::UNORDERED_UNIQUE, false, false} \ } // end namespace From 534440c2d95bcde2178844e0cb6cc7def8597c65 Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Tue, 8 Apr 2025 14:52:32 +0200 Subject: [PATCH 04/10] EH: CS-207: Trigger category add/del as part of job add/del --- source/daemons/common/CMakeLists.txt | 1 - source/daemons/common/category.cc | 224 ------------ source/daemons/qmaster/CMakeLists.txt | 2 +- source/daemons/qmaster/msg_qmaster.h | 2 +- source/daemons/qmaster/ocs_CategoryQmaster.cc | 272 +++++++++++++++ source/daemons/qmaster/ocs_CategoryQmaster.h | 64 ++++ .../qmaster/ocs_JsonAccountingFileWriter.cc | 6 +- .../qmaster/ocs_JsonReportingFileWriter.cc | 6 +- source/daemons/qmaster/setup_qmaster.cc | 25 +- source/daemons/qmaster/sge_c_gdi.cc | 276 +++++++-------- source/daemons/qmaster/sge_c_gdi.h | 3 +- source/daemons/qmaster/sge_give_jobs.cc | 19 +- source/daemons/qmaster/sge_job_qmaster.cc | 119 ++++--- .../qmaster/sge_persistence_qmaster.cc | 1 - .../qmaster/sge_qmaster_process_message.cc | 2 +- .../daemons/qmaster/sge_reporting_qmaster.cc | 10 +- .../daemons/qmaster/sge_sched_job_category.cc | 319 ------------------ .../daemons/qmaster/sge_sched_job_category.h | 77 ----- .../daemons/qmaster/sge_sched_prepare_data.cc | 120 ++++--- .../daemons/qmaster/sge_sched_prepare_data.h | 7 + .../qmaster/sge_sched_process_events.cc | 13 +- source/daemons/qmaster/sge_sched_thread.cc | 26 +- .../daemons/qmaster/sge_thread_scheduler.cc | 15 +- source/daemons/qmaster/sge_userprj_qmaster.cc | 26 +- source/daemons/qmaster/sge_userset_qmaster.cc | 35 +- source/libs/evm/sge_event_master.cc | 10 +- source/libs/gdi/ocs_gdi_Target.cc | 1 + source/libs/gdi/ocs_gdi_Target.h | 3 +- source/libs/mir/sge_mirror.cc | 32 ++ source/libs/sgeobj/CMakeLists.txt | 2 + source/libs/sgeobj/msg_sgeobjlib.h | 3 + source/libs/sgeobj/ocs_Category.cc | 140 ++++++++ source/libs/sgeobj/ocs_Category.h | 37 ++ source/libs/sgeobj/sge_event.cc | 20 +- source/libs/sgeobj/sge_event.h | 6 + source/libs/sgeobj/sge_object.h | 1 + source/libs/sgeobj/sge_resource_quota.cc | 1 + source/libs/sgeobj/sge_resource_quota.h | 1 + .../libs/sgeobj/sge_resource_quota_service.cc | 221 ++++++++++++ .../sgeobj/sge_resource_quota_service.h} | 11 +- source/libs/spool/flatfile/sge_flatfile.cc | 43 ++- source/libs/spool/flatfile/sge_flatfile.h | 2 + .../libs/spool/flatfile/sge_flatfile_obj.cc | 7 + source/libs/spool/flatfile/sge_flatfile_obj.h | 1 + 44 files changed, 1250 insertions(+), 962 deletions(-) delete mode 100644 source/daemons/common/category.cc create mode 100644 source/daemons/qmaster/ocs_CategoryQmaster.cc create mode 100644 source/daemons/qmaster/ocs_CategoryQmaster.h delete mode 100644 source/daemons/qmaster/sge_sched_job_category.cc delete mode 100644 source/daemons/qmaster/sge_sched_job_category.h create mode 100644 source/libs/sgeobj/ocs_Category.cc create mode 100644 source/libs/sgeobj/ocs_Category.h create mode 100644 source/libs/sgeobj/sge_resource_quota_service.cc rename source/{daemons/common/category.h => libs/sgeobj/sge_resource_quota_service.h} (82%) diff --git a/source/daemons/common/CMakeLists.txt b/source/daemons/common/CMakeLists.txt index eca2b61df..1067c6472 100644 --- a/source/daemons/common/CMakeLists.txt +++ b/source/daemons/common/CMakeLists.txt @@ -22,7 +22,6 @@ set(LIBRARY_NAME daemonscommon) set(LIBRARY_SOURCES admin_mail.cc - category.cc err_trace.cc lock.cc mail.cc diff --git a/source/daemons/common/category.cc b/source/daemons/common/category.cc deleted file mode 100644 index a2684e392..000000000 --- a/source/daemons/common/category.cc +++ /dev/null @@ -1,224 +0,0 @@ -/*___INFO__MARK_BEGIN__*/ -/************************************************************************* - * - * The Contents of this file are made available subject to the terms of - * the Sun Industry Standards Source License Version 1.2 - * - * Sun Microsystems Inc., March, 2001 - * - * - * Sun Industry Standards Source License Version 1.2 - * ================================================= - * The contents of this file are subject to the Sun Industry Standards - * Source License Version 1.2 (the "License"); You may not use this file - * except in compliance with the License. You may obtain a copy of the - * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html - * - * Software provided under this License is provided on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, - * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. - * See the License for the specific provisions governing your rights and - * obligations concerning the Software. - * - * The Initial Developer of the Original Code is: Sun Microsystems, Inc. - * - * Copyright: 2001 by Sun Microsystems, Inc. - * - * All Rights Reserved. - * - * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH - * - ************************************************************************/ -/*___INFO__MARK_END__*/ -#include - -#include "uti/sge_dstring.h" -#include "uti/sge_log.h" -#include "uti/sge_rmon_macros.h" - -#include "sgeobj/sge_job.h" -#include "sgeobj/sge_userprj.h" -#include "sgeobj/sge_ja_task.h" - -#include "sched/sge_resource_quota_schedd.h" - -#if 0 /* TODO: EB: ST: should this be enabled again? */ - -/* struct containing the cull field position of the job target structures - and the reduced order elements */ -typedef struct { - int JB_hard_queue_list_pos; - int JB_master_hard_queue_list_pos; - int JB_hard_resource_list_pos; - int JB_soft_resource_list_pos; - int JB_checkpoint_name_pos; - int JB_type_pos; - int JB_owner_pos; - int JB_group_pos; - int JB_project_pos; - int JB_pe_pos; - int JB_range_pos; - int JB_ar_pos; -} order_pos_t; - -typedef struct { - pthread_mutex_t cull_order_mutex; /* gards the last_update access */ - order_pos_t cull_order_pos; /* stores cull positions in the job, ja-task, and order structure */ -} sge_category_t; - -static sge_category_t Category_Control = {PTHREAD_MUTEX_INITIALIZER, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}}; - -#endif - -/*-------------------------------------------------------------------------*/ -/* build the category string */ -/* the category string includes now the soft requests */ -/*-------------------------------------------------------------------------*/ -/****** category/sge_build_job_category_dstring() ****************************** -* NAME -* sge_build_job_category_dstring() -- build the category string -* -* SYNOPSIS -* void sge_build_job_category_dstring(dstring *category_str, lListElem -* *job, lList *acl_list) -* -* FUNCTION -* The following parameter are put into the category: -* hard_queue_list -* master_hard_queue_list -* hard_resource_list -* soft_resource_list -* checkpoint_name -* type -* -* owner/group: -U user_lists -* Omitted, if user_lists/xuser_lists were not used in -* host_conf(5), sge_pe(5) and queue_conf(5). In sge_conf(5) -* user_lists/xuser_lists still can be used, as it causes -* jobs already be rejected at submit time. -* -* project: -P user_lists -* Omitted, if projects/xprojects were not used in -* host_conf(5), sge_pe(5) and queue_conf(5). In sge_conf(5) -* projects/xprojects still can be used, as it cuases -* jobs already be rejected at submit time. -* -* pe -* -* INPUTS -* dstring *category_str - target string, contains the category or nothing -* lListElem *job - the job for the category creating -* lList *acl_list - global access list -* -* NOTES -* MT-NOTE: sge_build_job_category_dstring() is MT safe as long as the caller is -* -*******************************************************************************/ -void sge_build_job_category_dstring(dstring *category_str, lListElem *job, const lList *acl_list, const lList *prj_list, bool *did_project, const lList *rqs_list) { - DENTER(TOP_LAYER); - -#if 0 - sge_mutex_lock("cull_order_mutex", __func__, __LINE__, &Category_Control.cull_order_mutex); - if (Category_Control.cull_order_pos.JB_hard_queue_list_pos == -1) { - Category_Control.cull_order_pos.JB_checkpoint_name_pos = lGetPosViaElem(job, JB_checkpoint_name, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_soft_resource_list_pos = lGetPosViaElem(job, JB_soft_resource_list, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_master_hard_queue_list_pos = lGetPosViaElem(job, JB_master_hard_queue_list, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_hard_queue_list_pos = lGetPosViaElem(job, JB_hard_queue_list, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_owner_pos = lGetPosViaElem(job, JB_owner, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_group_pos = lGetPosViaElem(job, JB_group, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_hard_resource_list_pos = lGetPosViaElem(job, JB_hard_resource_list, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_type_pos = lGetPosViaElem(job, JB_type, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_project_pos = lGetPosViaElem(job, JB_project, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_ar_pos = lGetPosViaElem(job, JB_ar, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_pe_pos = lGetPosViaElem(job, JB_pe, SGE_NO_ABORT); - Category_Control.cull_order_pos.JB_range_pos = lGetPosViaElem(job, JB_pe_range, SGE_NO_ABORT); - } - sge_mutex_unlock("cull_order_mutex", __func__, __LINE__, &Category_Control.cull_order_mutex); -#endif - - // owner (user, UNIX group, and ACLs) - const char *owner = lGetString(job, JB_owner); - const char *group = lGetString(job, JB_group); - const lList *grp_list = lGetList(job, JB_grp_list); - sge_unparse_acl_dstring(category_str, owner, group, grp_list, acl_list, "-U"); - - // -u if referenced in resource quota sets - // - // TODO: A possible performance enhancement is to split user and group inside category. - // Some users are only referenced by the unix group. Their jobs could be grouped - // together by referencing only the group in the category string - if (sge_user_is_referenced_in_rqs(rqs_list, owner, group, grp_list, acl_list)) { - sge_dstring_append(category_str, "-u "); - sge_dstring_append(category_str, owner); - sge_dstring_append_char(category_str, ' '); - } - - // -scope global -hard -q - sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_GLOBAL, true), "-scope global -hard -q"); - - // -scope master -hard -q - sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_MASTER, true), "-scope master -hard -q"); - - // -scope slave -hard -q - sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_SLAVE, true), "-scope slave -hard -q"); - - - // -scope global -hard -l - sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_GLOBAL, true), "-scope global -hard -l"); - - // -scope master -hard -l - sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_MASTER, true), "-scope master -hard -l"); - - // -scope slave -hard -l - sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_SLAVE, true), "-scope slave -hard -l"); - - // TODO: evaluate if soft requests should be part of the category string -#if 1 - // -scope global -soft -q - sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_GLOBAL, false), "-scope global -soft -q"); - - // -scope global -soft -l - sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_GLOBAL, false), "-scope global -soft -l"); -#endif - - // -pe pe_name pe_range - sge_unparse_pe_dstring(category_str, job, lGetPosViaElem(job, JB_pe, SGE_NO_ABORT), lGetPosViaElem(job, JB_pe_range, SGE_NO_ABORT), "-pe"); - - // -ckpt ckpt_name - sge_unparse_string_option_dstring(category_str, job, lGetPosViaElem(job, JB_checkpoint_name, SGE_NO_ABORT), "-ckpt"); - - // interactive job type - if (JOB_TYPE_IS_IMMEDIATE(lGetPosUlong(job, lGetPosViaElem(job, JB_type, SGE_NO_ABORT)))) { - sge_dstring_append(category_str, "-I y "); - } - - // -P project - { - int project_nm = lGetPosViaElem(job, JB_project, SGE_NO_ABORT); - const char *project = lGetPosString(job, project_nm); - - if (project != nullptr) { - const lListElem *prj = lGetElemStr(prj_list, PR_name, project); - - if (prj != nullptr && lGetBool(prj, PR_consider_with_categories)) { - if (did_project) { - *did_project = true; - } - sge_unparse_string_option_dstring(category_str, job, project_nm, "-P"); - } else { - if (did_project) { - *did_project = false; - } - } - } - } - - // -ar ar_id - sge_unparse_ulong_option_dstring(category_str, job, lGetPosViaElem(job, JB_ar, SGE_NO_ABORT), "-ar"); - - // remove the last white space that the last unparse function has written - sge_dstring_strip_white_space_at_eol(category_str); - - DRETURN_VOID; -} diff --git a/source/daemons/qmaster/CMakeLists.txt b/source/daemons/qmaster/CMakeLists.txt index 41363a9c3..7253cc5af 100644 --- a/source/daemons/qmaster/CMakeLists.txt +++ b/source/daemons/qmaster/CMakeLists.txt @@ -26,6 +26,7 @@ set(QMASTER_SOURCES job_report_qmaster.cc ocs_BaseAccountingFileWriter.cc ocs_BaseReportingFileWriter.cc + ocs_CategoryQmaster.cc ocs_JsonAccountingFileWriter.cc ocs_JsonReportingFileWriter.cc ocs_MirrorReaderDataStore.cc @@ -65,7 +66,6 @@ set(QMASTER_SOURCES sge_reporting_qmaster.cc sge_resource_quota_qmaster.cc sge_rusage.cc - sge_sched_job_category.cc sge_sched_order.cc sge_sched_prepare_data.cc sge_sched_process_events.cc diff --git a/source/daemons/qmaster/msg_qmaster.h b/source/daemons/qmaster/msg_qmaster.h index 8262a8801..686a57da7 100644 --- a/source/daemons/qmaster/msg_qmaster.h +++ b/source/daemons/qmaster/msg_qmaster.h @@ -148,7 +148,7 @@ #define MSG_MEM_MALLOC _MESSAGE(33121, _("malloc failure")) #define MSG_SGETEXT_UNKNOWNOP _MESSAGE(33122, _("unknown operation")) -#define MSG_SGETEXT_OPNOIMPFORTARGET _MESSAGE(33125, _("operation not implemented for target")) +#define MSG_SGETEXT_OPNOIMPFORTARGET_S _MESSAGE(33125, _("operation not implemented for target in " SFN)) #define MSG_SGETEXT_NOADMINHOST_S _MESSAGE(33126, _("denied: host " SFQ " is no admin host")) #define MSG_SGETEXT_NOSUBMITHOST_S _MESSAGE(33127, _("denied: host " SFQ " is no submit host")) #define MSG_SGETEXT_NOSUBMITORADMINHOST_S _MESSAGE(33128, _("denied: host " SFQ " is neither submit nor admin host")) diff --git a/source/daemons/qmaster/ocs_CategoryQmaster.cc b/source/daemons/qmaster/ocs_CategoryQmaster.cc new file mode 100644 index 000000000..94654cece --- /dev/null +++ b/source/daemons/qmaster/ocs_CategoryQmaster.cc @@ -0,0 +1,272 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "uti/sge_log.h" +#include "uti/sge_rmon_macros.h" +#include "uti/sge_dstring.h" + +#include "sgeobj/ocs_Category.h" +#include "sgeobj/sge_job.h" + +#include "ocs_CategoryQmaster.h" +#include "ocs_DataStore.h" +#include "sge_event_master.h" + +/****************************************************** + * + * Description: + * + * Categories are used to speed up the job dispatching + * in the scheduler. Before the job dispatching starts, + * the categories have to be build for new jobs and + * reseted for existing jobs. A new job gets a reference + * to its category regardless if it is existing or not. + * + * This is done with: + * - sge_add_job_category(lListElem *job, lList *acl_list) + * - sge_rebuild_job_category(lList *job_list, lList *acl_list) + * - int sge_reset_job_category() + * + * During the dispatch run for a job, the category caches + * all hosts and queues, which are not suitable for that + * category. This leads toa speed improvement when other + * jobs of the same category are matched. In addition to + * the host and queues, it has to cach the generated messages + * as well, since the are not generated again. If a category + * cannot run in the cluster at all, the category is rejected + * and the messages are added for all jobs in the category. + * + * This is done for simple and parallel jobs. In addition it + * also caches the results of soft request matching. Since + * a job can only soft request a fixed resource, it is not + * changing during a scheduling run and the soft request violation + * for a given queue are the same for all jobs in one + * category. + * + ******************************************************/ + +bool +ocs::CategoryQmaster::attach_job(lList **master_category_list, lListElem **category, lListElem *job, + const lList *master_userset_list, const lList *master_project_list, + const lList *master_rqs_list, bool send_events, u_long32 gdi_session) { + DENTER(TOP_LAYER); + + // check if the input parameters are valid + if (master_category_list == nullptr || category == nullptr || job == nullptr) { + DRETURN(false); + } + + // check if the category list is already created + if (*master_category_list == nullptr) { + *master_category_list = lCreateList("master category list", CT_Type); + } + + // build the category string + dstring category_str = DSTRING_INIT; + Category::build_string(&category_str, job, master_userset_list, master_project_list, master_rqs_list); + const char *cat_str = sge_dstring_get_string(&category_str); + + DPRINTF("category string: %s\n", cat_str); + + // get the category or create a new one + bool is_new = false; + *category = lGetElemStrRW(*master_category_list, CT_str, cat_str); + if (*category == nullptr) { + *category = lAddElemStr(master_category_list, CT_str, cat_str, CT_Type); + lSetUlong(*category, CT_id, Category::get_next_id()); + is_new = true; + } + sge_dstring_free(&category_str); + + // Increase the reference count + lSetUlong(*category, CT_refcount, lGetUlong(*category, CT_refcount) + 1); + + // Point to the category in the job + u_long32 category_id = lGetUlong(*category, CT_id); + lSetUlong(job, JB_category_id, category_id); + + // Send events if required + if (send_events) { + ev_event category_event_type = is_new ? sgeE_CATEGORY_ADD : sgeE_CATEGORY_MOD; + sge_add_event(0, category_event_type, category_id, 0, nullptr, + nullptr, nullptr, *category, gdi_session); + } + + DRETURN(true); +} + +bool +ocs::CategoryQmaster::detach_job(lList **master_category_list, lListElem *job, bool send_events, u_long32 gdi_session) { + DENTER(TOP_LAYER); + + // check if the input parameters are valid + if (master_category_list == nullptr && job == nullptr) { + DRETURN(false); + } + lListElem *category = lGetElemUlongRW(*master_category_list, CT_id, lGetUlong(job, JB_category_id)); + if (category == nullptr) { + DRETURN(false); + } + + // decrease the reference count or remove the category + bool is_del = false; + u_long32 refcount = lGetUlong(category, CT_refcount); + if (refcount > 1) { + lSetUlong(category, CT_refcount, refcount - 1); + } else { + lRemoveElem(*master_category_list, &category); + is_del = true; + } + + if (send_events) { + ev_event category_event = is_del ? sgeE_CATEGORY_DEL : sgeE_CATEGORY_MOD; + sge_add_event(0, category_event, lGetUlong(job, JB_category_id), 0, + nullptr, nullptr, nullptr, category, gdi_session); + } + + DRETURN(true); +} + +void +ocs::CategoryQmaster::reattach_job(lList **master_category_list, lListElem *job, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session) { + DENTER(TOP_LAYER); + + // remove the job from current category + detach_job(master_category_list, job, send_events, gdi_session); + + // add the job to the new category + lListElem *category; + attach_job(master_category_list, &category, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); + DRETURN_VOID; +} + +void +ocs::CategoryQmaster::attach_all_jobs(lList *master_job_list, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session) { + DENTER(TOP_LAYER); + lList **master_category_list = DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + + // add all jobs to the category list, create categories if they do not exist + lListElem *job; + for_each_rw(job, master_job_list) { + lListElem *category = nullptr; + attach_job(master_category_list, &category, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); + } + DRETURN_VOID; +} + +void +ocs::CategoryQmaster::reattach_all_jobs(lList *master_job_list, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session) { + DENTER(TOP_LAYER); + lList **master_category_list = DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + + lListElem *job; + for_each_rw(job, master_job_list) { + reattach_job(master_category_list, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); + } + DRETURN_VOID; +} + +/****** sge_category/sge_reset_job_category() ********************************** +* NAME +* sge_reset_job_category() -- resets the category temp information +* +* SYNOPSIS +* int sge_reset_job_category() +* +* FUNCTION +* Some information in the category should only life throu one scheduling run. +* These informations are reseted in the call: +* - dispatching messages +* - soft violations +* - not suitable cluster +* - the flag that identifies, if the messages are already added to the schedd infos +* - something with the resource reservation +* +* RESULT +* int - always 0 +* +* NOTES +* MT-NOTE: sge_reset_job_category() is not MT safe +* +*******************************************************************************/ +void +ocs::CategoryQmaster::reset_tmp_data() { + DENTER(TOP_LAYER); + + lList *master_category_list = *DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + lListElem *cat; + for_each_rw (cat, master_category_list) { + + // deallocate memory stored in the cache itself + lListElem *cache; + for_each_rw (cache, lGetList(cat, CT_cache)) { + auto *range = static_cast(lGetRef(cache, CCT_pe_job_slots)); + sge_free(&range); + lSetRef(cache, CCT_pe_job_slots, nullptr); + } + + // reset the cache and the messages added flag + lSetList(cat, CT_cache, nullptr); + lSetBool(cat, CT_messages_added, false); + + // reset information if category was rejected + lSetBool(cat, CT_rejected, false); + lSetBool(cat, CT_reservation_rejected, false); + + // reset cached resource contribution + lSetBool(cat, CT_rc_valid, false); + lSetDouble(cat, CT_resource_contribution, 0.0); + } + + DRETURN_VOID; +} + +void +ocs::CategoryQmaster::refresh_cat_data_in_job(lList *master_category_list, lListElem *job) { + DENTER(TOP_LAYER); + u_long32 category_id = lGetUlong(job, JB_category_id); + lListElem *category = lGetElemUlongRW(master_category_list, CT_id, category_id); + + DPRINTF("###### category id: %lu (%p)\n", category_id, category); + + lSetRef(job, JB_category, category); + DRETURN_VOID; +} + +void +ocs::CategoryQmaster::refresh_cat_data_all_jobs(lList *master_category_list, lList *master_job_list) { + DENTER(TOP_LAYER); + + if (master_category_list == nullptr || master_job_list == nullptr) { + DRETURN_VOID; + } + + lListElem *job; + for_each_rw(job, master_job_list) { + refresh_cat_data_in_job(master_category_list, job); + } + DRETURN_VOID; +} diff --git a/source/daemons/qmaster/ocs_CategoryQmaster.h b/source/daemons/qmaster/ocs_CategoryQmaster.h new file mode 100644 index 000000000..c07413aaa --- /dev/null +++ b/source/daemons/qmaster/ocs_CategoryQmaster.h @@ -0,0 +1,64 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "cull/cull.h" + +#include "gdi/ocs_gdi_Packet.h" +#include "gdi/ocs_gdi_Task.h" + +#include "sge_c_gdi.h" + +namespace ocs { + class CategoryQmaster { + public: + static bool + attach_job(lList **master_category_list, lListElem **category, lListElem *job, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session); + + static bool + detach_job(lList **master_category_list, lListElem *job, bool send_events, u_long32 gdi_session); + + static void + reattach_job(lList **master_category_list, lListElem *job, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session); + + static void + refresh_cat_data_in_job(lList *master_category_list, lListElem *job); + + static void + attach_all_jobs(lList *master_job_list, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session); + + static void + reattach_all_jobs(lList *master_job_list, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session); + + static void + reset_tmp_data(); + + static void + refresh_cat_data_all_jobs(lList *master_category_list, lList *master_job_list); + }; +} diff --git a/source/daemons/qmaster/ocs_JsonAccountingFileWriter.cc b/source/daemons/qmaster/ocs_JsonAccountingFileWriter.cc index 1961591db..c86459f1f 100644 --- a/source/daemons/qmaster/ocs_JsonAccountingFileWriter.cc +++ b/source/daemons/qmaster/ocs_JsonAccountingFileWriter.cc @@ -18,8 +18,7 @@ ***************************************************************************/ /*___INFO__MARK_END_NEW__*/ -#include "category.h" - +#include "sgeobj/ocs_Category.h" #include "sgeobj/ocs_DataStore.h" #include "uti/sge_log.h" @@ -44,8 +43,7 @@ namespace ocs { DSTRING_STATIC(category_dstring, MAX_STRING_SIZE); // get category string - sge_build_job_category_dstring(&category_dstring, job, master_userset_list, master_project_list, nullptr, - master_rqs_list); + Category::build_string(&category_dstring, job, master_userset_list, master_project_list, master_rqs_list); const char *category_string = sge_dstring_get_string(&category_dstring); // get accounting data diff --git a/source/daemons/qmaster/ocs_JsonReportingFileWriter.cc b/source/daemons/qmaster/ocs_JsonReportingFileWriter.cc index 626701e48..c285ade44 100644 --- a/source/daemons/qmaster/ocs_JsonReportingFileWriter.cc +++ b/source/daemons/qmaster/ocs_JsonReportingFileWriter.cc @@ -18,11 +18,10 @@ ***************************************************************************/ /*___INFO__MARK_END_NEW__*/ -#include "category.h" - #include "sched/sge_resource_utilization.h" #include "sched/sge_sharetree_printing.h" +#include "sgeobj/ocs_Category.h" #include "sgeobj/ocs_DataStore.h" #include "sgeobj/sge_centry.h" #include "sgeobj/sge_cqueue.h" @@ -58,8 +57,7 @@ namespace ocs { DSTRING_STATIC(category_dstring, MAX_STRING_SIZE); // get category string - sge_build_job_category_dstring(&category_dstring, job, master_userset_list, master_project_list, nullptr, - master_rqs_list); + Category::build_string(&category_dstring, job, master_userset_list, master_project_list, master_rqs_list); const char *category_string = sge_dstring_get_string(&category_dstring); // get accounting data diff --git a/source/daemons/qmaster/setup_qmaster.cc b/source/daemons/qmaster/setup_qmaster.cc index bc27371ec..f6d40721c 100644 --- a/source/daemons/qmaster/setup_qmaster.cc +++ b/source/daemons/qmaster/setup_qmaster.cc @@ -79,6 +79,7 @@ #include "sched/debit.h" +#include "ocs_CategoryQmaster.h" #include "sge_resource_quota_qmaster.h" #include "sge_advance_reservation_qmaster.h" #include "sge_qinstance_qmaster.h" @@ -1086,6 +1087,7 @@ setup_qmaster() { time_end = time(nullptr); answer_list_output(&answer_list); + { u_long32 saved_logginglevel = log_state_get_log_level(); log_state_set_log_level(LOG_INFO); @@ -1113,7 +1115,8 @@ setup_qmaster() { sge_task_depend_init(jep, &answer_list, ocs::SessionManager::GDI_SESSION_NONE); centry_list_fill_request(job_get_hard_resource_listRW(jep), - nullptr, *ocs::DataStore::get_master_list(SGE_TYPE_CENTRY), false, true, false); + nullptr, *ocs::DataStore::get_master_list(SGE_TYPE_CENTRY), + false, true, false); /* need to update JSUSPENDED_ON_SUBORDINATE since task spooling is not triggered upon queue un/-suspension */ @@ -1186,6 +1189,14 @@ setup_qmaster() { init_categories(); + // Create all categories + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); + ocs::CategoryQmaster::attach_all_jobs(master_job_list, master_userset_list, + master_project_list, master_rqs_list, false, ocs::SessionManager::GDI_SESSION_NONE); + DRETURN(0); } @@ -1422,13 +1433,17 @@ static void init_categories() { /* * now set categories flag with usersets/projects used as ACL */ - for_each_ep(ep, p_list) - if ((prj = prj_list_locate(master_project_list, lGetString(ep, PR_name)))) + for_each_ep(ep, p_list) { + if ((prj = prj_list_locate(master_project_list, lGetString(ep, PR_name)))) { lSetBool(prj, PR_consider_with_categories, true); + } + } - for_each_ep(ep, u_list) - if ((acl = lGetElemStrRW(master_userset_list, US_name, lGetString(ep, US_name)))) + for_each_ep(ep, u_list) { + if ((acl = lGetElemStrRW(master_userset_list, US_name, lGetString(ep, US_name)))) { lSetBool(acl, US_consider_with_categories, true); + } + } lFreeList(&p_list); lFreeList(&u_list); diff --git a/source/daemons/qmaster/sge_c_gdi.cc b/source/daemons/qmaster/sge_c_gdi.cc index 94de2b0c5..7ae667359 100644 --- a/source/daemons/qmaster/sge_c_gdi.cc +++ b/source/daemons/qmaster/sge_c_gdi.cc @@ -60,6 +60,7 @@ #include "gdi/ocs_gdi_Task.h" #include "gdi/ocs_gdi_Command.h" +#include "ocs_CategoryQmaster.h" #include "sge_follow.h" #include "sge_advance_reservation_qmaster.h" #include "sge_thread_scheduler.h" @@ -143,7 +144,7 @@ static bool sge_chck_mod_perm_user(const ocs::gdi::Packet *packet, lList **alpp, u_long32 target); static bool -sge_task_check_get_perm_host(ocs::gdi::Packet *packet, ocs::gdi::Task *task, monitoring_t *monitor); +sge_task_check_get_perm_host(ocs::gdi::Packet *packet, ocs::gdi::Task *task); static bool sge_chck_mod_perm_host(const ocs::gdi::Packet *packet, lList **alpp, u_long32 target); @@ -191,7 +192,8 @@ static gdi_object_t gdi_object[] = { {ocs::gdi::Target::SGE_HGRP_LIST, HGRP_name, HGRP_Type, "host group", SGE_TYPE_HGROUP, hgroup_mod, hgroup_spool, hgroup_success}, {ocs::gdi::Target::SGE_AR_LIST, AR_id, AR_Type, "advance reservation", SGE_TYPE_AR, ar_mod, ar_spool, ar_success}, {ocs::gdi::Target::SGE_DUMMY_LIST, 0, nullptr, "general request", SGE_TYPE_NONE, nullptr, nullptr, nullptr}, - {ocs::gdi::Target::NO_TARGET, 0, nullptr, nullptr, SGE_TYPE_NONE, nullptr, nullptr, nullptr} + {ocs::gdi::Target::SGE_CAT_LIST, CT_id, nullptr, "category", SGE_TYPE_CATEGORY, nullptr, nullptr, nullptr}, + {ocs::gdi::Target::NO_TARGET, 0, nullptr, nullptr, SGE_TYPE_NONE, nullptr, nullptr, nullptr} }; /* *INDENT-ON* */ @@ -255,13 +257,12 @@ sge_c_gdi_process_in_listener(ocs::gdi::Packet *packet, ocs::gdi::Task *task, } bool -sge_c_gdi_check_execution_permission(ocs::gdi::Packet *packet, ocs::gdi::Task *task, - monitoring_t *monitor) { +sge_c_gdi_check_execution_permission(ocs::gdi::Packet *packet, ocs::gdi::Task *task) { DENTER(TOP_LAYER); int operation = task->command; switch (operation) { case ocs::gdi::Command::SGE_GDI_GET: - DRETURN(sge_task_check_get_perm_host(packet, task, monitor)); + DRETURN(sge_task_check_get_perm_host(packet, task)); case ocs::gdi::Command::SGE_GDI_ADD: case ocs::gdi::Command::SGE_GDI_MOD: case ocs::gdi::Command::SGE_GDI_COPY: @@ -450,7 +451,7 @@ sge_c_gdi_get_in_listener(gdi_object_t *ao, ocs::gdi::Packet *packet, ocs::gdi:: lFreeList(&(task->data_list)); // check the permission - if (!sge_task_check_get_perm_host(packet, task, monitor)) { + if (!sge_task_check_get_perm_host(packet, task)) { DRETURN_VOID; } @@ -489,11 +490,8 @@ sge_c_gdi_get_in_worker(gdi_object_t *ao, ocs::gdi::Packet *packet, ocs::gdi::Ta answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_OK, ANSWER_QUALITY_END); DRETURN_VOID; case ocs::gdi::Target::SGE_SC_LIST: /* TODO EB: move this into the scheduler configuration, - and pack the list right away */ - { - lList *conf = nullptr; - - conf = sconf_get_config_list(); + and pack the list right away */ { + lList *conf = sconf_get_config_list(); task->data_list = lSelectHashPack("", conf, task->condition, task->enumeration, false, nullptr); task->do_select_pack_simultaneous = false; snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_GDI_OKNL); @@ -508,16 +506,16 @@ sge_c_gdi_get_in_worker(gdi_object_t *ao, ocs::gdi::Packet *packet, ocs::gdi::Ta * If the scheduler is not available the information in the job info * messages are outdated. In this case we have to reject the request. */ - if (task->target == ocs::gdi::Target::SGE_SME_LIST && - !sge_has_event_client(EV_ID_SCHEDD)) { - answer_list_add(&(task->answer_list), MSG_SGETEXT_JOBINFOMESSAGESOUTDATED, - STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR); + if (task->target == ocs::gdi::Target::SGE_SME_LIST && !sge_has_event_client(EV_ID_SCHEDD)) { + answer_list_add(&(task->answer_list), MSG_SGETEXT_JOBINFOMESSAGESOUTDATED, STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR); } else if (ao == nullptr || ao->list_type == SGE_TYPE_NONE) { - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); } else { lList *data_source = *ocs::DataStore::get_master_list_rw(ao->list_type); + DPRINTF("Got list with " sge_uu32 " elements\n", lGetNumberOfElem(data_source)); + if (packet->is_intern_request) { /* intern requests need no pb so it is not necessary to postpone the operation */ task->data_list = lSelectHashPack("", data_source, task->condition, @@ -662,7 +660,7 @@ sge_c_gdi_add(ocs::gdi::Packet *packet, ocs::gdi::Task *task, default: if (!ao) { - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); break; } @@ -740,7 +738,7 @@ sge_c_gdi_del(ocs::gdi::Packet *packet, ocs::gdi::Task *task, ocs::gdi::Command: packet->user, packet->host); break; default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); break; } @@ -816,7 +814,7 @@ sge_c_gdi_del(ocs::gdi::Packet *packet, ocs::gdi::Task *task, ocs::gdi::Command: ar_del(packet, task, ep, &(task->answer_list), ocs::DataStore::get_master_list_rw(SGE_TYPE_AR), monitor); break; default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); break; } /* switch target */ @@ -845,7 +843,7 @@ static void sge_c_gdi_copy(gdi_object_t *ao, ocs::gdi::Packet *packet, ocs::gdi: packet, task, monitor); break; default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); break; } @@ -907,7 +905,7 @@ sge_c_gdi_permcheck(ocs::gdi::Packet *packet, ocs::gdi::Task *task, monitoring_t sge_gdi_do_permcheck(packet, task); break; default: - WARNING(SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + WARNING(MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); } DRETURN_VOID; @@ -942,7 +940,7 @@ void sge_c_gdi_replace(gdi_object_t *ao, ocs::gdi::Packet *packet, ocs::gdi::Tas } break; default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); break; } @@ -984,7 +982,7 @@ sge_c_gdi_trigger_in_listener(ocs::gdi::Packet *packet, ocs::gdi::Task *task, mo DRETURN_VOID; default: // unknown operation for a listener thread - WARNING(SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + WARNING(MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); DRETURN_VOID; } @@ -1009,7 +1007,7 @@ sge_c_gdi_trigger_in_worker(ocs::gdi::Packet *packet, ocs::gdi::Task *task, moni DRETURN_VOID; default: // unknown operation for a worker thread - WARNING(SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + WARNING(MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); DRETURN_VOID; } @@ -1272,7 +1270,7 @@ static void sge_c_gdi_mod(gdi_object_t *ao, ocs::gdi::Packet *packet, ocs::gdi:: break; default: if (ao == nullptr) { - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); break; } @@ -1383,7 +1381,7 @@ sge_chck_mod_perm_user(const ocs::gdi::Packet *packet, lList **alpp, u_long32 ta break; } default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_OPNOIMPFORTARGET_S, __func__); answer_list_add(alpp, SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); DRETURN(false); } @@ -1391,160 +1389,118 @@ sge_chck_mod_perm_user(const ocs::gdi::Packet *packet, lList **alpp, u_long32 ta DRETURN(true); } +/** @brief checks modify-permissions of host for a given target + * + * Qmaster internal requests are not checked for permissions + * + * @param packet - packet to check + * @param alpp - answer list pointer + * @param target - target to check + * + * @return true if permission is granted, false otherwise + */ static bool -sge_chck_mod_perm_host(const ocs::gdi::Packet *packet, lList **alpp, u_long32 target) { +sge_chck_mod_perm_host(const ocs::gdi::Packet *packet, lList **alpp, const u_long32 target) { DENTER(TOP_LAYER); - /* check permissions of host */ - switch (target) { - - case ocs::gdi::Target::SGE_ORDER_LIST: - case ocs::gdi::Target::SGE_AH_LIST: - case ocs::gdi::Target::SGE_UO_LIST: - case ocs::gdi::Target::SGE_UM_LIST: - case ocs::gdi::Target::SGE_SH_LIST: - case ocs::gdi::Target::SGE_CQ_LIST: - case ocs::gdi::Target::SGE_CE_LIST: - case ocs::gdi::Target::SGE_PE_LIST: - case ocs::gdi::Target::SGE_CONF_LIST: - case ocs::gdi::Target::SGE_SC_LIST: - case ocs::gdi::Target::SGE_UU_LIST: - case ocs::gdi::Target::SGE_US_LIST: - case ocs::gdi::Target::SGE_PR_LIST: - case ocs::gdi::Target::SGE_STN_LIST: - case ocs::gdi::Target::SGE_CK_LIST: - case ocs::gdi::Target::SGE_CAL_LIST: - case ocs::gdi::Target::SGE_USER_MAPPING_LIST: - case ocs::gdi::Target::SGE_HGRP_LIST: - case ocs::gdi::Target::SGE_RQS_LIST: - case ocs::gdi::Target::SGE_MASTER_EVENT: - case ocs::gdi::Target::SGE_DUMMY_LIST: - - /* host must be SGE_AH_LIST */ - if (!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST), packet->host)) { - ERROR(MSG_SGETEXT_NOADMINHOST_S, packet->host); - answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - DRETURN(false); - } - break; - - case ocs::gdi::Target::SGE_EH_LIST: - - /* host must be either admin host or exec host and execd */ - - if (!(host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST), packet->host) || - (host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST), packet->host) && - !strcmp(packet->commproc, prognames[EXECD])))) { - ERROR(MSG_SGETEXT_NOADMINHOST_S, packet->host); - answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - DRETURN(false); - } - break; - - case ocs::gdi::Target::SGE_JB_LIST: - /* host must be SGE_SH_LIST */ - if (!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST), packet->host)) { - ERROR(MSG_SGETEXT_NOSUBMITHOST_S, packet->host); - answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - DRETURN(false); + if (!packet->is_intern_request) { + const lList *master_admin_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST); + bool is_admin_host = host_list_locate(master_admin_host_list, packet->host) != nullptr ? true : false; + const lList *master_submit_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST); + bool is_submit_host = host_list_locate(master_submit_host_list, packet->host) != nullptr ? true : false; + + switch (target) { + case ocs::gdi::Target::SGE_EH_LIST: { + const lList *master_exec_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST); + bool is_exec_host = host_list_locate(master_exec_host_list, packet->host) != nullptr ? true : false; + + // host must be either admin host + // or exec host and request has to come from execd + if (!(is_admin_host || (is_exec_host && !strcmp(packet->commproc, prognames[EXECD])))) { + ERROR(MSG_SGETEXT_NOADMINHOST_S, packet->host); + answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); + DRETURN(false); + } + break; } + case ocs::gdi::Target::SGE_EV_LIST: + // host must be admin host or submit host + if (!is_submit_host && !is_admin_host) { + ERROR(MSG_SGETEXT_NOSUBMITORADMINHOST_S, packet->host); + answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); + DRETURN(false); + } break; - case ocs::gdi::Target::SGE_EV_LIST: - /* to start an event client or if an event client - performs modify requests on itself - it must be on a submit or an admin host - */ - if ((!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST), packet->host)) - && (!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST), packet->host))) { - ERROR(MSG_SGETEXT_NOSUBMITORADMINHOST_S, packet->host); - answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - DRETURN(false); - } + case ocs::gdi::Target::SGE_JB_LIST: + case ocs::gdi::Target::SGE_AR_LIST: + // host must be a submit host + if (!is_submit_host) { + ERROR(MSG_SGETEXT_NOSUBMITHOST_S, packet->host); + answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); + DRETURN(false); + } break; - case ocs::gdi::Target::SGE_AR_LIST: - /* host must be SGE_SH_LIST */ - if (!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST), packet->host)) { - ERROR(MSG_SGETEXT_NOSUBMITHOST_S, packet->host); - answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - DRETURN(false); - } + default: + // for all other host must be an admin host + if (!is_admin_host) { + ERROR(MSG_SGETEXT_NOADMINHOST_S, packet->host); + answer_list_add(alpp, SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); + DRETURN(false); + } break; - default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); - answer_list_add(alpp, SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); - DRETURN(false); + } } DRETURN(true); } - -/* EB: TODO: ST: skip execution of this function if it is internal GDI request */ -/* EB: TODO: ST: move usage of this code into listener in future */ +/** @brief checks get-permissions of host for a given target + * + * Qmaster internal requests are not checked for permissions + * + * @param packet - packet to check + * @param task - task to check + * @param monitor - monitoring structure + * + * @return true if permission is granted, false otherwise + */ static bool -sge_task_check_get_perm_host(ocs::gdi::Packet *packet, ocs::gdi::Task *task, monitoring_t *monitor) { - bool ret = true; - u_long32 target; - char *host = nullptr; - +sge_task_check_get_perm_host(ocs::gdi::Packet *packet, ocs::gdi::Task *task) { DENTER(TOP_LAYER); - target = task->target; - host = packet->host; + // only external requests need to be checked + if (packet->is_intern_request) { - /* check permissions of host */ - switch (target) { - case ocs::gdi::Target::SGE_ORDER_LIST: - case ocs::gdi::Target::SGE_EV_LIST: - case ocs::gdi::Target::SGE_AH_LIST: - case ocs::gdi::Target::SGE_UO_LIST: - case ocs::gdi::Target::SGE_UM_LIST: - case ocs::gdi::Target::SGE_SH_LIST: - case ocs::gdi::Target::SGE_CQ_LIST: - case ocs::gdi::Target::SGE_CE_LIST: - case ocs::gdi::Target::SGE_PE_LIST: - case ocs::gdi::Target::SGE_SC_LIST: - case ocs::gdi::Target::SGE_UU_LIST: - case ocs::gdi::Target::SGE_US_LIST: - case ocs::gdi::Target::SGE_PR_LIST: - case ocs::gdi::Target::SGE_STN_LIST: - case ocs::gdi::Target::SGE_CK_LIST: - case ocs::gdi::Target::SGE_CAL_LIST: - case ocs::gdi::Target::SGE_USER_MAPPING_LIST: - case ocs::gdi::Target::SGE_HGRP_LIST: - case ocs::gdi::Target::SGE_EH_LIST: - case ocs::gdi::Target::SGE_JB_LIST: - case ocs::gdi::Target::SGE_ZOMBIE_LIST: - case ocs::gdi::Target::SGE_SME_LIST: - case ocs::gdi::Target::SGE_RQS_LIST: - case ocs::gdi::Target::SGE_AR_LIST: - case ocs::gdi::Target::SGE_DUMMY_LIST: - /* host must be admin or submit host */ - if (!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST), host) && - !host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST), host)) { - snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_NOSUBMITORADMINHOST_S, host); - answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - ret = false; - } - break; - case ocs::gdi::Target::SGE_CONF_LIST: - /* host must be admin or submit host or exec host */ - if (!host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST), host) && - !host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST), host) && - !host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST), host)) { - snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_NOSUBMITORADMINHOST_S, host); - answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); - ret = false; + const lList *master_admin_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST); + bool is_admin_host = host_list_locate(master_admin_host_list, packet->host) != nullptr ? true : false; + const lList *master_submit_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST); + bool is_submit_host = host_list_locate(master_submit_host_list, packet->host) != nullptr ? true : false; + + switch (task->target) { + case ocs::gdi::Target::SGE_CONF_LIST: { + const lList *master_exec_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST); + bool is_exec_host = host_list_locate(master_exec_host_list, packet->host) != nullptr ? true : false; + + // host must be either admin/submit or exec host + if (!is_admin_host && !is_submit_host && !is_exec_host) { + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_NOSUBMITORADMINHOST_S, packet->host); + answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); + DRETURN(false); + } + break; } + default: + // for all other targets host must be an admin host or submit host + if (!is_admin_host && !is_submit_host) { + snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_SGETEXT_NOSUBMITORADMINHOST_S, packet->host); + answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_EDENIED2HOST, ANSWER_QUALITY_ERROR); + DRETURN(false); + } break; - default: - snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_SGETEXT_OPNOIMPFORTARGET); - answer_list_add(&(task->answer_list), SGE_EVENT, STATUS_ENOIMP, ANSWER_QUALITY_ERROR); - ret = false; - return ret; + } } - DRETURN(ret); + DRETURN(true); } diff --git a/source/daemons/qmaster/sge_c_gdi.h b/source/daemons/qmaster/sge_c_gdi.h index 461c3d9e8..55ae9c742 100644 --- a/source/daemons/qmaster/sge_c_gdi.h +++ b/source/daemons/qmaster/sge_c_gdi.h @@ -98,8 +98,7 @@ sge_c_gdi_process_in_listener(ocs::gdi::Packet *packet, ocs::gdi::Task *task, lList **answer_list, monitoring_t *monitor, bool has_next); bool -sge_c_gdi_check_execution_permission(ocs::gdi::Packet *packet, ocs::gdi::Task *task, - monitoring_t *monitor); +sge_c_gdi_check_execution_permission(ocs::gdi::Packet *packet, ocs::gdi::Task *task); void sge_c_gdi_process_in_worker(ocs::gdi::Packet *packet, ocs::gdi::Task *task, lList **answer_list, diff --git a/source/daemons/qmaster/sge_give_jobs.cc b/source/daemons/qmaster/sge_give_jobs.cc index 15c284e49..1b7896876 100644 --- a/source/daemons/qmaster/sge_give_jobs.cc +++ b/source/daemons/qmaster/sge_give_jobs.cc @@ -51,6 +51,7 @@ #include "uti/sge_stdlib.h" #include "uti/sge_time.h" +#include "sgeobj/ocs_DataStore.h" #include "sgeobj/ocs_Session.h" #include "sgeobj/sge_ja_task.h" #include "sgeobj/sge_pe_task.h" @@ -68,15 +69,15 @@ #include "sgeobj/sge_ckpt.h" #include "sgeobj/sge_centry.h" #include "sgeobj/sge_cqueue.h" -#include "sgeobj/ocs_DataStore.h" +#include "sgeobj/sge_resource_quota.h" -#include "sched/sge_resource_quota_schedd.h" #include "sched/debit.h" #include "gdi/ocs_gdi_security.h" #include "spool/sge_spooling.h" +#include "ocs_CategoryQmaster.h" #include "ocs_ReportingFileWriter.h" #include "sge.h" #include "basis_types.h" @@ -1760,14 +1761,22 @@ sge_bury_job(const char *sge_root, lListElem *job, u_long32 job_id, lListElem *j spool_transaction(&answer_list, spool_get_default_context(), STC_commit); } - /* - * remove the job - */ + + // remove the job suser_unregister_job(job, master_suser_list); + + // update the category + lList **master_category_list = ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + ocs::CategoryQmaster::detach_job(master_category_list, job, !no_events, gdi_session); + + // send events if (!no_events) { sge_add_event(0, sgeE_JOB_DEL, job_id, ja_task_id, nullptr, nullptr, lGetString(job, JB_session), nullptr, gdi_session); + } + + // final job removal lRemoveElem(master_job_list, &job); } else { int is_enrolled = job_is_enrolled(job, ja_task_id); diff --git a/source/daemons/qmaster/sge_job_qmaster.cc b/source/daemons/qmaster/sge_job_qmaster.cc index 1355b8166..d7df89c42 100644 --- a/source/daemons/qmaster/sge_job_qmaster.cc +++ b/source/daemons/qmaster/sge_job_qmaster.cc @@ -51,6 +51,7 @@ #include "uti/sge_string.h" #include "uti/sge_time.h" +#include "sgeobj/ocs_Category.h" #include "sgeobj/sge_str.h" #include "sgeobj/sge_conf.h" #include "sgeobj/sge_object.h" @@ -95,6 +96,7 @@ #include "spool/sge_spooling.h" +#include "ocs_CategoryQmaster.h" #include "ocs_ReportingFileWriter.h" #include "sge_task_depend.h" #include "sge_persistence_qmaster.h" @@ -220,26 +222,23 @@ sge_gdi_add_job(lListElem **jep, lList **alpp, lList **lpp, DENTER(TOP_LAYER); + // JSV verification if enabled if (jsv_is_enabled(tc->thread_name)) { - struct timeval start_time{}; - struct timeval end_time{}; - int jsv_threshold = mconf_get_jsv_threshold(); - /* - * first verify before JSV is executed - */ + + // job verification so that data that is passed to JSV is correct ret = sge_job_verify_adjust(*jep, alpp, lpp, packet, task, monitor); if (ret != STATUS_OK) { DRETURN(ret); } - /* - * JSV verification - */ + // JSV verification with threshold handling + struct timeval start_time{}; + struct timeval end_time{}; gettimeofday(&start_time, nullptr); lret = jsv_do_verify(tc->thread_name, jep, alpp, true); gettimeofday(&end_time, nullptr); - if (((end_time.tv_sec - start_time.tv_sec) * 1000 + (end_time.tv_usec - start_time.tv_usec) / 1000) - > jsv_threshold || jsv_threshold == 0) { + int jsv_threshold = mconf_get_jsv_threshold(); + if (((end_time.tv_sec - start_time.tv_sec) * 1000 + (end_time.tv_usec - start_time.tv_usec) / 1000) > jsv_threshold || jsv_threshold == 0) { INFO(MSG_JSV_THRESHOLD_UU, lGetUlong(*jep, JB_job_number), static_cast((end_time.tv_sec - start_time.tv_sec) * 1000 + (end_time.tv_usec - start_time.tv_usec) / 1000)); } if (!lret) { @@ -247,18 +246,17 @@ sge_gdi_add_job(lListElem **jep, lList **alpp, lList **lpp, } } - /* - * second try to find something strange - */ + // final job verification ret = sge_job_verify_adjust(*jep, alpp, lpp, packet, task, monitor); if (ret != STATUS_OK) { DRETURN(ret); } - /* write script to file */ + // open a spooling transaction spool_transaction(alpp, spool_get_default_context(), STC_begin); - if (lGetString(*jep, JB_script_file) && - !JOB_TYPE_IS_BINARY(lGetUlong(*jep, JB_type))) { + + // write script to file separately, we do not want to hold it in memory + if (lGetString(*jep, JB_script_file) && !JOB_TYPE_IS_BINARY(lGetUlong(*jep, JB_type))) { if (!spool_write_script(alpp, lGetUlong(*jep, JB_job_number), *jep)) { spool_transaction(alpp, spool_get_default_context(), STC_rollback); ERROR(MSG_JOB_NOWRITE_US, lGetUlong(*jep, JB_job_number), strerror(errno)); @@ -266,11 +264,20 @@ sge_gdi_add_job(lListElem **jep, lList **alpp, lList **lpp, DRETURN(STATUS_EDISK); } } - - /* clean file out of memory */ lSetString(*jep, JB_script_ptr, nullptr); lSetUlong(*jep, JB_script_size, 0); + // create or assign a category + lList** master_category_list = ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + lListElem* category = nullptr; + lret = ocs::CategoryQmaster::attach_job(master_category_list, &category, *jep, + master_userset_list, master_project_list, master_rqs_list, + true, packet->gdi_session); + + // Handle job spooling and event if (!sge_event_spool(alpp, 0, sgeE_JOB_ADD, lGetUlong(*jep, JB_job_number), 0, nullptr, nullptr, nullptr, *jep, nullptr, nullptr, true, true, packet->gdi_session)) { @@ -283,14 +290,15 @@ sge_gdi_add_job(lListElem **jep, lList **alpp, lList **lpp, } DRETURN(STATUS_EDISK); } + + // close the spooling transaction spool_transaction(alpp, spool_get_default_context(), STC_commit); if (!job_is_array(*jep)) { DPRINTF("Added Job " sge_uu32"\n", lGetUlong(*jep, JB_job_number)); } else { job_get_submit_task_ids(*jep, &start, &end, &step); - DPRINTF("Added JobArray " sge_uu32"." sge_uu32"-" sge_uu32":" sge_uu32"\n", - lGetUlong(*jep, JB_job_number), start, end, step); + DPRINTF("Added JobArray " sge_uu32"." sge_uu32"-" sge_uu32":" sge_uu32"\n", lGetUlong(*jep, JB_job_number), start, end, step); } /* add into job list */ @@ -1255,7 +1263,6 @@ sge_gdi_mod_job(const ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem /* operate on a cull copy of the job */ new_job = lCopyElem(jobep); - if (mod_job_attributes(packet, new_job, jep, &tmp_alp, &trigger)) { if (*alpp == nullptr) { *alpp = lCreateList("answer", AN_Type); @@ -1268,22 +1275,59 @@ sge_gdi_mod_job(const ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem } if (!(trigger & VERIFY_EVENT)) { - dstring buffer = DSTRING_INIT; bool dbret; lList *answer_list = nullptr; + // @todo CS-1156: Why not also for PRIO_EVENT, RECHAIN_JID_HOLD and RECHAIN_JA_AD_HOLD if (trigger & MOD_EVENT) { lSetUlong(new_job, JB_version, lGetUlong(new_job, JB_version) + 1); } + // @todo CS-1155: add a transaction? + // open a spooling transaction + spool_transaction(alpp, spool_get_default_context(), STC_begin); + + + // if the job changed then check if also the category changed and triggere required events + if ((trigger & (PRIO_EVENT | MOD_EVENT)) > 0) { + // build the category string + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + dstring category_str = DSTRING_INIT; + ocs::Category::build_string(&category_str, new_job, master_userset_list, master_project_list, master_rqs_list); + const char *cat_str = sge_dstring_get_string(&category_str); + + // check if the category string changed + lList **master_category_list = ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + u_long32 old_category_id = lGetUlong(new_job, JB_category_id); + const lListElem *old_category = lGetElemUlong(*master_category_list, CT_id, old_category_id); + const char *old_cat_str = lGetString(old_category, CT_str); + + // check if the category string changed and trigger corresponding events + if (strcmp(old_cat_str, cat_str) != 0) { + // remove the job from the category + ocs::CategoryQmaster::detach_job(master_category_list, new_job, true, packet->gdi_session); + + // add the job to the new category + lListElem *category; + ocs::CategoryQmaster::attach_job(master_category_list, &category, new_job, + master_userset_list, master_project_list, master_rqs_list, + true, packet->gdi_session); + } + } + /* all job modifications to be saved on disk must be made in new_job */ + dstring buffer = DSTRING_INIT; dbret = spool_write_object(&answer_list, spool_get_default_context(), new_job, - job_get_key(jobid, 0, nullptr, &buffer), - SGE_TYPE_JOB, true); + job_get_key(jobid, 0, nullptr, &buffer), SGE_TYPE_JOB, true); answer_list_output(&answer_list); - if (!dbret) { ERROR(MSG_JOB_NOALTERNOWRITE_U, jobid); + + // @todo CS-1155: add a transaction? + spool_transaction(alpp, spool_get_default_context(), STC_rollback); + answer_list_add(alpp, SGE_EVENT, STATUS_EDISK, ANSWER_QUALITY_ERROR); sge_dstring_free(&buffer); lFreeList(&tmp_alp); @@ -1292,7 +1336,6 @@ sge_gdi_mod_job(const ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem sge_free(&job_mod_name); DRETURN(STATUS_EDISK); } - sge_dstring_free(&buffer); /* all elems in tmp_alp need to be appended to alpp */ @@ -1323,8 +1366,7 @@ sge_gdi_mod_job(const ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem DPRINTF(" JOB #" sge_uu32": P: " sge_uu32"\n", jobid, pre_ident); if ((suc_jobep = lGetElemUlongRW(*master_job_list, JB_job_number, pre_ident))) { - lListElem *temp_job = lGetElemUlongRW(lGetList(suc_jobep, JB_jid_successor_list), JRE_job_number, - jobid); + lListElem *temp_job = lGetElemUlongRW(lGetList(suc_jobep, JB_jid_successor_list), JRE_job_number, jobid); DPRINTF(" JOB " sge_uu32 " removed from trigger list of job " sge_uu32 "\n", jobid, pre_ident); lRemoveElem(lGetListRW(suc_jobep, JB_jid_successor_list), &temp_job); } @@ -1355,10 +1397,16 @@ sge_gdi_mod_job(const ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem lInsertElem(*master_job_list, prev, new_job); } /* no need to spool these mods */ - if (trigger & RECHAIN_JID_HOLD) + if (trigger & RECHAIN_JID_HOLD) { job_suc_pre(new_job); - if (trigger & RECHAIN_JA_AD_HOLD) + } + if (trigger & RECHAIN_JA_AD_HOLD) { job_suc_pre_ad(new_job); + } + + // @todo CS-1155: add a transaction? + // close the spooling transaction + spool_transaction(alpp, spool_get_default_context(), STC_commit); INFO(MSG_SGETEXT_MODIFIEDINLIST_SSUS, packet->user, packet->host, jobid, MSG_JOB_JOB); } @@ -3739,7 +3787,6 @@ static int sge_delete_all_tasks_of_job(const ocs::gdi::Packet *packet, lList **a } if (deleted_unenrolled_tasks) { - if (existing_tasks > *deleted_tasks) { dstring buffer = DSTRING_INIT; /* write only the common part - pass only the jobid, no jatask or petask id */ @@ -3749,14 +3796,6 @@ static int sge_delete_all_tasks_of_job(const ocs::gdi::Packet *packet, lList **a SGE_TYPE_JOB, true); answer_list_output(&answer_list); sge_dstring_free(&buffer); - } else { - /* JG: TODO: this joblog seems to have an invalid job object! */ -/* ocs::ReportingFileWriter::create_job_logs(nullptr, sge_get_gmt(), JL_DELETED, ruser, rhost, nullptr, job, nullptr, nullptr, MSG_LOG_DELETED); */ - -#if 0 /* EB: TODO: this should not be necessary because events have been sent in sge_commit_job() above */ - sge_add_event(start_time, sgeE_JOB_DEL, job_number, 0, - nullptr, nullptr, dupped_session, nullptr); -#endif } } diff --git a/source/daemons/qmaster/sge_persistence_qmaster.cc b/source/daemons/qmaster/sge_persistence_qmaster.cc index 7e48889fb..f792cff84 100644 --- a/source/daemons/qmaster/sge_persistence_qmaster.cc +++ b/source/daemons/qmaster/sge_persistence_qmaster.cc @@ -462,7 +462,6 @@ sge_event_spool(lList **answer_list, u_long64 timestamp, ev_event event, u_long3 element = object; object_type = SGE_TYPE_AR; break; - default: /* nothing to spool */ object_type = SGE_TYPE_ALL; diff --git a/source/daemons/qmaster/sge_qmaster_process_message.cc b/source/daemons/qmaster/sge_qmaster_process_message.cc index 325c6d1c7..83ba18524 100644 --- a/source/daemons/qmaster/sge_qmaster_process_message.cc +++ b/source/daemons/qmaster/sge_qmaster_process_message.cc @@ -435,7 +435,7 @@ do_gdi_packet(ocs::gdi::ClientServerBase::struct_msg_t *aMsg, monitoring_t *moni // - admin/submit/exec host if (local_ret) { for (auto *task : packet->tasks) { - local_ret = sge_c_gdi_check_execution_permission(packet, task, monitor); + local_ret = sge_c_gdi_check_execution_permission(packet, task); if (!local_ret) { break; } diff --git a/source/daemons/qmaster/sge_reporting_qmaster.cc b/source/daemons/qmaster/sge_reporting_qmaster.cc index 2df6174f6..f1bab3649 100644 --- a/source/daemons/qmaster/sge_reporting_qmaster.cc +++ b/source/daemons/qmaster/sge_reporting_qmaster.cc @@ -44,6 +44,7 @@ #include "sgeobj/ocs_DataStore.h" #include "sched/sge_resource_utilization.h" +#include "sgeobj/ocs_Category.h" #include "sgeobj/sge_answer.h" #include "sgeobj/sge_centry.h" #include "sgeobj/sge_cqueue.h" @@ -63,9 +64,6 @@ #include "sge_reporting_qmaster.h" #include "sge_rusage.h" -#include "msg_common.h" -#include "category.h" - /****** qmaster/reporting/--Introduction *************************** * NAME * qmaster reporting -- creation of a reporting file @@ -267,8 +265,7 @@ ocs::ClassicAccountingFileWriter::create_acct_record(lList **answer_list, lListE if (!intermediate) { DSTRING_STATIC(category_dstring, MAX_STRING_SIZE); const char *category_string; - sge_build_job_category_dstring(&category_dstring, job, master_userset_list, master_project_list, - nullptr, master_rqs_list); + Category::build_string(&category_dstring, job, master_userset_list, master_project_list, master_rqs_list); category_string = sge_dstring_get_string(&category_dstring); dstring job_dstring = DSTRING_INIT; @@ -351,8 +348,7 @@ ocs::ClassicReportingFileWriter::create_acct_record(lList **answer_list, lListEl DSTRING_STATIC(category_dstring, MAX_STRING_SIZE); const char *category_string = nullptr; - sge_build_job_category_dstring(&category_dstring, job, master_userset_list, master_project_list, nullptr, - master_rqs_list); + Category::build_string(&category_dstring, job, master_userset_list, master_project_list, master_rqs_list); category_string = sge_dstring_get_string(&category_dstring); // reporting records will be written both for intermediate and final job reports diff --git a/source/daemons/qmaster/sge_sched_job_category.cc b/source/daemons/qmaster/sge_sched_job_category.cc deleted file mode 100644 index 3cdbaac64..000000000 --- a/source/daemons/qmaster/sge_sched_job_category.cc +++ /dev/null @@ -1,319 +0,0 @@ -/*___INFO__MARK_BEGIN__*/ -/************************************************************************* - * - * The Contents of this file are made available subject to the terms of - * the Sun Industry Standards Source License Version 1.2 - * - * Sun Microsystems Inc., March, 2001 - * - * - * Sun Industry Standards Source License Version 1.2 - * ================================================= - * The contents of this file are subject to the Sun Industry Standards - * Source License Version 1.2 (the "License"); You may not use this file - * except in compliance with the License. You may obtain a copy of the - * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html - * - * Software provided under this License is provided on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, - * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. - * See the License for the specific provisions governing your rights and - * obligations concerning the Software. - * - * The Initial Developer of the Original Code is: Sun Microsystems, Inc. - * - * Copyright: 2001 by Sun Microsystems, Inc. - * - * All Rights Reserved. - * - * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH - * - ************************************************************************/ -/*___INFO__MARK_END__*/ -#include -#include - -#include "uti/sge_dstring.h" -#include "uti/sge_log.h" -#include "uti/sge_rmon_macros.h" - -#include "cull/cull_sort.h" - -#include "sgeobj/sge_job.h" -#include "sgeobj/sge_qinstance.h" -#include "sgeobj/sge_range.h" -#include "sgeobj/sge_qinstance_state.h" -#include "sgeobj/sge_order.h" -#include "sgeobj/sge_centry.h" -#include "sgeobj/sge_schedd_conf.h" - -#include "comm/commlib.h" - -#include "sched/schedd_message.h" -#include "sched/sge_schedd_text.h" -#include "sched/sge_orders.h" -#include "sched/msg_schedd.h" - -#include "sge_sched_job_category.h" -#include "category.h" - -/****************************************************** - * - * Description: - * - * Categories are used to speed up the job dispatching - * in the scheduler. Before the job dispatching starts, - * the categories have to be build for new jobs and - * reseted for existing jobs. A new job gets a reference - * to its category regardless if it is existing or not. - * - * This is done with: - * - sge_add_job_category(lListElem *job, lList *acl_list) - * - sge_rebuild_job_category(lList *job_list, lList *acl_list) - * - int sge_reset_job_category() - * - * During the dispatch run for a job, the category caches - * all hosts and queues, which are not suitable for that - * category. This leads toa speed improvement when other - * jobs of the same category are matched. In addition to - * the host and queues, it has to cach the generated messages - * as well, since the are not generated again. If a category - * cannot run in the cluster at all, the category is rejected - * and the messages are added for all jobs in the category. - * - * This is done for simple and parallel jobs. In addition it - * also caches the results of soft request matching. Since - * a job can only soft request a fixed resource, it is not - * changing during a scheduling run and the soft request violation - * for a given queue are the same for all jobs in one - * category. - * - ******************************************************/ - - -/* Categories of the job are managed here */ -static lList *CATEGORY_LIST = nullptr; /* Category list, which contains the categories referenced - * in the job structure. It is used for the resource matching - * type = CT_Type - */ - -static bool reb_cat = true; - -/*-------------------------------------------------------------------------*/ - -/*-------------------------------------------------------------------------*/ -/* add jobs' category to the global category list, if it doesn't */ -/* already exist, and reference the category in the job element */ -/* */ -/* NOTE: this function is not MT-Safe, because it uses global variables */ -/* */ -/* SG: TODO: split this into separate functions */ -/*-------------------------------------------------------------------------*/ -int -sge_add_job_category(lListElem *job, const lList *acl_list, const lList *prj_list, const lList *rqs_list) { - DENTER(TOP_LAYER); - - static const char no_requests[] = "no-requests"; - dstring category_str = DSTRING_INIT; - bool did_project; - - /* First part: - Builds the category for the resource matching - */ - sge_build_job_category_dstring(&category_str, job, acl_list, prj_list, &did_project, rqs_list); - - const char *cstr; - if (sge_dstring_strlen(&category_str) == 0) { - cstr = sge_dstring_copy_string(&category_str, no_requests); - } else { - cstr = sge_dstring_get_string(&category_str); - } - - lListElem *cat = lGetElemStrRW(CATEGORY_LIST, CT_str, cstr); - if (cat == nullptr) { - cat = lAddElemStr(&CATEGORY_LIST, CT_str, cstr, CT_Type); - } - - // increment ref counter and set reference to this element - u_long32 rc = lGetUlong(cat, CT_refcount); - lSetUlong(cat, CT_refcount, ++rc); - lSetRef(job, JB_category, cat); - - // free category string - sge_dstring_free(&category_str); - - DRETURN(0); -} - -/*-------------------------------------------------------------------------*/ -/* delete jobs category if CT_refcount gets 0 */ -/*-------------------------------------------------------------------------*/ -int -sge_delete_job_category(lListElem *job) { - DENTER(TOP_LAYER); - - /* First part */ - auto *cat = static_cast(lGetRef(job, JB_category)); - if (CATEGORY_LIST != nullptr && cat != nullptr) { - u_long32 rc = lGetUlong(cat, CT_refcount); - if (rc > 1) { - lSetUlong(cat, CT_refcount, --rc); - } else { - const lListElem *cache = nullptr; - const lList *cache_list = lGetList(cat, CT_cache); - - DPRINTF("############## Removing %s from category list (refcount: " sge_uu32 ")\n", - lGetString(cat, CT_str), lGetUlong(cat, CT_refcount)); - - for_each_ep(cache, cache_list) { - auto range = static_cast(lGetRef(cache, CCT_pe_job_slots)); - sge_free(&range); - } - - lRemoveElem(CATEGORY_LIST, &cat); - } - } - lSetRef(job, JB_category, nullptr); - - DRETURN(0); -} - -/*-------------------------------------------------------------------------*/ -int -sge_is_job_category_rejected(const lListElem *job) { - int ret; - lListElem *cat = nullptr; - - DENTER(TOP_LAYER); - cat = (lListElem *) lGetRef(job, JB_category); - ret = sge_is_job_category_rejected_(cat); - DRETURN(ret); -} - -/*-------------------------------------------------------------------------*/ -int -sge_is_job_category_reservation_rejected(const lListElem *job) { - int ret; - lListElem *cat = nullptr; - - DENTER(TOP_LAYER); - cat = (lListElem *) lGetRef(job, JB_category); - ret = sge_is_job_category_reservation_rejected_(cat); - DRETURN(ret); -} - -/*-------------------------------------------------------------------------*/ -bool -sge_is_job_category_rejected_(lRef cat) { - return lGetBool((lListElem *) cat, CT_rejected); -} - -/*-------------------------------------------------------------------------*/ -bool -sge_is_job_category_reservation_rejected_(lRef cat) { - return lGetBool((lListElem *) cat, CT_reservation_rejected); -} - -/*-------------------------------------------------------------------------*/ -void -sge_reject_category(lRef cat, bool with_reservation) { - lSetBool((lListElem *) cat, CT_rejected, true); - if (with_reservation) { - lSetBool((lListElem *) cat, CT_reservation_rejected, true); - } -} - -/*-------------------------------------------------------------------------*/ -/* rebuild the category references */ -/*-------------------------------------------------------------------------*/ -int -sge_rebuild_job_category(const lList *job_list, const lList *acl_list, const lList *prj_list, const lList *rqs_list) { - lListElem *job; - - DENTER(TOP_LAYER); - - if (!reb_cat) { - DRETURN(0); - } - - DPRINTF("### ### ### ### REBUILDING CATEGORIES ### ### ### ###\n"); - - lFreeList(&CATEGORY_LIST); - - for_each_rw (job, job_list) { - sge_add_job_category(job, acl_list, prj_list, rqs_list); - } - - reb_cat = false; - - DRETURN(0); -} - -int -sge_category_count() { - return lGetNumberOfElem(CATEGORY_LIST); -} - -/****** sge_category/sge_reset_job_category() ********************************** -* NAME -* sge_reset_job_category() -- resets the category temp information -* -* SYNOPSIS -* int sge_reset_job_category() -* -* FUNCTION -* Some information in the category should only life throu one scheduling run. -* These informations are reseted in the call: -* - dispatching messages -* - soft violations -* - not suitable cluster -* - the flag that identifies, if the messages are already added to the schedd infos -* - something with the resource reservation -* -* RESULT -* int - always 0 -* -* NOTES -* MT-NOTE: sge_reset_job_category() is not MT safe -* -*******************************************************************************/ -int -sge_reset_job_category() { - DENTER(TOP_LAYER); - - lListElem *cat; - for_each_rw (cat, CATEGORY_LIST) { - // deallocate memory stored in the cache itself - lListElem *cache; - for_each_rw(cache, lGetList(cat, CT_cache)) { - auto *range = static_cast(lGetRef(cache, CCT_pe_job_slots)); - sge_free(&range); - lSetRef(cache, CCT_pe_job_slots, nullptr); - } - - // now assignment (@todo make it boolean in master branch) - lSetBool(cat, CT_rejected, false); - - // reservation assignment (@todo make it boolean in master branch) - lSetBool(cat, CT_reservation_rejected, false); - - // @todo remove in master branch. This field is unused - lSetInt(cat, CT_count, -1); - - // reset the cache and the messages added flag - lSetList(cat, CT_cache, nullptr); - lSetBool(cat, CT_messages_added, false); - - // reset cached resource contribution - lSetBool(cat, CT_rc_valid, false); - lSetDouble(cat, CT_resource_contribution, 0.0); - } - - DRETURN(0); -} - -void -set_rebuild_categories(bool new_value) { - reb_cat = new_value; -} diff --git a/source/daemons/qmaster/sge_sched_job_category.h b/source/daemons/qmaster/sge_sched_job_category.h deleted file mode 100644 index 0bc749c77..000000000 --- a/source/daemons/qmaster/sge_sched_job_category.h +++ /dev/null @@ -1,77 +0,0 @@ -#pragma once -/*___INFO__MARK_BEGIN__*/ -/************************************************************************* - * - * The Contents of this file are made available subject to the terms of - * the Sun Industry Standards Source License Version 1.2 - * - * Sun Microsystems Inc., March, 2001 - * - * - * Sun Industry Standards Source License Version 1.2 - * ================================================= - * The contents of this file are subject to the Sun Industry Standards - * Source License Version 1.2 (the "License"); You may not use this file - * except in compliance with the License. You may obtain a copy of the - * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html - * - * Software provided under this License is provided on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, - * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. - * See the License for the specific provisions governing your rights and - * obligations concerning the Software. - * - * The Initial Developer of the Original Code is: Sun Microsystems, Inc. - * - * Copyright: 2001 by Sun Microsystems, Inc. - * - * All Rights Reserved. - * - * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH - * - ************************************************************************/ -/*___INFO__MARK_END__*/ - -/* -** ------------ to be called from within the data model layer -*/ - -int -sge_add_job_category(lListElem *job, const lList *acl_list, const lList *prj_list, const lList *lirs_list); - -int -sge_delete_job_category(lListElem *job); - -void -set_rebuild_categories(bool new_value); - -int -sge_rebuild_job_category(const lList *job_list, const lList *acl_list, const lList *prj_list, const lList *lirs_list); - -lList *sge_category_job_copy(lList *queue_list, lList **orders, bool monitor_next_run); - -/* -** ------------ to be called from within the decision-making layer -*/ - -void -sge_reject_category(lRef cat, bool with_reservation); - -bool -sge_is_job_category_rejected_(lRef cat); - -bool -sge_is_job_category_reservation_rejected_(lRef cat); - -int -sge_is_job_category_rejected(const lListElem *job); - -int -sge_is_job_category_reservation_rejected(const lListElem *job); - -int -sge_reset_job_category(); - -int -sge_category_count(); diff --git a/source/daemons/qmaster/sge_sched_prepare_data.cc b/source/daemons/qmaster/sge_sched_prepare_data.cc index 7c2295bdd..cad259a08 100644 --- a/source/daemons/qmaster/sge_sched_prepare_data.cc +++ b/source/daemons/qmaster/sge_sched_prepare_data.cc @@ -56,10 +56,11 @@ #include "evc/sge_event_client.h" #include "sge.h" -#include "sge_sched_job_category.h" #include "sge_sched_prepare_data.h" #include "sge_sched_process_events.h" +#include + static const int cqueue_field_ids[] = { CQ_name, CQ_hostlist, @@ -153,6 +154,7 @@ static const int qinstance_field_ids[] = { static const int job_nm[] = { JB_job_number, JB_category, + JB_category_id, JB_request_set_list, JB_owner, JB_group, @@ -498,6 +500,7 @@ sge_process_schedd_conf_event_after(sge_evc_class_t *evc, sge_object_type type, return SGE_EMA_OK; } +#if 0 sge_callback_result sge_process_project_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { @@ -542,6 +545,7 @@ sge_process_project_event_before(sge_evc_class_t *evc, sge_object_type type, DRETURN(SGE_EMA_OK); } +#endif sge_callback_result sge_process_schedd_monitor_event(sge_evc_class_t *evc, sge_object_type type, @@ -562,6 +566,37 @@ sge_process_global_config_event(sge_evc_class_t *evc, sge_object_type type, DRETURN(SGE_EMA_OK); } +sge_callback_result +sge_process_category_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { + DENTER(TOP_LAYER); + + // we handle only mod and del events + if (action != SGE_EMA_DEL && action != SGE_EMA_MOD) { + DRETURN(SGE_EMA_OK); + } + + // find the category + u_long32 category_id = lGetUlong(event, ET_intkey); + lList *master_category_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); + lListElem *category = lGetElemUlongRW(master_category_list, CT_id, category_id); + + // should not be possible + if (category == nullptr) { + ERROR(MSG_CANTFINDCATINMASTERLIST_U, category_id); + DRETURN(SGE_EMA_FAILURE); + } + + // remove tmp data + const lList *cache_list = lGetList(category, CT_cache); + const lListElem *cache = nullptr; + for_each_ep(cache, cache_list) { + auto range = static_cast(lGetRef(cache, CCT_pe_job_slots)); + sge_free(&range); + } + + DRETURN(SGE_EMA_OK); +} + sge_callback_result sge_process_job_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { @@ -584,6 +619,7 @@ sge_process_job_event_before(sge_evc_class_t *evc, sge_object_type type, DRETURN(SGE_EMA_OK); } +#if 0 switch (action) { case SGE_EMA_DEL: /* delete job category if necessary */ @@ -604,82 +640,58 @@ sge_process_job_event_before(sge_evc_class_t *evc, sge_object_type type, default: break; } +#endif DRETURN(SGE_EMA_OK); } sge_callback_result -sge_process_job_event_after(sge_evc_class_t *evc, sge_object_type type, - sge_event_action action, lListElem *event, void *clientdata) { - u_long32 job_id = 0; - lListElem *job = nullptr; - +sge_process_job_event_after(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { DENTER(TOP_LAYER); - DPRINTF("callback processing job event after default rule\n"); + // Find job ID and job where we received an event (add, modify) + u_long32 job_id; + lListElem *job = nullptr; if (action == SGE_EMA_ADD || action == SGE_EMA_MOD) { job_id = lGetUlong(event, ET_intkey); job = lGetElemUlongRW(*ocs::DataStore::get_master_list(SGE_TYPE_JOB), JB_job_number, job_id); + if (job == nullptr) { - dstring id_dstring = DSTRING_INIT; + DSTRING_STATIC(id_dstring, 64); ERROR(MSG_CANTFINDJOBINMASTERLIST_S, job_get_id_string(job_id, 0, nullptr, &id_dstring)); - sge_dstring_free(&id_dstring); DRETURN(SGE_EMA_FAILURE); } - sge_do_priority_job(job); /* job got added or modified, recompute the priorities */ } + lList *master_category_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); switch (action) { - case SGE_EMA_LIST: - set_rebuild_categories(true); - sge_do_priority(*ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB), nullptr); /* recompute the priorities */ - break; - - case SGE_EMA_ADD: { - u_long32 start, end, step; + case SGE_EMA_LIST: { + lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); - /* add job category */ - sge_add_job_category(job, *ocs::DataStore::get_master_list(SGE_TYPE_USERSET), - *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT), - *ocs::DataStore::get_master_list(SGE_TYPE_RQS)); + // refresh category references in all job objects + ocs::CategoryQmaster::refresh_cat_data_all_jobs(master_category_list, master_job_list); - job_get_submit_task_ids(job, &start, &end, &step); - - if (job_is_array(job)) { - DPRINTF("Added job-array " sge_uu32 "." sge_uu32 "-" sge_uu32 ":" sge_uu32 "\n", job_id, start, end, step); - } else { - DPRINTF("Added job " sge_uu32"\n", job_id); - } - } + // recompute the priorities for all jobs + sge_do_priority(master_job_list, nullptr); break; + } + case SGE_EMA_ADD: + // refresh category reference + ocs::CategoryQmaster::refresh_cat_data_in_job(master_category_list, job); + // recompute the priorities for the job + sge_do_priority_job(job); + break; case SGE_EMA_MOD: switch (lGetUlong(event, ET_type)) { - case sgeE_JOB_MOD: - /* - ** after changing the job, read category reference - ** for changed job - */ - - sge_add_job_category(job, - *ocs::DataStore::get_master_list(SGE_TYPE_USERSET), - *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT), - *ocs::DataStore::get_master_list(SGE_TYPE_RQS)); - break; - case sgeE_JOB_FINAL_USAGE: { - const char *pe_task_id; - - pe_task_id = lGetString(event, ET_strkey); + const char *pe_task_id = lGetString(event, ET_strkey); - /* ignore FINAL_USAGE for a pe task here */ if (pe_task_id == nullptr) { - u_long32 ja_task_id; - lListElem *ja_task; - - ja_task_id = lGetUlong(event, ET_intkey2); - ja_task = job_search_task(job, nullptr, ja_task_id); + // ignore FINAL_USAGE for a pe task here + u_long32 ja_task_id = lGetUlong(event, ET_intkey2); + lListElem *ja_task = job_search_task(job, nullptr, ja_task_id); if (ja_task == nullptr) { ERROR(MSG_CANTFINDTASKINJOB_UU, ja_task_id, job_id); @@ -688,12 +700,11 @@ sge_process_job_event_after(sge_evc_class_t *evc, sge_object_type type, lSetUlong(ja_task, JAT_status, JFINISHED); } - } break; + } + case sgeE_JOB_MOD: case sgeE_JOB_MOD_SCHED_PRIORITY: - break; - default: break; } @@ -736,6 +747,7 @@ sge_process_ja_task_event_after(sge_evc_class_t *evc, sge_object_type type, DRETURN(SGE_EMA_OK); } +#if 0 /****** sge_process_events/sge_process_userset_event_before() ****************** * NAME * sge_process_userset_event_before() -- ??? @@ -804,4 +816,4 @@ sge_process_userset_event_before(sge_evc_class_t *evc, sge_object_type type, sge DRETURN(SGE_EMA_OK); } - +#endif diff --git a/source/daemons/qmaster/sge_sched_prepare_data.h b/source/daemons/qmaster/sge_sched_prepare_data.h index 78d50573d..fae81e2d9 100644 --- a/source/daemons/qmaster/sge_sched_prepare_data.h +++ b/source/daemons/qmaster/sge_sched_prepare_data.h @@ -68,9 +68,11 @@ ensure_valid_what_and_where(sge_where_what_t *where_what); void free_what_and_where(sge_where_what_t *where_what); +#if 0 sge_callback_result sge_process_project_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata); +#endif sge_callback_result sge_process_schedd_conf_event_before(sge_evc_class_t *evc, sge_object_type type, @@ -100,7 +102,12 @@ sge_callback_result sge_process_schedd_monitor_event(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata); +sge_callback_result +sge_process_category_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata); + +#if 0 sge_callback_result sge_process_userset_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata); +#endif diff --git a/source/daemons/qmaster/sge_sched_process_events.cc b/source/daemons/qmaster/sge_sched_process_events.cc index 42c864e99..41d561a15 100644 --- a/source/daemons/qmaster/sge_sched_process_events.cc +++ b/source/daemons/qmaster/sge_sched_process_events.cc @@ -40,24 +40,18 @@ /* common/ */ #include "basis_types.h" -#include "sge.h" #include "uti/sge_mtutil.h" #include "uti/sge_rmon_macros.h" #include "uti/sge_bootstrap_files.h" -#include "sgeobj/sge_conf.h" -#include "sgeobj/sge_report.h" #include "sgeobj/sge_schedd_conf.h" #include "mir/sge_mirror.h" #include "evc/sge_event_client.h" -#include "gdi/ocs_gdi_Client.h" - #include "sge_sched_process_events.h" #include "sge_sched_prepare_data.h" -#include "sge_sched_job_category.h" /****** qmaster/sge_thread_scheduler/event_update_func() ************************** * NAME @@ -144,21 +138,22 @@ subscribe_scheduler(sge_evc_class_t *evc, sge_where_what_t *where_what) sge_mirror_subscribe(evc, SGE_TYPE_JOB, sge_process_job_event_before, sge_process_job_event_after, nullptr, where_what->where_job, where_what->what_job); sge_mirror_subscribe(evc, SGE_TYPE_JATASK, nullptr, sge_process_ja_task_event_after, nullptr, where_what->where_jat, where_what->what_jat); sge_mirror_subscribe(evc, SGE_TYPE_PE, nullptr, nullptr, nullptr, nullptr, where_what->what_pe); - + sge_mirror_subscribe(evc, SGE_TYPE_CATEGORY, sge_process_category_event_before, nullptr, nullptr, nullptr, nullptr); + /* we do *not* subscribe reduced elements for TYPE_PETASK: * event master currently cannot handle this, see IZ 3216 * sge_mirror_subscribe(evc, SGE_TYPE_PETASK, nullptr, nullptr, nullptr, nullptr, where_what->what_pet); */ sge_mirror_subscribe(evc, SGE_TYPE_PETASK, nullptr, nullptr, nullptr, nullptr, nullptr); - sge_mirror_subscribe(evc, SGE_TYPE_PROJECT, sge_process_project_event_before, nullptr, nullptr, nullptr, nullptr); + sge_mirror_subscribe(evc, SGE_TYPE_PROJECT, nullptr, nullptr, nullptr, nullptr, nullptr); sge_mirror_subscribe(evc, SGE_TYPE_QINSTANCE, nullptr, nullptr, nullptr, where_what->where_all_queue, where_what->what_queue); sge_mirror_subscribe(evc, SGE_TYPE_RQS, nullptr, nullptr, nullptr, nullptr, nullptr); sge_mirror_subscribe(evc, SGE_TYPE_SCHEDD_CONF, sge_process_schedd_conf_event_before, sge_process_schedd_conf_event_after, nullptr, nullptr, nullptr); sge_mirror_subscribe(evc, SGE_TYPE_SCHEDD_MONITOR, nullptr, sge_process_schedd_monitor_event, nullptr, nullptr, nullptr); sge_mirror_subscribe(evc, SGE_TYPE_SHARETREE, nullptr, nullptr, nullptr, nullptr, nullptr); sge_mirror_subscribe(evc, SGE_TYPE_USER, nullptr, nullptr, nullptr, nullptr, nullptr); - sge_mirror_subscribe(evc, SGE_TYPE_USERSET, sge_process_userset_event_before, nullptr, nullptr, nullptr, nullptr); + sge_mirror_subscribe(evc, SGE_TYPE_USERSET, nullptr, nullptr, nullptr, nullptr, nullptr); set_job_flushing(evc); diff --git a/source/daemons/qmaster/sge_sched_thread.cc b/source/daemons/qmaster/sge_sched_thread.cc index 92bb4ce1a..47c2c2a3f 100644 --- a/source/daemons/qmaster/sge_sched_thread.cc +++ b/source/daemons/qmaster/sge_sched_thread.cc @@ -45,7 +45,6 @@ #include "uti/sge_rmon_macros.h" #include "uti/sge_thread_ctrl.h" #include "uti/sge_time.h" -#include "uti/sge_unistd.h" #include "sgeobj/sge_answer.h" #include "sgeobj/sge_conf.h" @@ -53,21 +52,13 @@ #include "sgeobj/sge_schedd_conf.h" #include "sgeobj/sge_job.h" #include "sgeobj/sge_qinstance.h" -#include "sgeobj/sge_ja_task.h" -#include "sgeobj/sge_userset.h" #include "sgeobj/sge_qinstance_state.h" -#include "sgeobj/sge_userprj.h" -#include "sgeobj/sge_sharetree.h" #include "sgeobj/sge_host.h" -#include "sgeobj/sge_centry.h" #include "sgeobj/sge_ckpt.h" -#include "sgeobj/sge_pe.h" #include "sgeobj/sge_range.h" #include "sgeobj/sge_order.h" -#include "sgeobj/sge_ulong.h" #include "sgeobj/sge_grantedres.h" -#include "mir/sge_mirror.h" #include "evc/sge_event_client.h" #include "evm/sge_event_master.h" @@ -87,17 +78,10 @@ #include "sched/sort_hosts.h" #include "sched/debit.h" -#include "sge_sched_prepare_data.h" -#include "sge_sched_job_category.h" +#include "ocs_CategorySchedd.h" #include "basis_types.h" #include "sge.h" -#include "setup_qmaster.h" -#include "sge_sched_process_events.h" -#include "sge_qmaster_threads.h" #include "sge_follow.h" -#include "sge_follow.h" -#include "sge_qmaster_threads.h" -#include "sge_sched_process_events.h" #include "sge_sched_thread.h" #include "sge_sched_order.h" #include "sge_sched_thread_rsmap.h" @@ -742,7 +726,7 @@ static int dispatch_jobs(sge_evc_class_t *evc, scheduler_all_data_t *lists, orde if (nreservation < max_reserve && lGetBool(orig_job, JB_reserve) && !JOB_TYPE_IS_IMMEDIATE(lGetUlong(orig_job, JB_type)) && - !sge_is_job_category_reservation_rejected(orig_job)) { + !ocs::CategorySchedd::job_is_category_reservation_rejected(orig_job)) { is_reserve = true; } else { is_reserve = false; @@ -750,7 +734,7 @@ static int dispatch_jobs(sge_evc_class_t *evc, scheduler_all_data_t *lists, orde // Don't need to look for a 'now' assignment if the last job // of this category got no 'now' assignment either - is_start = sge_is_job_category_rejected(orig_job) == 0; + is_start =ocs::CategorySchedd:: job_is_category_rejected(orig_job) == 0; if (is_start || is_reserve) { lListElem *job = nullptr; @@ -904,7 +888,7 @@ static int dispatch_jobs(sge_evc_class_t *evc, scheduler_all_data_t *lists, orde if ((cat = (lListElem *) lGetRef(orig_job, JB_category))) { DPRINTF("SKIP JOB (R)" sge_uu32 " of category '%s' (rc: " sge_uu32 ")\n", job_id, lGetString(cat, CT_str), lGetUlong(cat, CT_refcount)); - sge_reject_category(cat, false); + ocs::CategorySchedd::job_reject_category(orig_job, false); } /* here no reservation was made for a job that couldn't be started now or the job is not dispatch-able at all */ @@ -935,7 +919,7 @@ static int dispatch_jobs(sge_evc_class_t *evc, scheduler_all_data_t *lists, orde if ((cat = (lListElem *) lGetRef(orig_job, JB_category))) { DPRINTF("SKIP JOB (N)" sge_uu32 " of category '%s' (rc: " sge_uu32 ")\n", job_id, lGetString(cat, CT_str), lGetUlong(cat, CT_refcount)); - sge_reject_category(cat, is_reserve); + ocs::CategorySchedd::job_reject_category(orig_job, is_reserve); } } /* fall through to DISPATCH_NEVER_JOB */ diff --git a/source/daemons/qmaster/sge_thread_scheduler.cc b/source/daemons/qmaster/sge_thread_scheduler.cc index 4fe9214dc..b4caae4d8 100644 --- a/source/daemons/qmaster/sge_thread_scheduler.cc +++ b/source/daemons/qmaster/sge_thread_scheduler.cc @@ -64,14 +64,13 @@ #include "sched/schedd_monitor.h" #include "basis_types.h" +#include "ocs_CategoryQmaster.h" #include "ocs_ReportingFileWriter.h" -#include "sge_sched_job_category.h" #include "sge_sched_order.h" #include "sge_thread_main.h" #include "sge_thread_scheduler.h" #include "setup_qmaster.h" #include "sge_sched_process_events.h" - #include "sge.h" #include "msg_common.h" #include "msg_qmaster.h" @@ -633,6 +632,7 @@ sge_scheduler_main(void *arg) { const lList *master_hgrp_list = *ocs::DataStore::get_master_list(SGE_TYPE_HGROUP); const lList *master_sharetree_list = *ocs::DataStore::get_master_list(SGE_TYPE_SHARETREE); const lList *master_config_list = *ocs::DataStore::get_master_list(SGE_TYPE_CONFIG); + const lList *master_catergory_list = *ocs::DataStore::get_master_list(SGE_TYPE_CATEGORY); /* delay scheduling for test purposes, see issue GE-3306 */ if (SGE_TEST_DELAY_SCHEDULING > 0) { @@ -679,6 +679,7 @@ sge_scheduler_main(void *arg) { lGetNumberOfLeafs(nullptr, master_sharetree_list, STN_children) ); +#if 0 /* rebuild all job categories * - when the scheduler config changed * - when the projects were added/modified/deleted @@ -687,6 +688,7 @@ sge_scheduler_main(void *arg) { * */ sge_rebuild_job_category(master_job_list, master_userset_list, master_project_list, master_rqs_list); +#endif PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM7); double prof_init = prof_get_measurement_wallclock(SGE_PROF_CUSTOM7, true, nullptr); @@ -713,7 +715,10 @@ sge_scheduler_main(void *arg) { * - the resource request dependent urgency contribution is cached * per job category */ +#if 0 sge_reset_job_category(); +#endif + ocs::CategoryQmaster::reset_tmp_data(); // prepare data for the scheduler itself copy.host_list = lCopyList(nullptr, master_exechost_list); @@ -893,9 +898,9 @@ sge_scheduler_main(void *arg) { if (prof_is_active(SGE_PROF_CUSTOM6)) { PROFILING("PROF: schedd run took: %.3f s (init: %.3f s, copy: %.3f s, " - "run:%.3f, free: %.3f s, jobs: " sge_uu32 ", categories: %d/%d)", - prof_total, prof_init, prof_copy, prof_run, prof_free, - lGetNumberOfElem(*ocs::DataStore::get_master_list(SGE_TYPE_JOB)), sge_category_count(), 0); + "run:%.3f, free: %.3f s, jobs: " sge_uu32 ", categories: %d/%d)", + prof_total, prof_init, prof_copy, prof_run, prof_free, + lGetNumberOfElem(master_job_list), lGetNumberOfElem(master_catergory_list), 0); } if (getenv("SGE_ND") != nullptr) { printf("--------------STOP-SCHEDULER-RUN-------------\n"); diff --git a/source/daemons/qmaster/sge_userprj_qmaster.cc b/source/daemons/qmaster/sge_userprj_qmaster.cc index 5ecb2954c..1d9407f22 100644 --- a/source/daemons/qmaster/sge_userprj_qmaster.cc +++ b/source/daemons/qmaster/sge_userprj_qmaster.cc @@ -58,6 +58,7 @@ #include "sgeobj/sge_suser.h" #include "sgeobj/ocs_DataStore.h" +#include "ocs_CategoryQmaster.h" #include "sge_utility_qmaster.h" #include "sge_userprj_qmaster.h" #include "sge_userset_qmaster.h" @@ -253,13 +254,23 @@ userprj_success(ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem *ep, l obj_mod_event = sgeE_USER_MOD; } + bool reattach_categories = false; for_each_ep(rqs, *(ocs::DataStore::get_master_list(SGE_TYPE_RQS))) { if (scope_is_referenced_rqs(rqs, obj_filter, lGetString(ep, obj_key))) { lSetBool(ep, obj_consider, true); + reattach_categories = true; break; } } + if (reattach_categories) { + lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + ocs::CategoryQmaster::reattach_all_jobs(master_job_list, master_userset_list, master_project_list, master_rqs_list, true, packet->gdi_session); + } + sge_add_event(0, old_ep ? obj_mod_event : obj_add_event, 0, 0, lGetString(ep, obj_key), nullptr, nullptr, ep, packet->gdi_session); DRETURN(0); @@ -721,11 +732,11 @@ static bool project_still_used(const char *p) { * MT-NOTE: project_update_categories() is not MT safe *******************************************************************************/ void project_update_categories(const lList *added, const lList *removed, u_long64 gdi_session) { + DENTER(TOP_LAYER); const lListElem *ep; const char *p; lListElem *prj; - - DENTER(TOP_LAYER); + bool reattach_categories = false; for_each_ep(ep, added) { p = lGetString(ep, PR_name); @@ -733,6 +744,7 @@ void project_update_categories(const lList *added, const lList *removed, u_long6 prj = lGetElemStrRW(*ocs::DataStore::get_master_list(SGE_TYPE_PROJECT), PR_name, p); if (prj && !lGetBool(prj, PR_consider_with_categories)) { lSetBool(prj, PR_consider_with_categories, true); + reattach_categories = true; sge_add_event(0, sgeE_PROJECT_MOD, 0, 0, p, nullptr, nullptr, prj, gdi_session); } } @@ -744,10 +756,20 @@ void project_update_categories(const lList *added, const lList *removed, u_long6 if (prj && !project_still_used(p)) { lSetBool(prj, PR_consider_with_categories, false); + reattach_categories = true; sge_add_event(0, sgeE_PROJECT_MOD, 0, 0, p, nullptr, nullptr, prj, gdi_session); } } + // reattach all jobs to categories and consider the new project attributes + if (reattach_categories) { + lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + ocs::CategoryQmaster::reattach_all_jobs(master_job_list, master_userset_list, master_project_list, master_rqs_list, true, gdi_session); + } + DRETURN_VOID; } diff --git a/source/daemons/qmaster/sge_userset_qmaster.cc b/source/daemons/qmaster/sge_userset_qmaster.cc index 8dc4a80c9..16603b438 100644 --- a/source/daemons/qmaster/sge_userset_qmaster.cc +++ b/source/daemons/qmaster/sge_userset_qmaster.cc @@ -49,8 +49,10 @@ #include "sched/valid_queue_user.h" -#include "sge.h" #include "evm/sge_event_master.h" + +#include "ocs_CategoryQmaster.h" +#include "sge.h" #include "sge_userset_qmaster.h" #include "sge_persistence_qmaster.h" #include "sge_utility_qmaster.h" @@ -530,12 +532,12 @@ static bool userset_still_used(const char *u) { * MT-NOTE: userset_update_categories() is not MT safe *******************************************************************************/ void userset_update_categories(const lList *added, const lList *removed, u_long64 gdi_session) { + DENTER(TOP_LAYER); const lListElem *ep; const char *u; lListElem *acl; const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); - - DENTER(TOP_LAYER); + bool reattach_categories = false; for_each_ep(ep, added) { u = lGetString(ep, US_name); @@ -543,6 +545,7 @@ void userset_update_categories(const lList *added, const lList *removed, u_long6 acl = lGetElemStrRW(master_userset_list, US_name, u); if (acl && !lGetBool(acl, US_consider_with_categories)) { lSetBool(acl, US_consider_with_categories, true); + reattach_categories = true; sge_add_event(0, sgeE_USERSET_MOD, 0, 0, u, nullptr, nullptr, acl, gdi_session); } } @@ -554,10 +557,20 @@ void userset_update_categories(const lList *added, const lList *removed, u_long6 if (acl && !userset_still_used(u)) { lSetBool(acl, US_consider_with_categories, false); + reattach_categories = true; sge_add_event(0, sgeE_USERSET_MOD, 0, 0, u, nullptr, nullptr, acl, gdi_session); } } + // reattach all jobs to categories and consider the new project attributes + if (reattach_categories) { + lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + ocs::CategoryQmaster::reattach_all_jobs(master_job_list, master_userset_list, master_project_list, master_rqs_list, true, gdi_session); + } + DRETURN_VOID; } @@ -788,20 +801,19 @@ int userset_spool(ocs::gdi::Packet *packet, ocs::gdi::Task *task, lList **alpp, *******************************************************************************/ int userset_success(ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem *ep, lListElem *old_ep, gdi_object_t *object, lList **ppList, monitoring_t *monitor) { - const char *userset_name; + DENTER(TOP_LAYER); dstring ds = DSTRING_INIT; const lListElem *rqs; const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); - - DENTER(TOP_LAYER); - - userset_name = lGetString(ep, US_name); + const char *userset_name = lGetString(ep, US_name); /* set consider with categories */ + bool reattach_categories = false; sge_dstring_sprintf(&ds, "@%s", userset_name); for_each_ep(rqs, master_rqs_list) { if (scope_is_referenced_rqs(rqs, RQR_filter_users, sge_dstring_get_string(&ds))) { lSetBool(ep, US_consider_with_categories, true); + reattach_categories = true; break; } } @@ -811,6 +823,13 @@ int userset_success(ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem *e sge_change_queue_version_acl(packet, task, userset_name); } + if (reattach_categories) { + lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); + const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); + const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); + ocs::CategoryQmaster::reattach_all_jobs(master_job_list, master_userset_list, master_project_list, master_rqs_list, true, packet->gdi_session); + } + sge_add_event(0, old_ep ? sgeE_USERSET_MOD : sgeE_USERSET_ADD, 0, 0, userset_name, nullptr, nullptr, ep, packet->gdi_session); diff --git a/source/libs/evm/sge_event_master.cc b/source/libs/evm/sge_event_master.cc index 28b46b8b8..e79b970a8 100644 --- a/source/libs/evm/sge_event_master.cc +++ b/source/libs/evm/sge_event_master.cc @@ -242,7 +242,7 @@ const int SOURCE_LIST[LIST_MAX][3] = { ***************************************************** */ -#define total_update_eventsMAX 21 +#define total_update_eventsMAX 22 const int total_update_events[total_update_eventsMAX + 1] = {sgeE_ADMINHOST_LIST, sgeE_CALENDAR_LIST, @@ -250,6 +250,7 @@ const int total_update_events[total_update_eventsMAX + 1] = {sgeE_ADMINHOST_LIST sgeE_CENTRY_LIST, sgeE_CONFIG_LIST, sgeE_EXECHOST_LIST, + sgeE_CATEGORY_LIST, sgeE_JOB_LIST, sgeE_JOB_SCHEDD_INFO_LIST, sgeE_MANAGER_LIST, @@ -274,6 +275,7 @@ const int block_events[total_update_eventsMAX][9] = { {sgeE_CENTRY_ADD, sgeE_CENTRY_DEL, sgeE_CENTRY_MOD, -1, -1, -1, -1, -1, -1}, {sgeE_CONFIG_ADD, sgeE_CONFIG_DEL, sgeE_CONFIG_MOD, -1, -1, -1, -1, -1, -1}, {sgeE_EXECHOST_ADD, sgeE_EXECHOST_DEL, sgeE_EXECHOST_MOD, -1, -1, -1, -1, -1, -1}, + {sgeE_CATEGORY_ADD, sgeE_CATEGORY_DEL, sgeE_CATEGORY_MOD, -1, -1, -1, -1, -1, -1}, {sgeE_JOB_ADD, sgeE_JOB_DEL, sgeE_JOB_MOD, sgeE_JOB_MOD_SCHED_PRIORITY, sgeE_JOB_USAGE, sgeE_JOB_FINAL_USAGE, sgeE_JOB_FINISH, -1, -1}, {sgeE_JOB_SCHEDD_INFO_ADD, sgeE_JOB_SCHEDD_INFO_DEL, sgeE_JOB_SCHEDD_INFO_MOD, -1, -1, -1, -1, -1, -1}, {sgeE_MANAGER_ADD, sgeE_MANAGER_DEL, sgeE_MANAGER_MOD, -1, -1, -1, -1, -1, -1}, @@ -768,6 +770,7 @@ sge_event_master_process_mod_event_client(const lListElem *request, monitoring_t check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_CENTRY_LIST); check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_CONFIG_LIST); check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_EXECHOST_LIST); + check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_CATEGORY_LIST); check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_JOB_LIST); check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_JOB_SCHEDD_INFO_LIST); check_send_new_subscribed_list(old_sub, new_sub, event_client, sgeE_MANAGER_LIST); @@ -1966,6 +1969,7 @@ init_send_events() { SEND_EVENTS[sgeE_CENTRY_LIST] = true; SEND_EVENTS[sgeE_CONFIG_LIST] = true; SEND_EVENTS[sgeE_EXECHOST_LIST] = true; + SEND_EVENTS[sgeE_CATEGORY_LIST] = true; SEND_EVENTS[sgeE_JOB_LIST] = true; SEND_EVENTS[sgeE_JOB_SCHEDD_INFO_LIST] = true; SEND_EVENTS[sgeE_MANAGER_LIST] = true; @@ -2373,6 +2377,7 @@ total_update(lListElem *event_client, u_long64 gdi_session) total_update_event(event_client, sgeE_CENTRY_LIST, false, gdi_session); total_update_event(event_client, sgeE_CONFIG_LIST, false, gdi_session); total_update_event(event_client, sgeE_EXECHOST_LIST, false, gdi_session); + total_update_event(event_client, sgeE_CATEGORY_LIST, false, gdi_session); total_update_event(event_client, sgeE_JOB_LIST, false, gdi_session); total_update_event(event_client, sgeE_JOB_SCHEDD_INFO_LIST, false, gdi_session); total_update_event(event_client, sgeE_MANAGER_LIST, false, gdi_session); @@ -2913,6 +2918,9 @@ static void total_update_event(lListElem *event_client, ev_event type, bool new_ case sgeE_AR_LIST: lp = *ocs::DataStore::get_master_list(SGE_TYPE_AR); break; + case sgeE_CATEGORY_LIST: + lp = *ocs::DataStore::get_master_list(SGE_TYPE_CATEGORY); + break; default: WARNING(MSG_EVE_TOTALUPDATENOTHANDLINGEVENT_I, type); DRETURN_VOID; diff --git a/source/libs/gdi/ocs_gdi_Target.cc b/source/libs/gdi/ocs_gdi_Target.cc index 7d1eec43c..6177e3252 100644 --- a/source/libs/gdi/ocs_gdi_Target.cc +++ b/source/libs/gdi/ocs_gdi_Target.cc @@ -48,6 +48,7 @@ std::string ocs::gdi::Target::targetToString(const TargetValue target) { case SGE_RQS_LIST: return "SGE_RQS_LIST"; case SGE_AR_LIST: return "SGE_AR_LIST"; case SGE_DUMMY_LIST: return "SGE_DUMMY_LIST"; + case SGE_CAT_LIST: return "SGE_CAT_LIST"; default: return "UNKNOWN_TARGET"; } } diff --git a/source/libs/gdi/ocs_gdi_Target.h b/source/libs/gdi/ocs_gdi_Target.h index 3c413496c..e5f85d021 100644 --- a/source/libs/gdi/ocs_gdi_Target.h +++ b/source/libs/gdi/ocs_gdi_Target.h @@ -53,7 +53,8 @@ namespace ocs::gdi { SGE_HGRP_LIST, SGE_RQS_LIST, SGE_AR_LIST, - SGE_DUMMY_LIST + SGE_DUMMY_LIST, + SGE_CAT_LIST, }; static std::string targetToString(TargetValue target); diff --git a/source/libs/mir/sge_mirror.cc b/source/libs/mir/sge_mirror.cc index 4ce55e741..ce5c1870b 100644 --- a/source/libs/mir/sge_mirror.cc +++ b/source/libs/mir/sge_mirror.cc @@ -163,6 +163,7 @@ static const mirror_description dev_mirror_base[SGE_TYPE_ALL] = { { nullptr, generic_update_master_list, nullptr, nullptr }, /*rqs*/ { nullptr, ar_update_master_list, nullptr, nullptr }, /*advance reservation*/ { nullptr, nullptr, nullptr, nullptr }, /*jobscripts*/ + { nullptr, generic_update_master_list, nullptr, nullptr }, // sgeE_CATEGORY_LIST }; /*-------------------------*/ @@ -719,6 +720,18 @@ sge_mirror_subscribe_internal(sge_evc_class_t *evc, sge_object_type type, case SGE_TYPE_JOBSCRIPT: ret = SGE_EM_NOT_INITIALIZED; break; + case SGE_TYPE_CATEGORY: + evc->ec_subscribe(evc, sgeE_CATEGORY_LIST); + evc->ec_subscribe(evc, sgeE_CATEGORY_ADD); + evc->ec_subscribe(evc, sgeE_CATEGORY_DEL); + evc->ec_subscribe(evc, sgeE_CATEGORY_MOD); + if (what_el && where_el) { + evc->ec_mod_subscription_where(evc, sgeE_CATEGORY_LIST, what_el, where_el); + evc->ec_mod_subscription_where(evc, sgeE_CATEGORY_ADD, what_el, where_el); + evc->ec_mod_subscription_where(evc, sgeE_CATEGORY_DEL, what_el, where_el); + evc->ec_mod_subscription_where(evc, sgeE_CATEGORY_MOD, what_el, where_el); + } + break; default: ret = SGE_EM_BAD_ARG; break; @@ -954,6 +967,12 @@ sge_mirror_unsubscribe_internal(sge_evc_class_t *evc, sge_object_type type) { break; case SGE_TYPE_JOBSCRIPT: DRETURN(SGE_EM_NOT_INITIALIZED); + case SGE_TYPE_CATEGORY: + evc->ec_unsubscribe(evc, sgeE_CATEGORY_LIST); + evc->ec_unsubscribe(evc, sgeE_CATEGORY_ADD); + evc->ec_unsubscribe(evc, sgeE_CATEGORY_DEL); + evc->ec_unsubscribe(evc, sgeE_CATEGORY_MOD); + break; default: ERROR("received invalid event group %d", type); DRETURN(SGE_EM_BAD_ARG); @@ -1405,6 +1424,19 @@ sge_mirror_process_event_list_(sge_evc_class_t *evc, lList *event_list) ret = sge_mirror_process_event(evc, mirror_base, SGE_TYPE_AR, SGE_EMA_MOD, event); break; + case sgeE_CATEGORY_LIST: + ret = sge_mirror_process_event(evc, mirror_base, SGE_TYPE_CATEGORY, SGE_EMA_LIST, event); + break; + case sgeE_CATEGORY_ADD: + ret = sge_mirror_process_event(evc, mirror_base, SGE_TYPE_CATEGORY, SGE_EMA_ADD, event); + break; + case sgeE_CATEGORY_DEL: + ret = sge_mirror_process_event(evc, mirror_base, SGE_TYPE_CATEGORY, SGE_EMA_DEL, event); + break; + case sgeE_CATEGORY_MOD: + ret = sge_mirror_process_event(evc, mirror_base, SGE_TYPE_CATEGORY, SGE_EMA_MOD, event); + break; + default: break; } diff --git a/source/libs/sgeobj/CMakeLists.txt b/source/libs/sgeobj/CMakeLists.txt index aa113a9ab..36feef651 100644 --- a/source/libs/sgeobj/CMakeLists.txt +++ b/source/libs/sgeobj/CMakeLists.txt @@ -68,6 +68,7 @@ set(LIBRARY_SOURCES cull_parse_util.cc ocs_binding_io.cc ocs_BindingFinder.cc + ocs_Category.cc ocs_DataStore.cc ocs_HostTopology.cc ocs_Session.cc @@ -118,6 +119,7 @@ set(LIBRARY_SOURCES sge_range.cc sge_report.cc sge_resource_quota.cc + sge_resource_quota_service.cc sge_schedd_conf.cc sge_sharetree.cc sge_str.cc diff --git a/source/libs/sgeobj/msg_sgeobjlib.h b/source/libs/sgeobj/msg_sgeobjlib.h index 25009e3ae..64352da6a 100644 --- a/source/libs/sgeobj/msg_sgeobjlib.h +++ b/source/libs/sgeobj/msg_sgeobjlib.h @@ -103,6 +103,9 @@ #define MSG_EVENT_MODOBJECTX_USS _MESSAGE(64092, _(sge_uu32". EVENT MOD " SFN " " SFN)) #define MSG_EVENT_OBJECTLISTXELEMENTS_USI _MESSAGE(64093, _(sge_uu32". EVENT " SFN " LIST %d Elements")) #define MSG_EVENT_MESSAGE_US _MESSAGE(64094, _(sge_uu32". EVENT " SFN)) +#define MSG_EVENT_ADDOBJECTX_USU _MESSAGE(64095, _(sge_uu32". EVENT ADD " SFN " " sge_uu32)) +#define MSG_EVENT_DELOBJECTX_USU _MESSAGE(64096, _(sge_uu32". EVENT DEL " SFN " " sge_uu32)) +#define MSG_EVENT_MODOBJECTX_USU _MESSAGE(64097, _(sge_uu32". EVENT MOD " SFN " " sge_uu32)) #define MSG_EVENT_MODSCHEDDPRIOOFJOBXTOY_USI _MESSAGE(64100, _(sge_uu32". EVENT MODIFY SCHEDULING PRIORITY OF JOB " SFN " TO %d")) #define MSG_EVENT_JOBXUSAGE_US _MESSAGE(64101, _(sge_uu32". EVENT JOB " SFN " USAGE")) diff --git a/source/libs/sgeobj/ocs_Category.cc b/source/libs/sgeobj/ocs_Category.cc new file mode 100644 index 000000000..984ff81c1 --- /dev/null +++ b/source/libs/sgeobj/ocs_Category.cc @@ -0,0 +1,140 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include + +#include "uti/sge_dstring.h" +#include "uti/sge_log.h" +#include "uti/sge_rmon_macros.h" + +#include "ocs_Category.h" +#include "sge_job.h" +#include "sge_resource_quota_service.h" +#include "sge_userprj.h" + +u_long32 ocs::Category::next_id = 0; + +/** @brief Build the job category string + * + * This function builds the job category string for a job. The category string is + * used to group jobs together for scheduling purposes. The function takes a + * dstring object to store the category string and a job object to extract + * information from. It also takes a lists to identify if a job indirectly + * references certain configuration objects (e.g. resource quota sets). + * + * @param category_str The target string, contains the category or nothing + * @param job The job for the category creating + * @param acl_list Global access list + * @param prj_list Project list + * @param rqs_list Resource quota set list + */ +void +ocs::Category::build_string(dstring *category_str, lListElem *job, + const lList *acl_list, const lList *prj_list, const lList *rqs_list) { + DENTER(TOP_LAYER); + + // owner (user, UNIX group, and ACLs) + const char *owner = lGetString(job, JB_owner); + const char *group = lGetString(job, JB_group); + const lList *grp_list = lGetList(job, JB_grp_list); + sge_unparse_acl_dstring(category_str, owner, group, grp_list, acl_list, "-U"); + + // -u if referenced in resource quota sets + // + // TODO: A possible performance enhancement is to split user and group inside category. + // Some users are only referenced by the unix group. Their jobs could be grouped + // together by referencing only the group in the category string + if (sge_user_is_referenced_in_rqs(rqs_list, owner, group, grp_list, acl_list)) { + sge_dstring_append(category_str, "-u "); + sge_dstring_append(category_str, owner); + sge_dstring_append_char(category_str, ' '); + } + + // -scope global -hard -q + sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_GLOBAL, true), "-scope global -hard -q"); + + // -scope master -hard -q + sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_MASTER, true), "-scope master -hard -q"); + + // -scope slave -hard -q + sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_SLAVE, true), "-scope slave -hard -q"); + + + // -scope global -hard -l + sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_GLOBAL, true), "-scope global -hard -l"); + + // -scope master -hard -l + sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_MASTER, true), "-scope master -hard -l"); + + // -scope slave -hard -l + sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_SLAVE, true), "-scope slave -hard -l"); + + // TODO: evaluate if soft requests should be part of the category string +#if 1 + // -scope global -soft -q + sge_unparse_queue_list_dstring(category_str, job_get_queue_listRW(job, JRS_SCOPE_GLOBAL, false), "-scope global -soft -q"); + + // -scope global -soft -l + sge_unparse_resource_list_dstring(category_str, job_get_resource_listRW(job, JRS_SCOPE_GLOBAL, false), "-scope global -soft -l"); +#endif + + // -pe pe_name pe_range + sge_unparse_pe_dstring(category_str, job, lGetPosViaElem(job, JB_pe, SGE_NO_ABORT), lGetPosViaElem(job, JB_pe_range, SGE_NO_ABORT), "-pe"); + + // -ckpt ckpt_name + sge_unparse_string_option_dstring(category_str, job, lGetPosViaElem(job, JB_checkpoint_name, SGE_NO_ABORT), "-ckpt"); + + // interactive job type + if (JOB_TYPE_IS_IMMEDIATE(lGetPosUlong(job, lGetPosViaElem(job, JB_type, SGE_NO_ABORT)))) { + sge_dstring_append(category_str, "-I y "); + } + + // -P project + int project_nm = lGetPosViaElem(job, JB_project, SGE_NO_ABORT); + const char *project = lGetPosString(job, project_nm); + + if (project != nullptr) { + const lListElem *prj = lGetElemStr(prj_list, PR_name, project); + + if (prj != nullptr && lGetBool(prj, PR_consider_with_categories)) { + sge_unparse_string_option_dstring(category_str, job, project_nm, "-P"); + } + } + + // -ar ar_id + sge_unparse_ulong_option_dstring(category_str, job, lGetPosViaElem(job, JB_ar, SGE_NO_ABORT), "-ar"); + + // remove the last white space that the last unparse function has written + sge_dstring_strip_white_space_at_eol(category_str); + + // avoid null pointer as category string in case job has no specific requests + if (sge_dstring_get_string(category_str) == nullptr) { + sge_dstring_append(category_str, "-"); + } + + DRETURN_VOID; +} + +lListElem * +ocs::Category::create_new(lListElem *job) { + lListElem *category = lCreateElem(CT_Type); + lSetUlong64(category, CT_id, get_next_id()); + return category; +} \ No newline at end of file diff --git a/source/libs/sgeobj/ocs_Category.h b/source/libs/sgeobj/ocs_Category.h new file mode 100644 index 000000000..a5ceb19bb --- /dev/null +++ b/source/libs/sgeobj/ocs_Category.h @@ -0,0 +1,37 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "cull/cull.h" + +#include "sgeobj/cull/sge_ct_CT_L.h" + +namespace ocs { + class Category { + static u_long32 next_id; + + public: + static u_long32 get_next_id() { + return next_id++; + } + static void build_string(dstring *category_str, lListElem *job, const lList *acl_list, const lList *prj_list, const lList *rqs_list); + static lListElem *create_new(lListElem *job); + }; +} diff --git a/source/libs/sgeobj/sge_event.cc b/source/libs/sgeobj/sge_event.cc index ff81f8551..2468bca14 100644 --- a/source/libs/sgeobj/sge_event.cc +++ b/source/libs/sgeobj/sge_event.cc @@ -420,13 +420,27 @@ const char *event_text(const lListElem *event, dstring *buffer) sge_dstring_sprintf(buffer, MSG_EVENT_OBJECTLISTXELEMENTS_USI, number, "ADVANCE RESERVATION", n); break; case sgeE_AR_ADD: - sge_dstring_sprintf(buffer, MSG_EVENT_ADDOBJECTX_USS, number, "ADVANCE RESERVATION", strkey); + sge_dstring_sprintf(buffer, MSG_EVENT_ADDOBJECTX_USU, number, "ADVANCE RESERVATION", intkey); break; case sgeE_AR_DEL: - sge_dstring_sprintf(buffer, MSG_EVENT_DELOBJECTX_USS, number, "ADVANCE RESERVATION", strkey); + sge_dstring_sprintf(buffer, MSG_EVENT_DELOBJECTX_USU, number, "ADVANCE RESERVATION", intkey); break; case sgeE_AR_MOD: - sge_dstring_sprintf(buffer, MSG_EVENT_MODOBJECTX_USS, number, "ADVANCE RESERVATION", strkey); + sge_dstring_sprintf(buffer, MSG_EVENT_MODOBJECTX_USU, number, "ADVANCE RESERVATION", intkey); + break; + + /* -------------------- */ + case sgeE_CATEGORY_LIST: + sge_dstring_sprintf(buffer, MSG_EVENT_OBJECTLISTXELEMENTS_USI, number, "CATEGORY", n); + break; + case sgeE_CATEGORY_ADD: + sge_dstring_sprintf(buffer, MSG_EVENT_ADDOBJECTX_USU, number, "CATEGORY", intkey); + break; + case sgeE_CATEGORY_DEL: + sge_dstring_sprintf(buffer, MSG_EVENT_DELOBJECTX_USU, number, "CATEGORY", intkey); + break; + case sgeE_CATEGORY_MOD: + sge_dstring_sprintf(buffer, MSG_EVENT_MODOBJECTX_USU, number, "CATEGORY", intkey); break; /* -------------------- */ diff --git a/source/libs/sgeobj/sge_event.h b/source/libs/sgeobj/sge_event.h index 87b62c663..1121eb22d 100644 --- a/source/libs/sgeobj/sge_event.h +++ b/source/libs/sgeobj/sge_event.h @@ -278,6 +278,11 @@ typedef enum { sgeE_ACK_TIMEOUT, + sgeE_CATEGORY_LIST, // events for job categories + sgeE_CATEGORY_ADD, + sgeE_CATEGORY_DEL, + sgeE_CATEGORY_MOD, + sgeE_EVENTSIZE } ev_event; @@ -299,6 +304,7 @@ typedef bool (*evm_ack_func_t)( ((x)==sgeE_CENTRY_LIST) || \ ((x)==sgeE_CONFIG_LIST) || \ ((x)==sgeE_EXECHOST_LIST) || \ + ((x)==sgeE_CATEGORY_LIST) || \ ((x)==sgeE_JOB_LIST) || \ ((x)==sgeE_JOB_SCHEDD_INFO_LIST) || \ ((x)==sgeE_MANAGER_LIST) || \ diff --git a/source/libs/sgeobj/sge_object.h b/source/libs/sgeobj/sge_object.h index 1d2e22636..7b48f0654 100644 --- a/source/libs/sgeobj/sge_object.h +++ b/source/libs/sgeobj/sge_object.h @@ -150,6 +150,7 @@ typedef enum { SGE_TYPE_RQS, /*28*/ SGE_TYPE_AR, /*29*/ SGE_TYPE_JOBSCRIPT, /*30*/ + SGE_TYPE_CATEGORY, /*31*/ /* diff --git a/source/libs/sgeobj/sge_resource_quota.cc b/source/libs/sgeobj/sge_resource_quota.cc index f5af398aa..1c5dc31db 100644 --- a/source/libs/sgeobj/sge_resource_quota.cc +++ b/source/libs/sgeobj/sge_resource_quota.cc @@ -1612,3 +1612,4 @@ bool rqs_replace_request_verify(lList **answer_list, const lList *request) DRETURN(true); } + diff --git a/source/libs/sgeobj/sge_resource_quota.h b/source/libs/sgeobj/sge_resource_quota.h index 0485e4159..b9c5ddafb 100644 --- a/source/libs/sgeobj/sge_resource_quota.h +++ b/source/libs/sgeobj/sge_resource_quota.h @@ -98,3 +98,4 @@ bool rqs_replace_request_verify(lList **answer_list, const lList *request); bool rqs_filter_match(lListElem *filter, int filter_type, const char *value, const lList *master_userset_list, const lList *master_hgroup_list, const char *group, const lList *grp_list); + diff --git a/source/libs/sgeobj/sge_resource_quota_service.cc b/source/libs/sgeobj/sge_resource_quota_service.cc new file mode 100644 index 000000000..d30f420eb --- /dev/null +++ b/source/libs/sgeobj/sge_resource_quota_service.cc @@ -0,0 +1,221 @@ +/*___INFO__MARK_BEGIN__*/ +/************************************************************************* + * + * The Contents of this file are made available subject to the terms of + * the Sun Industry Standards Source License Version 1.2 + * + * Sun Microsystems Inc., March, 2001 + * + * + * Sun Industry Standards Source License Version 1.2 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.2 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2001 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH + * + ************************************************************************/ +/*___INFO__MARK_END__*/ + +#include +#include + +#include "uti/sge_hostname.h" +#include "uti/sge_log.h" +#include "uti/sge_parse_num_par.h" +#include "uti/sge_rmon_macros.h" + +#include "sched/sge_select_queue.h" +#include "sched/sort_hosts.h" + +#include "sge_str.h" +#include "sge_resource_quota.h" +#include "sge_resource_quota_service.h" +#include "sge_object.h" + +static bool is_global(const lListElem *rule, int nm) +{ + lListElem *filter = lGetObject(rule, nm); + if (!filter) + return true; + if (lGetSubStr(filter, ST_name, "*", RQRF_scope) && lGetNumberOfElem(lGetList(filter, RQRF_xscope))==0) + return true; + return false; +} + +/****** sge_resource_quota_schedd/is_cqueue_global() *************************** +* NAME +* is_cqueue_global() -- Global rule with regards to cluster queues? +* +* SYNOPSIS +* bool is_cqueue_global(const lListElem *rule) +* +* INPUTS +* const lListElem *rule - RQR_Type +* +* RESULT +* bool - True if cluster queues play no role with the rule +* +* NOTES +* MT-NOTE: is_cqueue_global() is MT safe +*******************************************************************************/ +bool is_cqueue_global(const lListElem *rule) +{ + return is_global(rule, RQR_filter_queues); +} + + +/****** sge_resource_quota_schedd/is_host_global() ***************************** +* NAME +* is_host_global() -- Global rule with regards to hosts? +* +* SYNOPSIS +* bool is_host_global(const lListElem *rule) +* +* FUNCTION +* Return true if hosts play no role with the rule +* +* INPUTS +* const lListElem *rule - RQR_Type +* +* RESULT +* bool - True if hosts play no role with the rule +* +* NOTES +* MT-NOTE: is_host_global() is MT safe +*******************************************************************************/ +bool is_host_global(const lListElem *rule) +{ + return is_global(rule, RQR_filter_hosts); +} + +static bool is_expand(const lListElem *rule, int nm) +{ + lListElem *filter = lGetObject(rule, nm); + if (filter && lGetBool(filter, RQRF_expand)) + return true; + else + return false; +} + + +/****** sge_resource_quota_schedd/is_host_expand() ***************************** +* NAME +* is_host_expand() -- Returns true if rule expands on hosts +* +* SYNOPSIS +* bool is_host_expand(const lListElem *rule) +* +* FUNCTION +* Returns true if rule expands on hosts. +* +* INPUTS +* const lListElem *rule - RQR_Type +* +* RESULT +* bool - True if rule expands on hosts +* +* EXAMPLE +* "hosts {*}" returns true +* "hosts @allhosts" returns false +* +* NOTES +* MT-NOTE: is_host_expand() is MT safe +*******************************************************************************/ +bool is_host_expand(const lListElem *rule) +{ + return is_expand(rule, RQR_filter_hosts); +} + +/****** sge_resource_quota_schedd/is_cqueue_expand() *************************** +* NAME +* is_cqueue_expand() -- Returns true if rule expands on cluster queues +* +* SYNOPSIS +* bool is_cqueue_expand(const lListElem *rule) +* +* FUNCTION +* Returns true if rule expands on cluster queues. +* +* INPUTS +* const lListElem *rule - RQR_Type +* +* RESULT +* bool - True if rule expands on hosts +* +* EXAMPLE +* "queues {*}" returns true +* "queues Q001,Q002" returns false +* +* NOTES +* MT-NOTE: is_cqueue_expand() is MT safe +*******************************************************************************/ +bool is_cqueue_expand(const lListElem *rule) +{ + return is_expand(rule, RQR_filter_queues); +} + + +/****** sge_resource_quota_schedd/sge_user_is_referenced_in_rqs() ******************** +* NAME +* sge_user_is_referenced_in_rqs() -- search for user reference in rqs +* +* SYNOPSIS +* bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, +* lList *acl_list) +* +* FUNCTION +* Search for a user reference in the resource quota sets +* +* INPUTS +* const lList *rqs - resource quota set list +* const char *user - user to search +* const char *group - user's group +* lList *acl_list - acl list for user resolving +* +* RESULT +* bool - true if user was found +* false if user was not found +* +* NOTES +* MT-NOTE: sge_user_is_referenced_in_rqs() is MT safe +* +*******************************************************************************/ +bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, const char *group, const lList *grp_list, const lList *acl_list) +{ + bool ret = false; + const lListElem *ep; + + for_each_ep(ep, rqs) { + const lList *rule_list = lGetList(ep, RQS_rule); + const lListElem *rule; + + for_each_ep(rule, rule_list) { + /* there may be no per-user limitation and also not limitation that is special for this user */ + if ((is_expand(rule, RQR_filter_users) || !is_global(rule, RQR_filter_users)) && + rqs_filter_match(lGetObject(rule, RQR_filter_users), FILTER_USERS, user, acl_list, nullptr, group, grp_list)) { + ret = true; + break; + } + } + if (ret) { + break; + } + } + return ret; +} diff --git a/source/daemons/common/category.h b/source/libs/sgeobj/sge_resource_quota_service.h similarity index 82% rename from source/daemons/common/category.h rename to source/libs/sgeobj/sge_resource_quota_service.h index f72123d2a..6add9646c 100644 --- a/source/daemons/common/category.h +++ b/source/libs/sgeobj/sge_resource_quota_service.h @@ -33,8 +33,11 @@ ************************************************************************/ /*___INFO__MARK_END__*/ -#include "uti/sge_dstring.h" -#include "cull/cull.h" +bool is_cqueue_global(const lListElem *rule); +bool is_host_global(const lListElem *rule); + +bool is_cqueue_expand(const lListElem *rule); +bool is_host_expand(const lListElem *rule); + +bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, const char *group, const lList *grp_list, const lList *acl_list); -void -sge_build_job_category_dstring(dstring *category_str, lListElem *job, const lList *acl_list, const lList *prj_list, bool *did_project, const lList *lirs_list); diff --git a/source/libs/spool/flatfile/sge_flatfile.cc b/source/libs/spool/flatfile/sge_flatfile.cc index f98b056bc..d75b9541d 100644 --- a/source/libs/spool/flatfile/sge_flatfile.cc +++ b/source/libs/spool/flatfile/sge_flatfile.cc @@ -297,6 +297,46 @@ const spool_flatfile_instr qconf_ce_list_sfi = { NoName, NoName, NoName } }; +const spool_flatfile_instr qconf_cat_sfi = +{ + nullptr, + true, + false, + false, + true, + false, + false, + true, + false, + ' ', + '\n', + '\0', + '\0', + '\0', + &qconf_sub_name_value_space_sfi, + { NoName, NoName, NoName } +}; + +const spool_flatfile_instr qconf_cat_list_sfi = +{ + nullptr, + false, + true, + true, + false, + true, + false, + true, + false, + '\0', + ' ', + '\0', + '\0', + '\n', + nullptr, + { NoName, NoName, NoName } +}; + const spool_flatfile_instr qconf_sub_rqs_sfi = { nullptr, @@ -823,8 +863,7 @@ spool_flatfile_write_list(lList **answer_list, DRETURN(nullptr); } - result = spool_flatfile_write_data(answer_list, data, data_len, destination, - filepath); + result = spool_flatfile_write_data(answer_list, data, data_len, destination, filepath); /* cleanup */ sge_dstring_free(&char_buffer); diff --git a/source/libs/spool/flatfile/sge_flatfile.h b/source/libs/spool/flatfile/sge_flatfile.h index 788555ca4..a18e52aab 100644 --- a/source/libs/spool/flatfile/sge_flatfile.h +++ b/source/libs/spool/flatfile/sge_flatfile.h @@ -137,6 +137,8 @@ extern const spool_flatfile_instr qconf_sub_name_value_comma_sfi; extern const spool_flatfile_instr qconf_sub_comma_sfi; extern const spool_flatfile_instr qconf_param_sfi; extern const spool_flatfile_instr qconf_sub_param_sfi; +extern const spool_flatfile_instr qconf_cat_sfi; +extern const spool_flatfile_instr qconf_cat_list_sfi; extern const spool_flatfile_instr qconf_comma_sfi; extern const spool_flatfile_instr qconf_ce_sfi; extern const spool_flatfile_instr qconf_ce_list_sfi; diff --git a/source/libs/spool/flatfile/sge_flatfile_obj.cc b/source/libs/spool/flatfile/sge_flatfile_obj.cc index 3bff25904..57c70ee88 100644 --- a/source/libs/spool/flatfile/sge_flatfile_obj.cc +++ b/source/libs/spool/flatfile/sge_flatfile_obj.cc @@ -340,6 +340,13 @@ spooling_field CE_fields[] = { { NoName, 11, nullptr, false, nullptr, false, nullptr, nullptr, nullptr} }; +spooling_field CAT_fields[] = { + { CT_id, 10, "id", false, nullptr, false, nullptr, nullptr, nullptr}, + { CT_refcount, 10, "rcount", false, nullptr, false, nullptr, nullptr, nullptr}, + { CT_str, 10, "str", false, nullptr, false, nullptr, nullptr, nullptr}, + { NoName, 10, nullptr, false, nullptr, false, nullptr, nullptr, nullptr} +}; + spooling_field CAL_fields[] = { { CAL_name, 16, "calendar_name", false, nullptr, false, nullptr, nullptr, nullptr}, { CAL_year_calendar, 16, "year", false, nullptr, false, nullptr, nullptr, nullptr}, diff --git a/source/libs/spool/flatfile/sge_flatfile_obj.h b/source/libs/spool/flatfile/sge_flatfile_obj.h index 5b1b395e3..e5d8e2c98 100644 --- a/source/libs/spool/flatfile/sge_flatfile_obj.h +++ b/source/libs/spool/flatfile/sge_flatfile_obj.h @@ -38,6 +38,7 @@ #define MAX_NUM_FIELDS 60 extern spooling_field CAL_fields[]; +extern spooling_field CAT_fields[]; extern spooling_field CK_fields[]; extern spooling_field CE_fields[]; extern spooling_field HGRP_fields[]; From 528ca944a31a9c24d59c311c1f906d92838de85f Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Tue, 8 Apr 2025 14:54:22 +0200 Subject: [PATCH 05/10] EH: CS-208: Replace scheduler categories by GDI categories --- source/libs/sched/CMakeLists.txt | 3 +- source/libs/sched/debit.cc | 4 +- source/libs/sched/msg_schedd.h | 1 + source/libs/sched/ocs_CategorySchedd.cc | 53 + source/libs/sched/ocs_CategorySchedd.h | 36 + source/libs/sched/sge_select_queue.cc | 98 +- source/libs/sched/sge_select_queue.h | 5 + ...uota_schedd.cc => sge_select_queue_rqs.cc} | 2007 +++++++---------- ..._quota_schedd.h => sge_select_queue_rqs.h} | 50 +- test/daemons/common/test_common_category.cc | 7 +- test/daemons/qmaster/CMakeLists.txt | 2 +- 11 files changed, 1090 insertions(+), 1176 deletions(-) create mode 100644 source/libs/sched/ocs_CategorySchedd.cc create mode 100644 source/libs/sched/ocs_CategorySchedd.h rename source/libs/sched/{sge_resource_quota_schedd.cc => sge_select_queue_rqs.cc} (79%) rename source/libs/sched/{sge_resource_quota_schedd.h => sge_select_queue_rqs.h} (73%) diff --git a/source/libs/sched/CMakeLists.txt b/source/libs/sched/CMakeLists.txt index e6dedf9d7..c06f16269 100644 --- a/source/libs/sched/CMakeLists.txt +++ b/source/libs/sched/CMakeLists.txt @@ -23,6 +23,7 @@ set(LIBRARY_NAME sched) set(LIBRARY_SOURCES debit.cc load_correction.cc + ocs_CategorySchedd.cc schedd_message.cc schedd_monitor.cc sge_complex_schedd.cc @@ -31,10 +32,10 @@ set(LIBRARY_SOURCES sge_orders.cc sge_pe_schedd.cc sge_qeti.cc - sge_resource_quota_schedd.cc sge_resource_utilization.cc sge_schedd_text.cc sge_select_queue.cc + sge_select_queue_rqs.cc sge_serf.cc sge_sharetree_printing.cc sge_support.cc diff --git a/source/libs/sched/debit.cc b/source/libs/sched/debit.cc index 9d5a26fe4..d2b5ddaca 100644 --- a/source/libs/sched/debit.cc +++ b/source/libs/sched/debit.cc @@ -46,13 +46,13 @@ #include "sgeobj/sge_job.h" #include "sgeobj/sge_ja_task.h" #include "sgeobj/sge_qinstance.h" -#include "sgeobj/sge_subordinate.h" #include "sgeobj/sge_order.h" #include "sgeobj/sge_schedd_conf.h" +#include "sgeobj/sge_subordinate.h" #include "sgeobj/sge_host.h" #include "sgeobj/sge_advance_reservation.h" +#include "sgeobj/sge_resource_quota.h" -#include "sge_resource_quota_schedd.h" #include "sge_resource_utilization.h" #include "subordinate_schedd.h" #include "sge_select_queue.h" diff --git a/source/libs/sched/msg_schedd.h b/source/libs/sched/msg_schedd.h index cbe9f1651..0bf511cea 100644 --- a/source/libs/sched/msg_schedd.h +++ b/source/libs/sched/msg_schedd.h @@ -240,5 +240,6 @@ #define MSG_INVALID_PARAM_SETTING_S _MESSAGE(47402, _("Invalid scheduler param setting: " SFQ)) #define MSG_SCONF_NO_CONFIG _MESSAGE(47403, _("sconf_printf_config: no config to validate")) #define MSG_QINSTANCE_VALUEMISSINGMASTERDOWN_S _MESSAGE(47404, _("error: no value for " SFQ " because execd is in unknown state")) +#define MSG_CANTFINDCATINMASTERLIST_U _MESSAGE(47405, _("could not find category " sge_uu32 " in master list")) // clang-format on diff --git a/source/libs/sched/ocs_CategorySchedd.cc b/source/libs/sched/ocs_CategorySchedd.cc new file mode 100644 index 000000000..958e5a229 --- /dev/null +++ b/source/libs/sched/ocs_CategorySchedd.cc @@ -0,0 +1,53 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "uti/sge_log.h" +#include "uti/sge_rmon_macros.h" + +#include "sgeobj/sge_job.h" +#include "sgeobj/ocs_Category.h" + +#include "ocs_CategorySchedd.h" + +int +ocs::CategorySchedd::job_is_category_rejected(const lListElem *job) { + DENTER(TOP_LAYER); + auto *cat = static_cast(lGetRef(job, JB_category)); + int ret = lGetBool(cat, CT_rejected); + DRETURN(ret); +} + +int +ocs::CategorySchedd::job_is_category_reservation_rejected(const lListElem *job) { + DENTER(TOP_LAYER); + auto *cat = static_cast(lGetRef(job, JB_category)); + int ret = lGetBool(cat, CT_reservation_rejected); + DRETURN(ret); +} + +void +ocs::CategorySchedd::job_reject_category(const lListElem *job, bool with_reservation) { + auto *cat = static_cast(lGetRef(job, JB_category)); + + lSetBool(cat, CT_rejected, true); + if (with_reservation) { + lSetBool(cat, CT_reservation_rejected, true); + } +} diff --git a/source/libs/sched/ocs_CategorySchedd.h b/source/libs/sched/ocs_CategorySchedd.h new file mode 100644 index 000000000..ea8bf6bb6 --- /dev/null +++ b/source/libs/sched/ocs_CategorySchedd.h @@ -0,0 +1,36 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "cull/cull.h" + +namespace ocs { + class CategorySchedd { + public: + static void + job_reject_category(const lListElem *job, bool with_reservation); + + static int + job_is_category_rejected(const lListElem *job); + + static int + job_is_category_reservation_rejected(const lListElem *job); + }; +} \ No newline at end of file diff --git a/source/libs/sched/sge_select_queue.cc b/source/libs/sched/sge_select_queue.cc index cbd3f2c86..d873af00e 100755 --- a/source/libs/sched/sge_select_queue.cc +++ b/source/libs/sched/sge_select_queue.cc @@ -37,6 +37,7 @@ #include #include +#include "uti/sge.h" #include "uti/sge_bitfield.h" #include "uti/sge_hostname.h" #include "uti/sge_log.h" @@ -67,6 +68,8 @@ #include "sgeobj/sge_qref.h" #include "sgeobj/sge_advance_reservation.h" #include "sgeobj/sge_userset.h" +#include "sgeobj/sge_resource_quota.h" +#include "sgeobj/sge_resource_quota_service.h" #include "basis_types.h" #include "schedd_message.h" @@ -74,12 +77,10 @@ #include "sge_complex_schedd.h" #include "sge_pe_schedd.h" #include "sge_qeti.h" -#include "sge_resource_quota_schedd.h" #include "sge_resource_utilization.h" #include "sge_schedd_text.h" #include "sge_select_queue.h" -#include "uti/sge.h" -#include "valid_queue_user.h" +#include "sge_select_queue_rqs.h" #include "sgeobj/cull/sge_select_queue_LDR_L.h" #include "sgeobj/cull/sge_select_queue_QRL_L.h" @@ -6837,3 +6838,94 @@ sge_ar_swap_resource_lists(sge_assignment_t &a) { DRETURN_VOID; } + +/****** sge_resource_quota_schedd/parallel_limit_slots_by_time() ******************** +* NAME +* parallel_limit_slots_by_time() -- Determine number of slots avail. within +* time frame +* +* SYNOPSIS +* static dispatch_t parallel_limit_slots_by_time(const sge_assignment_t *a, +* lList *requests, int *slots, lListElem *centry, lListElem +* *limit, dstring rue_name) +* +* FUNCTION +* ??? +* +* INPUTS +* const sge_assignment_t *a - job info structure (in) +* lList *requests - Job request list (CE_Type) +* int *slots - out: free slots +* lListElem *centry - Load information for the resource +* lListElem *limit - limitation (RQRL_Type) +* dstring rue_name - rue_name saved in limit sublist RQRL_usage +* lListElem *qep - queue instance (QU_Type) +* +* RESULT +* static dispatch_t - DISPATCH_OK got an assignment +* - DISPATCH_NEVER_CAT no assignment for all jobs af that category +* +* NOTES +* MT-NOTE: parallel_limit_slots_by_time() is not MT safe +* +* SEE ALSO +* parallel_rc_slots_by_time +*******************************************************************************/ +dispatch_t +parallel_limit_slots_by_time(const sge_assignment_t *a, int *slots, lListElem *centry, + lListElem *limit, dstring *rue_name, lListElem *qep, bool need_master, + bool is_master_queue) +{ + lList *tmp_centry_list = lCreateList("", CE_Type); + lList *tmp_rue_list = lCreateList("", RUE_Type); + lListElem *tmp_centry_elem = nullptr; + lListElem *tmp_rue_elem = nullptr; + const lList *rue_list = lGetList(limit, RQRL_usage); + dispatch_t result = DISPATCH_NEVER_CAT; + + DENTER(TOP_LAYER); + + /* create tmp_centry_list */ + tmp_centry_elem = lCopyElem(centry); + lSetDouble(tmp_centry_elem, CE_doubleval, lGetDouble(limit, RQRL_dvalue)); + lAppendElem(tmp_centry_list, tmp_centry_elem); + + /* create tmp_rue_list */ + tmp_rue_elem = lCopyElem(lGetElemStr(rue_list, RUE_name, sge_dstring_get_string(rue_name))); + if (tmp_rue_elem == nullptr) { + DPRINTF("RD: 1\n"); + tmp_rue_elem = lCreateElem(RUE_Type); + } +#if 0 +{ + const char *object_name = "bla"; + const lListElem *rde; + DPRINTF("resource utilization: %s \"%s\" %f utilized now\n", + object_name?object_name:"", lGetString(tmp_rue_elem, RUE_name), + lGetDouble(tmp_rue_elem, RUE_utilized_now)); + for_each_ep(rde, lGetList(tmp_rue_elem, RUE_utilized)) { + DPRINTF("\t" sge_u64 " %f\n", lGetUlong64(rde, RDE_time), lGetDouble(rde, RDE_amount)); + } + DPRINTF("resource utilization: %s \"%s\" %f utilized now non-exclusive\n", + object_name?object_name:"", lGetString(tmp_rue_elem, RUE_name), + lGetDouble(tmp_rue_elem, RUE_utilized_now_nonexclusive)); + for_each_ep(rde, lGetList(tmp_rue_elem, RUE_utilized_nonexclusive)) { + DPRINTF("\t" sge_u64 " %f\n", lGetUlong64(rde, RDE_time), lGetDouble(rde, RDE_amount)); + } +} +#endif + + lSetString(tmp_rue_elem, RUE_name, lGetString(limit, RQRL_name)); + lAppendElem(tmp_rue_list, tmp_rue_elem); + + result = parallel_rc_slots_by_time(a, slots, + tmp_centry_list, tmp_rue_list, nullptr, + false, qep, DOMINANT_LAYER_RQS, 0.0, RQS_TAG, need_master, is_master_queue, + false, SGE_RQS_NAME, true); + + lFreeList(&tmp_centry_list); + lFreeList(&tmp_rue_list); + + DRETURN(result); +} + diff --git a/source/libs/sched/sge_select_queue.h b/source/libs/sched/sge_select_queue.h index 72fedc5d7..0da125848 100644 --- a/source/libs/sched/sge_select_queue.h +++ b/source/libs/sched/sge_select_queue.h @@ -224,3 +224,8 @@ dispatch_t cqueue_match_static(const char *cqname, sge_assignment_t *a); void sge_ar_swap_resource_lists(sge_assignment_t &a); + +dispatch_t +parallel_limit_slots_by_time(const sge_assignment_t *a, int *slots, lListElem *centry, + lListElem *limit, dstring *rue_name, lListElem *qep, bool need_master, + bool is_master_queue); diff --git a/source/libs/sched/sge_resource_quota_schedd.cc b/source/libs/sched/sge_select_queue_rqs.cc similarity index 79% rename from source/libs/sched/sge_resource_quota_schedd.cc rename to source/libs/sched/sge_select_queue_rqs.cc index ec49e7398..dc487ad29 100644 --- a/source/libs/sched/sge_resource_quota_schedd.cc +++ b/source/libs/sched/sge_select_queue_rqs.cc @@ -1,885 +1,631 @@ /*___INFO__MARK_BEGIN__*/ /************************************************************************* - * + * * The Contents of this file are made available subject to the terms of * the Sun Industry Standards Source License Version 1.2 - * + * * Sun Microsystems Inc., March, 2001 - * - * + * + * * Sun Industry Standards Source License Version 1.2 * ================================================= * The contents of this file are subject to the Sun Industry Standards * Source License Version 1.2 (the "License"); You may not use this file * except in compliance with the License. You may obtain a copy of the * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html - * + * * Software provided under this License is provided on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. * See the License for the specific provisions governing your rights and * obligations concerning the Software. - * + * * The Initial Developer of the Original Code is: Sun Microsystems, Inc. - * + * * Copyright: 2001 by Sun Microsystems, Inc. - * + * * All Rights Reserved. - * + * * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH * ************************************************************************/ /*___INFO__MARK_END__*/ -#include -#include - -#include "uti/sge_hostname.h" #include "uti/sge_log.h" -#include "uti/sge_parse_num_par.h" #include "uti/sge_rmon_macros.h" +#include "uti/sge_parse_num_par.h" -#include "sgeobj/sge_centry.h" #include "sgeobj/ocs_DataStore.h" -#include "sgeobj/sge_str.h" -#include "sgeobj/sge_cqueue.h" +#include "sgeobj/sge_advance_reservation.h" +#include "sgeobj/sge_centry.h" +#include "sgeobj/sge_host.h" #include "sgeobj/sge_qinstance.h" #include "sgeobj/sge_job.h" #include "sgeobj/sge_resource_quota.h" -#include "sgeobj/sge_object.h" -#include "sgeobj/sge_job.h" -#include "sgeobj/sge_pe.h" -#include "sgeobj/sge_host.h" +#include "sgeobj/sge_resource_quota_service.h" +#include "sgeobj/sge_resource_utilization.h" + +#include "sched/schedd_message.h" +#include "sched/msg_schedd.h" #include "sge_complex_schedd.h" +#include "sge_schedd_text.h" #include "sge_select_queue.h" -#include "sge_resource_quota_schedd.h" +#include "sge_select_queue_rqs.h" #include "sort_hosts.h" -#include "sge_schedd_text.h" -#include "schedd_message.h" - -static void rqs_can_optimize(const lListElem *rule, bool *host, bool *queue, sge_assignment_t *a); - -static void rqs_expand_cqueues(const lListElem *rule, sge_assignment_t *a); -static void rqs_expand_hosts(const lListElem *rule, sge_assignment_t *a); - -static bool is_cqueue_global(const lListElem *rule); -static bool is_host_global(const lListElem *rule); - -static bool is_cqueue_expand(const lListElem *rule); -static bool is_host_expand(const lListElem *rule); -static bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a); -static bool host_shadowed(const lListElem *rule, sge_assignment_t *a); - -static void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a); -static void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a); - - -/****** sge_resource_quota_schedd/rqs_set_dynamical_limit() *********************** +/****** sge_resource_quota_schedd/rqs_limitation_reached() ********************* * NAME -* rqs_set_dynamical_limit() -- evaluate dynamical limit +* rqs_limitation_reached() -- is the limitation reached for a queue instance * * SYNOPSIS -* bool rqs_set_dynamical_limit(lListElem *limit, lListElem -* *global_host, lListElem *exec_host, lList *centry) +* static bool rqs_limitation_reached(sge_assignment_t *a, lListElem *rule, +* const char* host, const char* queue) * * FUNCTION -* The function evaluates if neccessary the dynamical limit for a host and -* sets the evaluated double value in the given limitation element (RQRL_dvalue). -* -* A evaluation is neccessary if the limit boolean RQRL_dynamic is true. This -* field is set by qmaster during the rule set verification +* The function verifies no limitation is reached for the specific job request +* and queue instance * * INPUTS -* lListElem *limit - limitation (RQRL_Type) -* lListElem *global_host - global host (EH_Type) -* lListElem *exec_host - exec host (EH_Type) -* lList *centry - consumable resource list (CE_Type) +* sge_assignment_t *a - job info structure +* const lListElem *rule - rqsource quota rule (RQR_Type) +* const char* host - host name +* const char* queue - queue name +* u_long64 *start - start time of job * * RESULT -* bool - always true +* static dispatch_t - DISPATCH_OK job can be scheduled +* DISPATCH_NEVER_CAT no jobs of this category will be scheduled +* DISPATCH_NOT_AT_TIME job can be scheduled later +* DISPATCH_MISSING_ATTR rule does not match requested attributes * * NOTES -* MT-NOTE: rqs_set_dynamical_limit() is MT safe +* MT-NOTE: rqs_limitation_reached() is not MT safe * *******************************************************************************/ -bool -rqs_set_dynamical_limit(lListElem *limit, lListElem *global_host, lListElem *exec_host, const lList *centry) { +static dispatch_t rqs_limitation_reached(sge_assignment_t *a, const lListElem *rule, const char* host, const char* queue, u_long64 *start) +{ + dispatch_t ret = DISPATCH_MISSING_ATTR; + const lList *limit_list = nullptr; + lListElem * limit = nullptr; + static lListElem *implicit_slots_request = nullptr; + lListElem *exec_host = host_list_locate(a->host_list, host); + dstring rue_name = DSTRING_INIT; + dstring reason = DSTRING_INIT; DENTER(TOP_LAYER); - if (lGetBool(limit, RQRL_dynamic)) { - double dynamic_limit = scaled_mixed_load(lGetString(limit, RQRL_value), global_host, exec_host, centry); - DPRINTF("found a dynamic limit for host %s with value %d\n", lGetHost(exec_host, EH_name), (int)dynamic_limit); - lSetDouble(limit, RQRL_dvalue, dynamic_limit); - } + if (implicit_slots_request == nullptr) { + implicit_slots_request = lCreateElem(CE_Type); + lSetString(implicit_slots_request, CE_name, SGE_ATTR_SLOTS); + lSetString(implicit_slots_request, CE_stringval, "1"); + lSetDouble(implicit_slots_request, CE_doubleval, 1); + } - DRETURN(true); -} + limit_list = lGetList(rule, RQR_limit); + for_each_rw(limit, limit_list) { + bool is_forced = false; + const char *limit_name = lGetString(limit, RQRL_name); + lListElem *raw_centry = centry_list_locate(a->centry_list, limit_name); -/****** sge_resource_quota_schedd/rqs_match_assignment() *********************** -* NAME -* rqs_match_assignment() -- match resource quota rule any queue instance -* -* SYNOPSIS -* static bool rqs_match_assignment(const lListElem *rule, sge_assignment_t -* *a) -* -* FUNCTION -* Check whether a resource quota rule can match any queue instance. If -* if does not match due to users/projects/pes scope one can rule this -* out. -* -* Note: As long as rqs_match_assignment() is not used for parallel jobs -* passing nullptr as PE request is perfectly fine. -* -* INPUTS -* const lListElem *rule - Resource quota rule -* sge_assignment_t *a - Scheduler assignment -* -* RESULT -* static bool - True if it matches -* -* NOTES -* MT-NOTE: rqs_match_assignment() is MT safe -*******************************************************************************/ -static bool rqs_match_assignment(const lListElem *rule, sge_assignment_t *a) -{ - return (rqs_filter_match(lGetObject(rule, RQR_filter_projects), FILTER_PROJECTS, a->project, nullptr, nullptr, nullptr, nullptr) && - rqs_filter_match(lGetObject(rule, RQR_filter_users), FILTER_USERS, a->user, a->acl_list, nullptr, a->group, a->grp_list) && - rqs_filter_match(lGetObject(rule, RQR_filter_pes), FILTER_PES, nullptr, nullptr, nullptr, nullptr, nullptr))?true:false; -} + if (raw_centry == nullptr) { + DPRINTF("ignoring limit %s because not defined", limit_name); + continue; + } else { + DPRINTF("checking limit %s\n", lGetString(raw_centry, CE_name)); + } + is_forced = lGetUlong(raw_centry, CE_requestable) == REQU_FORCED; + lList *job_centry_list = job_get_hard_resource_listRW(a->job); + // @todo CS-400: we only need job_centry. Have a function searching it in the 3 possible request lists + lListElem *job_centry = centry_list_locate(job_centry_list, limit_name); -/****** sge_resource_quota_schedd/cqueue_shadowed() **************************** -* NAME -* cqueue_shadowed() -- Check for cluster queue rule before current rule -* -* SYNOPSIS -* static bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a) -* -* FUNCTION -* Check whether there is any cluster queue specific rule before the -* current rule. -* -* INPUTS -* const lListElem *rule - Current rule -* sge_assignment_t *a - Scheduler assignment -* -* RESULT -* static bool - True if shadowed -* -* EXAMPLE -* limit queue Q001 to F001=1 -* limit host gridware to F001=0 (--> returns 'true' due to 'Q001' meaning -* that gridware can't be generelly ruled out ) -* -* NOTES -* MT-NOTE: cqueue_shadowed() is MT safe -*******************************************************************************/ -static bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a) -{ - while ((rule = lPrev(rule))) { - if (rqs_match_assignment(rule, a) && !is_cqueue_global(rule)) { - return true; + /* check for implicit slot and default request */ + if (job_centry == nullptr) { + if (strcmp(lGetString(raw_centry, CE_name), SGE_ATTR_SLOTS) == 0) { + job_centry = implicit_slots_request; + } else if (lGetString(raw_centry, CE_defaultval) != nullptr && lGetUlong(raw_centry, CE_consumable)) { + double request; + parse_ulong_val(&request, nullptr, lGetUlong(raw_centry, CE_valtype), lGetString(raw_centry, CE_defaultval), nullptr, 0); + + /* default requests with zero value are ignored */ + if (request == 0.0 && lGetUlong(raw_centry, CE_relop) != CMPLXEXCL_OP) { + continue; + } + lSetString(raw_centry, CE_stringval, lGetString(raw_centry, CE_defaultval)); + lSetDouble(raw_centry, CE_doubleval, request); + job_centry = raw_centry; + DPRINTF("using default request for %s!\n", lGetString(raw_centry, CE_name)); + } else if (is_forced) { + schedd_mes_add(a->monitor_alpp, a->monitor_next_run, a->job_id, + SCHEDD_INFO_NOTREQFORCEDRES); + ret = DISPATCH_NEVER_CAT; + break; + } else { + /* ignoring because centry was not requested and is no consumable */ + DPRINTF("complex not requested!\n"); + continue; + } + } + + { + lList *tmp_centry_list = lCreateList("", CE_Type); + lList *tmp_rue_list = lCreateList("", RUE_Type); + lListElem *tmp_centry_elem = nullptr; + lListElem *tmp_rue_elem = nullptr; + + if (rqs_set_dynamical_limit(limit, a->gep, exec_host, a->centry_list)) { + const lList *rue_list = lGetList(limit, RQRL_usage); + u_long64 tmp_time = a->start; + + /* create tmp_centry_list */ + tmp_centry_elem = lCopyElem(raw_centry); + lSetString(tmp_centry_elem, CE_stringval, lGetString(limit, RQRL_value)); + lSetDouble(tmp_centry_elem, CE_doubleval, lGetDouble(limit, RQRL_dvalue)); + lAppendElem(tmp_centry_list, tmp_centry_elem); + + /* create tmp_rue_list */ + rqs_get_rue_string(&rue_name, rule, a->user, a->project, host, queue, nullptr); + tmp_rue_elem = lCopyElem(lGetElemStr(rue_list, RUE_name, sge_dstring_get_string(&rue_name))); + if (tmp_rue_elem == nullptr) { + tmp_rue_elem = lCreateElem(RUE_Type); + } + lSetString(tmp_rue_elem, RUE_name, limit_name); + lAppendElem(tmp_rue_list, tmp_rue_elem); + + sge_dstring_clear(&reason); + ret = ri_time_by_slots(a, job_centry, nullptr, tmp_centry_list, tmp_rue_list, + nullptr, &reason, false, 1, DOMINANT_LAYER_RQS, 0.0, &tmp_time, + SGE_RQS_NAME); + if (ret != DISPATCH_OK) { + DPRINTF("denied because: %s\n", sge_dstring_get_string(&reason)); + lFreeList(&tmp_rue_list); + lFreeList(&tmp_centry_list); + break; + } + + if (a->is_reservation && ret == DISPATCH_OK) { + *start = tmp_time; + } + + lFreeList(&tmp_rue_list); + lFreeList(&tmp_centry_list); + } } } - return false; + + sge_dstring_free(&reason); + sge_dstring_free(&rue_name); + + DRETURN(ret); } -/****** sge_resource_quota_schedd/host_shadowed() ****************************** + +/****** sge_resource_quota_schedd/rqs_exceeded_sort_out() ********************** * NAME -* host_shadowed() -- Check for host rule before current rule +* rqs_exceeded_sort_out() -- Rule out queues/hosts whenever possible * * SYNOPSIS -* static bool host_shadowed(const lListElem *rule, sge_assignment_t *a) +* bool rqs_exceeded_sort_out(sge_assignment_t *a, const lListElem *rule, +* const dstring *rule_name, const char* queue_name, const char* host_name) * * FUNCTION -* Check whether there is any host specific rule before the -* current rule. +* This function tries to rule out hosts and cluster queues after a +* quota exeeding was found for a limitation rule with specific queue +* instance. +* +* When a limitation was exeeded that applies to the entire +* cluster 'true' is returned, 'false' otherwise. * * INPUTS -* const lListElem *rule - Current rule -* sge_assignment_t *a - Scheduler assignment +* sge_assignment_t *a - Scheduler assignment type +* const lListElem *rule - The exeeded rule +* const dstring *rule_name - Name of the rule (monitoring only) +* const char* queue_name - Cluster queue name +* const char* host_name - Host name * * RESULT -* static bool - True if shadowed -* -* EXAMPLE -* limit host gridware to F001=1 -* limit queue Q001 to F001=0 (--> returns 'true' due to 'gridware' meaning -* that Q001 can't be generelly ruled out ) +* bool - True upon global limits exceeding * * NOTES -* MT-NOTE: host_shadowed() is MT safe +* MT-NOTE: rqs_exceeded_sort_out() is MT safe *******************************************************************************/ -static bool host_shadowed(const lListElem *rule, sge_assignment_t *a) +static bool rqs_exceeded_sort_out(sge_assignment_t *a, const lListElem *rule, const dstring *rule_name, + const char* queue_name, const char* host_name) { - while ((rule = lPrev(rule))) { - if (rqs_match_assignment(rule, a) && !is_host_global(rule)) { - return true; - } + bool cq_global = is_cqueue_global(rule); + bool eh_global = is_host_global(rule); + + DENTER(TOP_LAYER); + + if ((!cq_global && !eh_global) || (cq_global && eh_global && + (is_cqueue_expand(rule) || is_host_expand(rule)))) { /* failure at queue instance limit */ + DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", + sge_dstring_get_string(rule_name), queue_name, host_name); + DRETURN(false); } - return false; -} -/****** sge_resource_quota_schedd/cqueue_shadowed_by() ************************* -* NAME -* cqueue_shadowed_by() -- Check rules shadowing current cluster queue rule -* -* SYNOPSIS -* static bool cqueue_shadowed_by(const char *cqname, const lListElem *rule, -* sge_assignment_t *a) -* -* FUNCTION -* Check if cluster queue in current rule is shadowed. -* -* INPUTS -* const char *cqname - Cluster queue name to check -* const lListElem *rule - Current rule -* sge_assignment_t *a - Assignment -* -* RESULT -* static bool - True if shadowed -* -* EXAMPLE -* limits queues Q001,Q002 to F001=1 -* limits queues Q002,Q003 to F001=1 (--> returns 'true' for Q002 and 'false' for Q003) -* -* NOTES -* MT-NOTE: cqueue_shadowed_by() is MT safe -*******************************************************************************/ -static bool cqueue_shadowed_by(const char *cqname, const lListElem *rule, sge_assignment_t *a) -{ - while ((rule = lPrev(rule))) { - if (rqs_match_assignment(rule, a) && - rqs_filter_match(lGetObject(rule, RQR_filter_queues), FILTER_QUEUES, cqname, nullptr, nullptr, nullptr, nullptr)) { - return true; + if (cq_global && eh_global) { /* failure at a global limit */ + bool host_shadowed, queue_shadowed; + + rqs_can_optimize(rule, &host_shadowed, &queue_shadowed, a); + if (!host_shadowed && !queue_shadowed) { + DPRINTF("GLOBAL: resource quota set %s deny job execution globally\n", sge_dstring_get_string(rule_name)); + DRETURN(true); + } + + if (host_shadowed && queue_shadowed) { + rqs_excluded_cqueues(rule, a); + rqs_excluded_hosts(rule, a); + DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", sge_dstring_get_string(rule_name), queue_name, host_name); + DRETURN(false); + } + + if (queue_shadowed) { + rqs_excluded_cqueues(rule, a); + DPRINTF("QUEUE: resource quota set %s deny job execution in all its queues\n", sge_dstring_get_string(rule_name)); + } else { /* must be host_shadowed */ + rqs_excluded_hosts(rule, a); + DPRINTF("HOST: resource quota set %s deny job execution in all its queues\n", sge_dstring_get_string(rule_name)); } + + DRETURN(false); } - return false; -} + if (!cq_global) { /* failure at a cluster queue limit */ -/****** sge_resource_quota_schedd/host_shadowed_by() *************************** -* NAME -* host_shadowed_by() -- ??? -* -* SYNOPSIS -* static bool host_shadowed_by(const char *host, const lListElem *rule, -* sge_assignment_t *a) -* -* FUNCTION -* Check if host in current rule is shadowed. -* -* INPUTS -* const char *cqname - Host name to check -* const lListElem *rule - Current rule -* sge_assignment_t *a - Assignment -* -* RESULT -* static bool - True if shadowed -* -* EXAMPLE -* limits hosts host1,host2 to F001=1 -* limits hosts host2,host3 to F001=1 (--> returns 'true' for host2 and 'false' for host3) -* -* NOTES -* MT-NOTE: host_shadowed_by() is MT safe -*******************************************************************************/ -static bool host_shadowed_by(const char *host, const lListElem *rule, sge_assignment_t *a) -{ - while ((rule = lPrev(rule))) { - if (rqs_match_assignment(rule, a) && - rqs_filter_match(lGetObject(rule, RQR_filter_hosts), FILTER_HOSTS, host, nullptr, a->hgrp_list, nullptr, nullptr)) { - return true; + if (host_shadowed(rule, a)) { + DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", sge_dstring_get_string(rule_name), queue_name, host_name); + DRETURN(false); + } + + if (lGetBool(lGetObject(rule, RQR_filter_queues), RQRF_expand)) { + lAddElemStr(&(a->skip_cqueue_list), CTI_name, queue_name, CTI_Type); + DPRINTF("QUEUE: resource quota set %s deny job execution in queue %s\n", sge_dstring_get_string(rule_name), queue_name); + } else { + rqs_expand_cqueues(rule, a); + DPRINTF("QUEUE: resource quota set %s deny job execution in all its queues\n", sge_dstring_get_string(rule_name)); } + + DRETURN(false); } - return false; + /* must be (!eh_global) */ + { /* failure at a host limit */ + + if (cqueue_shadowed(rule, a)) { + DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", sge_dstring_get_string(rule_name), queue_name, host_name); + DRETURN(false); + } + + if (lGetBool(lGetObject(rule, RQR_filter_hosts), RQRF_expand)) { + lAddElemStr(&(a->skip_host_list), CTI_name, host_name, CTI_Type); + DPRINTF("HOST: resource quota set %s deny job execution at host %s\n", sge_dstring_get_string(rule_name), host_name); + } else { + rqs_expand_hosts(rule, a); + DPRINTF("HOST: resource quota set %s deny job execution at all its hosts\n", sge_dstring_get_string(rule_name)); + } + + DRETURN(false); + } } -/****** sge_resource_quota_schedd/rqs_can_optimize() *************************** +/****** sge_resource_quota_schedd/rqs_exceeded_sort_out_par() ****************** * NAME -* rqs_can_optimize() -- Poke whether a queue/host negation can be made +* rqs_exceeded_sort_out_par() -- Rule out queues/hosts whenever possible * * SYNOPSIS -* static void rqs_can_optimize(const lListElem *rule, bool *host, bool -* *queue, sge_assignment_t *a) +* void rqs_exceeded_sort_out_par(sge_assignment_t *a, const lListElem +* *rule, const dstring *rule_name, const char* queue_name, const char* +* host_name) * * FUNCTION -* A global limit was hit with 'rule'. This function helps to determine -* to what exend we can profit from that situation. If there is no -* previous matching rule within the same rule set any other queue/host -* can be skipped. +* Function wrapper around rqs_exceeded_sort_out() for parallel jobs. +* In contrast to the sequential case global limit exeeding is handled +* by adding all cluster queue names to the a->skip_cqueue_list. * * INPUTS -* const lListElem *rule - Rule -* bool *host - Any previous rule with a host scope? -* bool *queue - Any previous rule with a queue scope? -* sge_assignment_t *a - Scheduler assignment +* sge_assignment_t *a - Scheduler assignment type +* const lListElem *rule - The exeeded rule +* const dstring *rule_name - Name of the rule (monitoring only) +* const char* queue_name - Cluster queue name +* const char* host_name - Host name * * NOTES -* MT-NOTE: rqs_can_optimize() is MT safe +* MT-NOTE: rqs_exceeded_sort_out_par() is MT safe *******************************************************************************/ -static void rqs_can_optimize(const lListElem *rule, bool *host, bool *queue, sge_assignment_t *a) +static void rqs_exceeded_sort_out_par(sge_assignment_t *a, const lListElem *rule, const dstring *rule_name, + const char* queue_name, const char* host_name) { - bool host_shadowed = false, queue_shadowed = false; - - const lListElem *prev = rule; - while ((prev = lPrev(prev))) { - if (!rqs_match_assignment(rule, a)) - continue; - if (!is_host_global(prev)) - host_shadowed = true; - if (!is_cqueue_global(prev)) - queue_shadowed = true; + if (rqs_exceeded_sort_out(a, rule, rule_name, queue_name, host_name)) { + rqs_expand_hosts(rule, a); } - - *host = host_shadowed; - *queue = queue_shadowed; - - return; } -/****** sge_resource_quota_schedd/rqs_excluded_cqueues() *********************** +/****** sge_resource_quota_schedd/parallel_rqs_slots_by_time() ****************** * NAME -* rqs_excluded_cqueues() -- Find excluded queues +* parallel_rqs_slots_by_time() -- Dertermine number of slots avail within +* time frame * * SYNOPSIS -* static void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a) +* dispatch_t parallel_rqs_slots_by_time(const sge_assignment_t *a, +* int *slots, const char *host, const char *queue) * * FUNCTION -* Find queues that are excluded by previous rules. +* This function iterates for a queue instance over all resource quota sets +* and evaluates the number of slots available. * * INPUTS -* const lListElem *rule - The rule -* sge_assignment_t *a - Scheduler assignement +* const sge_assignment_t *a - job info structure (in) +* int *slots - out: # free slots +* lListElem *qep - QU_Type Elem * -* EXAMPLE -* limit projects {*} queues !Q001 to F001=1 -* limit to F001=0 ( ---> returns Q001 in a->skip_cqueue_list) +* RESULT +* static dispatch_t - DISPATCH_OK got an assignment +* - DISPATCH_NEVER_CAT no assignment for all jobs af that category * * NOTES -* MT-NOTE: rqs_excluded_cqueues() is MT safe +* MT-NOTE: parallel_rqs_slots_by_time() is not MT safe +* +* SEE ALSO +* ri_slots_by_time() +* *******************************************************************************/ -static void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a) +dispatch_t +parallel_rqs_slots_by_time(sge_assignment_t *a, int *slots, lListElem *qep, bool need_master, + bool is_master_queue) { - const lListElem *cq; - const lListElem *prev; - int ignored = 0, excluded = 0; + dispatch_t result = DISPATCH_OK; + int tslots = INT_MAX; + const char* queue = lGetString(qep, QU_qname); + const char* host = lGetHost(qep, QU_qhostname); DENTER(TOP_LAYER); - for_each_ep(cq, *ocs::DataStore::get_master_list(SGE_TYPE_CQUEUE)) { - const char *cqname = lGetString(cq, CQ_name); - bool exclude = true; + if (lGetNumberOfElem(a->rqs_list) != 0) { + const char* user = a->user; + const char* group = a->group; + const lList *grp_list = a->grp_list; + const char* project = a->project; + const char* pe = a->pe_name; + lListElem *rql; + const lListElem *rqs; + // @todo can we used static dstrings? What size would be needed? + dstring dstr_rule_name = DSTRING_INIT; + dstring dstr_rue_string = DSTRING_INIT; + dstring dstr_limit_name = DSTRING_INIT; + lListElem *exec_host = host_list_locate(a->host_list, host); - if (lGetElemStr(a->skip_cqueue_list, CTI_name, cqname)) { - ignored++; - continue; - } + SCHED_PROF_INC(a->pi, par_rqs); - prev = rule; - while ((prev = lPrev(prev))) { - if (!rqs_match_assignment(rule, a)) - continue; + for_each_ep(rqs, a->rqs_list) { + lListElem *rule = nullptr; - if (rqs_filter_match(lGetObject(prev, RQR_filter_queues), FILTER_QUEUES, cqname, nullptr, nullptr, nullptr, nullptr)) { - exclude = false; - break; + /* ignore disabled rule sets */ + if (!lGetBool(rqs, RQS_enabled)) { + continue; } - } - if (exclude) { - lAddElemStr(&(a->skip_cqueue_list), CTI_name, cqname, CTI_Type); - excluded++; - } - } - - if (ignored + excluded == 0) { - CRITICAL("not a single queue excluded in rqs_excluded_cqueues()\n"); - } + sge_dstring_clear(&dstr_rule_name); + rule = rqs_get_matching_rule(rqs, user, group, grp_list, project, pe, host, queue, a->acl_list, a->hgrp_list, &dstr_rule_name); + if (rule != nullptr) { + lListElem *limit = nullptr; + const char *limit_s; + rqs_get_rue_string(&dstr_rue_string, rule, user, project, host, queue, pe); + limit_s = sge_dstring_sprintf(&dstr_limit_name, "%s=%s", sge_dstring_get_string(&dstr_rule_name), sge_dstring_get_string(&dstr_rue_string)); - DRETURN_VOID; -} + /* reuse earlier result */ + if ((rql=lGetElemStrRW(a->limit_list, RQL_name, limit_s))) { + result = (dispatch_t)lGetInt(rql, RQL_result); + tslots = MIN(tslots, lGetInt(rql, RQL_slots)); -/****** sge_resource_quota_schedd/rqs_excluded_hosts() ************************* -* NAME -* rqs_excluded_hosts() -- Find excluded hosts -* -* SYNOPSIS -* static void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a) -* -* FUNCTION -* Find hosts that are excluded by previous rules. -* -* INPUTS -* const lListElem *rule - The rule -* sge_assignment_t *a - Scheduler assignement -* -* EXAMPLE -* limit projects {*} queues !gridware to F001=1 -* limit to F001=0 ( ---> returns gridware in skip_host_list) -* -* NOTES -* MT-NOTE: rqs_excluded_hosts() is MT safe -*******************************************************************************/ -static void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a) -{ - const lListElem *eh; - const lListElem *prev; - int ignored = 0, excluded = 0; + // build the minimum + lAndUlongBitMask(qep, QU_tagged4schedule, lGetUlong(rql, RQL_tagged4schedule)); - DENTER(TOP_LAYER); + DPRINTF("parallel_rqs_slots_by_time(%s@%s) result %d slots %d for " SFQ " (cache)\n", + queue, host, result, tslots, limit_s); + } else { + int ttslots = INT_MAX; - for_each_ep(eh, a->host_list) { - const char *hname = lGetHost(eh, EH_name); - bool exclude = true; + u_long32 tagged_for_schedule_old = lGetUlong(qep, QU_tagged4schedule); /* default value or set in match_static_queue() */ + lSetUlong(qep, QU_tagged4schedule, TAG4SCHED_ALL); - if (lGetElemStr(a->skip_host_list, CTI_name, hname)) { - ignored++; - continue; - } + for_each_rw(limit, lGetList(rule, RQR_limit)) { + const char *limit_name = lGetString(limit, RQRL_name); - prev = rule; - while ((prev = lPrev(prev))) { - if (!rqs_match_assignment(rule, a)) - continue; + lListElem *raw_centry = centry_list_locate(a->centry_list, limit_name); + if (raw_centry == nullptr) { + DPRINTF("ignoring limit %s because not defined", limit_name); + continue; + } else { + DPRINTF("checking limit %s\n", lGetString(raw_centry, CE_name)); + } - if (rqs_filter_match(lGetObject(prev, RQR_filter_hosts), FILTER_HOSTS, hname, nullptr, a->hgrp_list, nullptr, nullptr)) { - exclude = false; - break; + lList *job_centry_list = job_get_hard_resource_listRW(a->job); // @todo CS-400 need to check all request lists + // @todo do we really need to pass the whole job_centry_list info functions below, + // or could we create a sub-list with just the one job_entry element? + // And would we have to copy-back info like CE_tagged? + lListElem *job_centry = centry_list_locate(job_centry_list, limit_name); + + /* found a rule, now check limit */ + if (lGetUlong(raw_centry, CE_consumable)) { + + rqs_get_rue_string(&dstr_rue_string, rule, user, project, host, queue, pe); + + if (rqs_set_dynamical_limit(limit, a->gep, exec_host, a->centry_list)) { + int tttslots = INT_MAX; + result = parallel_limit_slots_by_time(a, &tttslots, raw_centry, + limit, &dstr_rue_string, qep, + need_master, is_master_queue); + ttslots = MIN(ttslots, tttslots); + if (result == DISPATCH_NOT_AT_TIME) { + /* can still be interesting for reservation and as slave task for per_job_consumables */ + result = DISPATCH_OK; + } else if (result != DISPATCH_OK) { + break; + } + } else { + result = DISPATCH_NEVER_CAT; + break; + } + } else if (job_centry != nullptr) { + char availability_text[2048]; + + lSetString(raw_centry, CE_stringval, lGetString(limit, RQRL_value)); + if (compare_complexes(1, raw_centry, job_centry, availability_text, false, false) != 1) { + result = DISPATCH_NEVER_CAT; + break; + } + } + + } + + DPRINTF("parallel_rqs_slots_by_time(%s@%s) result %d slots %d for " SFQ " (fresh)\n", + queue, host, result, ttslots, limit_s); + + /* store result for reuse */ + rql = lAddElemStr(&(a->limit_list), RQL_name, limit_s, RQL_Type); + lSetInt(rql, RQL_result, result); + lSetInt(rql, RQL_slots, ttslots); + lSetUlong(rql, RQL_tagged4schedule, lGetUlong(qep, QU_tagged4schedule)); + + /* reset QU_tagged4schedule if necessary */ + lAndUlongBitMask(qep, QU_tagged4schedule, tagged_for_schedule_old); + + tslots = MIN(tslots, ttslots); + } + + if (result != DISPATCH_OK || tslots == 0) { + DPRINTF("RQS PARALLEL SORT OUT\n"); + schedd_mes_add(a->monitor_alpp, a->monitor_next_run, a->job_id, + SCHEDD_INFO_CANNOTRUNRQSGLOBAL_SS, + sge_dstring_get_string(&dstr_rue_string), sge_dstring_get_string(&dstr_rule_name)); + rqs_exceeded_sort_out_par(a, rule, &dstr_rule_name, queue, host); + } + + if (result != DISPATCH_OK || tslots == 0) { + break; + } } } - if (exclude) { - lAddElemStr(&(a->skip_host_list), CTI_name, hname, CTI_Type); - excluded++; - } + sge_dstring_free(&dstr_rue_string); + sge_dstring_free(&dstr_rule_name); + sge_dstring_free(&dstr_limit_name); } - if (ignored + excluded == 0) { - CRITICAL("not a single host excluded in rqs_excluded_hosts()\n"); - } + *slots = tslots; - DRETURN_VOID; + DPRINTF("parallel_rqs_slots_by_time(%s@%s) finalresult %d slots %d\n", queue, host, result, *slots); + + DRETURN(result); } -/****** sge_resource_quota_schedd/rqs_expand_cqueues() ************************* +/****** sge_resource_quota_schedd/rqs_match_assignment() *********************** * NAME -* rqs_expand_cqueues() -- Add all matching cqueues to the list +* rqs_match_assignment() -- match resource quota rule any queue instance * * SYNOPSIS -* void rqs_expand_cqueues(const lListElem *rule) +* static bool rqs_match_assignment(const lListElem *rule, sge_assignment_t +* *a) * * FUNCTION -* The names of all cluster queues that match the rule are added to -* the skip list without duplicates. +* Check whether a resource quota rule can match any queue instance. If +* if does not match due to users/projects/pes scope one can rule this +* out. +* +* Note: As long as rqs_match_assignment() is not used for parallel jobs +* passing nullptr as PE request is perfectly fine. * * INPUTS -* const lListElem *rule - RQR_Type +* const lListElem *rule - Resource quota rule +* sge_assignment_t *a - Scheduler assignment +* +* RESULT +* static bool - True if it matches * * NOTES -* MT-NOTE: rqs_expand_cqueues() is not MT safe +* MT-NOTE: rqs_match_assignment() is MT safe *******************************************************************************/ -static void rqs_expand_cqueues(const lListElem *rule, sge_assignment_t *a) +static bool rqs_match_assignment(const lListElem *rule, sge_assignment_t *a) { - const lListElem *cq; - const char *cqname; - lListElem *qfilter = lGetObject(rule, RQR_filter_queues); - - DENTER(TOP_LAYER); - - for_each_ep(cq, *ocs::DataStore::get_master_list(SGE_TYPE_CQUEUE)) { - cqname = lGetString(cq, CQ_name); - if (lGetElemStr(a->skip_cqueue_list, CTI_name, cqname)) - continue; - if (rqs_filter_match(qfilter, FILTER_QUEUES, cqname, nullptr, nullptr, nullptr, nullptr) && !cqueue_shadowed_by(cqname, rule, a)) - lAddElemStr(&(a->skip_cqueue_list), CTI_name, cqname, CTI_Type); - } - - DRETURN_VOID; + return (rqs_filter_match(lGetObject(rule, RQR_filter_projects), FILTER_PROJECTS, a->project, nullptr, nullptr, nullptr, nullptr) && + rqs_filter_match(lGetObject(rule, RQR_filter_users), FILTER_USERS, a->user, a->acl_list, nullptr, a->group, a->grp_list) && + rqs_filter_match(lGetObject(rule, RQR_filter_pes), FILTER_PES, nullptr, nullptr, nullptr, nullptr, nullptr))?true:false; } -/****** sge_resource_quota_schedd/rqs_expand_hosts() *************************** + +/****** sge_resource_quota_schedd/rqs_can_optimize() *************************** * NAME -* rqs_expand_hosts() -- Add all matching hosts to the list +* rqs_can_optimize() -- Poke whether a queue/host negation can be made * * SYNOPSIS -* void rqs_expand_hosts(const lListElem *rule, lList **skip_host_list, -* const lList *host_list, lList *hgrp_list) +* static void rqs_can_optimize(const lListElem *rule, bool *host, bool +* *queue, sge_assignment_t *a) * * FUNCTION -* The names of all hosts that match the rule are added to -* the skip list without duplicates. +* A global limit was hit with 'rule'. This function helps to determine +* to what exend we can profit from that situation. If there is no +* previous matching rule within the same rule set any other queue/host +* can be skipped. * * INPUTS -* const lListElem *rule - RQR_Type -* const lList *host_list - EH_Type +* const lListElem *rule - Rule +* bool *host - Any previous rule with a host scope? +* bool *queue - Any previous rule with a queue scope? +* sge_assignment_t *a - Scheduler assignment * * NOTES -* MT-NOTE: rqs_expand_hosts() is MT safe +* MT-NOTE: rqs_can_optimize() is MT safe *******************************************************************************/ -static void rqs_expand_hosts(const lListElem *rule, sge_assignment_t *a) +void rqs_can_optimize(const lListElem *rule, bool *host, bool *queue, sge_assignment_t *a) { - const lListElem *eh; - const char *hname; - lListElem *hfilter = lGetObject(rule, RQR_filter_hosts); + bool host_shadowed = false, queue_shadowed = false; - for_each_ep(eh, a->host_list) { - hname = lGetHost(eh, EH_name); - if (lGetElemStr(a->skip_host_list, CTI_name, hname)) + const lListElem *prev = rule; + while ((prev = lPrev(prev))) { + if (!rqs_match_assignment(rule, a)) continue; - if (rqs_filter_match(hfilter, FILTER_HOSTS, hname, nullptr, a->hgrp_list, nullptr, nullptr) && !host_shadowed_by(hname, rule, a)) - lAddElemStr(&(a->skip_host_list), CTI_name, hname, CTI_Type); + if (!is_host_global(prev)) + host_shadowed = true; + if (!is_cqueue_global(prev)) + queue_shadowed = true; } - return; -} + *host = host_shadowed; + *queue = queue_shadowed; -static bool is_global(const lListElem *rule, int nm) -{ - lListElem *filter = lGetObject(rule, nm); - if (!filter) - return true; - if (lGetSubStr(filter, ST_name, "*", RQRF_scope) && lGetNumberOfElem(lGetList(filter, RQRF_xscope))==0) - return true; - return false; + return; } -/****** sge_resource_quota_schedd/is_cqueue_global() *************************** +/****** sge_resource_quota_schedd/check_and_debit_rqs_slots() ********************* * NAME -* is_cqueue_global() -- Global rule with regards to cluster queues? +* check_and_debit_rqs_slots() -- Determine RQS limit slot amount and debit * * SYNOPSIS -* bool is_cqueue_global(const lListElem *rule) +* static void check_and_debit_rqs_slots(sge_assignment_t *a, const char +* *host, const char *queue, int *slots, dstring +* *rule_name, dstring *rue_name, dstring *limit_name) * -* INPUTS -* const lListElem *rule - RQR_Type +* FUNCTION +* The function determines the final slot amount due +* to all resource quota limitations that apply for the queue instance. +* Both slot amounts get debited from the a->limit_list to keep track +* of still available amounts per resource quota limit. * -* RESULT -* bool - True if cluster queues play no role with the rule +* INPUTS +* sge_assignment_t *a - Assignment data structure +* const char *host - hostname +* const char *queue - queuename +* int *slots - needed/available slots +* dstring *rule_name - caller maintained buffer +* dstring *rue_name - caller maintained buffer +* dstring *limit_name - caller maintained buffer * * NOTES -* MT-NOTE: is_cqueue_global() is MT safe +* MT-NOTE: check_and_debit_rqs_slots() is MT safe *******************************************************************************/ -static bool is_cqueue_global(const lListElem *rule) -{ - return is_global(rule, RQR_filter_queues); -} - - -/****** sge_resource_quota_schedd/is_host_global() ***************************** -* NAME -* is_host_global() -- Global rule with regards to hosts? -* -* SYNOPSIS -* bool is_host_global(const lListElem *rule) -* -* FUNCTION -* Return true if hosts play no role with the rule -* -* INPUTS -* const lListElem *rule - RQR_Type -* -* RESULT -* bool - True if hosts play no role with the rule -* -* NOTES -* MT-NOTE: is_host_global() is MT safe -*******************************************************************************/ -static bool is_host_global(const lListElem *rule) -{ - return is_global(rule, RQR_filter_hosts); -} - -static bool is_expand(const lListElem *rule, int nm) -{ - lListElem *filter = lGetObject(rule, nm); - if (filter && lGetBool(filter, RQRF_expand)) - return true; - else - return false; -} - - -/****** sge_resource_quota_schedd/is_host_expand() ***************************** -* NAME -* is_host_expand() -- Returns true if rule expands on hosts -* -* SYNOPSIS -* bool is_host_expand(const lListElem *rule) -* -* FUNCTION -* Returns true if rule expands on hosts. -* -* INPUTS -* const lListElem *rule - RQR_Type -* -* RESULT -* bool - True if rule expands on hosts -* -* EXAMPLE -* "hosts {*}" returns true -* "hosts @allhosts" returns false -* -* NOTES -* MT-NOTE: is_host_expand() is MT safe -*******************************************************************************/ -static bool is_host_expand(const lListElem *rule) -{ - return is_expand(rule, RQR_filter_hosts); -} - -/****** sge_resource_quota_schedd/is_cqueue_expand() *************************** -* NAME -* is_cqueue_expand() -- Returns true if rule expands on cluster queues -* -* SYNOPSIS -* bool is_cqueue_expand(const lListElem *rule) -* -* FUNCTION -* Returns true if rule expands on cluster queues. -* -* INPUTS -* const lListElem *rule - RQR_Type -* -* RESULT -* bool - True if rule expands on hosts -* -* EXAMPLE -* "queues {*}" returns true -* "queues Q001,Q002" returns false -* -* NOTES -* MT-NOTE: is_cqueue_expand() is MT safe -*******************************************************************************/ -static bool is_cqueue_expand(const lListElem *rule) -{ - return is_expand(rule, RQR_filter_queues); -} - -/****** sge_resource_quota_schedd/rqs_exceeded_sort_out() ********************** -* NAME -* rqs_exceeded_sort_out() -- Rule out queues/hosts whenever possible -* -* SYNOPSIS -* bool rqs_exceeded_sort_out(sge_assignment_t *a, const lListElem *rule, -* const dstring *rule_name, const char* queue_name, const char* host_name) -* -* FUNCTION -* This function tries to rule out hosts and cluster queues after a -* quota exeeding was found for a limitation rule with specific queue -* instance. -* -* When a limitation was exeeded that applies to the entire -* cluster 'true' is returned, 'false' otherwise. -* -* INPUTS -* sge_assignment_t *a - Scheduler assignment type -* const lListElem *rule - The exeeded rule -* const dstring *rule_name - Name of the rule (monitoring only) -* const char* queue_name - Cluster queue name -* const char* host_name - Host name -* -* RESULT -* bool - True upon global limits exceeding -* -* NOTES -* MT-NOTE: rqs_exceeded_sort_out() is MT safe -*******************************************************************************/ -static bool rqs_exceeded_sort_out(sge_assignment_t *a, const lListElem *rule, const dstring *rule_name, - const char* queue_name, const char* host_name) -{ - bool cq_global = is_cqueue_global(rule); - bool eh_global = is_host_global(rule); - - DENTER(TOP_LAYER); - - if ((!cq_global && !eh_global) || (cq_global && eh_global && - (is_cqueue_expand(rule) || is_host_expand(rule)))) { /* failure at queue instance limit */ - DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", - sge_dstring_get_string(rule_name), queue_name, host_name); - DRETURN(false); - } - - if (cq_global && eh_global) { /* failure at a global limit */ - bool host_shadowed, queue_shadowed; - - rqs_can_optimize(rule, &host_shadowed, &queue_shadowed, a); - if (!host_shadowed && !queue_shadowed) { - DPRINTF("GLOBAL: resource quota set %s deny job execution globally\n", sge_dstring_get_string(rule_name)); - DRETURN(true); - } - - if (host_shadowed && queue_shadowed) { - rqs_excluded_cqueues(rule, a); - rqs_excluded_hosts(rule, a); - DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", sge_dstring_get_string(rule_name), queue_name, host_name); - DRETURN(false); - } - - if (queue_shadowed) { - rqs_excluded_cqueues(rule, a); - DPRINTF("QUEUE: resource quota set %s deny job execution in all its queues\n", sge_dstring_get_string(rule_name)); - } else { /* must be host_shadowed */ - rqs_excluded_hosts(rule, a); - DPRINTF("HOST: resource quota set %s deny job execution in all its queues\n", sge_dstring_get_string(rule_name)); - } - - DRETURN(false); - } - - if (!cq_global) { /* failure at a cluster queue limit */ - - if (host_shadowed(rule, a)) { - DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", sge_dstring_get_string(rule_name), queue_name, host_name); - DRETURN(false); - } - - if (lGetBool(lGetObject(rule, RQR_filter_queues), RQRF_expand)) { - lAddElemStr(&(a->skip_cqueue_list), CTI_name, queue_name, CTI_Type); - DPRINTF("QUEUE: resource quota set %s deny job execution in queue %s\n", sge_dstring_get_string(rule_name), queue_name); - } else { - rqs_expand_cqueues(rule, a); - DPRINTF("QUEUE: resource quota set %s deny job execution in all its queues\n", sge_dstring_get_string(rule_name)); - } - - DRETURN(false); - } - - /* must be (!eh_global) */ - { /* failure at a host limit */ - - if (cqueue_shadowed(rule, a)) { - DPRINTF("QUEUE INSTANCE: resource quota set %s deny job execution on %s@%s\n", sge_dstring_get_string(rule_name), queue_name, host_name); - DRETURN(false); - } - - if (lGetBool(lGetObject(rule, RQR_filter_hosts), RQRF_expand)) { - lAddElemStr(&(a->skip_host_list), CTI_name, host_name, CTI_Type); - DPRINTF("HOST: resource quota set %s deny job execution at host %s\n", sge_dstring_get_string(rule_name), host_name); - } else { - rqs_expand_hosts(rule, a); - DPRINTF("HOST: resource quota set %s deny job execution at all its hosts\n", sge_dstring_get_string(rule_name)); - } - - DRETURN(false); - } -} - -/****** sge_resource_quota_schedd/rqs_exceeded_sort_out_par() ****************** -* NAME -* rqs_exceeded_sort_out_par() -- Rule out queues/hosts whenever possible -* -* SYNOPSIS -* void rqs_exceeded_sort_out_par(sge_assignment_t *a, const lListElem -* *rule, const dstring *rule_name, const char* queue_name, const char* -* host_name) -* -* FUNCTION -* Function wrapper around rqs_exceeded_sort_out() for parallel jobs. -* In contrast to the sequential case global limit exeeding is handled -* by adding all cluster queue names to the a->skip_cqueue_list. -* -* INPUTS -* sge_assignment_t *a - Scheduler assignment type -* const lListElem *rule - The exeeded rule -* const dstring *rule_name - Name of the rule (monitoring only) -* const char* queue_name - Cluster queue name -* const char* host_name - Host name -* -* NOTES -* MT-NOTE: rqs_exceeded_sort_out_par() is MT safe -*******************************************************************************/ -static void rqs_exceeded_sort_out_par(sge_assignment_t *a, const lListElem *rule, const dstring *rule_name, - const char* queue_name, const char* host_name) -{ - if (rqs_exceeded_sort_out(a, rule, rule_name, queue_name, host_name)) { - rqs_expand_hosts(rule, a); - } -} - -/****** sge_resource_quota_schedd/sge_user_is_referenced_in_rqs() ******************** -* NAME -* sge_user_is_referenced_in_rqs() -- search for user reference in rqs -* -* SYNOPSIS -* bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, -* lList *acl_list) -* -* FUNCTION -* Search for a user reference in the resource quota sets -* -* INPUTS -* const lList *rqs - resource quota set list -* const char *user - user to search -* const char *group - user's group -* lList *acl_list - acl list for user resolving -* -* RESULT -* bool - true if user was found -* false if user was not found -* -* NOTES -* MT-NOTE: sge_user_is_referenced_in_rqs() is MT safe -* -*******************************************************************************/ -bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, const char *group, const lList *grp_list, const lList *acl_list) -{ - bool ret = false; - const lListElem *ep; - - for_each_ep(ep, rqs) { - const lList *rule_list = lGetList(ep, RQS_rule); - const lListElem *rule; - - for_each_ep(rule, rule_list) { - /* there may be no per-user limitation and also not limitation that is special for this user */ - if ((is_expand(rule, RQR_filter_users) || !is_global(rule, RQR_filter_users)) && - rqs_filter_match(lGetObject(rule, RQR_filter_users), FILTER_USERS, user, acl_list, nullptr, group, grp_list)) { - ret = true; - break; - } - } - if (ret) { - break; - } - } - return ret; -} - - -/****** sge_resource_quota_schedd/check_and_debit_rqs_slots() ********************* -* NAME -* check_and_debit_rqs_slots() -- Determine RQS limit slot amount and debit -* -* SYNOPSIS -* static void check_and_debit_rqs_slots(sge_assignment_t *a, const char -* *host, const char *queue, int *slots, dstring -* *rule_name, dstring *rue_name, dstring *limit_name) -* -* FUNCTION -* The function determines the final slot amount due -* to all resource quota limitations that apply for the queue instance. -* Both slot amounts get debited from the a->limit_list to keep track -* of still available amounts per resource quota limit. -* -* INPUTS -* sge_assignment_t *a - Assignment data structure -* const char *host - hostname -* const char *queue - queuename -* int *slots - needed/available slots -* dstring *rule_name - caller maintained buffer -* dstring *rue_name - caller maintained buffer -* dstring *limit_name - caller maintained buffer -* -* NOTES -* MT-NOTE: check_and_debit_rqs_slots() is MT safe -*******************************************************************************/ -void parallel_check_and_debit_rqs_slots(sge_assignment_t *a, const char *host, const char *queue, - int *slots, dstring *rule_name, dstring *rue_name, dstring *limit_name) +void parallel_check_and_debit_rqs_slots(sge_assignment_t *a, const char *host, const char *queue, + int *slots, dstring *rule_name, dstring *rue_name, dstring *limit_name) { const lListElem *rqs, *rule; const char* user = a->user; @@ -973,542 +719,513 @@ void parallel_revert_rqs_slot_debitation(sge_assignment_t *a, const char *host, DRETURN_VOID; } -/****** sge_resource_quota_schedd/parallel_limit_slots_by_time() ******************** +/****** sge_resource_quota_schedd/rqs_by_slots() *********************************** * NAME -* parallel_limit_slots_by_time() -- Determine number of slots avail. within -* time frame +* rqs_by_slots() -- Check queue instance suitability due to RQS * * SYNOPSIS -* static dispatch_t parallel_limit_slots_by_time(const sge_assignment_t *a, -* lList *requests, int *slots, lListElem *centry, lListElem -* *limit, dstring rue_name) +* dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, +* const char *host, u_long64 *tt_rqs_all, bool *is_global, +* dstring *rue_string, dstring *limit_name, dstring *rule_name) * * FUNCTION -* ??? +* Checks (or determines earliest time) queue instance suitability +* according to resource quota set limits. +* +* For performance reasons RQS verification results are cached in +* a->limit_list. In addition unsuited queues and hosts are collected +* in a->skip_cqueue_list and a->skip_host_list so that ruling out +* chunks of queue instance becomes quite cheap. * * INPUTS -* const sge_assignment_t *a - job info structure (in) -* lList *requests - Job request list (CE_Type) -* int *slots - out: free slots -* lListElem *centry - Load information for the resource -* lListElem *limit - limitation (RQRL_Type) -* dstring rue_name - rue_name saved in limit sublist RQRL_usage -* lListElem *qep - queue instance (QU_Type) +* sge_assignment_t *a - assignment +* const char *queue - cluster queue name +* const char *host - host name +* u_long64 *tt_rqs_all - returns earliest time over all resource quotas +* bool *is_global - returns true if result is valid for any other queue +* dstring *rue_string - caller maintained buffer +* dstring *limit_name - caller maintained buffer +* dstring *rule_name - caller maintained buffer +* u_long64 tt_best - time of best solution found so far * * RESULT -* static dispatch_t - DISPATCH_OK got an assignment -* - DISPATCH_NEVER_CAT no assignment for all jobs af that category +* static dispatch_t - usual return values * * NOTES -* MT-NOTE: parallel_limit_slots_by_time() is not MT safe -* -* SEE ALSO -* parallel_rc_slots_by_time +* MT-NOTE: rqs_by_slots() is MT safe *******************************************************************************/ -static dispatch_t -parallel_limit_slots_by_time(const sge_assignment_t *a, int *slots, lListElem *centry, - lListElem *limit, dstring *rue_name, lListElem *qep, bool need_master, - bool is_master_queue) +dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, const char *host, + u_long64 *tt_rqs_all, bool *is_global, dstring *rue_string, dstring *limit_name, dstring *rule_name, u_long64 tt_best) { - lList *tmp_centry_list = lCreateList("", CE_Type); - lList *tmp_rue_list = lCreateList("", RUE_Type); - lListElem *tmp_centry_elem = nullptr; - lListElem *tmp_rue_elem = nullptr; - const lList *rue_list = lGetList(limit, RQRL_usage); - dispatch_t result = DISPATCH_NEVER_CAT; + const lListElem *rqs; + dispatch_t result = DISPATCH_OK; DENTER(TOP_LAYER); - /* create tmp_centry_list */ - tmp_centry_elem = lCopyElem(centry); - lSetDouble(tmp_centry_elem, CE_doubleval, lGetDouble(limit, RQRL_dvalue)); - lAppendElem(tmp_centry_list, tmp_centry_elem); + *is_global = false; - /* create tmp_rue_list */ - tmp_rue_elem = lCopyElem(lGetElemStr(rue_list, RUE_name, sge_dstring_get_string(rue_name))); - if (tmp_rue_elem == nullptr) { - DPRINTF("RD: 1\n"); - tmp_rue_elem = lCreateElem(RUE_Type); - } -#if 0 -{ - const char *object_name = "bla"; - const lListElem *rde; - DPRINTF("resource utilization: %s \"%s\" %f utilized now\n", - object_name?object_name:"", lGetString(tmp_rue_elem, RUE_name), - lGetDouble(tmp_rue_elem, RUE_utilized_now)); - for_each_ep(rde, lGetList(tmp_rue_elem, RUE_utilized)) { - DPRINTF("\t" sge_u64 " %f\n", lGetUlong64(rde, RDE_time), lGetDouble(rde, RDE_amount)); + if (lGetNumberOfElem(a->rqs_list) > 0) { + SCHED_PROF_INC(a->pi, seq_rqs); } - DPRINTF("resource utilization: %s \"%s\" %f utilized now non-exclusive\n", - object_name?object_name:"", lGetString(tmp_rue_elem, RUE_name), - lGetDouble(tmp_rue_elem, RUE_utilized_now_nonexclusive)); - for_each_ep(rde, lGetList(tmp_rue_elem, RUE_utilized_nonexclusive)) { - DPRINTF("\t" sge_u64 " %f\n", lGetUlong64(rde, RDE_time), lGetDouble(rde, RDE_amount)); + + for_each_ep(rqs, a->rqs_list) { + u_long64 tt_rqs = a->start; + const char *user = a->user; + const char *group = a->group; + const lList *grp_list = a->grp_list; + const char *project = a->project; + const lListElem *rule; + + if (!lGetBool(rqs, RQS_enabled)) { + continue; + } + + sge_dstring_clear(rule_name); + rule = rqs_get_matching_rule(rqs, user, group, grp_list, project, nullptr, host, queue, a->acl_list, a->hgrp_list, rule_name); + if (rule != nullptr) { + const char *limit; + lListElem *rql; + + /* need unique identifier for cache */ + rqs_get_rue_string(rue_string, rule, user, project, host, queue, nullptr); + sge_dstring_sprintf(limit_name, "%s=%s", sge_dstring_get_string(rule_name), sge_dstring_get_string(rue_string)); + limit = sge_dstring_get_string(limit_name); + + /* check limit or reuse earlier results */ + if ((rql=lGetElemStrRW(a->limit_list, RQL_name, limit))) { + tt_rqs = lGetUlong64(rql, RQL_time); + result = (dispatch_t)lGetInt(rql, RQL_result); + } else { + /* Check booked usage */ + result = rqs_limitation_reached(a, rule, host, queue, &tt_rqs); + + rql = lAddElemStr(&(a->limit_list), RQL_name, limit, RQL_Type); + lSetInt(rql, RQL_result, result); + lSetUlong64(rql, RQL_time, tt_rqs); + /* init with same value as QU_tagged4schedule */ + lSetUlong(rql, RQL_tagged4schedule, 2); + + if (result != DISPATCH_OK && result != DISPATCH_MISSING_ATTR) { + schedd_mes_add(a->monitor_alpp, a->monitor_next_run, a->job_id, + SCHEDD_INFO_CANNOTRUNRQSGLOBAL_SS, + sge_dstring_get_string(rue_string), sge_dstring_get_string(rule_name)); + if (rqs_exceeded_sort_out(a, rule, rule_name, queue, host)) { + *is_global = true; + } + } + } + + if (result == DISPATCH_MISSING_ATTR) { + result = DISPATCH_OK; + continue; + } + if (result != DISPATCH_OK) + break; + + if (a->is_reservation && tt_rqs >= tt_best) { + /* no need to further investigate these ones */ + if (rqs_exceeded_sort_out(a, rule, rule_name, queue, host)) + *is_global = true; + } + + *tt_rqs_all = MAX(*tt_rqs_all, tt_rqs); + } } -} -#endif - lSetString(tmp_rue_elem, RUE_name, lGetString(limit, RQRL_name)); - lAppendElem(tmp_rue_list, tmp_rue_elem); + if (!rqs) { + result = DISPATCH_OK; + } - result = parallel_rc_slots_by_time(a, slots, - tmp_centry_list, tmp_rue_list, nullptr, - false, qep, DOMINANT_LAYER_RQS, 0.0, RQS_TAG, need_master, is_master_queue, - false, SGE_RQS_NAME, true); - - lFreeList(&tmp_centry_list); - lFreeList(&tmp_rue_list); + if (result == DISPATCH_OK || result == DISPATCH_MISSING_ATTR) { + DPRINTF("rqs_by_slots(%s@%s) returns " sge_u64 "\n", queue, host, tt_rqs_all); + } else { + DPRINTF("rqs_by_slots(%s@%s) returns " sge_u64 " (%s)\n", queue, host, tt_rqs_all, *is_global?"global":"not global"); + } DRETURN(result); } - -/****** sge_resource_quota_schedd/parallel_rqs_slots_by_time() ****************** +/****** sge_resource_quota_schedd/rqs_expand_cqueues() ************************* * NAME -* parallel_rqs_slots_by_time() -- Dertermine number of slots avail within -* time frame +* rqs_expand_cqueues() -- Add all matching cqueues to the list * * SYNOPSIS -* dispatch_t parallel_rqs_slots_by_time(const sge_assignment_t *a, -* int *slots, const char *host, const char *queue) +* void rqs_expand_cqueues(const lListElem *rule) * * FUNCTION -* This function iterates for a queue instance over all resource quota sets -* and evaluates the number of slots available. +* The names of all cluster queues that match the rule are added to +* the skip list without duplicates. * * INPUTS -* const sge_assignment_t *a - job info structure (in) -* int *slots - out: # free slots -* lListElem *qep - QU_Type Elem -* -* RESULT -* static dispatch_t - DISPATCH_OK got an assignment -* - DISPATCH_NEVER_CAT no assignment for all jobs af that category +* const lListElem *rule - RQR_Type * * NOTES -* MT-NOTE: parallel_rqs_slots_by_time() is not MT safe -* -* SEE ALSO -* ri_slots_by_time() -* +* MT-NOTE: rqs_expand_cqueues() is not MT safe *******************************************************************************/ -dispatch_t -parallel_rqs_slots_by_time(sge_assignment_t *a, int *slots, lListElem *qep, bool need_master, - bool is_master_queue) +void rqs_expand_cqueues(const lListElem *rule, sge_assignment_t *a) { - dispatch_t result = DISPATCH_OK; - int tslots = INT_MAX; - const char* queue = lGetString(qep, QU_qname); - const char* host = lGetHost(qep, QU_qhostname); + const lListElem *cq; + const char *cqname; + lListElem *qfilter = lGetObject(rule, RQR_filter_queues); DENTER(TOP_LAYER); - if (lGetNumberOfElem(a->rqs_list) != 0) { - const char* user = a->user; - const char* group = a->group; - const lList *grp_list = a->grp_list; - const char* project = a->project; - const char* pe = a->pe_name; - lListElem *rql; - const lListElem *rqs; - // @todo can we used static dstrings? What size would be needed? - dstring dstr_rule_name = DSTRING_INIT; - dstring dstr_rue_string = DSTRING_INIT; - dstring dstr_limit_name = DSTRING_INIT; - lListElem *exec_host = host_list_locate(a->host_list, host); - - SCHED_PROF_INC(a->pi, par_rqs); - - for_each_ep(rqs, a->rqs_list) { - lListElem *rule = nullptr; - - /* ignore disabled rule sets */ - if (!lGetBool(rqs, RQS_enabled)) { - continue; - } - sge_dstring_clear(&dstr_rule_name); - rule = rqs_get_matching_rule(rqs, user, group, grp_list, project, pe, host, queue, a->acl_list, a->hgrp_list, &dstr_rule_name); - if (rule != nullptr) { - lListElem *limit = nullptr; - const char *limit_s; - rqs_get_rue_string(&dstr_rue_string, rule, user, project, host, queue, pe); - limit_s = sge_dstring_sprintf(&dstr_limit_name, "%s=%s", sge_dstring_get_string(&dstr_rule_name), sge_dstring_get_string(&dstr_rue_string)); - - /* reuse earlier result */ - if ((rql=lGetElemStrRW(a->limit_list, RQL_name, limit_s))) { - result = (dispatch_t)lGetInt(rql, RQL_result); - tslots = MIN(tslots, lGetInt(rql, RQL_slots)); - - // build the minimum - lAndUlongBitMask(qep, QU_tagged4schedule, lGetUlong(rql, RQL_tagged4schedule)); - - DPRINTF("parallel_rqs_slots_by_time(%s@%s) result %d slots %d for " SFQ " (cache)\n", - queue, host, result, tslots, limit_s); - } else { - int ttslots = INT_MAX; - - u_long32 tagged_for_schedule_old = lGetUlong(qep, QU_tagged4schedule); /* default value or set in match_static_queue() */ - lSetUlong(qep, QU_tagged4schedule, TAG4SCHED_ALL); - - for_each_rw(limit, lGetList(rule, RQR_limit)) { - const char *limit_name = lGetString(limit, RQRL_name); - - lListElem *raw_centry = centry_list_locate(a->centry_list, limit_name); - if (raw_centry == nullptr) { - DPRINTF("ignoring limit %s because not defined", limit_name); - continue; - } else { - DPRINTF("checking limit %s\n", lGetString(raw_centry, CE_name)); - } - - lList *job_centry_list = job_get_hard_resource_listRW(a->job); // @todo CS-400 need to check all request lists - // @todo do we really need to pass the whole job_centry_list info functions below, - // or could we create a sub-list with just the one job_entry element? - // And would we have to copy-back info like CE_tagged? - lListElem *job_centry = centry_list_locate(job_centry_list, limit_name); - - /* found a rule, now check limit */ - if (lGetUlong(raw_centry, CE_consumable)) { - - rqs_get_rue_string(&dstr_rue_string, rule, user, project, host, queue, pe); - - if (rqs_set_dynamical_limit(limit, a->gep, exec_host, a->centry_list)) { - int tttslots = INT_MAX; - result = parallel_limit_slots_by_time(a, &tttslots, raw_centry, - limit, &dstr_rue_string, qep, - need_master, is_master_queue); - ttslots = MIN(ttslots, tttslots); - if (result == DISPATCH_NOT_AT_TIME) { - /* can still be interesting for reservation and as slave task for per_job_consumables */ - result = DISPATCH_OK; - } else if (result != DISPATCH_OK) { - break; - } - } else { - result = DISPATCH_NEVER_CAT; - break; - } - } else if (job_centry != nullptr) { - char availability_text[2048]; - - lSetString(raw_centry, CE_stringval, lGetString(limit, RQRL_value)); - if (compare_complexes(1, raw_centry, job_centry, availability_text, false, false) != 1) { - result = DISPATCH_NEVER_CAT; - break; - } - } - - } - - DPRINTF("parallel_rqs_slots_by_time(%s@%s) result %d slots %d for " SFQ " (fresh)\n", - queue, host, result, ttslots, limit_s); - - /* store result for reuse */ - rql = lAddElemStr(&(a->limit_list), RQL_name, limit_s, RQL_Type); - lSetInt(rql, RQL_result, result); - lSetInt(rql, RQL_slots, ttslots); - lSetUlong(rql, RQL_tagged4schedule, lGetUlong(qep, QU_tagged4schedule)); - - /* reset QU_tagged4schedule if necessary */ - lAndUlongBitMask(qep, QU_tagged4schedule, tagged_for_schedule_old); + for_each_ep(cq, *ocs::DataStore::get_master_list(SGE_TYPE_CQUEUE)) { + cqname = lGetString(cq, CQ_name); + if (lGetElemStr(a->skip_cqueue_list, CTI_name, cqname)) + continue; + if (rqs_filter_match(qfilter, FILTER_QUEUES, cqname, nullptr, nullptr, nullptr, nullptr) && !cqueue_shadowed_by(cqname, rule, a)) + lAddElemStr(&(a->skip_cqueue_list), CTI_name, cqname, CTI_Type); + } - tslots = MIN(tslots, ttslots); - } + DRETURN_VOID; +} - if (result != DISPATCH_OK || tslots == 0) { - DPRINTF("RQS PARALLEL SORT OUT\n"); - schedd_mes_add(a->monitor_alpp, a->monitor_next_run, a->job_id, - SCHEDD_INFO_CANNOTRUNRQSGLOBAL_SS, - sge_dstring_get_string(&dstr_rue_string), sge_dstring_get_string(&dstr_rule_name)); - rqs_exceeded_sort_out_par(a, rule, &dstr_rule_name, queue, host); - } +/****** sge_resource_quota_schedd/rqs_expand_hosts() *************************** +* NAME +* rqs_expand_hosts() -- Add all matching hosts to the list +* +* SYNOPSIS +* void rqs_expand_hosts(const lListElem *rule, lList **skip_host_list, +* const lList *host_list, lList *hgrp_list) +* +* FUNCTION +* The names of all hosts that match the rule are added to +* the skip list without duplicates. +* +* INPUTS +* const lListElem *rule - RQR_Type +* const lList *host_list - EH_Type +* +* NOTES +* MT-NOTE: rqs_expand_hosts() is MT safe +*******************************************************************************/ +void rqs_expand_hosts(const lListElem *rule, sge_assignment_t *a) +{ + const lListElem *eh; + const char *hname; + lListElem *hfilter = lGetObject(rule, RQR_filter_hosts); - if (result != DISPATCH_OK || tslots == 0) { - break; - } - } - } - sge_dstring_free(&dstr_rue_string); - sge_dstring_free(&dstr_rule_name); - sge_dstring_free(&dstr_limit_name); + for_each_ep(eh, a->host_list) { + hname = lGetHost(eh, EH_name); + if (lGetElemStr(a->skip_host_list, CTI_name, hname)) + continue; + if (rqs_filter_match(hfilter, FILTER_HOSTS, hname, nullptr, a->hgrp_list, nullptr, nullptr) && !host_shadowed_by(hname, rule, a)) + lAddElemStr(&(a->skip_host_list), CTI_name, hname, CTI_Type); } - *slots = tslots; - - DPRINTF("parallel_rqs_slots_by_time(%s@%s) finalresult %d slots %d\n", queue, host, result, *slots); - - DRETURN(result); + return; } -/****** sge_resource_quota_schedd/rqs_limitation_reached() ********************* +/****** sge_resource_quota_schedd/cqueue_shadowed() **************************** * NAME -* rqs_limitation_reached() -- is the limitation reached for a queue instance +* cqueue_shadowed() -- Check for cluster queue rule before current rule * * SYNOPSIS -* static bool rqs_limitation_reached(sge_assignment_t *a, lListElem *rule, -* const char* host, const char* queue) +* static bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a) * * FUNCTION -* The function verifies no limitation is reached for the specific job request -* and queue instance +* Check whether there is any cluster queue specific rule before the +* current rule. * * INPUTS -* sge_assignment_t *a - job info structure -* const lListElem *rule - rqsource quota rule (RQR_Type) -* const char* host - host name -* const char* queue - queue name -* u_long64 *start - start time of job +* const lListElem *rule - Current rule +* sge_assignment_t *a - Scheduler assignment * * RESULT -* static dispatch_t - DISPATCH_OK job can be scheduled -* DISPATCH_NEVER_CAT no jobs of this category will be scheduled -* DISPATCH_NOT_AT_TIME job can be scheduled later -* DISPATCH_MISSING_ATTR rule does not match requested attributes +* static bool - True if shadowed * -* NOTES -* MT-NOTE: rqs_limitation_reached() is not MT safe +* EXAMPLE +* limit queue Q001 to F001=1 +* limit host gridware to F001=0 (--> returns 'true' due to 'Q001' meaning +* that gridware can't be generelly ruled out ) * +* NOTES +* MT-NOTE: cqueue_shadowed() is MT safe *******************************************************************************/ -static dispatch_t rqs_limitation_reached(sge_assignment_t *a, const lListElem *rule, const char* host, const char* queue, u_long64 *start) +bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a) { - dispatch_t ret = DISPATCH_MISSING_ATTR; - const lList *limit_list = nullptr; - lListElem * limit = nullptr; - static lListElem *implicit_slots_request = nullptr; - lListElem *exec_host = host_list_locate(a->host_list, host); - dstring rue_name = DSTRING_INIT; - dstring reason = DSTRING_INIT; - - DENTER(TOP_LAYER); - - if (implicit_slots_request == nullptr) { - implicit_slots_request = lCreateElem(CE_Type); - lSetString(implicit_slots_request, CE_name, SGE_ATTR_SLOTS); - lSetString(implicit_slots_request, CE_stringval, "1"); - lSetDouble(implicit_slots_request, CE_doubleval, 1); - } - - limit_list = lGetList(rule, RQR_limit); - for_each_rw(limit, limit_list) { - bool is_forced = false; - const char *limit_name = lGetString(limit, RQRL_name); - lListElem *raw_centry = centry_list_locate(a->centry_list, limit_name); - - if (raw_centry == nullptr) { - DPRINTF("ignoring limit %s because not defined", limit_name); - continue; - } else { - DPRINTF("checking limit %s\n", lGetString(raw_centry, CE_name)); + while ((rule = lPrev(rule))) { + if (rqs_match_assignment(rule, a) && !is_cqueue_global(rule)) { + return true; } + } + return false; +} - is_forced = lGetUlong(raw_centry, CE_requestable) == REQU_FORCED; - lList *job_centry_list = job_get_hard_resource_listRW(a->job); - // @todo CS-400: we only need job_centry. Have a function searching it in the 3 possible request lists - lListElem *job_centry = centry_list_locate(job_centry_list, limit_name); - - /* check for implicit slot and default request */ - if (job_centry == nullptr) { - if (strcmp(lGetString(raw_centry, CE_name), SGE_ATTR_SLOTS) == 0) { - job_centry = implicit_slots_request; - } else if (lGetString(raw_centry, CE_defaultval) != nullptr && lGetUlong(raw_centry, CE_consumable)) { - double request; - parse_ulong_val(&request, nullptr, lGetUlong(raw_centry, CE_valtype), lGetString(raw_centry, CE_defaultval), nullptr, 0); - - /* default requests with zero value are ignored */ - if (request == 0.0 && lGetUlong(raw_centry, CE_relop) != CMPLXEXCL_OP) { - continue; - } - lSetString(raw_centry, CE_stringval, lGetString(raw_centry, CE_defaultval)); - lSetDouble(raw_centry, CE_doubleval, request); - job_centry = raw_centry; - DPRINTF("using default request for %s!\n", lGetString(raw_centry, CE_name)); - } else if (is_forced) { - schedd_mes_add(a->monitor_alpp, a->monitor_next_run, a->job_id, - SCHEDD_INFO_NOTREQFORCEDRES); - ret = DISPATCH_NEVER_CAT; - break; - } else { - /* ignoring because centry was not requested and is no consumable */ - DPRINTF("complex not requested!\n"); - continue; - } +/****** sge_resource_quota_schedd/host_shadowed() ****************************** +* NAME +* host_shadowed() -- Check for host rule before current rule +* +* SYNOPSIS +* static bool host_shadowed(const lListElem *rule, sge_assignment_t *a) +* +* FUNCTION +* Check whether there is any host specific rule before the +* current rule. +* +* INPUTS +* const lListElem *rule - Current rule +* sge_assignment_t *a - Scheduler assignment +* +* RESULT +* static bool - True if shadowed +* +* EXAMPLE +* limit host gridware to F001=1 +* limit queue Q001 to F001=0 (--> returns 'true' due to 'gridware' meaning +* that Q001 can't be generelly ruled out ) +* +* NOTES +* MT-NOTE: host_shadowed() is MT safe +*******************************************************************************/ +bool host_shadowed(const lListElem *rule, sge_assignment_t *a) +{ + while ((rule = lPrev(rule))) { + if (rqs_match_assignment(rule, a) && !is_host_global(rule)) { + return true; } + } + return false; +} - { - lList *tmp_centry_list = lCreateList("", CE_Type); - lList *tmp_rue_list = lCreateList("", RUE_Type); - lListElem *tmp_centry_elem = nullptr; - lListElem *tmp_rue_elem = nullptr; - - if (rqs_set_dynamical_limit(limit, a->gep, exec_host, a->centry_list)) { - const lList *rue_list = lGetList(limit, RQRL_usage); - u_long64 tmp_time = a->start; - - /* create tmp_centry_list */ - tmp_centry_elem = lCopyElem(raw_centry); - lSetString(tmp_centry_elem, CE_stringval, lGetString(limit, RQRL_value)); - lSetDouble(tmp_centry_elem, CE_doubleval, lGetDouble(limit, RQRL_dvalue)); - lAppendElem(tmp_centry_list, tmp_centry_elem); +/****** sge_resource_quota_schedd/rqs_excluded_cqueues() *********************** +* NAME +* rqs_excluded_cqueues() -- Find excluded queues +* +* SYNOPSIS +* static void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a) +* +* FUNCTION +* Find queues that are excluded by previous rules. +* +* INPUTS +* const lListElem *rule - The rule +* sge_assignment_t *a - Scheduler assignement +* +* EXAMPLE +* limit projects {*} queues !Q001 to F001=1 +* limit to F001=0 ( ---> returns Q001 in a->skip_cqueue_list) +* +* NOTES +* MT-NOTE: rqs_excluded_cqueues() is MT safe +*******************************************************************************/ +void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a) +{ + const lListElem *cq; + const lListElem *prev; + int ignored = 0, excluded = 0; - /* create tmp_rue_list */ - rqs_get_rue_string(&rue_name, rule, a->user, a->project, host, queue, nullptr); - tmp_rue_elem = lCopyElem(lGetElemStr(rue_list, RUE_name, sge_dstring_get_string(&rue_name))); - if (tmp_rue_elem == nullptr) { - tmp_rue_elem = lCreateElem(RUE_Type); - } - lSetString(tmp_rue_elem, RUE_name, limit_name); - lAppendElem(tmp_rue_list, tmp_rue_elem); - - sge_dstring_clear(&reason); - ret = ri_time_by_slots(a, job_centry, nullptr, tmp_centry_list, tmp_rue_list, - nullptr, &reason, false, 1, DOMINANT_LAYER_RQS, 0.0, &tmp_time, - SGE_RQS_NAME); - if (ret != DISPATCH_OK) { - DPRINTF("denied because: %s\n", sge_dstring_get_string(&reason)); - lFreeList(&tmp_rue_list); - lFreeList(&tmp_centry_list); - break; - } + DENTER(TOP_LAYER); - if (a->is_reservation && ret == DISPATCH_OK) { - *start = tmp_time; - } + for_each_ep(cq, *ocs::DataStore::get_master_list(SGE_TYPE_CQUEUE)) { + const char *cqname = lGetString(cq, CQ_name); + bool exclude = true; - lFreeList(&tmp_rue_list); - lFreeList(&tmp_centry_list); + if (lGetElemStr(a->skip_cqueue_list, CTI_name, cqname)) { + ignored++; + continue; + } + + prev = rule; + while ((prev = lPrev(prev))) { + if (!rqs_match_assignment(rule, a)) + continue; + + if (rqs_filter_match(lGetObject(prev, RQR_filter_queues), FILTER_QUEUES, cqname, nullptr, nullptr, nullptr, nullptr)) { + exclude = false; + break; } } + if (exclude) { + lAddElemStr(&(a->skip_cqueue_list), CTI_name, cqname, CTI_Type); + excluded++; + } } - sge_dstring_free(&reason); - sge_dstring_free(&rue_name); + if (ignored + excluded == 0) { + CRITICAL("not a single queue excluded in rqs_excluded_cqueues()\n"); + } - DRETURN(ret); + DRETURN_VOID; } -/****** sge_resource_quota_schedd/rqs_by_slots() *********************************** +/****** sge_resource_quota_schedd/rqs_excluded_hosts() ************************* * NAME -* rqs_by_slots() -- Check queue instance suitability due to RQS +* rqs_excluded_hosts() -- Find excluded hosts * * SYNOPSIS -* dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, -* const char *host, u_long64 *tt_rqs_all, bool *is_global, -* dstring *rue_string, dstring *limit_name, dstring *rule_name) +* static void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a) * * FUNCTION -* Checks (or determines earliest time) queue instance suitability -* according to resource quota set limits. -* -* For performance reasons RQS verification results are cached in -* a->limit_list. In addition unsuited queues and hosts are collected -* in a->skip_cqueue_list and a->skip_host_list so that ruling out -* chunks of queue instance becomes quite cheap. +* Find hosts that are excluded by previous rules. * * INPUTS -* sge_assignment_t *a - assignment -* const char *queue - cluster queue name -* const char *host - host name -* u_long64 *tt_rqs_all - returns earliest time over all resource quotas -* bool *is_global - returns true if result is valid for any other queue -* dstring *rue_string - caller maintained buffer -* dstring *limit_name - caller maintained buffer -* dstring *rule_name - caller maintained buffer -* u_long64 tt_best - time of best solution found so far +* const lListElem *rule - The rule +* sge_assignment_t *a - Scheduler assignement * -* RESULT -* static dispatch_t - usual return values +* EXAMPLE +* limit projects {*} queues !gridware to F001=1 +* limit to F001=0 ( ---> returns gridware in skip_host_list) * * NOTES -* MT-NOTE: rqs_by_slots() is MT safe +* MT-NOTE: rqs_excluded_hosts() is MT safe *******************************************************************************/ -dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, const char *host, - u_long64 *tt_rqs_all, bool *is_global, dstring *rue_string, dstring *limit_name, dstring *rule_name, u_long64 tt_best) +void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a) { - const lListElem *rqs; - dispatch_t result = DISPATCH_OK; + const lListElem *eh; + const lListElem *prev; + int ignored = 0, excluded = 0; DENTER(TOP_LAYER); - *is_global = false; - - if (lGetNumberOfElem(a->rqs_list) > 0) { - SCHED_PROF_INC(a->pi, seq_rqs); - } - - for_each_ep(rqs, a->rqs_list) { - u_long64 tt_rqs = a->start; - const char *user = a->user; - const char *group = a->group; - const lList *grp_list = a->grp_list; - const char *project = a->project; - const lListElem *rule; + for_each_ep(eh, a->host_list) { + const char *hname = lGetHost(eh, EH_name); + bool exclude = true; - if (!lGetBool(rqs, RQS_enabled)) { + if (lGetElemStr(a->skip_host_list, CTI_name, hname)) { + ignored++; continue; } - sge_dstring_clear(rule_name); - rule = rqs_get_matching_rule(rqs, user, group, grp_list, project, nullptr, host, queue, a->acl_list, a->hgrp_list, rule_name); - if (rule != nullptr) { - const char *limit; - lListElem *rql; - - /* need unique identifier for cache */ - rqs_get_rue_string(rue_string, rule, user, project, host, queue, nullptr); - sge_dstring_sprintf(limit_name, "%s=%s", sge_dstring_get_string(rule_name), sge_dstring_get_string(rue_string)); - limit = sge_dstring_get_string(limit_name); - - /* check limit or reuse earlier results */ - if ((rql=lGetElemStrRW(a->limit_list, RQL_name, limit))) { - tt_rqs = lGetUlong64(rql, RQL_time); - result = (dispatch_t)lGetInt(rql, RQL_result); - } else { - /* Check booked usage */ - result = rqs_limitation_reached(a, rule, host, queue, &tt_rqs); - - rql = lAddElemStr(&(a->limit_list), RQL_name, limit, RQL_Type); - lSetInt(rql, RQL_result, result); - lSetUlong64(rql, RQL_time, tt_rqs); - /* init with same value as QU_tagged4schedule */ - lSetUlong(rql, RQL_tagged4schedule, 2); + prev = rule; + while ((prev = lPrev(prev))) { + if (!rqs_match_assignment(rule, a)) + continue; - if (result != DISPATCH_OK && result != DISPATCH_MISSING_ATTR) { - schedd_mes_add(a->monitor_alpp, a->monitor_next_run, a->job_id, - SCHEDD_INFO_CANNOTRUNRQSGLOBAL_SS, - sge_dstring_get_string(rue_string), sge_dstring_get_string(rule_name)); - if (rqs_exceeded_sort_out(a, rule, rule_name, queue, host)) { - *is_global = true; - } - } + if (rqs_filter_match(lGetObject(prev, RQR_filter_hosts), FILTER_HOSTS, hname, nullptr, a->hgrp_list, nullptr, nullptr)) { + exclude = false; + break; } + } + if (exclude) { + lAddElemStr(&(a->skip_host_list), CTI_name, hname, CTI_Type); + excluded++; + } + } - if (result == DISPATCH_MISSING_ATTR) { - result = DISPATCH_OK; - continue; - } - if (result != DISPATCH_OK) - break; + if (ignored + excluded == 0) { + CRITICAL("not a single host excluded in rqs_excluded_hosts()\n"); + } - if (a->is_reservation && tt_rqs >= tt_best) { - /* no need to further investigate these ones */ - if (rqs_exceeded_sort_out(a, rule, rule_name, queue, host)) - *is_global = true; - } + DRETURN_VOID; +} - *tt_rqs_all = MAX(*tt_rqs_all, tt_rqs); +/****** sge_resource_quota_schedd/cqueue_shadowed_by() ************************* +* NAME +* cqueue_shadowed_by() -- Check rules shadowing current cluster queue rule +* +* SYNOPSIS +* static bool cqueue_shadowed_by(const char *cqname, const lListElem *rule, +* sge_assignment_t *a) +* +* FUNCTION +* Check if cluster queue in current rule is shadowed. +* +* INPUTS +* const char *cqname - Cluster queue name to check +* const lListElem *rule - Current rule +* sge_assignment_t *a - Assignment +* +* RESULT +* static bool - True if shadowed +* +* EXAMPLE +* limits queues Q001,Q002 to F001=1 +* limits queues Q002,Q003 to F001=1 (--> returns 'true' for Q002 and 'false' for Q003) +* +* NOTES +* MT-NOTE: cqueue_shadowed_by() is MT safe +*******************************************************************************/ +bool cqueue_shadowed_by(const char *cqname, const lListElem *rule, sge_assignment_t *a) +{ + while ((rule = lPrev(rule))) { + if (rqs_match_assignment(rule, a) && + rqs_filter_match(lGetObject(rule, RQR_filter_queues), FILTER_QUEUES, cqname, nullptr, nullptr, nullptr, nullptr)) { + return true; } } - if (!rqs) { - result = DISPATCH_OK; + return false; +} + +/****** sge_resource_quota_schedd/host_shadowed_by() *************************** +* NAME +* host_shadowed_by() -- ??? +* +* SYNOPSIS +* static bool host_shadowed_by(const char *host, const lListElem *rule, +* sge_assignment_t *a) +* +* FUNCTION +* Check if host in current rule is shadowed. +* +* INPUTS +* const char *cqname - Host name to check +* const lListElem *rule - Current rule +* sge_assignment_t *a - Assignment +* +* RESULT +* static bool - True if shadowed +* +* EXAMPLE +* limits hosts host1,host2 to F001=1 +* limits hosts host2,host3 to F001=1 (--> returns 'true' for host2 and 'false' for host3) +* +* NOTES +* MT-NOTE: host_shadowed_by() is MT safe +*******************************************************************************/ +bool host_shadowed_by(const char *host, const lListElem *rule, sge_assignment_t *a) +{ + while ((rule = lPrev(rule))) { + if (rqs_match_assignment(rule, a) && + rqs_filter_match(lGetObject(rule, RQR_filter_hosts), FILTER_HOSTS, host, nullptr, a->hgrp_list, nullptr, nullptr)) { + return true; + } } - if (result == DISPATCH_OK || result == DISPATCH_MISSING_ATTR) { - DPRINTF("rqs_by_slots(%s@%s) returns " sge_u64 "\n", queue, host, tt_rqs_all); - } else { - DPRINTF("rqs_by_slots(%s@%s) returns " sge_u64 " (%s)\n", queue, host, tt_rqs_all, *is_global?"global":"not global"); + return false; +} + +/****** sge_resource_quota_schedd/rqs_set_dynamical_limit() *********************** +* NAME +* rqs_set_dynamical_limit() -- evaluate dynamical limit +* +* SYNOPSIS +* bool rqs_set_dynamical_limit(lListElem *limit, lListElem +* *global_host, lListElem *exec_host, lList *centry) +* +* FUNCTION +* The function evaluates if neccessary the dynamical limit for a host and +* sets the evaluated double value in the given limitation element (RQRL_dvalue). +* +* A evaluation is neccessary if the limit boolean RQRL_dynamic is true. This +* field is set by qmaster during the rule set verification +* +* INPUTS +* lListElem *limit - limitation (RQRL_Type) +* lListElem *global_host - global host (EH_Type) +* lListElem *exec_host - exec host (EH_Type) +* lList *centry - consumable resource list (CE_Type) +* +* RESULT +* bool - always true +* +* NOTES +* MT-NOTE: rqs_set_dynamical_limit() is MT safe +* +*******************************************************************************/ +bool +rqs_set_dynamical_limit(lListElem *limit, lListElem *global_host, lListElem *exec_host, const lList *centry) { + + DENTER(TOP_LAYER); + + if (lGetBool(limit, RQRL_dynamic)) { + double dynamic_limit = scaled_mixed_load(lGetString(limit, RQRL_value), global_host, exec_host, centry); + DPRINTF("found a dynamic limit for host %s with value %d\n", lGetHost(exec_host, EH_name), (int)dynamic_limit); + lSetDouble(limit, RQRL_dvalue, dynamic_limit); } - DRETURN(result); + DRETURN(true); } + diff --git a/source/libs/sched/sge_resource_quota_schedd.h b/source/libs/sched/sge_select_queue_rqs.h similarity index 73% rename from source/libs/sched/sge_resource_quota_schedd.h rename to source/libs/sched/sge_select_queue_rqs.h index c3d7c0052..5288abf8c 100644 --- a/source/libs/sched/sge_resource_quota_schedd.h +++ b/source/libs/sched/sge_select_queue_rqs.h @@ -1,57 +1,67 @@ #pragma once /*___INFO__MARK_BEGIN__*/ /************************************************************************* - * + * * The Contents of this file are made available subject to the terms of * the Sun Industry Standards Source License Version 1.2 - * + * * Sun Microsystems Inc., March, 2001 - * - * + * + * * Sun Industry Standards Source License Version 1.2 * ================================================= * The contents of this file are subject to the Sun Industry Standards * Source License Version 1.2 (the "License"); You may not use this file * except in compliance with the License. You may obtain a copy of the * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html - * + * * Software provided under this License is provided on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. * See the License for the specific provisions governing your rights and * obligations concerning the Software. - * + * * The Initial Developer of the Original Code is: Sun Microsystems, Inc. - * + * * Copyright: 2001 by Sun Microsystems, Inc. - * + * * All Rights Reserved. - * + * * Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH * ************************************************************************/ /*___INFO__MARK_END__*/ -#include "sgeobj/sge_resource_quota.h" #include "sge_select_queue.h" -#include "sgeobj/cull/sge_resource_utilization_RUE_L.h" -#include "sgeobj/cull/sge_resource_utilization_RDE_L.h" - -bool rqs_set_dynamical_limit(lListElem *limit, lListElem *global_host, lListElem *exec_host, const lList *centry); - - -bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, const char *group, const lList *grp_list, const lList *acl_list); /* parallel assignments */ dispatch_t parallel_rqs_slots_by_time(sge_assignment_t *a, int *slots, lListElem *qep, bool need_master, bool is_master_queue); -void parallel_check_and_debit_rqs_slots(sge_assignment_t *a, const char *host, const char *queue, + +void parallel_check_and_debit_rqs_slots(sge_assignment_t *a, const char *host, const char *queue, int *slots, dstring *rule_name, dstring *rue_name, dstring *limit_name); -void parallel_revert_rqs_slot_debitation(sge_assignment_t *a, const char *host, const char *queue, + +void parallel_revert_rqs_slot_debitation(sge_assignment_t *a, const char *host, const char *queue, int slots, dstring *rule_name, dstring *rue_name, dstring *limit_name); /* sequential assignments */ -dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, const char *host, +dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, const char *host, u_long64 *tt_rqs_all, bool *is_global, dstring *rue_string, dstring *limit_name, dstring *rule_name, u_long64 tt_best); + +void rqs_can_optimize(const lListElem *rule, bool *host, bool *queue, sge_assignment_t *a); + +void rqs_expand_cqueues(const lListElem *rule, sge_assignment_t *a); +void rqs_expand_hosts(const lListElem *rule, sge_assignment_t *a); + +bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a); +bool host_shadowed(const lListElem *rule, sge_assignment_t *a); + +void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a); +void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a); + +bool cqueue_shadowed_by(const char *cqname, const lListElem *rule, sge_assignment_t *a); +bool host_shadowed_by(const char *host, const lListElem *rule, sge_assignment_t *a); + +bool rqs_set_dynamical_limit(lListElem *limit, lListElem *global_host, lListElem *exec_host, const lList *centry); diff --git a/test/daemons/common/test_common_category.cc b/test/daemons/common/test_common_category.cc index e8e8a2ccf..0322f8d13 100644 --- a/test/daemons/common/test_common_category.cc +++ b/test/daemons/common/test_common_category.cc @@ -43,11 +43,10 @@ #include "cull/cull_multitype.h" #include "sgeobj/cull/sge_all_listsL.h" +#include "sgeobj/ocs_Category.h" #include "sgeobj/sge_job.h" #include "sgeobj/sge_range.h" -#include "category.h" - typedef struct { int test_nr; //< test number u_long32 type; //< the job type @@ -526,7 +525,7 @@ static double test_performance(lListElem *job_elem, const int max, const lList* gettimeofday(&before, nullptr); for (int i = 0; i < max; i++) { - sge_build_job_category_dstring(&category_str, job_elem, access_list, nullptr, nullptr, rqs_list); + ocs::Category::build_string(&category_str, job_elem, access_list, nullptr, rqs_list); sge_dstring_clear(&category_str); } @@ -585,7 +584,7 @@ static int test(const data_entry_t *test, const char *result, const int count) { if (job_elem != nullptr) { dstring category_str = DSTRING_INIT; - sge_build_job_category_dstring(&category_str, job_elem, access_list, project_list, nullptr, rqs_list); + ocs::Category::build_string(&category_str, job_elem, access_list, project_list, rqs_list); printf("got : <%s>\n", sge_dstring_get_string(&category_str)!=nullptr?sge_dstring_get_string(&category_str):""); diff --git a/test/daemons/qmaster/CMakeLists.txt b/test/daemons/qmaster/CMakeLists.txt index 1a928d71f..199d7dc16 100644 --- a/test/daemons/qmaster/CMakeLists.txt +++ b/test/daemons/qmaster/CMakeLists.txt @@ -29,6 +29,7 @@ add_test(NAME test_qmaster_timed_event COMMAND test_qmaster_timed_event) add_executable(test_qmaster_calendar test_qmaster_calendar.cc + ../../../source/daemons/qmaster/ocs_CategoryQmaster.cc ../../../source/daemons/qmaster/ocs_ReportingFileWriter.cc ../../../source/daemons/qmaster/ocs_BaseAccountingFileWriter.cc ../../../source/daemons/qmaster/ocs_BaseReportingFileWriter.cc @@ -65,7 +66,6 @@ add_executable(test_qmaster_calendar ../../../source/daemons/qmaster/sge_hgroup_qmaster.cc ../../../source/daemons/qmaster/sge_thread_scheduler.cc ../../../source/daemons/qmaster/sge_qmaster_threads.cc - ../../../source/daemons/qmaster/sge_sched_job_category.cc ../../../source/daemons/qmaster/sge_sched_process_events.cc ../../../source/daemons/qmaster/sge_sched_prepare_data.cc ../../../source/daemons/qmaster/sge_sched_thread.cc From b88cfc9c74c5cfada9877d370d77b6f8f9444e7d Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Tue, 8 Apr 2025 14:59:15 +0200 Subject: [PATCH 06/10] EH: CS-209: Implement client functionality to show details about categories --- source/clients/qconf/CMakeLists.txt | 2 + source/clients/qconf/msg_qconf.h | 2 + source/clients/qconf/ocs_qconf_Category.cc | 132 +++++++++++++++++++++ source/clients/qconf/ocs_qconf_Category.h | 41 +++++++ source/clients/qconf/ocs_qconf_parse.cc | 33 +++++- source/clients/qevent/ocs_qevent.cc | 31 ++++- 6 files changed, 238 insertions(+), 3 deletions(-) create mode 100644 source/clients/qconf/ocs_qconf_Category.cc create mode 100644 source/clients/qconf/ocs_qconf_Category.h diff --git a/source/clients/qconf/CMakeLists.txt b/source/clients/qconf/CMakeLists.txt index d0d805dc4..687e3263e 100644 --- a/source/clients/qconf/CMakeLists.txt +++ b/source/clients/qconf/CMakeLists.txt @@ -21,6 +21,7 @@ # source/clients/qconf add_executable( qconf + ocs_qconf_Category.cc ocs_qconf_cqueue.cc ocs_qconf_centry.cc sge_qconf_hgroup.cc @@ -31,6 +32,7 @@ add_executable( ../../common/sig_handlers.cc ../../common/sge_options.cc ../../common/usage.cc) + target_include_directories(qconf PUBLIC ${SGE_INCLUDES}) target_link_libraries( qconf diff --git a/source/clients/qconf/msg_qconf.h b/source/clients/qconf/msg_qconf.h index 3b535d9a9..5e99c5ee9 100644 --- a/source/clients/qconf/msg_qconf.h +++ b/source/clients/qconf/msg_qconf.h @@ -72,5 +72,7 @@ #define MSG_RQS_NOTFOUNDINFILE_SS _MESSAGE(5251, _("resource quota set " SFQ " not found in file " SFQ)) +#define MSG_CAT_DOESNOTEXIST_U _MESSAGE(5252, _("Category " sge_uu32 " does not exist")) + // clang-format on diff --git a/source/clients/qconf/ocs_qconf_Category.cc b/source/clients/qconf/ocs_qconf_Category.cc new file mode 100644 index 000000000..758b36c19 --- /dev/null +++ b/source/clients/qconf/ocs_qconf_Category.cc @@ -0,0 +1,132 @@ +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "uti/sge_log.h" +#include "uti/sge_rmon_macros.h" + +#include "cull/cull.h" + +#include "sgeobj/ocs_Category.h" +#include "sgeobj/sge_answer.h" + +#include "gdi/ocs_gdi_Client.h" + +#include "spool/flatfile/sge_flatfile.h" +#include "spool/flatfile/sge_flatfile_obj.h" + +#include "ocs_qconf_Category.h" +#include "msg_qconf.h" + +bool ocs::CategoryQconf::show_list(lList **answer_list) { + DENTER(TOP_LAYER); + + lList *cat_list = get_via_gdi(answer_list); + if (cat_list != nullptr) { + spool_flatfile_align_list(answer_list, (const lList *) cat_list, CAT_fields, 3); + + const char *filename = spool_flatfile_write_list(answer_list, cat_list, CAT_fields, &qconf_cat_list_sfi, + SP_DEST_STDOUT, SP_FORM_ASCII, nullptr, false); + sge_free(&filename); + lFreeList(&cat_list); + + } + + if (answer_list_has_error(answer_list)) { + DRETURN(false); + } + + DRETURN(true); +} + +bool ocs::CategoryQconf::show(lList **answer_list, u_long64 id) { + DENTER(TOP_LAYER); + + lListElem *centry = get_via_gdi(answer_list, id); + if (centry != nullptr) { + const char *filename = spool_flatfile_write_object(answer_list, centry, false, CAT_fields, &qconf_cat_sfi, SP_DEST_STDOUT, + SP_FORM_ASCII, nullptr, false); + sge_free(&filename); + lFreeElem(¢ry); + if (answer_list_has_error(answer_list)) { + DRETURN(false); + } + } else { + answer_list_add_sprintf(answer_list, STATUS_ERROR1, ANSWER_QUALITY_ERROR, MSG_CAT_DOESNOTEXIST_U64, id); + DRETURN(false); + } + + DRETURN(true); +} + +lListElem * +ocs::CategoryQconf::get_via_gdi(lList **answer_list, u_long64 id) { + DENTER(TOP_LAYER); + + // Get the list via GDI + lList *cat_list = nullptr; + lEnumeration *what = lWhat("%T(ALL)", CT_Type); + lCondition *where = lWhere("%T(%I==%u)", CT_Type, CT_id, id); + lList *gdi_answer_list = gdi::Client::sge_gdi(gdi::Target::TargetValue::SGE_CAT_LIST, gdi::Command::SGE_GDI_GET, + gdi::SubCommand::SGE_GDI_SUB_NONE, &cat_list, where, what); + lFreeWhat(&what); + lFreeWhere(&where); + + // Return the answer list if there was an error + lListElem *ret = nullptr; + if (answer_list_has_error(&gdi_answer_list)) { + answer_list_replace(answer_list, &gdi_answer_list); + } + lFreeList(&gdi_answer_list); + + // Dechain the first element + if (cat_list != nullptr) { + ret = lDechainElem(cat_list, lFirstRW(cat_list)); + lFreeList(&cat_list); + } + + DRETURN(ret); +} + +lList * +ocs::CategoryQconf::get_via_gdi(lList **answer_list) { + DENTER(TOP_LAYER); + + // Get the list via GDI + lList *ret = nullptr; + lEnumeration *what = lWhat("%T(ALL)", CT_Type); + lList *gdi_answer_list = gdi::Client::sge_gdi(gdi::Target::TargetValue::SGE_CAT_LIST, gdi::Command::SGE_GDI_GET, + gdi::SubCommand::SGE_GDI_SUB_NONE, &ret, nullptr, what); + lFreeWhat(&what); + + // Return the answer list if there was an error + if (answer_list_has_error(&gdi_answer_list)) { + answer_list_replace(answer_list, &gdi_answer_list); + } + lFreeList(&gdi_answer_list); + + // Sort the list + if (ret != nullptr) { + lSortOrder *order = lParseSortOrderVarArg(lGetListDescr(ret), "%I+", CT_id); + lSortList(ret, order); + lFreeSortOrder(&order); + } + + DRETURN(ret); +} diff --git a/source/clients/qconf/ocs_qconf_Category.h b/source/clients/qconf/ocs_qconf_Category.h new file mode 100644 index 000000000..623bff3ea --- /dev/null +++ b/source/clients/qconf/ocs_qconf_Category.h @@ -0,0 +1,41 @@ +#pragma once +/*___INFO__MARK_BEGIN_NEW__*/ +/*************************************************************************** + * + * Copyright 2025 HPC-Gridware GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************/ +/*___INFO__MARK_END_NEW__*/ + +#include "cull/cull.h" + +#include "sgeobj/ocs_Category.h" + +namespace ocs { + class CategoryQconf { + public: + static bool + show_list(lList **answer_list); + + static bool + show(lList **answer_list, u_long64 id); + + static lList * + get_via_gdi(lList **answer_list); + + static lListElem * + get_via_gdi(lList **answer_list, u_long64 id); + }; +} \ No newline at end of file diff --git a/source/clients/qconf/ocs_qconf_parse.cc b/source/clients/qconf/ocs_qconf_parse.cc index 655e98951..09f91e9ce 100644 --- a/source/clients/qconf/ocs_qconf_parse.cc +++ b/source/clients/qconf/ocs_qconf_parse.cc @@ -78,6 +78,7 @@ #include "sge.h" #include "sge_options.h" #include "usage.h" +#include "ocs_qconf_Category.h" #include "ocs_qconf_acl.h" #include "ocs_qconf_parse.h" #include "sge_qconf_hgroup.h" @@ -3833,7 +3834,7 @@ int sge_parse_qconf(char *argv[]) } /*-----------------------------------------------------------------------------*/ - /* "-sc complex_name_list" */ + /* "-sc" */ if (strcmp("-sc", *spp) == 0) { lList *answer_list = nullptr; @@ -3847,6 +3848,21 @@ int sge_parse_qconf(char *argv[]) continue; } +/*-----------------------------------------------------------------------------*/ + /* "-scatl complex_name_list" */ + + if (strcmp("-scatl", *spp) == 0) { + lList *answer_list = nullptr; + + if (!ocs::CategoryQconf::show_list(&answer_list)) { + show_answer(answer_list); + sge_parse_return = 1; + } + lFreeList(&answer_list); + spp++; + continue; + } + /*-----------------------------------------------------------------------------*/ /* "-scal calendar_name" */ if (strcmp("-scal", *spp) == 0) { @@ -4645,6 +4661,21 @@ int sge_parse_qconf(char *argv[]) continue; } + // -scat id + if (strcmp("-scat", *spp) == 0) { + lList *answer_list = nullptr; + + spp = sge_parser_get_next(spp); + u_long64 id = strtoull(*spp, nullptr, 10); + if (!ocs::CategoryQconf::show(&answer_list, id)) { + show_answer(answer_list); + sge_parse_return = 1; + } + lFreeList(&answer_list); + spp++; + continue; + } + /* "-sce attribute" */ if (strcmp("-sce", *spp) == 0) { lList *answer_list = nullptr; diff --git a/source/clients/qevent/ocs_qevent.cc b/source/clients/qevent/ocs_qevent.cc index 7e78da3ca..6e7ea5322 100644 --- a/source/clients/qevent/ocs_qevent.cc +++ b/source/clients/qevent/ocs_qevent.cc @@ -219,7 +219,24 @@ print_jatask_event([[maybe_unused]] sge_evc_class_t *evc, sge_object_type type, Global_jobs_registered--; fflush(stdout); } - + if (event_type == sgeE_CATEGORY_ADD) { + u_long category_id = lGetUlong(event, ET_intkey); + fprintf(stdout,"CATEGORY_ADD (%ld:ECL_TIME=" sge_u64 ")\n", category_id, timestamp); + Global_jobs_registered--; + fflush(stdout); + } + if (event_type == sgeE_CATEGORY_MOD) { + u_long category_id = lGetUlong(event, ET_intkey); + fprintf(stdout,"CATEGORY_MOD (%ld:ECL_TIME=" sge_u64 ")\n", category_id, timestamp); + Global_jobs_registered--; + fflush(stdout); + } + if (event_type == sgeE_CATEGORY_DEL) { + u_long category_id = lGetUlong(event, ET_intkey); + fprintf(stdout,"CATEGORY_DEL (%ld:ECL_TIME=" sge_u64 ")\n", category_id, timestamp); + Global_jobs_registered--; + fflush(stdout); + } } DRETURN(SGE_EMA_OK); } @@ -677,7 +694,13 @@ static void qevent_testsuite_mode(sge_evc_class_t *evc) sge_mirror_subscribe(evc, SGE_TYPE_JATASK, print_jatask_event, nullptr, nullptr, where, what); lFreeWhere(&where); lFreeWhat(&what); - + + where = nullptr; + what = lIntVector2What(JB_Type, job_nm); + sge_mirror_subscribe(evc, SGE_TYPE_CATEGORY, print_jatask_event, nullptr, nullptr, nullptr, nullptr); + lFreeWhere(&where); + lFreeWhat(&what); + /* we want a 5-second event delivery interval */ evc->ec_set_edtime(evc, 5); @@ -688,6 +711,10 @@ static void qevent_testsuite_mode(sge_evc_class_t *evc) evc->ec_set_flush(evc, sgeE_JOB_ADD, true, 1); evc->ec_set_flush(evc, sgeE_JOB_DEL, true, 1); + evc->ec_set_flush(evc, sgeE_CATEGORY_ADD, true, 1); + evc->ec_set_flush(evc, sgeE_CATEGORY_MOD, true, 1); + evc->ec_set_flush(evc, sgeE_CATEGORY_DEL, true, 1); + #endif /* QEVENT_SHOW_ALL */ while (!shut_me_down) { From 573e2e730e123a7d607b3b5206f686db285308f2 Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Thu, 17 Apr 2025 12:05:18 +0200 Subject: [PATCH 07/10] EH: CS-1159: Ensure that categories get still unused ID as primary key --- source/clients/qconf/ocs_qconf_Category.cc | 4 +- source/clients/qconf/ocs_qconf_Category.h | 2 +- source/daemons/qmaster/ocs_CategoryQmaster.cc | 125 +++++++++++++++--- source/daemons/qmaster/ocs_CategoryQmaster.h | 18 ++- source/daemons/qmaster/setup_qmaster.cc | 98 ++------------ source/daemons/qmaster/sge_job_qmaster.cc | 10 +- .../daemons/qmaster/sge_sched_prepare_data.cc | 116 ---------------- .../qmaster/sge_sched_process_events.cc | 6 +- source/daemons/qmaster/sge_sched_thread.h | 1 + .../daemons/qmaster/sge_thread_scheduler.cc | 13 +- source/libs/sgeobj/cull/sge_job_JB_L.h | 2 +- source/libs/sgeobj/lwdb/ocs_JB_attributes.h | 2 +- source/libs/sgeobj/ocs_Category.cc | 7 - source/libs/sgeobj/ocs_Category.h | 10 +- source/libs/uti/sge_log.h | 3 +- source/libs/uti/sge_uidgid.cc | 5 + test/daemons/common/test_common_category.cc | 2 +- test/libs/uti/test_uti_uidgid.cc | 3 + 18 files changed, 171 insertions(+), 256 deletions(-) diff --git a/source/clients/qconf/ocs_qconf_Category.cc b/source/clients/qconf/ocs_qconf_Category.cc index 758b36c19..78759f18f 100644 --- a/source/clients/qconf/ocs_qconf_Category.cc +++ b/source/clients/qconf/ocs_qconf_Category.cc @@ -55,7 +55,7 @@ bool ocs::CategoryQconf::show_list(lList **answer_list) { DRETURN(true); } -bool ocs::CategoryQconf::show(lList **answer_list, u_long64 id) { +bool ocs::CategoryQconf::show(lList **answer_list, u_long32 id) { DENTER(TOP_LAYER); lListElem *centry = get_via_gdi(answer_list, id); @@ -68,7 +68,7 @@ bool ocs::CategoryQconf::show(lList **answer_list, u_long64 id) { DRETURN(false); } } else { - answer_list_add_sprintf(answer_list, STATUS_ERROR1, ANSWER_QUALITY_ERROR, MSG_CAT_DOESNOTEXIST_U64, id); + answer_list_add_sprintf(answer_list, STATUS_ERROR1, ANSWER_QUALITY_ERROR, MSG_CAT_DOESNOTEXIST_U, id); DRETURN(false); } diff --git a/source/clients/qconf/ocs_qconf_Category.h b/source/clients/qconf/ocs_qconf_Category.h index 623bff3ea..35ae2afaa 100644 --- a/source/clients/qconf/ocs_qconf_Category.h +++ b/source/clients/qconf/ocs_qconf_Category.h @@ -30,7 +30,7 @@ namespace ocs { show_list(lList **answer_list); static bool - show(lList **answer_list, u_long64 id); + show(lList **answer_list, u_long32 id); static lList * get_via_gdi(lList **answer_list); diff --git a/source/daemons/qmaster/ocs_CategoryQmaster.cc b/source/daemons/qmaster/ocs_CategoryQmaster.cc index 94654cece..d0b504d16 100644 --- a/source/daemons/qmaster/ocs_CategoryQmaster.cc +++ b/source/daemons/qmaster/ocs_CategoryQmaster.cc @@ -23,11 +23,16 @@ #include "uti/sge_dstring.h" #include "sgeobj/ocs_Category.h" +#include "sgeobj/ocs_Session.h" #include "sgeobj/sge_job.h" #include "ocs_CategoryQmaster.h" #include "ocs_DataStore.h" #include "sge_event_master.h" +#include "sge_resource_quota_qmaster.h" +#include "sge_cqueue_qmaster.h" +#include "sge_pe_qmaster.h" +#include "sge_host_qmaster.h" /****************************************************** * @@ -62,14 +67,94 @@ * ******************************************************/ +void +ocs::CategoryQmaster::initialize_prj_and_uset_for_categories(lList *master_project_list, lList *master_userset_list, + const lList *master_rqs_list, const lList *master_cqueue_list, + const lList *master_pe_list, const lList *master_host_list) { + const lListElem *cq, *pe, *hep, *ep; + const lListElem *rqs; + lList *u_list = nullptr, *p_list = nullptr; + bool all_projects = false; + bool all_usersets = false; + + /* + * collect a list of references to usersets/projects used in + * the resource quota sets + */ + for_each_ep(rqs, master_rqs_list) { + if (!all_projects && !rqs_diff_projects(rqs, nullptr, &p_list, nullptr, master_project_list)) { + all_projects = true; + } + if (!all_usersets && !rqs_diff_usersets(rqs, nullptr, &u_list, nullptr, master_userset_list)) { + all_usersets = true; + } + if (all_usersets && all_projects) { + break; + } + } + + /* + * collect list of references to usersets/projects used as ACL + * with queue_conf(5), host_conf(5) and sge_pe(5) + */ + for_each_ep(cq, master_cqueue_list) { + cqueue_diff_projects(cq, nullptr, &p_list, nullptr); + cqueue_diff_usersets(cq, nullptr, &u_list, nullptr); + } + + for_each_ep(pe, master_pe_list) { + pe_diff_usersets(pe, nullptr, &u_list, nullptr); + } + + for_each_ep(hep, master_host_list) { + host_diff_projects(hep, nullptr, &p_list, nullptr); + host_diff_usersets(hep, nullptr, &u_list, nullptr); + } + + /* + * now set categories flag with usersets/projects used as ACL + */ + for_each_ep(ep, p_list) { + lListElem *prj = lGetElemStrRW(master_project_list, PR_name, lGetString(ep, PR_name)); + if (prj != nullptr) { + lSetBool(prj, PR_consider_with_categories, true); + } + } + + for_each_ep(ep, u_list) { + lListElem *acl = lGetElemStrRW(master_userset_list, US_name, lGetString(ep, US_name)); + if (acl != nullptr) { + lSetBool(acl, US_consider_with_categories, true); + } + } + + lFreeList(&p_list); + lFreeList(&u_list); +} + +void +ocs::CategoryQmaster::initialize_prj_uset_and_create_categories(lList **master_category_list, lList *master_job_list, + lList *master_project_list, lList *master_userset_list, + const lList *master_rqs_list, const lList *master_cqueue_list, + const lList *master_pe_list, const lList *master_host_list) { + // Initialize projects and usersets for the used is categories + // names will be part of the category string if they are used in cqueue, PEs, hosts or RQS + initialize_prj_and_uset_for_categories(master_project_list, master_userset_list, master_rqs_list, + master_cqueue_list, master_pe_list, master_host_list); + + // Create all categories + attach_all_jobs(master_job_list, master_category_list, master_userset_list, master_project_list, + master_rqs_list, false, SessionManager::GDI_SESSION_NONE); +} + bool -ocs::CategoryQmaster::attach_job(lList **master_category_list, lListElem **category, lListElem *job, - const lList *master_userset_list, const lList *master_project_list, - const lList *master_rqs_list, bool send_events, u_long32 gdi_session) { +ocs::CategoryQmaster::attach_job(lList **master_category_list, lListElem *job, + const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, + bool send_events, u_long32 gdi_session) { DENTER(TOP_LAYER); // check if the input parameters are valid - if (master_category_list == nullptr || category == nullptr || job == nullptr) { + if (master_category_list == nullptr || job == nullptr) { DRETURN(false); } @@ -87,26 +172,26 @@ ocs::CategoryQmaster::attach_job(lList **master_category_list, lListElem **categ // get the category or create a new one bool is_new = false; - *category = lGetElemStrRW(*master_category_list, CT_str, cat_str); - if (*category == nullptr) { - *category = lAddElemStr(master_category_list, CT_str, cat_str, CT_Type); - lSetUlong(*category, CT_id, Category::get_next_id()); + lListElem *category = lGetElemStrRW(*master_category_list, CT_str, cat_str); + if (category == nullptr) { + category = lAddElemStr(master_category_list, CT_str, cat_str, CT_Type); + lSetUlong(category, CT_id, Category::get_next_id(*master_category_list)); is_new = true; } sge_dstring_free(&category_str); // Increase the reference count - lSetUlong(*category, CT_refcount, lGetUlong(*category, CT_refcount) + 1); + lSetUlong(category, CT_refcount, lGetUlong(category, CT_refcount) + 1); // Point to the category in the job - u_long32 category_id = lGetUlong(*category, CT_id); + u_long32 category_id = lGetUlong(category, CT_id); lSetUlong(job, JB_category_id, category_id); // Send events if required if (send_events) { ev_event category_event_type = is_new ? sgeE_CATEGORY_ADD : sgeE_CATEGORY_MOD; sge_add_event(0, category_event_type, category_id, 0, nullptr, - nullptr, nullptr, *category, gdi_session); + nullptr, nullptr, category, gdi_session); } DRETURN(true); @@ -121,9 +206,9 @@ ocs::CategoryQmaster::detach_job(lList **master_category_list, lListElem *job, b DRETURN(false); } lListElem *category = lGetElemUlongRW(*master_category_list, CT_id, lGetUlong(job, JB_category_id)); - if (category == nullptr) { - DRETURN(false); - } + + // each category that is referenced in a job should also exist + SGE_ASSERT(category != nullptr); // decrease the reference count or remove the category bool is_del = false; @@ -154,23 +239,20 @@ ocs::CategoryQmaster::reattach_job(lList **master_category_list, lListElem *job, detach_job(master_category_list, job, send_events, gdi_session); // add the job to the new category - lListElem *category; - attach_job(master_category_list, &category, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); + attach_job(master_category_list, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); DRETURN_VOID; } void -ocs::CategoryQmaster::attach_all_jobs(lList *master_job_list, +ocs::CategoryQmaster::attach_all_jobs(lList *master_job_list, lList **master_category_list, const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, bool send_events, u_long32 gdi_session) { DENTER(TOP_LAYER); - lList **master_category_list = DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); // add all jobs to the category list, create categories if they do not exist lListElem *job; for_each_rw(job, master_job_list) { - lListElem *category = nullptr; - attach_job(master_category_list, &category, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); + attach_job(master_category_list, job, master_userset_list, master_project_list, master_rqs_list, send_events, gdi_session); } DRETURN_VOID; } @@ -213,10 +295,9 @@ ocs::CategoryQmaster::reattach_all_jobs(lList *master_job_list, * *******************************************************************************/ void -ocs::CategoryQmaster::reset_tmp_data() { +ocs::CategoryQmaster::reset_tmp_data(lList *master_category_list) { DENTER(TOP_LAYER); - lList *master_category_list = *DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); lListElem *cat; for_each_rw (cat, master_category_list) { diff --git a/source/daemons/qmaster/ocs_CategoryQmaster.h b/source/daemons/qmaster/ocs_CategoryQmaster.h index c07413aaa..0cc1f2bdc 100644 --- a/source/daemons/qmaster/ocs_CategoryQmaster.h +++ b/source/daemons/qmaster/ocs_CategoryQmaster.h @@ -28,9 +28,21 @@ namespace ocs { class CategoryQmaster { + static void + initialize_prj_and_uset_for_categories(lList *master_project_list, lList *master_userset_list, + const lList *master_rqs_list, const lList *master_cqueue_list, + const lList *master_pe_list, const lList *master_host_list); + public: + static void + initialize_prj_uset_and_create_categories(lList **master_category_list, lList *master_job_list, + lList *master_project_list, lList *master_userset_list, + const lList *master_rqs_list, const lList *master_cqueue_list, + const lList *master_pe_list, const lList *master_host_list); + + static bool - attach_job(lList **master_category_list, lListElem **category, lListElem *job, + attach_job(lList **master_category_list, lListElem *job, const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, bool send_events, u_long32 gdi_session); @@ -46,7 +58,7 @@ namespace ocs { refresh_cat_data_in_job(lList *master_category_list, lListElem *job); static void - attach_all_jobs(lList *master_job_list, + attach_all_jobs(lList *master_job_list, lList **master_category_list, const lList *master_userset_list, const lList *master_project_list, const lList *master_rqs_list, bool send_events, u_long32 gdi_session); @@ -56,7 +68,7 @@ namespace ocs { bool send_events, u_long32 gdi_session); static void - reset_tmp_data(); + reset_tmp_data(lList *master_category_list); static void refresh_cat_data_all_jobs(lList *master_category_list, lList *master_job_list); diff --git a/source/daemons/qmaster/setup_qmaster.cc b/source/daemons/qmaster/setup_qmaster.cc index f6d40721c..043bf5285 100644 --- a/source/daemons/qmaster/setup_qmaster.cc +++ b/source/daemons/qmaster/setup_qmaster.cc @@ -143,9 +143,6 @@ remove_invalid_job_references(int user); static void debit_all_jobs_from_qs(); -static void -init_categories(); - /****** qmaster/setup_qmaster/sge_setup_qmaster() ****************************** * NAME @@ -1186,16 +1183,18 @@ setup_qmaster() { lFreeList(&alp); } - - init_categories(); - - // Create all categories - const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); - const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); - const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); + DPRINTF("post init of prj/uset and create of categories--------------------\n"); + lList **master_category_list = ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); - ocs::CategoryQmaster::attach_all_jobs(master_job_list, master_userset_list, - master_project_list, master_rqs_list, false, ocs::SessionManager::GDI_SESSION_NONE); + lList *master_userset_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_USERSET); + lList *master_project_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_PROJECT); + lList *master_rqs_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_RQS); + const lList *master_pe_list = *ocs::DataStore::get_master_list(SGE_TYPE_PE); + const lList *master_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST); + + ocs::CategoryQmaster::initialize_prj_uset_and_create_categories(master_category_list, master_job_list, + master_project_list, master_userset_list, master_rqs_list, + master_cqueue_list, master_pe_list, master_host_list); DRETURN(0); } @@ -1373,79 +1372,4 @@ static void debit_all_jobs_from_qs() { DRETURN_VOID; } -/****** setup_qmaster/init_categories() **************************************** -* NAME -* init_categories() -- Initialize usersets/projects wrts categories -* -* SYNOPSIS -* static void init_categories() -* -* FUNCTION -* Initialize usersets/projects wrts categories. -* -* NOTES -* MT-NOTE: init_categories() is not MT safe -*******************************************************************************/ -static void init_categories() { - const lListElem *cq, *pe, *hep, *ep; - lListElem *acl, *prj; - const lListElem *rqs; - lList *u_list = nullptr, *p_list = nullptr; - const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); - const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); - bool all_projects = false; - bool all_usersets = false; - - /* - * collect a list of references to usersets/projects used in - * the resource quota sets - */ - for_each_ep(rqs, *ocs::DataStore::get_master_list(SGE_TYPE_RQS)) { - if (!all_projects && !rqs_diff_projects(rqs, nullptr, &p_list, nullptr, master_project_list)) { - all_projects = true; - } - if (!all_usersets && !rqs_diff_usersets(rqs, nullptr, &u_list, nullptr, master_userset_list)) { - all_usersets = true; - } - if (all_usersets && all_projects) { - break; - } - } - - /* - * collect list of references to usersets/projects used as ACL - * with queue_conf(5), host_conf(5) and sge_pe(5) - */ - for_each_ep(cq, *ocs::DataStore::get_master_list(SGE_TYPE_CQUEUE)) { - cqueue_diff_projects(cq, nullptr, &p_list, nullptr); - cqueue_diff_usersets(cq, nullptr, &u_list, nullptr); - } - - for_each_ep(pe, *ocs::DataStore::get_master_list(SGE_TYPE_PE)) { - pe_diff_usersets(pe, nullptr, &u_list, nullptr); - } - - for_each_ep(hep, *ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST)) { - host_diff_projects(hep, nullptr, &p_list, nullptr); - host_diff_usersets(hep, nullptr, &u_list, nullptr); - } - - /* - * now set categories flag with usersets/projects used as ACL - */ - for_each_ep(ep, p_list) { - if ((prj = prj_list_locate(master_project_list, lGetString(ep, PR_name)))) { - lSetBool(prj, PR_consider_with_categories, true); - } - } - - for_each_ep(ep, u_list) { - if ((acl = lGetElemStrRW(master_userset_list, US_name, lGetString(ep, US_name)))) { - lSetBool(acl, US_consider_with_categories, true); - } - } - - lFreeList(&p_list); - lFreeList(&u_list); -} diff --git a/source/daemons/qmaster/sge_job_qmaster.cc b/source/daemons/qmaster/sge_job_qmaster.cc index d7df89c42..af5eab614 100644 --- a/source/daemons/qmaster/sge_job_qmaster.cc +++ b/source/daemons/qmaster/sge_job_qmaster.cc @@ -272,8 +272,7 @@ sge_gdi_add_job(lListElem **jep, lList **alpp, lList **lpp, const lList *master_userset_list = *ocs::DataStore::get_master_list(SGE_TYPE_USERSET); const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT); const lList *master_rqs_list = *ocs::DataStore::get_master_list(SGE_TYPE_RQS); - lListElem* category = nullptr; - lret = ocs::CategoryQmaster::attach_job(master_category_list, &category, *jep, + lret = ocs::CategoryQmaster::attach_job(master_category_list, *jep, master_userset_list, master_project_list, master_rqs_list, true, packet->gdi_session); @@ -1310,10 +1309,9 @@ sge_gdi_mod_job(const ocs::gdi::Packet *packet, ocs::gdi::Task *task, lListElem ocs::CategoryQmaster::detach_job(master_category_list, new_job, true, packet->gdi_session); // add the job to the new category - lListElem *category; - ocs::CategoryQmaster::attach_job(master_category_list, &category, new_job, - master_userset_list, master_project_list, master_rqs_list, - true, packet->gdi_session); + ocs::CategoryQmaster::attach_job(master_category_list, new_job, + master_userset_list, master_project_list, master_rqs_list, + true, packet->gdi_session); } } diff --git a/source/daemons/qmaster/sge_sched_prepare_data.cc b/source/daemons/qmaster/sge_sched_prepare_data.cc index cad259a08..58e027f45 100644 --- a/source/daemons/qmaster/sge_sched_prepare_data.cc +++ b/source/daemons/qmaster/sge_sched_prepare_data.cc @@ -566,85 +566,6 @@ sge_process_global_config_event(sge_evc_class_t *evc, sge_object_type type, DRETURN(SGE_EMA_OK); } -sge_callback_result -sge_process_category_event_before(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { - DENTER(TOP_LAYER); - - // we handle only mod and del events - if (action != SGE_EMA_DEL && action != SGE_EMA_MOD) { - DRETURN(SGE_EMA_OK); - } - - // find the category - u_long32 category_id = lGetUlong(event, ET_intkey); - lList *master_category_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); - lListElem *category = lGetElemUlongRW(master_category_list, CT_id, category_id); - - // should not be possible - if (category == nullptr) { - ERROR(MSG_CANTFINDCATINMASTERLIST_U, category_id); - DRETURN(SGE_EMA_FAILURE); - } - - // remove tmp data - const lList *cache_list = lGetList(category, CT_cache); - const lListElem *cache = nullptr; - for_each_ep(cache, cache_list) { - auto range = static_cast(lGetRef(cache, CCT_pe_job_slots)); - sge_free(&range); - } - - DRETURN(SGE_EMA_OK); -} - -sge_callback_result -sge_process_job_event_before(sge_evc_class_t *evc, sge_object_type type, - sge_event_action action, lListElem *event, void *clientdata) { - u_long32 job_id = 0; - lListElem *job = nullptr; - - DENTER(GDI_LAYER); - DPRINTF("callback processing job event before default rule\n"); - - if (action == SGE_EMA_DEL || action == SGE_EMA_MOD) { - job_id = lGetUlong(event, ET_intkey); - job = lGetElemUlongRW(*ocs::DataStore::get_master_list(SGE_TYPE_JOB), JB_job_number, job_id); - if (job == nullptr) { - dstring id_dstring = DSTRING_INIT; - ERROR(MSG_CANTFINDJOBINMASTERLIST_S, job_get_id_string(job_id, 0, nullptr, &id_dstring)); - sge_dstring_free(&id_dstring); - DRETURN(SGE_EMA_FAILURE); - } - } else { - DRETURN(SGE_EMA_OK); - } - -#if 0 - switch (action) { - case SGE_EMA_DEL: - /* delete job category if necessary */ - sge_delete_job_category(job); - break; - - case SGE_EMA_MOD: - switch (lGetUlong(event, ET_type)) { - case sgeE_JOB_MOD: - sge_delete_job_category(job); - break; - - default: - break; - } - break; - - default: - break; - } -#endif - - DRETURN(SGE_EMA_OK); -} - sge_callback_result sge_process_job_event_after(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *clientdata) { DENTER(TOP_LAYER); @@ -663,22 +584,15 @@ sge_process_job_event_after(sge_evc_class_t *evc, sge_object_type type, sge_even } } - lList *master_category_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_CATEGORY); switch (action) { case SGE_EMA_LIST: { lList *master_job_list = *ocs::DataStore::get_master_list_rw(SGE_TYPE_JOB); - // refresh category references in all job objects - ocs::CategoryQmaster::refresh_cat_data_all_jobs(master_category_list, master_job_list); - // recompute the priorities for all jobs sge_do_priority(master_job_list, nullptr); break; } case SGE_EMA_ADD: - // refresh category reference - ocs::CategoryQmaster::refresh_cat_data_in_job(master_category_list, job); - // recompute the priorities for the job sge_do_priority_job(job); break; @@ -717,36 +631,6 @@ sge_process_job_event_after(sge_evc_class_t *evc, sge_object_type type, sge_even DRETURN(SGE_EMA_OK); } -/* If the last ja task of a job is deleted, - * remove the job category. - * Do we really need it? - * Isn't a job delete event sent after the last array task exited? - */ -sge_callback_result -sge_process_ja_task_event_after(sge_evc_class_t *evc, sge_object_type type, - sge_event_action action, lListElem *event, void *clientdata) { - DENTER(GDI_LAYER); - - if (action == SGE_EMA_DEL) { - const lListElem *job; - u_long32 job_id; - DPRINTF("callback processing ja_task event after default rule SGE_EMA_DEL\n"); - - job_id = lGetUlong(event, ET_intkey); - job = lGetElemUlong(*ocs::DataStore::get_master_list(SGE_TYPE_JOB), JB_job_number, job_id); - if (job == nullptr) { - dstring id_dstring = DSTRING_INIT; - ERROR(MSG_CANTFINDJOBINMASTERLIST_S, job_get_id_string(job_id, 0, nullptr, &id_dstring)); - sge_dstring_free(&id_dstring); - DRETURN(SGE_EMA_FAILURE); - } - } else { - DPRINTF("callback processing ja_task event after default rule\n"); - } - - DRETURN(SGE_EMA_OK); -} - #if 0 /****** sge_process_events/sge_process_userset_event_before() ****************** * NAME diff --git a/source/daemons/qmaster/sge_sched_process_events.cc b/source/daemons/qmaster/sge_sched_process_events.cc index 41d561a15..37e521de2 100644 --- a/source/daemons/qmaster/sge_sched_process_events.cc +++ b/source/daemons/qmaster/sge_sched_process_events.cc @@ -135,10 +135,10 @@ subscribe_scheduler(sge_evc_class_t *evc, sge_where_what_t *where_what) sge_mirror_subscribe(evc, SGE_TYPE_EXECHOST, nullptr, nullptr, nullptr, where_what->where_host, where_what->what_host); sge_mirror_subscribe(evc, SGE_TYPE_HGROUP, nullptr, nullptr, nullptr, nullptr, nullptr); sge_mirror_subscribe(evc, SGE_TYPE_CONFIG, nullptr, sge_process_global_config_event, nullptr, where_what->where_config, where_what->what_config); - sge_mirror_subscribe(evc, SGE_TYPE_JOB, sge_process_job_event_before, sge_process_job_event_after, nullptr, where_what->where_job, where_what->what_job); - sge_mirror_subscribe(evc, SGE_TYPE_JATASK, nullptr, sge_process_ja_task_event_after, nullptr, where_what->where_jat, where_what->what_jat); + sge_mirror_subscribe(evc, SGE_TYPE_JOB, nullptr, sge_process_job_event_after, nullptr, where_what->where_job, where_what->what_job); + sge_mirror_subscribe(evc, SGE_TYPE_JATASK, nullptr, nullptr, nullptr, where_what->where_jat, where_what->what_jat); sge_mirror_subscribe(evc, SGE_TYPE_PE, nullptr, nullptr, nullptr, nullptr, where_what->what_pe); - sge_mirror_subscribe(evc, SGE_TYPE_CATEGORY, sge_process_category_event_before, nullptr, nullptr, nullptr, nullptr); + sge_mirror_subscribe(evc, SGE_TYPE_CATEGORY, nullptr, nullptr, nullptr, nullptr, nullptr); /* we do *not* subscribe reduced elements for TYPE_PETASK: * event master currently cannot handle this, see IZ 3216 diff --git a/source/daemons/qmaster/sge_sched_thread.h b/source/daemons/qmaster/sge_sched_thread.h index f0ce8a8c7..4ccf3bcc3 100644 --- a/source/daemons/qmaster/sge_sched_thread.h +++ b/source/daemons/qmaster/sge_sched_thread.h @@ -55,6 +55,7 @@ typedef struct { const lList *hgrp_list; /* HGRP_Type */ lList *rqs_list; /* RQS_Type */ lList *ar_list; /* AR_Type */ + lList *category_list; /* CT_Type */ bool monitor_next_run; } scheduler_all_data_t; diff --git a/source/daemons/qmaster/sge_thread_scheduler.cc b/source/daemons/qmaster/sge_thread_scheduler.cc index b4caae4d8..549d90b95 100644 --- a/source/daemons/qmaster/sge_thread_scheduler.cc +++ b/source/daemons/qmaster/sge_thread_scheduler.cc @@ -632,7 +632,7 @@ sge_scheduler_main(void *arg) { const lList *master_hgrp_list = *ocs::DataStore::get_master_list(SGE_TYPE_HGROUP); const lList *master_sharetree_list = *ocs::DataStore::get_master_list(SGE_TYPE_SHARETREE); const lList *master_config_list = *ocs::DataStore::get_master_list(SGE_TYPE_CONFIG); - const lList *master_catergory_list = *ocs::DataStore::get_master_list(SGE_TYPE_CATEGORY); + const lList *master_category_list = *ocs::DataStore::get_master_list(SGE_TYPE_CATEGORY); /* delay scheduling for test purposes, see issue GE-3306 */ if (SGE_TEST_DELAY_SCHEDULING > 0) { @@ -718,7 +718,6 @@ sge_scheduler_main(void *arg) { #if 0 sge_reset_job_category(); #endif - ocs::CategoryQmaster::reset_tmp_data(); // prepare data for the scheduler itself copy.host_list = lCopyList(nullptr, master_exechost_list); @@ -774,8 +773,12 @@ sge_scheduler_main(void *arg) { } } + copy.category_list = lCopyList(nullptr, master_category_list); copy.job_list = lCopyList(nullptr, master_job_list); + // store category reference in each job + ocs::CategoryQmaster::refresh_cat_data_all_jobs(copy.category_list, copy.job_list); + /* no need to copy these lists, they are read only used */ copy.centry_list = master_centry_list; copy.ckpt_list = master_ckpt_list; @@ -848,6 +851,10 @@ sge_scheduler_main(void *arg) { double prof_run = prof_get_measurement_wallclock(SGE_PROF_CUSTOM7, true, nullptr); PROF_START_MEASUREMENT(SGE_PROF_CUSTOM7); + // free category cache within the category list + ocs::CategoryQmaster::reset_tmp_data(copy.category_list); + lFreeList(&(copy.category_list)); + /* ... which gets deleted after using */ lFreeList(&(copy.host_list)); lFreeList(&(copy.queue_list)); @@ -900,7 +907,7 @@ sge_scheduler_main(void *arg) { PROFILING("PROF: schedd run took: %.3f s (init: %.3f s, copy: %.3f s, " "run:%.3f, free: %.3f s, jobs: " sge_uu32 ", categories: %d/%d)", prof_total, prof_init, prof_copy, prof_run, prof_free, - lGetNumberOfElem(master_job_list), lGetNumberOfElem(master_catergory_list), 0); + lGetNumberOfElem(master_job_list), lGetNumberOfElem(master_category_list), 0); } if (getenv("SGE_ND") != nullptr) { printf("--------------STOP-SCHEDULER-RUN-------------\n"); diff --git a/source/libs/sgeobj/cull/sge_job_JB_L.h b/source/libs/sgeobj/cull/sge_job_JB_L.h index 687e0ff34..00913f5d7 100644 --- a/source/libs/sgeobj/cull/sge_job_JB_L.h +++ b/source/libs/sgeobj/cull/sge_job_JB_L.h @@ -561,7 +561,7 @@ LISTDEF(JB_Type) SGE_LIST(JB_grp_list, ST_Type, CULL_SPOOL) SGE_LIST(JB_joker, VA_Type, CULL_SPOOL) SGE_ULONG(JB_sync_options, CULL_SPOOL) - SGE_ULONG(JB_category_id, CULL_HASH) + SGE_ULONG(JB_category_id, CULL_HASH | CULL_SPOOL) LISTEND NAMEDEF(JBN) diff --git a/source/libs/sgeobj/lwdb/ocs_JB_attributes.h b/source/libs/sgeobj/lwdb/ocs_JB_attributes.h index bb06f8bb0..c2924316a 100644 --- a/source/libs/sgeobj/lwdb/ocs_JB_attributes.h +++ b/source/libs/sgeobj/lwdb/ocs_JB_attributes.h @@ -293,7 +293,7 @@ constexpr const int JB_Type[] = { {JB_grp_list, "JB_grp_list", AttributeStatic::LIST, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ {JB_joker, "JB_joker", AttributeStatic::LIST, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ {JB_sync_options, "JB_sync_options", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::NO_HASH, false, true}, \ - {JB_category_id, "JB_category_id", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::UNORDERED_UNIQUE, false, false} \ + {JB_category_id, "JB_category_id", AttributeStatic::UINT32, nullptr, AttributeStatic::NO_POS, AttributeStatic::UNORDERED_UNIQUE, false, true} \ } // end namespace diff --git a/source/libs/sgeobj/ocs_Category.cc b/source/libs/sgeobj/ocs_Category.cc index 984ff81c1..26248e649 100644 --- a/source/libs/sgeobj/ocs_Category.cc +++ b/source/libs/sgeobj/ocs_Category.cc @@ -131,10 +131,3 @@ ocs::Category::build_string(dstring *category_str, lListElem *job, DRETURN_VOID; } - -lListElem * -ocs::Category::create_new(lListElem *job) { - lListElem *category = lCreateElem(CT_Type); - lSetUlong64(category, CT_id, get_next_id()); - return category; -} \ No newline at end of file diff --git a/source/libs/sgeobj/ocs_Category.h b/source/libs/sgeobj/ocs_Category.h index a5ceb19bb..d4b2df87a 100644 --- a/source/libs/sgeobj/ocs_Category.h +++ b/source/libs/sgeobj/ocs_Category.h @@ -28,8 +28,14 @@ namespace ocs { static u_long32 next_id; public: - static u_long32 get_next_id() { - return next_id++; + static u_long32 get_next_id(lList *master_category_list) { + u_long32 id; + + // do not use 0 as ID or IDs that already have been used + do { + id = next_id++; + } while (id == 0 || lGetElemUlong(master_category_list, CT_id, id) != nullptr); + return id; } static void build_string(dstring *category_str, lListElem *job, const lList *acl_list, const lList *prj_list, const lList *rqs_list); static lListElem *create_new(lListElem *job); diff --git a/source/libs/uti/sge_log.h b/source/libs/uti/sge_log.h index b4eb5238f..f278f9b05 100644 --- a/source/libs/uti/sge_log.h +++ b/source/libs/uti/sge_log.h @@ -33,6 +33,7 @@ ************************************************************************/ /*___INFO__MARK_END__*/ +#include #include #include @@ -330,6 +331,6 @@ sge_log(u_long32 log_level, const char *msg, const char *file, int line); # define SGE_ASSERT(x) \ if (!(x)) { \ sge_log(LOG_CRIT, MSG_UNREC_ERROR,__FILE__,__LINE__); \ - abort(); \ + assert(x); \ } \ void() diff --git a/source/libs/uti/sge_uidgid.cc b/source/libs/uti/sge_uidgid.cc index 2815b7f5c..c429d24ea 100644 --- a/source/libs/uti/sge_uidgid.cc +++ b/source/libs/uti/sge_uidgid.cc @@ -1347,6 +1347,8 @@ ocs_get_groups(const char *user, gid_t gid, int *amount, ocs_grp_elem_t **grp_ar DRETURN(false); } + DPRINTF("max_groups=%d\n", max_groups); + // allocate buffer for group IDs auto *grp_id_list = reinterpret_cast(sge_malloc(max_groups * sizeof(gid_t))); if (grp_id_list == nullptr) { @@ -1366,6 +1368,9 @@ ocs_get_groups(const char *user, gid_t gid, int *amount, ocs_grp_elem_t **grp_ar sge_free(&grp_id_list); DRETURN(false); } + + DPRINTF("num_group_ids=%d\n", num_group_ids); + if (num_group_ids == 0) { // success case: user has no supplementary groups (this case probably does not exist) *amount = 0; diff --git a/test/daemons/common/test_common_category.cc b/test/daemons/common/test_common_category.cc index 0322f8d13..6f39150ea 100644 --- a/test/daemons/common/test_common_category.cc +++ b/test/daemons/common/test_common_category.cc @@ -118,7 +118,7 @@ static data_entry_t tests[] = { * result strings **/ static const char *result_category[] = { - nullptr, // 1 + "-", // 1 "-P my_pr", "-ckpt my_check", "-ckpt my_check -P my_pr", diff --git a/test/libs/uti/test_uti_uidgid.cc b/test/libs/uti/test_uti_uidgid.cc index dd820c6f1..953b84d4f 100644 --- a/test/libs/uti/test_uti_uidgid.cc +++ b/test/libs/uti/test_uti_uidgid.cc @@ -37,6 +37,8 @@ #include "uti/sge_uidgid.h" +#include + int check_get_buffer_size() { int ret = EXIT_SUCCESS; int size; @@ -98,6 +100,7 @@ int check_supplementary_groups() { } int main(int argc, char *argv[]) { + DENTER_MAIN(TOP_LAYER, "test_uti_uidgid"); int ret = check_get_buffer_size(); if (ret == EXIT_SUCCESS) { ret = check_supplementary_groups(); From 896d1d74677a8d17cdfd857d2ed35c6e4d1bd2ef Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Fri, 18 Apr 2025 20:56:36 +0200 Subject: [PATCH 08/10] BF: CS-1190: Mirror does not process MOD and DEL events for categories correctly --- source/clients/qconf/ocs_qconf_parse.cc | 4 +- source/common/msg_common.h | 7 ++- source/daemons/qmaster/msg_qmaster.h | 3 +- source/daemons/qmaster/ocs_CategoryQmaster.cc | 5 +- source/daemons/qmaster/sge_c_gdi.cc | 3 +- source/libs/mir/sge_mirror.cc | 61 ++++++++++++++++--- source/libs/sgeobj/sge_object.cc | 1 + 7 files changed, 65 insertions(+), 19 deletions(-) diff --git a/source/clients/qconf/ocs_qconf_parse.cc b/source/clients/qconf/ocs_qconf_parse.cc index 09f91e9ce..60066eb5a 100644 --- a/source/clients/qconf/ocs_qconf_parse.cc +++ b/source/clients/qconf/ocs_qconf_parse.cc @@ -6626,7 +6626,7 @@ qconf_is_adminhost(const char *host) { // if host has no permission then exit if (!is_admin_host) { - fprintf(stderr, MSG_ANSWER_DENIEDHOSTXISNOADMINHOST_S, host); + fprintf(stderr,MSG_SGETEXT_NOADMINHOST_S, host); fprintf(stderr, "\n"); sge_exit(1); } @@ -6663,7 +6663,7 @@ qconf_is_manager_on_admin_host(const char *user, const char *host) { } // if host has no permission then exit if (!is_admin_host) { - fprintf(stderr, MSG_ANSWER_DENIEDHOSTXISNOADMINHOST_S, host); + fprintf(stderr, MSG_SGETEXT_NOADMINHOST_S, host); fprintf(stderr, "\n"); sge_exit(1); } diff --git a/source/common/msg_common.h b/source/common/msg_common.h index 7538384b0..67a9880d7 100644 --- a/source/common/msg_common.h +++ b/source/common/msg_common.h @@ -283,9 +283,10 @@ #define MSG_ANSWER_CONFIGUNCHANGED _MESSAGE(23204, _("configuration unchanged")) #define MSG_ANSWER_ERRORREADINGTEMPFILE _MESSAGE(23205, _("error reading temp file")) #define MSG_ANSWER_ERRORREADINGCONFIGFROMFILEX_S _MESSAGE(23206, _("error reading configuration from file " SFN)) -#define MSG_ANSWER_DENIEDHOSTXISNOADMINHOST_S _MESSAGE(23207, _("denied: host " SFQ " is no admin host")) -#define MSG_TREE_UNABLETOLACATEXINSHARETREE_S _MESSAGE(23208, _("Unable to locate " SFN " in sharetree")) -#define MSG_OBJ_NOSTREEELEM _MESSAGE(23209, _("no sharetree element")) +#define MSG_SGETEXT_NOADMINHOST_S _MESSAGE(23207, _("denied: host " SFQ " is no admin host")) +#define MSG_SGETEXT_NOSUBMITHOST_S _MESSAGE(23208, _("denied: host " SFQ " is no submit host")) +#define MSG_TREE_UNABLETOLACATEXINSHARETREE_S _MESSAGE(23209, _("Unable to locate " SFN " in sharetree")) +#define MSG_OBJ_NOSTREEELEM _MESSAGE(23210, _("no sharetree element")) #define MSG_STREE_NOVALIDNODEREF_U _MESSAGE(23222, _("found reference to node " sge_uu32 " but no specification")) diff --git a/source/daemons/qmaster/msg_qmaster.h b/source/daemons/qmaster/msg_qmaster.h index 686a57da7..9d2dbbdbd 100644 --- a/source/daemons/qmaster/msg_qmaster.h +++ b/source/daemons/qmaster/msg_qmaster.h @@ -149,8 +149,7 @@ #define MSG_SGETEXT_UNKNOWNOP _MESSAGE(33122, _("unknown operation")) #define MSG_SGETEXT_OPNOIMPFORTARGET_S _MESSAGE(33125, _("operation not implemented for target in " SFN)) -#define MSG_SGETEXT_NOADMINHOST_S _MESSAGE(33126, _("denied: host " SFQ " is no admin host")) -#define MSG_SGETEXT_NOSUBMITHOST_S _MESSAGE(33127, _("denied: host " SFQ " is no submit host")) + #define MSG_SGETEXT_NOSUBMITORADMINHOST_S _MESSAGE(33128, _("denied: host " SFQ " is neither submit nor admin host")) #define MSG_SGETEXT_ALREADYEXISTS_SS _MESSAGE(33129, _("" SFN " " SFQ " already exists")) #define MSG_SGETEXT_JOBINFOMESSAGESOUTDATED _MESSAGE(33130, _("Can not get job info messages, scheduler is not available")) diff --git a/source/daemons/qmaster/ocs_CategoryQmaster.cc b/source/daemons/qmaster/ocs_CategoryQmaster.cc index d0b504d16..9842eff37 100644 --- a/source/daemons/qmaster/ocs_CategoryQmaster.cc +++ b/source/daemons/qmaster/ocs_CategoryQmaster.cc @@ -213,6 +213,7 @@ ocs::CategoryQmaster::detach_job(lList **master_category_list, lListElem *job, b // decrease the reference count or remove the category bool is_del = false; u_long32 refcount = lGetUlong(category, CT_refcount); + u_long32 category_id = lGetUlong(category, CT_id); if (refcount > 1) { lSetUlong(category, CT_refcount, refcount - 1); } else { @@ -222,7 +223,7 @@ ocs::CategoryQmaster::detach_job(lList **master_category_list, lListElem *job, b if (send_events) { ev_event category_event = is_del ? sgeE_CATEGORY_DEL : sgeE_CATEGORY_MOD; - sge_add_event(0, category_event, lGetUlong(job, JB_category_id), 0, + sge_add_event(0, category_event, category_id, 0, nullptr, nullptr, nullptr, category, gdi_session); } @@ -331,7 +332,7 @@ ocs::CategoryQmaster::refresh_cat_data_in_job(lList *master_category_list, lList u_long32 category_id = lGetUlong(job, JB_category_id); lListElem *category = lGetElemUlongRW(master_category_list, CT_id, category_id); - DPRINTF("###### category id: %lu (%p)\n", category_id, category); + DPRINTF("###### job / cat_id / ptr: " sge_uu32 " / " sge_uu32 "/ %p\n", lGetUlong(job, JB_job_number), category_id, category); lSetRef(job, JB_category, category); DRETURN_VOID; diff --git a/source/daemons/qmaster/sge_c_gdi.cc b/source/daemons/qmaster/sge_c_gdi.cc index 7ae667359..1c04892be 100644 --- a/source/daemons/qmaster/sge_c_gdi.cc +++ b/source/daemons/qmaster/sge_c_gdi.cc @@ -1469,8 +1469,7 @@ sge_task_check_get_perm_host(ocs::gdi::Packet *packet, ocs::gdi::Task *task) { DENTER(TOP_LAYER); // only external requests need to be checked - if (packet->is_intern_request) { - + if (!packet->is_intern_request) { const lList *master_admin_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_ADMINHOST); bool is_admin_host = host_list_locate(master_admin_host_list, packet->host) != nullptr ? true : false; const lList *master_submit_host_list = *ocs::DataStore::get_master_list(SGE_TYPE_SUBMITHOST); diff --git a/source/libs/mir/sge_mirror.cc b/source/libs/mir/sge_mirror.cc index ce5c1870b..823d7e09a 100644 --- a/source/libs/mir/sge_mirror.cc +++ b/source/libs/mir/sge_mirror.cc @@ -118,13 +118,19 @@ generic_update_master_list(sge_evc_class_t *evc, void *client_data); static sge_callback_result -ar_update_master_list(sge_evc_class_t *evc, sge_object_type type, - sge_event_action action, lListElem *event, void *client_data); +ar_update_master_list(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *client_data); static -sge_mirror_error sge_mirror_update_master_list_ar_key(lList **list, const lDescr *list_descr, - int key_nm, u_long32 key, - sge_event_action action, lListElem *event); +sge_mirror_error +sge_mirror_update_master_list_ar_key(lList **list, const lDescr *list_descr, + int key_nm, u_long32 key, sge_event_action action, lListElem *event); + +static sge_callback_result +cat_update_master_list(sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, void *client_data); + +static sge_mirror_error +sge_mirror_update_master_list_cat_key(lList **list, const lDescr *list_descr, + int key_nm, u_long32 key, sge_event_action action, lListElem *event); /* * One entry per event type, this is the basic definition. @@ -162,8 +168,8 @@ static const mirror_description dev_mirror_base[SGE_TYPE_ALL] = { { nullptr, generic_update_master_list, nullptr, nullptr }, /*suser*/ { nullptr, generic_update_master_list, nullptr, nullptr }, /*rqs*/ { nullptr, ar_update_master_list, nullptr, nullptr }, /*advance reservation*/ - { nullptr, nullptr, nullptr, nullptr }, /*jobscripts*/ - { nullptr, generic_update_master_list, nullptr, nullptr }, // sgeE_CATEGORY_LIST + { nullptr, nullptr, nullptr, nullptr }, /*jobscripts*/ + { nullptr, cat_update_master_list, nullptr, nullptr }, // sgeE_CATEGORY_LIST }; /*-------------------------*/ @@ -1765,7 +1771,6 @@ sge_mirror_update_master_list(lList **list, const lDescr *list_descr, lListElem DRETURN(SGE_EM_OK); } -static sge_callback_result /****** sge_mirror/ar_update_master_list() ************************************* * NAME * ar_update_master_list() -- update the master advance reservation list @@ -1793,6 +1798,7 @@ static sge_callback_result * NOTES * MT-NOTE: ar_update_master_list() is not MT safe *******************************************************************************/ +static sge_callback_result ar_update_master_list([[maybe_unused]] sge_evc_class_t *evc, sge_object_type type, sge_event_action action, lListElem *event, [[maybe_unused]] void *client_data) { @@ -1808,6 +1814,7 @@ ar_update_master_list([[maybe_unused]] sge_evc_class_t *evc, sge_object_type typ DRETURN(SGE_EMA_OK); } + /****** sge_mirror/sge_mirror_update_master_list_ar_key() ********************** * NAME * sge_mirror_update_master_list_ar_key() -- updates the advance reservation @@ -1858,3 +1865,41 @@ static sge_mirror_error sge_mirror_update_master_list_ar_key(lList **list, const DRETURN(ret); } + +static sge_callback_result +cat_update_master_list([[maybe_unused]] sge_evc_class_t *evc, sge_object_type type, + sge_event_action action, lListElem *event, [[maybe_unused]] void *client_data) +{ + DENTER(TOP_LAYER); + lList **list = ocs::DataStore::get_master_list_rw(type); + const lDescr *list_descr = lGetListDescr(lGetList(event, ET_new_version)); + int key_nm = object_type_get_key_nm(type); + u_long32 key = lGetUlong(event, ET_intkey); + + if (sge_mirror_update_master_list_cat_key(list, list_descr, key_nm, key, action, event) != SGE_EM_OK) { + DRETURN(SGE_EMA_FAILURE); + } + DRETURN(SGE_EMA_OK); +} + +static sge_mirror_error +sge_mirror_update_master_list_cat_key(lList **list, const lDescr *list_descr, + int key_nm, u_long32 key, sge_event_action action, lListElem *event) +{ + DENTER(TOP_LAYER); + + sge_mirror_error ret; + if (list != nullptr) { + lListElem *ep = nullptr; + if (key > 0) { + ep = lGetElemUlongRW(*list, key_nm, key); + } + + DSTRING_STATIC(dstr, 32); + ret = sge_mirror_update_master_list(list, list_descr, ep, sge_dstring_sprintf(&dstr, sge_uu32, key), action, event); + } else { + ret = SGE_EM_NOT_INITIALIZED; + } + + DRETURN(ret); +} diff --git a/source/libs/sgeobj/sge_object.cc b/source/libs/sgeobj/sge_object.cc index bda1a2adc..68a77363d 100644 --- a/source/libs/sgeobj/sge_object.cc +++ b/source/libs/sgeobj/sge_object.cc @@ -105,6 +105,7 @@ static object_description object_base[SGE_TYPE_ALL] = { {"RQS", RQS_Type, RQS_name}, {"AR", AR_Type, AR_id}, {"JOBSCRIPT", STU_Type, STU_name}, + {"CATEGORY", CT_Type, CT_id}, }; From b49e31d653b3511c9fa06e079ad532e8407c8336 Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Fri, 18 Apr 2025 21:12:00 +0200 Subject: [PATCH 09/10] BF: CS-209: Implement client functionality to show details about categories --- source/clients/common/ocs_client_job.cc | 3 +++ source/clients/qstat/ocs_qstat.cc | 4 ++-- source/libs/spool/flatfile/sge_flatfile.cc | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/source/clients/common/ocs_client_job.cc b/source/clients/common/ocs_client_job.cc index 0dfc3b921..f68a717bb 100644 --- a/source/clients/common/ocs_client_job.cc +++ b/source/clients/common/ocs_client_job.cc @@ -92,6 +92,9 @@ void cull_show_job(const lListElem *job, int flags, bool show_binding) { printf("job_number: %s\n", MSG_JOB_UNASSIGNED); } + if (lGetPosViaElem(job, JB_category_id, SGE_NO_ABORT) >= 0) + printf("category_id: " sge_uu32 "\n", lGetUlong(job, JB_category_id)); + if (lGetPosViaElem(job, JB_exec_file, SGE_NO_ABORT) >= 0) if (lGetString(job, JB_exec_file)) printf("exec_file: %s\n", lGetString(job, JB_exec_file)); diff --git a/source/clients/qstat/ocs_qstat.cc b/source/clients/qstat/ocs_qstat.cc index d5a54678d..54a140f83 100644 --- a/source/clients/qstat/ocs_qstat.cc +++ b/source/clients/qstat/ocs_qstat.cc @@ -2041,7 +2041,7 @@ qstat_show_job(lList *jid_list, u_long32 isXML, qstat_env_t *qstat_env) { } } what = lWhat("%T(%I%I%I%I%I%I%I%I%I%I%I%I%I%I%I%I%I->%T%I%I%I->%T%I%I%I%I->%T(%I%I%I%I%I%I%I)" - "%I%I%I%I->%T(%I)%I->%T(%I)%I%I%I%I%I%I%I%I%I%I%I%I%I%I->%T%I%I%I%I%I%I%I%I%I%I%I%I%I%I->%T%I)", + "%I%I%I%I->%T(%I)%I->%T(%I)%I%I%I%I%I%I%I%I%I%I%I%I%I%I->%T%I%I%I%I%I%I%I%I%I%I%I%I%I%I->%T%I%I)", JB_Type, JB_job_number, JB_ar, JB_exec_file, JB_submission_time, JB_submission_command_line, JB_owner, JB_uid, JB_group, JB_gid, JB_account, JB_merge_stderr, JB_mail_list, JB_project, JB_department, JB_notify, JB_job_name, @@ -2071,7 +2071,7 @@ qstat_show_job(lList *jid_list, u_long32 isXML, qstat_env_t *qstat_env) { JB_verify_suitable_queues, JB_soft_wallclock_gmt, JB_hard_wallclock_gmt, JB_override_tickets, JB_version, JB_ja_structure, JB_type, JB_binding, JB_ja_task_concurrency, JB_pty, - JB_grp_list, RN_Type, JB_sync_options); + JB_grp_list, RN_Type, JB_sync_options, JB_category_id); /* get job list */ alp = ocs::gdi::Client::sge_gdi(ocs::gdi::Target::TargetValue::SGE_JB_LIST, ocs::gdi::Command::SGE_GDI_GET, ocs::gdi::SubCommand::SGE_GDI_SUB_NONE, &jlp, where, what); lFreeWhere(&where); diff --git a/source/libs/spool/flatfile/sge_flatfile.cc b/source/libs/spool/flatfile/sge_flatfile.cc index d75b9541d..eca3deca9 100644 --- a/source/libs/spool/flatfile/sge_flatfile.cc +++ b/source/libs/spool/flatfile/sge_flatfile.cc @@ -322,7 +322,7 @@ const spool_flatfile_instr qconf_cat_list_sfi = nullptr, false, true, - true, + false, false, true, false, From 968e6d7b436786b60a3cc7887e4316d54381b1b1 Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Fri, 18 Apr 2025 23:28:20 +0200 Subject: [PATCH 10/10] EH: CS-210: Document qconf -scat and -scatl --- doc/markdown/man/man1/qconf.md | 6 ++++ doc/markdown/man/man1/sge_types.md | 7 ++++ doc/markdown/man/man5/CMakeLists.txt | 3 +- doc/markdown/man/man5/sge_category.md | 47 +++++++++++++++++++++++++++ source/common/msg_common.h | 7 ++++ source/common/sge_options.cc | 8 +++-- source/common/sge_options.h | 6 ++-- source/common/usage.cc | 14 ++++++-- source/common/usage.h | 1 + 9 files changed, 92 insertions(+), 7 deletions(-) create mode 100644 doc/markdown/man/man5/sge_category.md diff --git a/doc/markdown/man/man1/qconf.md b/doc/markdown/man/man1/qconf.md index 6b67bd982..9c43be74e 100644 --- a/doc/markdown/man/man1/qconf.md +++ b/doc/markdown/man/man1/qconf.md @@ -566,6 +566,12 @@ Display the configuration of the specified calendar. ## -scall Show a list of all calendars currently defined. +## -scat *cat_id* +Display characteristics of the given category ID *cat_id*. IDs of existing categories can be obtained from the output of `-scatl`. + +## -scatl +Show a list of all categories currently defined. (see xxqs_name_sxx_category(5)) + ## -sckpt *ckpt_name* Display the configuration of the specified checkpointing environment. diff --git a/doc/markdown/man/man1/sge_types.md b/doc/markdown/man/man1/sge_types.md index f909bb2f9..bda987d67 100644 --- a/doc/markdown/man/man1/sge_types.md +++ b/doc/markdown/man/man1/sge_types.md @@ -25,6 +25,13 @@ A calendar name is the name of a xxQS_NAMExx calendar described in xxqs_name_sxx calendar_name := object_name +## *cat_id* + +A category id is a unique identifier for a xxQS_NAMExx category. The id is assigned by the system and identifies +the category as long as it exists. Category ids of deleted categories may be reused later for other new categories. + + cat_id := 32_bit_integer + ## *ckpt_name* A *ckpt_name* is the name of a xxQS_NAMExx checkpointing interface described in xxqs_name_sxx_checkpoint(5). diff --git a/doc/markdown/man/man5/CMakeLists.txt b/doc/markdown/man/man5/CMakeLists.txt index ac2336039..5a2e2ea30 100644 --- a/doc/markdown/man/man5/CMakeLists.txt +++ b/doc/markdown/man/man5/CMakeLists.txt @@ -22,7 +22,7 @@ # build all man pages from section 5 set(PAGES sge_access_list sge_accounting sge_aliases sge_bootstrap sge_calendar_conf sge_checkpoint sge_complex - sge_conf sge_host_aliases sge_host_conf sge_hostgroup sge_monitoring + sge_category sge_conf sge_host_aliases sge_host_conf sge_hostgroup sge_monitoring sge_pe sge_priority sge_project sge_qhost sge_qstat sge_qselect sge_qtask sge_queue_conf sge_reporting sge_request sge_resource_quota sge_sched_conf sge_share_tree sge_user) build_markdown_man("5" PAGES "0") @@ -34,6 +34,7 @@ add_custom_target(troffman5 ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/sge_accounting.5 ${CMAKE_CURRENT_BINARY_DIR}/sge_aliases.5 ${CMAKE_CURRENT_BINARY_DIR}/sge_bootstrap.5 + ${CMAKE_CURRENT_BINARY_DIR}/sge_category.5 ${CMAKE_CURRENT_BINARY_DIR}/sge_calendar_conf.5 ${CMAKE_CURRENT_BINARY_DIR}/sge_checkpoint.5 ${CMAKE_CURRENT_BINARY_DIR}/sge_complex.5 diff --git a/doc/markdown/man/man5/sge_category.md b/doc/markdown/man/man5/sge_category.md new file mode 100644 index 000000000..44171c68f --- /dev/null +++ b/doc/markdown/man/man5/sge_category.md @@ -0,0 +1,47 @@ +--- +title: sge_category +section: 5 +header: Reference Manual +footer: __RELEASE__ +date: __DATE__ +--- + +# NAME + +xxqs_name_sxx_category - xxQS_NAMExx category format + +# DESCRIPTION + +Categories are automatically managed objects in xxQS_NAMExx. They group jobs that have the same scheduling characteristics. Categories are created by the system and are also automatically adjusted as required. + +Each job in the system belongs to exactly one category, but a category can belong to several jobs. When the last job of a category leaves the system, the category is deleted. + +Categories are identified by a unique ID, and have a string appended to them, usually reflecting the submit switches used either when the job was submitted or when the job was modified. `qconf -scatl` will show a list of all the categories in the system. The list will be sorted by category ID. It will also show the number of jobs in the category and the category string. `qstat -j ` will show the category ID of a job. + +Categories are used in the Job Scheduler to group jobs which have the same scheduling characteristics. If a job of a particular category cannot be scheduled, then also no other job of that category can be started. Categories allow the Job Scheduler to find an early cut-off point in the scheduling algorithm to reduce the scheduling overhead. + +Job Submission Verifiers can be used to reduce the number of categories in the system by adjusting the submit switches of submitted jobs. To achieve this, the characteristics of these submitted jobs can be made consistent. For example, storage requests can be rounded up to the nearest multiple of 100MB. This reduces the number of categories in the system and therefore the scheduling overhead. + +# FORMAT + +## id + +Unique category ID. The ID is assigned by the system and identifies the category as long as it exists. Category IDs of deleted categories may be reused later for other new categories. + +## rcount + +Reference count. The number of jobs that are currently assigned to this category. The reference count is incremented when a job is assigned to the category and decremented when the job leaves the system or is modified to belongs to a different category. + +## str + +The category string. The string is assigned by the system and reflects the submit switches or other characteristics of a job that were present either when the job was submitted or when the job was modified. At any point in time one category string is unique in the system. + +Please note that the category strings may be different for different versions of xxQS_NAMExx. They also depend on other configuration parameters such as Resource Quota Sets, Access Lists or Projects. + +# SEE ALSO + +xxqs_name_sxx_intro(1), qconf(1), qsub(1), xxqs_name_sxx_jsv(1), xxqs_name_sxx_qmaster(8) + +# COPYRIGHT + +See xxqs_name_sxx_intro(1) for a full statement of rights and permissions. diff --git a/source/common/msg_common.h b/source/common/msg_common.h index 67a9880d7..6a0e34263 100644 --- a/source/common/msg_common.h +++ b/source/common/msg_common.h @@ -50,6 +50,7 @@ #define MSG_TABLE_EV_POOL "POOL" #define MSG_TABLE_SIZE "SIZE" +#define MSG_GDI_ARGUMENTSYNTAX_QA_CATEGORY_ID "cat_id category ID" #define MSG_GDI_ARGUMENTSYNTAX_OA_ACCOUNT_STRING "account_string account_name" #define MSG_GDI_ARGUMENTSYNTAX_QA_BINDING_STRATEGY_EXP "exp explicit:,[:...]" #define MSG_GDI_ARGUMENTSYNTAX_QA_BINDING_STRATEGY_LIN "lin linear:[:,]" @@ -1063,4 +1064,10 @@ #define MSG_CONTACT_HPC_GRIDWARE _MESSAGE(60725, _("The functionality is available in the commercial version of the product.\nPlease contact HPC-Gridware GmbH (sales@hpc-gridware.com) for further information.")) +#define MSG_GDI_USAGE_scatl_OPT_CATEGORY_NAME "[-scatl]" +#define MSG_GDI_UTEXT_scatl_OPT_CATEGORY_NAME _MESSAGE(60726, _("show list of all categories" )) + +#define MSG_GDI_USAGE_scat_OPT_CATEGORY_NAME "[-scat cat_id]" +#define MSG_GDI_UTEXT_scat_OPT_CATEGORY_NAME _MESSAGE(60727, _("show category" )) + // clang-format on diff --git a/source/common/sge_options.cc b/source/common/sge_options.cc index 858309af5..d0b13a46a 100644 --- a/source/common/sge_options.cc +++ b/source/common/sge_options.cc @@ -510,8 +510,12 @@ unsigned short sge_options[][ALL_OPT + 1] = /* stl_OPT show thread list */ {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, /* dept_OPT set department of job */ - {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1} - /* + {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1}, + /* scatl_OPT show category list */ +{0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, + /* scat_OPT show category */ +{0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1} + /* n q q q q q q q q q q q q q q e q q q q n A o a c d h m m r r s s r l s s x e r r r o L n l o e o a o e l e h s o t u e v s d s n L diff --git a/source/common/sge_options.h b/source/common/sge_options.h index 5505150ee..cd8d0a36d 100644 --- a/source/common/sge_options.h +++ b/source/common/sge_options.h @@ -295,8 +295,10 @@ enum { sce_OPT, /* show ce object */ scel_OPT,/* show ce object list */ - stl_OPT, //< show thread list - dept_OPT //< set department of job + stl_OPT, //< show thread list + dept_OPT, //< set department of job + scatl_OPT, //< show category list + scat_OPT, //< show category }; /* macros used in parsing */ diff --git a/source/common/usage.cc b/source/common/usage.cc index 36380c583..0cb707bcd 100644 --- a/source/common/usage.cc +++ b/source/common/usage.cc @@ -207,6 +207,8 @@ static const char* get_argument_syntax(u_long32 prog_number, int nr) return MSG_GDI_ARGUMENTSYNTAX_QA_BINDING_STRATEGY_LIN; case OA_BINDING_STRIDING: return MSG_GDI_ARGUMENTSYNTAX_QA_BINDING_STRATEGY_STR; + case OA_CATEGORY_ID: + return MSG_GDI_ARGUMENTSYNTAX_QA_CATEGORY_ID; default: break; } @@ -1058,14 +1060,22 @@ void sge_usage(u_long32 prog_number, FILE *fp) { } if (VALID_OPT(scal_OPT, prog_number)) { - PRINTITD(MSG_GDI_USAGE_scal_OPT_CALENDAR_NAME, - MSG_GDI_UTEXT_scal_OPT_CALENDAR_NAME); + PRINTITD(MSG_GDI_USAGE_scal_OPT_CALENDAR_NAME, MSG_GDI_UTEXT_scal_OPT_CALENDAR_NAME); } if (VALID_OPT(scall_OPT, prog_number)) { PRINTITD(MSG_GDI_USAGE_scall_OPT, MSG_GDI_UTEXT_scall_OPT); } + if (VALID_OPT(scat_OPT, prog_number)) { + PRINTITD(MSG_GDI_USAGE_scatl_OPT_CATEGORY_NAME, MSG_GDI_UTEXT_scatl_OPT_CATEGORY_NAME); + } + + if (VALID_OPT(scatl_OPT, prog_number)) { + PRINTITD(MSG_GDI_USAGE_scat_OPT_CATEGORY_NAME, MSG_GDI_UTEXT_scat_OPT_CATEGORY_NAME); + MARK(OA_CATEGORY_ID); + } + if (VALID_OPT(sckpt_OPT, prog_number)) { PRINTITD(MSG_GDI_USAGE_sckpt_OPT_CKPT_NAME, MSG_GDI_UTEXT_sckpt_OPT_CKPT_NAME); diff --git a/source/common/usage.h b/source/common/usage.h index d55cce888..ce47192a0 100644 --- a/source/common/usage.h +++ b/source/common/usage.h @@ -48,6 +48,7 @@ */ enum { OA_ACCOUNT_STRING, + OA_CATEGORY_ID, OA_COMPLEX_LIST, OA_CONTEXT_LIST, OA_CKPT_SEL,