Skip to content

Commit fbba10c

Browse files
Neural Solution Resource Management (#1060)
Signed-off-by: Kaihui-intel <kaihui.tang@intel.com> Co-authored-by: Yi30 <106061964+yiliu30@users.noreply.github.com>
1 parent 1b5337e commit fbba10c

File tree

6 files changed

+280
-16
lines changed

6 files changed

+280
-16
lines changed

.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2677,3 +2677,4 @@ jJA
26772677
wWLes
26782678
xHKe
26792679
PR
2680+
hostname

neural_solution/backend/cluster.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,54 @@ def free_resource(self, reserved_resource_lst):
8484
"""
8585
self.cursor.execute(sql, (free_resources[node_id], free_resources[node_id], node_id))
8686
self.conn.commit()
87+
# delete nodes with status of remove, some version without RETURNING syntax
88+
self.cursor.execute("SELECT id FROM cluster WHERE status='remove' AND busy_sockets=0")
89+
deleted_ids = self.cursor.fetchall()
90+
deleted_ids = [str(id_tuple[0]) for id_tuple in deleted_ids]
91+
self.cursor.execute("DELETE FROM cluster WHERE status='remove' AND busy_sockets=0")
92+
self.conn.commit()
93+
94+
# remove deleted nodes from socket queue
95+
socket_queue_delete_ids = [socket for socket in self.socket_queue if socket.split()[0] in deleted_ids]
96+
if len(socket_queue_delete_ids) > 0:
97+
logger.info(f"[Cluster] remove node-list {socket_queue_delete_ids} from socket_queue: {self.socket_queue}")
98+
self.socket_queue = [socket for socket in self.socket_queue if socket.split()[0] not in deleted_ids]
8799
logger.info(f"[Cluster] free resource {reserved_resource_lst}, now have free resource {self.socket_queue}")
88100

89101
@synchronized
90102
def get_free_socket(self, num_sockets: int) -> List[str]:
91103
"""Get the free sockets list."""
92104
booked_socket_lst = []
105+
106+
# detect and append new resource
107+
self.cursor.execute(f"SELECT id, name, total_sockets FROM cluster where status = 'join'")
108+
new_node_lst = self.cursor.fetchall()
109+
for index, name, total_sockets in new_node_lst:
110+
sql = """
111+
UPDATE cluster
112+
SET status = ?
113+
WHERE id = ?
114+
"""
115+
self.cursor.execute(sql, ('alive', index))
116+
self.conn.commit()
117+
self.socket_queue += [str(index) + " " + name] * total_sockets
118+
logger.info(f"[Cluster] add new node-id {index} to socket_queue: {self.socket_queue}")
119+
120+
# do not assign nodes with status of remove
121+
# remove to-delete nodes from socket queue
122+
self.cursor.execute("SELECT id FROM cluster WHERE status='remove'")
123+
deleted_ids = self.cursor.fetchall()
124+
deleted_ids = [str(id_tuple[0]) for id_tuple in deleted_ids]
125+
126+
socket_queue_delete_ids = [socket for socket in self.socket_queue if socket.split()[0] in deleted_ids]
127+
if len(socket_queue_delete_ids) > 0:
128+
logger.info(f"[Cluster] remove node-list {socket_queue_delete_ids} from socket_queue: {self.socket_queue}")
129+
self.socket_queue = [socket for socket in self.socket_queue if socket.split()[0] not in deleted_ids]
130+
131+
# delete nodes with status of remove
132+
self.cursor.execute("DELETE FROM cluster WHERE status='remove' AND busy_sockets=0")
133+
self.conn.commit()
134+
93135
if len(self.socket_queue) < num_sockets:
94136
logger.info(f"Can not allocate {num_sockets} sockets, due to only {len(self.socket_queue)} left.")
95137
return 0
@@ -111,6 +153,7 @@ def initial_cluster_from_node_lst(self, node_lst):
111153
self.cursor.execute("drop table if exists cluster ")
112154
self.cursor.execute(
113155
r"create table cluster(id INTEGER PRIMARY KEY AUTOINCREMENT,"
156+
+ "name varchar(100),"
114157
+ "node_info varchar(500),"
115158
+ "status varchar(100),"
116159
+ "free_sockets int,"
@@ -121,9 +164,9 @@ def initial_cluster_from_node_lst(self, node_lst):
121164
for index, node in enumerate(self.node_lst):
122165
self.socket_queue += [str(index + 1) + " " + node.name] * node.num_sockets
123166
self.cursor.execute(
124-
r"insert into cluster(node_info, status, free_sockets, busy_sockets, total_sockets)"
125-
+ "values ('{}', '{}', {}, {}, {})".format(
126-
repr(node).replace("Node", f"Node{index+1}"), "alive", node.num_sockets, 0, node.num_sockets
167+
r"insert into cluster(name, node_info, status, free_sockets, busy_sockets, total_sockets)"
168+
+ "values ('{}', '{}', '{}', {}, {}, {})".format(
169+
node.name, repr(node).replace("Node", f"Node{index+1}"), "alive", node.num_sockets, 0, node.num_sockets
127170
)
128171
)
129172

neural_solution/docs/source/README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
- [Query task status](#query-task-status)
1111
- [Stop service](#stop-service)
1212
- [Inspect logs](#inspect-logs)
13+
- [Manage resource](#manage-resource)
14+
- [Node States](#node-states)
15+
- [Query cluster](#query-cluster)
16+
- [Add node](#add-node)
17+
- [Remove node](#remove-node)
1318

1419
## Install Neural Solution
1520
### Prerequisites
@@ -126,3 +131,40 @@ There are several logs under workspace:
126131

127132
```
128133

134+
## Manage resource
135+
Neural Solution supports cluster management for service maintainers, providing several command-line tools for efficient resource management.
136+
137+
### Node States
138+
139+
Each node in the cluster can have three different states:
140+
141+
- Alive: Represents a node that is functioning properly and available to handle requests.
142+
- Join: Indicates that a node is in the process of being added to the cluster but has not fully joined yet.
143+
- Remove: Indicates that a node is scheduled to be removed from the cluster.
144+
145+
Below are some commonly used commands and their usage:
146+
147+
### Query cluster
148+
This command is used to query the current status of the cluster. No additional parameters are required, simply enter the following command:
149+
```shell
150+
neural_solution cluster --query
151+
```
152+
### Add node
153+
This command is used to add nodes to the cluster. You can either specify a host file or provide a list of nodes separated by ";". The node format consists of three parts: hostname, number_of_sockets, and cores_per_socket. Here's a breakdown of each part:
154+
155+
- hostname: This refers to the name or IP address of the node that you want to add to the cluster. It identifies the specific machine or server that will be part of the cluster.
156+
157+
- number_of_sockets: This indicates the number of physical CPU sockets available on the node. A socket is a physical component that houses one or more CPU cores. It represents a physical processor unit.
158+
159+
- cores_per_socket: This specifies the number of CPU cores present in each socket. A core is an individual processing unit within a CPU.
160+
161+
For example:
162+
```shell
163+
neural_solution cluster --join "host1 2 20; host2 4 20"
164+
```
165+
### Remove node
166+
This command is used to remove nodes from the cluster based on the IDs obtained from the query. The IDs can be passed as a parameter to the command. For example:
167+
```shell
168+
neural_solution cluster --remove <query_id>
169+
```
170+
Please note that the above commands are just examples and may require additional parameters or configurations based on your specific setup.

neural_solution/examples/custom_models_optimized/tf_example1/README.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ In this example, we show how to quantize a [custom model](https://github.com/int
88
- Demonstrate how to start the Neural Solution Service.
99
- Demonstrate how to prepare an optimization task request and submit it to Neural Solution Service.
1010
- Demonstrate how to query the status of the task and fetch the optimization result.
11+
- Demonstrate how to query and manage the resource of the cluster.
1112

1213
### Requirements
1314
Customizing the model requires preparing the following folders and files.
@@ -48,12 +49,12 @@ neural_solution -h
4849

4950
usage: neural_solution {start,stop} [-h] [--hostfile HOSTFILE] [--restful_api_port RESTFUL_API_PORT] [--grpc_api_port GRPC_API_PORT]
5051
[--result_monitor_port RESULT_MONITOR_PORT] [--task_monitor_port TASK_MONITOR_PORT] [--api_type API_TYPE]
51-
[--workspace WORKSPACE] [--conda_env CONDA_ENV] [--upload_path UPLOAD_PATH]
52+
[--workspace WORKSPACE] [--conda_env CONDA_ENV] [--upload_path UPLOAD_PATH] [--query] [--join JOIN] [--remove REMOVE]
5253

5354
Neural Solution
5455

5556
positional arguments:
56-
{start,stop} start/stop service
57+
{start,stop,cluster} start/stop/management service
5758

5859
optional arguments:
5960
-h, --help show this help message and exit
@@ -73,6 +74,9 @@ optional arguments:
7374
specify the running environment for the task
7475
--upload_path UPLOAD_PATH
7576
specify the file path for the tasks
77+
--query [cluster parameter] query cluster information
78+
--join JOIN [cluster parameter] add new node into cluster
79+
--remove REMOVE [cluster parameter] remove <node-id> from cluster
7680
```
7781

7882

@@ -155,6 +159,19 @@ When using distributed quantization, the `workers` needs to be set to greater th
155159
# download quantized_model.zip
156160
```
157161

162+
### Manage resource
163+
```shell
164+
# query cluster information
165+
neural_solution cluster --query
166+
167+
# add new node into cluster
168+
# parameter: "<node1> <number_of_sockets> <number_of_threads>;<node2> <number_of_sockets> <number_of_threads>"
169+
neural_solution cluster --join "host1 2 20; host2 5 20"
170+
171+
# remove node from cluster according to id
172+
neural_solution cluster --remove <node-id>
173+
```
174+
158175
### Stop the service
159176
```shell
160177
neural_solution stop

neural_solution/examples/hf_models/README.md

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ In this example, we show how to quantize a Hugging Face model with Neural Soluti
66
- Demonstrate how to start the Neural Solution Service.
77
- Demonstrate how to prepare an optimization task request and submit it to Neural Solution Service.
88
- Demonstrate how to query the status of the task and fetch the optimization result.
9+
- Demonstrate how to query and manage the resource of the cluster.
910

1011

1112
### Start the Neural Solution Service
@@ -27,14 +28,14 @@ neural_solution stop
2728
neural_solution -h
2829
# Help output
2930

30-
usage: neural_solution {start,stop} [-h] [--hostfile HOSTFILE] [--restful_api_port RESTFUL_API_PORT] [--grpc_api_port GRPC_API_PORT]
31+
usage: neural_solution {start,stop,cluster} [-h] [--hostfile HOSTFILE] [--restful_api_port RESTFUL_API_PORT] [--grpc_api_port GRPC_API_PORT]
3132
[--result_monitor_port RESULT_MONITOR_PORT] [--task_monitor_port TASK_MONITOR_PORT] [--api_type API_TYPE]
32-
[--workspace WORKSPACE] [--conda_env CONDA_ENV] [--upload_path UPLOAD_PATH]
33+
[--workspace WORKSPACE] [--conda_env CONDA_ENV] [--upload_path UPLOAD_PATH] [--query] [--join JOIN] [--remove REMOVE]
3334

3435
Neural Solution
3536

3637
positional arguments:
37-
{start,stop} start/stop service
38+
{start,stop,cluster} start/stop/management service
3839

3940
optional arguments:
4041
-h, --help show this help message and exit
@@ -54,6 +55,9 @@ optional arguments:
5455
specify the running environment for the task
5556
--upload_path UPLOAD_PATH
5657
specify the file path for the tasks
58+
--query [cluster parameter] query cluster information
59+
--join JOIN [cluster parameter] add new node into cluster
60+
--remove REMOVE [cluster parameter] remove <node-id> from cluster
5761
```
5862

5963

@@ -118,6 +122,19 @@ optional arguments:
118122
``` shell
119123
[user@server tf_example1]$ curl -X GET http://localhost:8000/download/{task_id} --output quantized_model.zip
120124
# download quantized_model.zip
125+
```
126+
### Manage resource
127+
```shell
128+
# query cluster information
129+
neural_solution cluster --query
130+
131+
# add new node into cluster
132+
# parameter: "<node1> <number_of_sockets> <number_of_threads>;<node2> <number_of_sockets> <number_of_threads>"
133+
neural_solution cluster --join "host1 2 20; host2 5 20"
134+
135+
# remove node from cluster according to id
136+
neural_solution cluster --remove <node-id>
137+
121138
```
122139
### Stop the service
123140
```shell

0 commit comments

Comments
 (0)