Skip to content

[Dashboard] debug with dashboard #172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
972 changes: 972 additions & 0 deletions examples/debug_with_dashboard.ipynb

Large diffs are not rendered by default.

Binary file added examples/img/example-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/img/example-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/img/example-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/img/infeasible-task.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/img/local-memory-usage.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/img/node-memory-usage.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/img/profiling.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
625 changes: 598 additions & 27 deletions examples/news_recommendation_model.ipynb

Large diffs are not rendered by default.

135 changes: 106 additions & 29 deletions examples/sharded_parameter_server.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,61 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: RAY_DASHBOARD_DEBUG=True\n"
]
}
],
"source": [
"from __future__ import absolute_import\n",
"from __future__ import division\n",
"from __future__ import print_function\n",
"\n",
"import numpy as np\n",
"import ray\n",
"import time"
"import time\n",
"\n",
"%env RAY_DASHBOARD_DEBUG = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-01-24 14:32:55,735\tINFO resource_spec.py:212 -- Starting Ray with 2.69 GiB memory available for workers and up to 1.36 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).\n",
"2020-01-24 14:32:56,852\tINFO services.py:501 -- Failed to connect to the redis server, retrying.\n",
"2020-01-24 14:32:57,123\tINFO services.py:1093 -- View the Ray dashboard at \u001b[1m\u001b[32mlocalhost:8268\u001b[39m\u001b[22m\n"
]
},
{
"data": {
"text/plain": [
"{'node_ip_address': '192.168.1.27',\n",
" 'redis_address': '192.168.1.27:16149',\n",
" 'object_store_address': '/tmp/ray/session_2020-01-24_14-32-55_727080_11529/sockets/plasma_store',\n",
" 'raylet_socket_name': '/tmp/ray/session_2020-01-24_14-32-55_727080_11529/sockets/raylet',\n",
" 'webui_url': 'localhost:8268',\n",
" 'session_dir': '/tmp/ray/session_2020-01-24_14-32-55_727080_11529'}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ray.init(num_cpus=30, include_webui=False, ignore_reinit_error=True)"
"ray.init(num_cpus=30, include_webui=True, ignore_reinit_error=True)"
]
},
{
Expand All @@ -57,12 +92,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"dim = 10\n",
"\n",
"@ray.remote\n",
"class ParameterServer(object):\n",
" def __init__(self, dim):\n",
" self.parameters = np.zeros(dim)\n",
Expand All @@ -74,7 +110,7 @@
" self.parameters += update\n",
"\n",
"\n",
"ps = ParameterServer(dim)\n",
"ps = ParameterServer.remote(dim)\n",
"\n",
"assert hasattr(ParameterServer, 'remote'), ('You need to turn ParameterServer into an '\n",
" 'actor (by using the ray.remote keyword).')"
Expand All @@ -91,18 +127,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"def worker(ps, dim, num_iters):\n",
" for _ in range(num_iters):\n",
" # Get the latest parameters.\n",
" parameters = ps.get_parameters()\n",
" parameters = ray.get(ps.get_parameters.remote())\n",
" # Compute an update.\n",
" update = 1e-3 * parameters + np.ones(dim)\n",
" # Update the parameters.\n",
" ps.update_parameters(update)\n",
" ray.get(ps.update_parameters.remote(update))\n",
" # Sleep a little to simulate a real workload.\n",
" time.sleep(0.5)\n",
"\n",
Expand All @@ -112,12 +149,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Start two workers.\n",
"worker_results = [worker(ps, dim, 100) for _ in range(2)]"
"worker_results = [worker.remote(ps, dim, 100) for _ in range(2)]"
]
},
{
Expand All @@ -131,9 +168,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[41.81828509 41.81828509 41.81828509 41.81828509 41.81828509 41.81828509\n",
" 41.81828509 41.81828509 41.81828509 41.81828509]\n"
]
}
],
"source": [
"print(ray.get(ps.get_parameters.remote()))"
]
Expand All @@ -155,10 +201,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"class ParameterServerShard(object):\n",
" def __init__(self, sharded_dim):\n",
" self.parameters = np.zeros(sharded_dim)\n",
Expand All @@ -178,7 +225,7 @@
" 'perfectly divide the total dimension.')\n",
"\n",
"# Start some parameter servers.\n",
"ps_shards = [ParameterServerShard(total_dim // num_shards) for _ in range(num_shards)]\n",
"ps_shards = [ParameterServerShard.remote(total_dim // num_shards) for _ in range(num_shards)]\n",
"\n",
"assert hasattr(ParameterServerShard, 'remote'), ('You need to turn ParameterServerShard into an '\n",
" 'actor (by using the ray.remote keyword).')"
Expand All @@ -200,17 +247,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"@ray.remote\n",
"def worker_task(total_dim, num_iters, *ps_shards):\n",
" # Note that ps_shards are passed in using Python's variable number\n",
" # of arguments feature. We do this because currently actor handles\n",
" # cannot be passed to tasks inside of lists or other objects.\n",
" for _ in range(num_iters):\n",
" # Get the current parameters from each parameter server.\n",
" parameter_shards = [ps.get_parameters() for ps in ps_shards]\n",
" parameter_shards = ray.get([ps.get_parameters.remote() for ps in ps_shards])\n",
" assert all([isinstance(shard, np.ndarray) for shard in parameter_shards]), (\n",
" 'The parameter shards must be numpy arrays. Did you forget to call ray.get?')\n",
" # Concatenate them to form the full parameter vector.\n",
Expand All @@ -224,7 +272,7 @@
" \n",
" # Apply the updates to the relevant parameter server shards.\n",
" for ps, update_shard in zip(ps_shards, update_shards):\n",
" ps.update_parameters(update_shard)\n",
" ray.get(ps.update_parameters.remote(update_shard))\n",
"\n",
"\n",
"# Test that worker_task is implemented correctly. You do not need to change this line.\n",
Expand All @@ -242,18 +290,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This took 1.3368570804595947 seconds.\n",
"This took 1.6423673629760742 seconds.\n",
"This took 3.1816182136535645 seconds.\n"
]
},
{
"ename": "RayTaskError",
"evalue": "\u001b[36mray::__main__.worker_task()\u001b[39m (pid=11559, ip=192.168.1.27)\n File \"python/ray/_raylet.pyx\", line 647, in ray._raylet.execute_task\n File \"<ipython-input-8-fe1d9fc6a19b>\", line 22, in worker_task\nray.exceptions.RayTaskError: \u001b[36mray::ParameterServerShard\u001b[39m (pid=11551, ip=192.168.1.27)\n File \"python/ray/_raylet.pyx\", line 633, in ray._raylet.execute_task\n File \"python/ray/_raylet.pyx\", line 634, in ray._raylet.execute_task\n File \"python/ray/_raylet.pyx\", line 519, in ray._raylet.deserialize_args\nray.exceptions.UnreconstructableError: Object 2ca53902e2591031ffffffff0100008004000000 is lost (either LRU evicted or deleted by user) and cannot be reconstructed. Try increasing the object store memory available with ray.init(object_store_memory=<bytes>) or setting object store limits with ray.remote(object_store_memory=<bytes>). See also: https://ray.readthedocs.io/en/latest/memory-management.html",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRayTaskError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-f71151eb2314>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# duration changes.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mworker_task\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal_dim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mps_shards\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_workers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'This took {} seconds.'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/ray/python/ray/worker.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(object_ids, timeout)\u001b[0m\n\u001b[1;32m 1490\u001b[0m \u001b[0mworker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore_worker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdump_object_store_memory_usage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1491\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRayTaskError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1492\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_instanceof_cause\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1493\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1494\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRayTaskError\u001b[0m: \u001b[36mray::__main__.worker_task()\u001b[39m (pid=11559, ip=192.168.1.27)\n File \"python/ray/_raylet.pyx\", line 647, in ray._raylet.execute_task\n File \"<ipython-input-8-fe1d9fc6a19b>\", line 22, in worker_task\nray.exceptions.RayTaskError: \u001b[36mray::ParameterServerShard\u001b[39m (pid=11551, ip=192.168.1.27)\n File \"python/ray/_raylet.pyx\", line 633, in ray._raylet.execute_task\n File \"python/ray/_raylet.pyx\", line 634, in ray._raylet.execute_task\n File \"python/ray/_raylet.pyx\", line 519, in ray._raylet.deserialize_args\nray.exceptions.UnreconstructableError: Object 2ca53902e2591031ffffffff0100008004000000 is lost (either LRU evicted or deleted by user) and cannot be reconstructed. Try increasing the object store memory available with ray.init(object_store_memory=<bytes>) or setting object store limits with ray.remote(object_store_memory=<bytes>). See also: https://ray.readthedocs.io/en/latest/memory-management.html"
]
}
],
"source": [
"num_workers = 4\n",
"for num_workers in [1, 2, 4, 8]:\n",
"\n",
"# Start some workers. Try changing various quantities and see how the\n",
"# duration changes.\n",
"start = time.time()\n",
"ray.get([worker_task(total_dim, 5, *ps_shards) for _ in range(num_workers)])\n",
"print('This took {} seconds.'.format(time.time() - start))"
" # Start some workers. Try changing various quantities and see how the\n",
" # duration changes.\n",
" start = time.time()\n",
" ray.get([worker_task.remote(total_dim, 5, *ps_shards) for _ in range(num_workers)])\n",
" print('This took {} seconds.'.format(time.time() - start))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -272,7 +349,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.7.4"
}
},
"nbformat": 4,
Expand Down
Loading