Python API hangs when run twice in distributed setting #1789
Closed
Description
I have a problem with python API in distributed setting. When I run training twice, the second run hangs. I've attached python test and docker image to reproduce the behavior.
Environment info
Operating System: Docker container based on python:3.6-slim image (running on MacOS)
CPU/GPU model: CPU
C++/Python/R version: Python version
Example
tests/python_package_test/test_distributed_sklearn.py
from multiprocessing import Pipe, Process
from unittest import TestCase
import random
from sklearn.datasets import make_blobs
import numpy as np
import numpy.testing as npt
def _fit_local(q):
import lightgbm
X, y, weight, params = q.recv()
classifier = lightgbm.LGBMClassifier(**params)
classifier.fit(X, y, sample_weight=weight)
classifier.booster_.free_network()
q.send(classifier)
q.close()
def fit_model(X, y, weight=None, time_out=120):
parent_conn_a, child_conn_a = Pipe()
parent_conn_b, child_conn_b = Pipe()
process_a = Process(target=_fit_local, args=(child_conn_a,))
process_b = Process(target=_fit_local, args=(child_conn_b,))
idx_a = np.random.rand(X.shape[0]) > 0.5
idx_b = ~idx_a
X_a, y_a = X[idx_a, :], y[idx_a]
X_b, y_b = X[idx_b, :], y[idx_b]
if weight:
w_a, w_b = weight[idx_a], weight[idx_b]
else:
w_a, w_b = None, None
params_a, params_b = _build_params(time_out)
process_a.start()
process_b.start()
parent_conn_a.send((X_a, y_a, w_a, params_a))
parent_conn_b.send((X_b, y_b, w_b, params_b))
model_a = parent_conn_a.recv()
model_b = parent_conn_b.recv()
process_a.join(timeout=120)
process_b.join(timeout=120)
# import lightgbm
# model = lightgbm.LGBMClassifier()
# model.fit(X, y, sample_weight=weight)
return model_a, model_b
def _build_params(time_out):
start_port = random.randint(0, 10_000) + 12400
machines = f"127.0.0.1:{start_port},127.0.0.1:{start_port+1}"
params_a = {
"machines": machines,
"local_listen_port": start_port,
"time_out": time_out,
"num_machines": 2,
"verbose": 100,
"tree_learner": "data"
}
params_b = {
"machines": machines,
"local_listen_port": start_port + 1,
"time_out": time_out,
"num_machines": 2,
"verbose": 100,
"tree_learner": "data"
}
return params_a, params_b
def assert_array_almost_equal(x, y, eps=1e-8):
npt.assert_equal(x.shape, y.shape)
assert np.sum((x != y)) / (np.product(x.shape)) <= eps
class TestDistributedSklearn(TestCase):
def test_run_twice(self):
X_1, y_1 = make_blobs(n_samples=100, centers=2)
X_2, y_2 = make_blobs(n_samples=100, centers=2)
model_a, model_b = fit_model(X_1, y_1, time_out=20)
self.assertIsNotNone(model_a)
self.assertIsNotNone(model_b)
del model_a
del model_b
model_a, model_b = fit_model(X_2, y_2, time_out=20)
self.assertIsNotNone(model_a)
self.assertIsNotNone(model_b)
del model_a
del model_b
Dockerfile:
FROM python:3.6-slim
RUN apt-get update && \
apt-get install -y --no-install-recommends \
cmake \
libc6-dev \
make \
gcc \
g++ \
libssl-dev \
automake \
libtool \
net-tools \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
ADD . /app
RUN mkdir /app/build
WORKDIR /app/build
WORKDIR /app/python-package
RUN python setup.py bdist_wheel && pip install dist/*
RUN python -m pip install dask distributed pytest pytest-xdist
WORKDIR /app/tests/python_package_test
ENTRYPOINT [ "/bin/bash", "-c", "pytest ${*}", "--" ]
Steps to reproduce
docker build -t lightgbm-test .
docker run -it lightgbm-test /app/tests/python_package_test/test_distributed_sklearn.py::TestDistributedSklearn::test_run_twice [-ss
]
Metadata
Assignees
Labels
No labels