Skip to content

Commit 50936e4

Browse files
committed
Separate Data Proxy monkeypatching into a distinct kaggle library and only expose gcp connected account as a credential class the user can pass into the vanilla bq client. In the future we should remove all monkeypatching and let users use either vanilla bq or the Kaggle library.
1 parent da1b91e commit 50936e4

File tree

4 files changed

+80
-58
lines changed

4 files changed

+80
-58
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,7 @@ RUN pip install --upgrade dask && \
504504

505505
# Add BigQuery client proxy settings
506506
ENV PYTHONUSERBASE "/root/.local"
507+
ADD patches/kaggle.py /root/.local/lib/python3.6/site-packages/kaggle.py
507508
ADD patches/sitecustomize.py /root/.local/lib/python3.6/site-packages/sitecustomize.py
508509

509510
# Set backend for matplotlib

patches/kaggle.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
from google.auth import credentials
3+
from google.cloud import bigquery
4+
from google.cloud.bigquery._http import Connection
5+
6+
7+
class KaggleKernelCredentials(credentials.Credentials):
8+
"""Custom Credentials used to authenticate using the Kernel's connected OAuth account."""
9+
10+
def refresh(self, request):
11+
print("Calling Kaggle.UserSecrets to refresh token.")
12+
# Set self.token and self.expiry here.
13+
raise NotImplementedError(
14+
"Private BigQuery integration is not yet implemented.")
15+
16+
17+
class DataProxyConnection(Connection):
18+
"""Custom Connection class used to proxy the BigQuery client ho Kaggle's data proxy."""
19+
20+
API_BASE_URL = os.getenv("KAGGLE_DATA_PROXY_URL")
21+
22+
def __init__(self, client):
23+
super().__init__(client)
24+
self._EXTRA_HEADERS["X-KAGGLE-PROXY-DATA"] = os.getenv("KAGGLE_DATA_PROXY_TOKEN")
25+
26+
27+
class DataProxyClient(bigquery.client.Client):
28+
def __init__(self, project=None):
29+
if project:
30+
raise Exception("In order to query a private BigQuery project, please connect a GCP account. "
31+
"Otherwise do not specify a project to use Kaggle's public dataset BigQuery integration.")
32+
data_proxy_project = os.getenv("KAGGLE_DATA_PROXY_PROJECT")
33+
anon_credentials = credentials.AnonymousCredentials()
34+
anon_credentials.refresh = lambda *args: None
35+
super().__init__(
36+
project=data_proxy_project, credentials=anon_credentials
37+
)
38+
self._connection = DataProxyConnection(self)
39+
40+
kaggle_bq_client = lambda *args, **kwargs: DataProxyClient(*args, **kwargs)

patches/sitecustomize.py

Lines changed: 21 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,26 @@
1-
from google.auth import credentials
2-
from google.cloud import bigquery
3-
from google.cloud.bigquery._http import Connection
41
import os
52

6-
7-
class KaggleKernelCredentials(credentials.Credentials):
8-
"""Custom Credentials used to authenticate using the Kernel's connected OAuth account."""
9-
10-
def refresh(self, request):
11-
print("Calling Kaggle.UserSecrets to refresh token.")
12-
# Set self.token and self.expiry here.
13-
raise NotImplementedError("Private BigQuery integration is not yet implemented.")
14-
153
kaggle_proxy_token = os.getenv("KAGGLE_DATA_PROXY_TOKEN")
16-
CONNECTION_BASE_URL = Connection.API_BASE_URL
17-
18-
19-
def monkeypatch_bq(bq_client, *args, **kwargs):
20-
data_proxy_project = os.getenv("KAGGLE_DATA_PROXY_PROJECT")
21-
bq_user_jwt = os.getenv("KAGGLE_BQ_USER_JWT")
22-
specified_project = kwargs.get('project')
23-
# Use Data Proxy if user has specified to use the Kaggle project, or if
24-
# there are no connected GCP accounts (to maintain backwards compatibility).
25-
if bq_user_jwt is None and specified_project and specified_project.lower() != 'kaggle':
26-
raise Exception("In order to query a private BigQuery project, please connect a GCP account. "
27-
"Otherwise specify 'kaggle' as the project to use Kaggle's public dataset BigQuery integration.")
28-
use_data_proxy = (specified_project and specified_project.lower() == 'kaggle') or bq_user_jwt is None
29-
if use_data_proxy:
30-
if data_proxy_project is None or kaggle_proxy_token is None:
31-
# We don't have the data proxy info so leave the bq client unmodified.
32-
return bq_client(*args, **kwargs)
33-
print("Using Kaggle's public dataset BigQuery integration.")
34-
Connection.API_BASE_URL = os.getenv("KAGGLE_DATA_PROXY_URL")
35-
Connection._EXTRA_HEADERS["X-KAGGLE-PROXY-DATA"] = kaggle_proxy_token
36-
anon_credentials = credentials.AnonymousCredentials()
37-
anon_credentials.refresh = lambda *args: None
38-
kwargs['project'] = data_proxy_project
39-
return bq_client(
40-
*args,
41-
credentials=anon_credentials,
42-
**kwargs)
43-
else:
44-
Connection.API_BASE_URL = CONNECTION_BASE_URL
45-
Connection._EXTRA_HEADERS.pop('X-KAGGLE-PROXY-DATA', None)
46-
if kwargs.get('credentials') is not None:
47-
# The user wants to use their own credentials scheme, don't try to interfere.
4+
bq_user_jwt = os.getenv("KAGGLE_BQ_USER_JWT")
5+
if (kaggle_proxy_token or bq_user_jwt):
6+
from google.auth import credentials
7+
from google.cloud import bigquery
8+
from google.cloud.bigquery._http import Connection
9+
from kaggle import kaggle_bq_client
10+
11+
def monkeypatch_bq(bq_client, *args, **kwargs):
12+
data_proxy_project = os.getenv("KAGGLE_DATA_PROXY_PROJECT")
13+
specified_project = kwargs.get('project')
14+
specified_credentials = kwargs.get('credentials')
15+
if specified_project is None and specified_credentials is None:
16+
print("Using Kaggle's public dataset BigQuery integration.")
17+
return kaggle_bq_client(*args, **kwargs)
18+
else:
4819
return bq_client(*args, **kwargs)
49-
print("Using enabled BigQuery integration.")
50-
kwargs['credentials'] = KaggleKernelCredentials()
51-
return bq_client(
52-
*args,
53-
**kwargs)
54-
55-
# Monkey patches BigQuery client creation to use proxy or user.
56-
bq_client = bigquery.Client
57-
bigquery.Client = lambda *args, **kwargs: monkeypatch_bq(bq_client, *args, **kwargs)
5820

21+
# Monkey patches BigQuery client creation to use proxy or user-connected GCP account.
22+
# TODO: Remove monkeypatching altogether and move to using a Kaggle library for Data Proxy and
23+
# KaggleKernelCredentials with vanilla bq client for connected account.
24+
bq_client = bigquery.Client
25+
bigquery.Client = lambda *args, **kwargs: monkeypatch_bq(
26+
bq_client, *args, **kwargs)

tests/test_bigquery.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from google.cloud import bigquery
99
from google.auth.exceptions import DefaultCredentialsError
10+
from kaggle import KaggleKernelCredentials, kaggle_bq_client
1011

1112
HOSTNAME = "127.0.0.1"
1213
PORT = 8000
@@ -41,12 +42,12 @@ def do_GET(s):
4142
self.assertTrue(HTTPHandler.header_found, msg="X-KAGGLE-PROXY-DATA header was missing from the BQ request.")
4243
else:
4344
self.assertFalse(HTTPHandler.called, msg="Fake server was called from the BQ client, but should not have been.")
44-
45-
def test_proxy_kaggle_project(self):
45+
46+
def test_proxy_using_library(self):
4647
env = EnvironmentVarGuard()
4748
env.unset('KAGGLE_BQ_USER_JWT')
4849
with env:
49-
client = bigquery.Client(project='KAGGLE')
50+
client = kaggle_bq_client()
5051
self._test_proxy(client, should_use_proxy=True)
5152

5253
def test_proxy_no_project(self):
@@ -60,8 +61,20 @@ def test_project_with_connected_account(self):
6061
env = EnvironmentVarGuard()
6162
env.set('KAGGLE_BQ_USER_JWT', 'foobar')
6263
with env:
63-
client = bigquery.Client(project='ANOTHER_PROJECT')
64+
client = bigquery.Client(project='ANOTHER_PROJECT', credentials=KaggleKernelCredentials())
6465
self._test_proxy(client, should_use_proxy=False)
66+
67+
def test_simultaneous_clients(self):
68+
env = EnvironmentVarGuard()
69+
env.set('KAGGLE_BQ_USER_JWT', 'foobar')
70+
with env:
71+
proxy_client = bigquery.Client()
72+
self._test_proxy(proxy_client, should_use_proxy=True)
73+
bq_client = bigquery.Client(project='ANOTHER_PROJECT', credentials=KaggleKernelCredentials())
74+
self._test_proxy(bq_client, should_use_proxy=False)
75+
# Verify that proxy client is still going to proxy to ensure global Connection
76+
# isn't being modified.
77+
self._test_proxy(proxy_client, should_use_proxy=True)
6578

6679
def test_no_project_with_connected_account(self):
6780
env = EnvironmentVarGuard()
@@ -70,5 +83,5 @@ def test_no_project_with_connected_account(self):
7083
with self.assertRaises(DefaultCredentialsError):
7184
# TODO(vimota): Handle this case, either default to Kaggle Proxy or use some default project
7285
# by the user or throw a custom exception.
73-
client = bigquery.Client()
86+
client = bigquery.Client(credentials=KaggleKernelCredentials())
7487
self._test_proxy(client, should_use_proxy=False)

0 commit comments

Comments
 (0)