Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support choosing tensorboard version from UI #2690

Merged
merged 37 commits into from
Dec 24, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
556ace1
Support select tensorflow image for tensorboard
dldaisy Nov 29, 2019
f9259bf
modify test for tensorflow version select
dldaisy Nov 29, 2019
8d541fb
delete not available image entry
dldaisy Nov 29, 2019
e0d2f4b
Support tensorflow image selection to run tensorboard
dldaisy Dec 3, 2019
c89a22b
format code with prettier
dldaisy Dec 4, 2019
1bf624a
use HasPrefix instead of regexp
dldaisy Dec 4, 2019
301ec41
delete
dldaisy Dec 6, 2019
07ba4fc
modified tensorboard test
dldaisy Dec 11, 2019
68ae3bf
delete tensorboard
dldaisy Dec 11, 2019
45af753
modify typo
dldaisy Dec 11, 2019
324bc83
test tensorboard
dldaisy Dec 11, 2019
81dab3b
Merge remote-tracking branch 'upstream/master'
dldaisy Dec 11, 2019
9578720
Merge remote-tracking branch 'upstream/master'
dldaisy Dec 11, 2019
48bffbc
tensorboard test
dldaisy Dec 11, 2019
65ff6bb
fuck
dldaisy Dec 11, 2019
df9c9fa
fuck2
dldaisy Dec 11, 2019
9a22416
modify test
dldaisy Dec 17, 2019
d896c4c
merge master
dldaisy Dec 17, 2019
13b5cf6
modify typo in tensorboard hint
dldaisy Dec 17, 2019
c05c1ee
npm run format
dldaisy Dec 18, 2019
6bfb09a
modify tensorboard snapshot
dldaisy Dec 18, 2019
65b1d4a
compatible with previous kfp version. Allow vacant tensorflowImage fi…
dldaisy Dec 19, 2019
e8f6644
add 2 tests for dialog
dldaisy Dec 19, 2019
5bed8d8
modify default tensorflow image to 1.13.2
dldaisy Dec 19, 2019
2900699
merge get version and get tensorboard; let --bind_all support tensorb…
dldaisy Dec 20, 2019
a8308c7
modify reconciler.go
dldaisy Dec 23, 2019
d5a2e15
reconciler rollback
dldaisy Dec 23, 2019
5c5687a
modify corresponding test for --bind_all
dldaisy Dec 23, 2019
0042e1f
modify requested chances 12/23
dldaisy Dec 23, 2019
a6374ae
formControl sorted alphabetically
dldaisy Dec 23, 2019
e3ee9f8
select sorted alphabetically
dldaisy Dec 23, 2019
8801bb2
modify details from PR request 12/24
dldaisy Dec 24, 2019
4bf953a
moidfy format
dldaisy Dec 24, 2019
34e26ed
modify details 12/23
dldaisy Dec 24, 2019
fec3711
modify snapshot
dldaisy Dec 24, 2019
04048ac
retest
dldaisy Dec 24, 2019
f0e1790
retest
dldaisy Dec 24, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion backend/src/crd/controller/viewer/reconciler/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ package reconciler
import (
"context"
"fmt"
"regexp"

"github.com/golang/glog"
viewerV1beta1 "github.com/kubeflow/pipelines/backend/src/crd/pkg/apis/viewer/v1beta1"
Expand All @@ -42,7 +43,7 @@ import (

const viewerTargetPort = 6006

const defaultTensorflowImage = "tensorflow/tensorflow:1.13.2"
const defaultTensorflowImage = "tensorflow/tensorflow:1.14.0"

// Reconciler implements reconcile.Reconciler for the Viewer CRD.
type Reconciler struct {
Expand Down Expand Up @@ -180,6 +181,12 @@ func setPodSpecForTensorboard(view *viewerV1beta1.Viewer, s *corev1.PodSpec) {
// when https://github.com/kubeflow/pipelines/issues/2514 is done
// "--bind_all",
}

matched, _ := regexp.MatchString(`tensorflow/tensorflow:2.`, view.Spec.TensorboardSpec.TensorflowImage)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: you are not using any regex features, just using string.startsWith is simpler

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review and suggestion! I will re-format the code today.

if matched {
c.Args = append(c.Args, "--bind_all")
}

c.Ports = []corev1.ContainerPort{
corev1.ContainerPort{ContainerPort: viewerTargetPort},
}
Expand Down
41 changes: 30 additions & 11 deletions frontend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 11 additions & 11 deletions frontend/server/k8s-helper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ if (isInCluster) {
k8sV1CustomObjectClient = kc.makeApiClient(Custom_objectsApi);
}

function getNameOfViewerResource(logdir: string): string {
function getNameOfViewerResource(logdir: string, tfversion: string): string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some comments on what the viewer resource id is determined? E.g., the id depends on logdir and/or tfversion?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The id depends only on logdir for now. I'll remove the unused parameters.

// TODO: find some hash function with shorter resulting message.
return 'viewer-' + crypto.SHA1(logdir);
}
Expand All @@ -80,11 +80,11 @@ function getNameOfViewerResource(logdir: string): string {
* Create Tensorboard instance via CRD with the given logdir if there is no
* existing Tensorboard instance.
*/
export async function newTensorboardInstance(logdir: string, podTemplateSpec: Object = defaultPodTemplateSpec): Promise<void> {
export async function newTensorboardInstance(logdir: string, tfversion: string, podTemplateSpec: Object = defaultPodTemplateSpec): Promise<void> {
if (!k8sV1CustomObjectClient) {
throw new Error('Cannot access kubernetes Custom Object API');
}
const currentPod = await getTensorboardInstance(logdir);
const currentPod = await getTensorboardInstance(logdir, tfversion);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if there's a current pod with a different version?
Your steps below would fail because you are creating another viewer with the same name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After some discussion with Jing, we find we are facing two choices:

  1. Allow user to run more than one version at the same time, so with need to distinguish pods name by version. To implement this way, there might be several tensorboard deployments running at background, and the frontend will display which one will be opened.
  2. Another choice is only allow one version of tensorboard running at the same time. Because users mainly hope to choose a version that can be used correctly, this implementation might be functionally enough while keep the page concise. To implement this way, the "logdir" would be an unique identifier of the tensorboard folder. It's not until the user delete the previous version that he can open a new version, within the same logdir.

Maybe we can discuss later to decide which one to choose!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 sgtm
Do you have a preference?

if (currentPod) {
return;
}
Expand All @@ -93,15 +93,14 @@ export async function newTensorboardInstance(logdir: string, podTemplateSpec: Ob
apiVersion: viewerGroup + '/' + viewerVersion,
kind: 'Viewer',
metadata: {
name: getNameOfViewerResource(logdir),
name: getNameOfViewerResource(logdir, tfversion),
namespace: namespace,
},
spec: {
type: 'tensorboard',
tensorboardSpec: {
logDir: logdir,
// TODO(jingzhang36): tensorflow image version read from input textbox.
tensorflowImage: 'tensorflow/tensorflow:1.13.2',
tensorflowImage: 'tensorflow/tensorflow:'+tfversion
},
podTemplateSpec
}
Expand All @@ -114,21 +113,22 @@ export async function newTensorboardInstance(logdir: string, podTemplateSpec: Ob
* Finds a running Tensorboard instance created via CRD with the given logdir
* and returns its dns address.
*/
export async function getTensorboardInstance(logdir: string): Promise<string> {
export async function getTensorboardInstance(logdir: string, tfversion: string): Promise<string> {
if (!k8sV1CustomObjectClient) {
throw new Error('Cannot access kubernetes Custom Object API');
}

return await (k8sV1CustomObjectClient.getNamespacedCustomObject(
viewerGroup, viewerVersion, namespace, viewerPlural,
getNameOfViewerResource(logdir))).then(
getNameOfViewerResource(logdir, tfversion))).then(
// Viewer CRD pod has tensorboard instance running at port 6006 while
// viewer CRD service has tensorboard instance running at port 80. Since
// we return service address here (instead of pod address), so use 80.
(viewer: any) => (
viewer && viewer.body &&
viewer.body.spec.tensorboardSpec.logDir == logdir &&
viewer.body.spec.type == 'tensorboard') ?
viewer.body.spec.type == 'tensorboard' &&
viewer.body.spec.tensorboardSpec.tensorflowImage == 'tensorflow/tensorflow:'+tfversion) ?
`http://${viewer.body.metadata.name}-service.${namespace}.svc.cluster.local:80/tensorboard/${viewer.body.metadata.name}/` : '',
// No existing custom object with the given name, i.e., no existing
// tensorboard instance.
Expand All @@ -140,14 +140,14 @@ export async function getTensorboardInstance(logdir: string): Promise<string> {
* Polls every second for a running Tensorboard instance with the given logdir,
* and returns the address of one if found, or rejects if a timeout expires.
*/
export function waitForTensorboardInstance(logdir: string, timeout: number): Promise<string> {
export function waitForTensorboardInstance(logdir: string, tfversion: string, timeout: number): Promise<string> {
const start = Date.now();
return new Promise((resolve, reject) => {
setInterval(async () => {
if (Date.now() - start > timeout) {
reject('Timed out waiting for tensorboard');
}
const tensorboardAddress = await getTensorboardInstance(logdir);
const tensorboardAddress = await getTensorboardInstance(logdir, tfversion);
if (tensorboardAddress) {
resolve(encodeURIComponent(tensorboardAddress));
}
Expand Down
Loading