Skip to content

Commit

Permalink
fix(backend): prevent seg fault if workflow manifest is deleted. Fixes
Browse files Browse the repository at this point in the history
…kubeflow#4389 (kubeflow#4439)

Fixes kubeflow#4389 (partially).

When the workflow manifest file is deleted from s3 due to the retention policy, we were
getting this segmentation fault in the next createRun attempt for that pipeline:

```
I0831 06:36:53.916141       1 interceptor.go:29] /api.RunService/CreateRun handler starting
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x148 pc=0x156e140]

goroutine 183 [running]:
github.com/kubeflow/pipelines/backend/src/common/util.(*Workflow).VerifyParameters(0xc000010610, 0xc00036b6b0, 0x0, 0xc00036b6b0)
        backend/src/common/util/workflow.go:66 +0x90
github.com/kubeflow/pipelines/backend/src/apiserver/resource.(*ResourceManager).CreateRun(0xc00088b5e0, 0xc00088b880, 0xc0009c3c50, 0xc000010450, 0x1)
        backend/src/apiserver/resource/resource_manager.go:326 +0x27c
github.com/kubeflow/pipelines/backend/src/apiserver/server.(*RunServer).CreateRun(0xc0000b8718, 0x1e7bc20, 0xc0009c3c50, 0xc0009c3c80, 0xc0000b8718, 0x2ddc6e9, 0xc00014e070)
        backend/src/apiserver/server/run_server.go:43 +0xce
github.com/kubeflow/pipelines/backend/api/go_client._RunService_CreateRun_Handler.func1(0x1e7bc20, 0xc0009c3c50, 0x1aa80e0, 0xc0009c3c80, 0xc0008cbb40, 0x1, 0x1, 0x7f9e4d6466d0)
        bazel-out/k8-opt/bin/backend/api/linux_amd64_stripped/go_client_go_proto%/github.com/kubeflow/pipelines/backend/api/go_client/run.pb.go:1399 +0x86
main.apiServerInterceptor(0x1e7bc20, 0xc0009c3c50, 0x1aa80e0, 0xc0009c3c80, 0xc000778ca0, 0xc000778cc0, 0xc0004dcbd0, 0x4e7bba, 0x1a98e00, 0xc0009c3c50)
        backend/src/apiserver/interceptor.go:30 +0xf8
github.com/kubeflow/pipelines/backend/api/go_client._RunService_CreateRun_Handler(0x1ac4a20, 0xc0000b8718, 0x1e7bc20, 0xc0009c3c50, 0xc0009c6e40, 0x1c6bd70, 0x1e7bc20, 0xc0009c3c50, 0xc0004321c0, 0x66)
        bazel-out/k8-opt/bin/backend/api/linux_amd64_stripped/go_client_go_proto%/github.com/kubeflow/pipelines/backend/api/go_client/run.pb.go:1401 +0x158
google.golang.org/grpc.(*Server).processUnaryRPC(0xc00064eb00, 0x1ea2840, 0xc00061cd80, 0xc00046c700, 0xc00071ab70, 0x2e14040, 0x0, 0x0, 0x0)
        external/org_golang_google_grpc/server.go:995 +0x466
google.golang.org/grpc.(*Server).handleStream(0xc00064eb00, 0x1ea2840, 0xc00061cd80, 0xc00046c700, 0x0)
        external/org_golang_google_grpc/server.go:1275 +0xda6
google.golang.org/grpc.(*Server).serveStreams.func1.1(0xc0004e9084, 0xc00064eb00, 0x1ea2840, 0xc00061cd80, 0xc00046c700)
        external/org_golang_google_grpc/server.go:710 +0x9f
created by google.golang.org/grpc.(*Server).serveStreams.func1
        external/org_golang_google_grpc/server.go:708 +0xa1
```

It was same in CreateJob calls.

Scenario described in kubeflow#4389 also seems causing the same issue.

With this PR, we aim not to have the segmentation fault at least, because in
our case it's expected that manifest files will be deleted after some time due
to the retention policy.

Other problems about right pipeline version picking described in issue kubeflow#4389
still need to be addressed.
  • Loading branch information
ekesken authored and Bobgy committed Sep 4, 2020
1 parent 4d3d89e commit 9c0bacb
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 0 deletions.
10 changes: 10 additions & 0 deletions backend/src/apiserver/resource/resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,11 @@ func (r *ResourceManager) CreateRun(apiRun *api.Run) (*model.RunDetail, error) {
return nil, util.NewInternalServerError(err,
"Failed to unmarshal workflow spec manifest. Workflow bytes: %s", string(workflowSpecManifestBytes))
}
if workflow.Workflow == nil {
return nil, util.Wrap(
util.NewResourceNotFoundError("WorkflowSpecManifest", apiRun.GetName()),
"Failed to fetch workflow spec manifest.")
}

parameters := toParametersMap(apiRun.GetPipelineSpec().GetParameters())
// Verify no additional parameter provided
Expand Down Expand Up @@ -571,6 +576,11 @@ func (r *ResourceManager) CreateJob(apiJob *api.Job) (*model.Job, error) {
return nil, util.NewInternalServerError(err,
"Failed to unmarshal workflow spec manifest. Workflow bytes: %s", string(workflowSpecManifestBytes))
}
if workflow.Workflow == nil {
return nil, util.Wrap(
util.NewResourceNotFoundError("WorkflowSpecManifest", apiJob.GetName()),
"Failed to fetch workflow spec manifest.")
}

// Verify no additional parameter provided
err = workflow.VerifyParameters(toParametersMap(apiJob.GetPipelineSpec().GetParameters()))
Expand Down
37 changes: 37 additions & 0 deletions backend/src/apiserver/resource/resource_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,24 @@ func TestCreateRun_InvalidWorkflowSpec(t *testing.T) {
assert.Contains(t, err.Error(), "Failed to unmarshal workflow spec manifest")
}

func TestCreateRun_NullWorkflowSpec(t *testing.T) {
store := NewFakeClientManagerOrFatal(util.NewFakeTimeForEpoch())
defer store.Close()
manager := NewResourceManager(store)
apiRun := &api.Run{
Name: "run1",
PipelineSpec: &api.PipelineSpec{
WorkflowManifest: "null", // this situation occurs for real when the manifest file disappears from object store in some way due to retention policy or manual deletion.
Parameters: []*api.Parameter{
{Name: "param1", Value: "world"},
},
},
}
_, err := manager.CreateRun(apiRun)
assert.NotNil(t, err)
assert.Contains(t, err.Error(), "Failed to fetch workflow spec manifest.: ResourceNotFoundError: WorkflowSpecManifest run1 not found.")
}

func TestCreateRun_OverrideParametersError(t *testing.T) {
store := NewFakeClientManagerOrFatal(util.NewFakeTimeForEpoch())
defer store.Close()
Expand Down Expand Up @@ -1131,6 +1149,25 @@ func TestCreateJob_InvalidWorkflowSpec(t *testing.T) {
assert.Contains(t, err.Error(), "Failed to unmarshal workflow spec manifest")
}

func TestCreateJob_NullWorkflowSpec(t *testing.T) {
store := NewFakeClientManagerOrFatal(util.NewFakeTimeForEpoch())
defer store.Close()
manager := NewResourceManager(store)
job := &api.Job{
Name: "pp 1",
Enabled: true,
PipelineSpec: &api.PipelineSpec{
WorkflowManifest: string("null"), // this situation occurs for real when the manifest file disappears from object store in some way due to retention policy or manual deletion.
Parameters: []*api.Parameter{
{Name: "param1", Value: "world"},
},
},
}
_, err := manager.CreateJob(job)
assert.NotNil(t, err)
assert.Contains(t, err.Error(), "Failed to fetch workflow spec manifest.: ResourceNotFoundError: WorkflowSpecManifest pp 1 not found.")
}

func TestCreateJob_ExtraInputParameterError(t *testing.T) {
store, manager, p := initWithPipeline(t)
defer store.Close()
Expand Down

0 comments on commit 9c0bacb

Please sign in to comment.