Skip to content

Simplify the configuration loading #437

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 88 additions & 44 deletions cadctl/cmd/investigate/investigate.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ package investigate
import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"

cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
"github.com/openshift/configuration-anomaly-detection/cadctl/config"
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
Expand All @@ -41,39 +40,79 @@ var InvestigateCmd = &cobra.Command{
Use: "investigate",
SilenceUsage: true,
Short: "Filter for and investigate supported alerts",
RunE: run,
PreRunE: func(cmd *cobra.Command, _ []string) error {
var err error
var c config.Config

c, err = config.BuildConfig(cmd)
if err != nil {
return fmt.Errorf("failed to build config: %w", err)
}

// Initialize the logger here because it depends on user input
logging.RawLogger = logging.InitLogger(c.LogLevel)
logging.RawLogger = logging.WithPipelineName(logging.RawLogger, c.PipelineName)

// configure the RunE command using a wrapper to
// append the configuration into the command
cmd.RunE = func(cmd *cobra.Command, args []string) error {
if err := run(c, cmd, args); err != nil {
logging.Error(err)
}
return err
}

return nil
},
RunE: func(_ *cobra.Command, _ []string) error {
// DO NOT REMOVE
// this ensures RunE work as expected
// RunE is set in the PreRunE function
// this works because the RunE is always called after PreRunE
return nil
},
Comment on lines +43 to +73
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific reason we need this vs just loading the config & initialize the logger in in run()? It seems like an unnecessary hack to use RunE, maybe I'm missing something.

}

var (
logLevelFlag = ""
payloadPath = "./payload.json"
)
var payloadPath = "./payload.json"

const pagerdutyTitlePrefix = "[CAD Investigated]"

func init() {
InvestigateCmd.Flags().StringVarP(&payloadPath, "payload-path", "p", payloadPath, "the path to the payload")
InvestigateCmd.Flags().StringVarP(&logging.LogLevelString, "log-level", "l", "", "the log level [debug,info,warn,error,fatal], default = info")

err := InvestigateCmd.MarkFlagRequired("payload-path")
if err != nil {
logging.Warn("Could not mark flag 'payload-path' as required")
}
}

func run(cmd *cobra.Command, _ []string) error {
// early init of logger for logs before clusterID is known
if cmd.Flags().Changed("log-level") {
flagValue, _ := cmd.Flags().GetString("log-level")
logging.RawLogger = logging.InitLogger(flagValue, "")
func run(c config.Config, _ *cobra.Command, _ []string) error {
var err error

if c.PrometheusPushGateway == "" {
logging.Warn("metrics disabled, set env 'CAD_PROMETHEUS_PUSHGATEWAY' to push metrics")
}

err = metrics.Push(c.PrometheusPushGateway)
Comment on lines +90 to +96
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need the loose var err error here, this would help with readability.

Suggested change
var err error
if c.PrometheusPushGateway == "" {
logging.Warn("metrics disabled, set env 'CAD_PROMETHEUS_PUSHGATEWAY' to push metrics")
}
err = metrics.Push(c.PrometheusPushGateway)
if c.PrometheusPushGateway == "" {
logging.Warn("metrics disabled, set env 'CAD_PROMETHEUS_PUSHGATEWAY' to push metrics")
}
err := metrics.Push(c.PrometheusPushGateway)

See google golang styleguide on error handling: https://google.github.io/styleguide/go/guide#maintainability

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would there be any metrics yet at this point in the code? The old logic pushed metrics after they were set (post run logic), now we push before we run anything IIUC.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think another issue is that we first check if the PushGateway is empty, but the proceed to push metrics to it, even if it is empty.

if err != nil {
logging.Warnf("failed to push metrics: %v", err)
}

// early init of logger for logs before clusterID is known
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a leftover? I don't see the relation to the code around it.

Suggested change
// early init of logger for logs before clusterID is known

payload, err := os.ReadFile(payloadPath)
if err != nil {
return fmt.Errorf("failed to read webhook payload: %w", err)
}
logging.Info("Running CAD with webhook payload:", string(payload))

pdClient, err := pagerduty.GetPDClient(payload)
if c.PagerDutyToken == "" {
return fmt.Errorf("missing PagerDuty token")
}
if c.PagerDutySilentPolicy == "" {
return fmt.Errorf("missing PagerDuty silent policy")
}
Comment on lines +108 to +113
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the intent behind checking this again here?

We already checked that we loaded something while building the config. If we want to check for completeness/format, this should ideally also go to the config building to allow us to exit as early as possible once we detect we're missing something required.


pdClient, err := pagerduty.GetPDClient(payload, c.PagerDutyToken, c.PagerDutySilentPolicy)
if err != nil {
return fmt.Errorf("could not initialize pagerduty client: %w", err)
}
Expand All @@ -88,9 +127,7 @@ func run(cmd *cobra.Command, _ []string) error {
}
}()

experimentalEnabledVar := os.Getenv("CAD_EXPERIMENTAL_ENABLED")
cadExperimentalEnabled, _ := strconv.ParseBool(experimentalEnabledVar)
alertInvestigation := investigations.GetInvestigation(pdClient.GetTitle(), cadExperimentalEnabled)
alertInvestigation := investigations.GetInvestigation(pdClient.GetTitle(), c.Experimental)

// Escalate all unsupported alerts
if alertInvestigation == nil {
Expand All @@ -111,8 +148,11 @@ func run(cmd *cobra.Command, _ []string) error {
return err
}

ocmClient, err := GetOCMClient()
if err != nil {
var ocmClient *ocm.SdkClient
var ocmErr error

ocmClient, ocmErr = ocm.NewFromClientKeyPair(c.CadOcmURL, c.CadOcmClientID, c.CadOcmClientSecret)
if ocmErr != nil {
return fmt.Errorf("could not initialize ocm client: %w", err)
}
Comment on lines +151 to 157
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the intent behind the extra/explicit variables?

Suggested change
var ocmClient *ocm.SdkClient
var ocmErr error
ocmClient, ocmErr = ocm.NewFromClientKeyPair(c.CadOcmURL, c.CadOcmClientID, c.CadOcmClientSecret)
if ocmErr != nil {
return fmt.Errorf("could not initialize ocm client: %w", err)
}
ocmClient, err := ocm.NewFromClientKeyPair(c.CadOcmURL, c.CadOcmClientID, c.CadOcmClientSecret)
if ocmErr != nil {
return fmt.Errorf("could not initialize ocm client: %w", err)
}


Expand All @@ -129,13 +169,8 @@ func run(cmd *cobra.Command, _ []string) error {
// For installing clusters, externalID can be empty.
internalClusterID := cluster.ID()

// re-initialize logger for the internal-cluster-id context
// if log-level flag is set, take priority over env + default
if cmd.Flags().Changed("log-level") {
logging.RawLogger = logging.InitLogger(logLevelFlag, internalClusterID)
} else {
logging.RawLogger = logging.InitLogger(logging.LogLevelString, internalClusterID)
}
// add the internal-cluster-id context to the logger
logging.RawLogger = logging.WithClusterID(logging.RawLogger, internalClusterID)

requiresInvestigation, err := clusterRequiresInvestigation(cluster, pdClient, ocmClient)
if err != nil || !requiresInvestigation {
Expand All @@ -147,7 +182,23 @@ func run(cmd *cobra.Command, _ []string) error {
return fmt.Errorf("could not retrieve Cluster Deployment for %s: %w", internalClusterID, err)
}

customerAwsClient, err := managedcloud.CreateCustomerAWSClient(cluster, ocmClient)
if c.BackplaneURL == "" {
return fmt.Errorf("missing backplane URL")
}
if c.BackplaneProxyURL == "" {
logging.Warn("missing backplane proxy URL, using default")
}
if c.BackplaneInitialARN == "" {
return fmt.Errorf("missing backplane initial ARN")
}
Comment on lines +185 to +193
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit late to check these. Would we not have caught this when loading the config at the start?

What's the default for BackplaneProxyURL?

customerAwsClient, err := managedcloud.CreateCustomerAWSClient(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit (can be in a follow-up - non blocking): ideally,when we get to a point where we have soooooo many parameters, we should cover this with a struct.

cluster,
ocmClient,
c.BackplaneURL,
c.BackplaneProxyURL,
c.BackplaneInitialARN,
c.AWSProxy,
)
if err != nil {
ccamResources := &investigation.Resources{Name: "ccam", Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil, AdditionalResources: map[string]interface{}{"error": err}}
inv := ccam.Investigation{}
Expand All @@ -156,7 +207,16 @@ func run(cmd *cobra.Command, _ []string) error {
return err
}

investigationResources = &investigation.Resources{Name: alertInvestigation.Name(), Cluster: cluster, ClusterDeployment: clusterDeployment, AwsClient: customerAwsClient, OcmClient: ocmClient, PdClient: pdClient, Notes: nil}
investigationResources = &investigation.Resources{
Name: alertInvestigation.Name(),
BackplaneURL: c.BackplaneURL,
Cluster: cluster,
ClusterDeployment: clusterDeployment,
AwsClient: customerAwsClient,
OcmClient: ocmClient,
PdClient: pdClient,
Notes: nil,
}

logging.Infof("Starting investigation for %s", alertInvestigation.Name())
result, err := alertInvestigation.Run(investigationResources)
Expand Down Expand Up @@ -191,22 +251,6 @@ func handleCADFailure(err error, resources *investigation.Resources, pdClient *p
}
}

// GetOCMClient will retrieve the OcmClient from the 'ocm' package
func GetOCMClient() (*ocm.SdkClient, error) {
cadOcmFilePath := os.Getenv("CAD_OCM_FILE_PATH")

_, err := os.Stat(cadOcmFilePath)
if os.IsNotExist(err) {
configDir, err := os.UserConfigDir()
if err != nil {
return nil, err
}
cadOcmFilePath = filepath.Join(configDir, "/ocm/ocm.json")
}

return ocm.New(cadOcmFilePath)
}

// Checks pre-requisites for a cluster investigation:
// - the cluster's state is supported by CAD for an investigation (= not uninstalling)
// - the cloud provider is supported by CAD (cluster is AWS)
Expand Down
11 changes: 4 additions & 7 deletions cadctl/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ package cmd

import (
investigate "github.com/openshift/configuration-anomaly-detection/cadctl/cmd/investigate"
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
"github.com/openshift/configuration-anomaly-detection/pkg/metrics"
"github.com/spf13/cobra"
)

Expand All @@ -31,14 +29,13 @@ var rootCmd = &cobra.Command{

// Execute adds all child commands to the root command and sets flags appropriately.
// This is called by main.main(). It only needs to happen once to the rootCmd.
func Execute() {
func Execute() error {
err := rootCmd.Execute()
metrics.Push()
if err != nil {
logging.Fatal(err)
}

return err
}

func init() {
rootCmd.AddCommand(investigate.InvestigateCmd)
rootCmd.PersistentFlags().StringP("loglevel", "l", "info", "Log level to use. One of: debug, info, warn, error, fatal, panic")
}
Loading