Skip to content

Reworking logs #1778

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jan 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ image_metrics_server: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/met
image_inferentia: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/inferentia:latest
image_neuron_rtd: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/neuron-rtd:latest
image_nvidia: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/nvidia:latest
image_fluentd: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/fluentd:latest
image_fluent_bit: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/fluent_bit:latest
image_statsd: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/statsd:latest
image_istio_proxy: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/istio-proxy:latest
image_istio_pilot: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/istio-pilot:latest
Expand Down
2 changes: 1 addition & 1 deletion build/images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ non_dev_images_cluster=(
"operator"
"istio-proxy"
"istio-pilot"
"fluent-bit"
)
non_dev_images_aws=(
# includes non_dev_images_cluster
Expand All @@ -76,7 +77,6 @@ non_dev_images_aws=(
"inferentia"
"neuron-rtd"
"nvidia"
"fluentd"
"statsd"
)
non_dev_images_gcp=(
Expand Down
4 changes: 2 additions & 2 deletions cli/cluster/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ import (
"os/signal"
"strings"

"github.com/cortexlabs/cortex/cli/lib/routines"
"github.com/cortexlabs/cortex/pkg/consts"
"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/exit"
"github.com/cortexlabs/cortex/pkg/lib/json"
"github.com/cortexlabs/cortex/pkg/lib/routines"
"github.com/cortexlabs/cortex/pkg/operator/schema"
"github.com/gorilla/websocket"
)
Expand Down Expand Up @@ -120,7 +120,7 @@ func handleConnection(connection *websocket.Conn, done chan struct{}) {
if err != nil {
exit.Error(ErrorOperatorSocketRead(err))
}
fmt.Println(string(message))
fmt.Print(string(message))
}
}, false)
}
Expand Down
1 change: 1 addition & 0 deletions cli/cmd/cluster_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli
gkeClusterConfig := containerpb.Cluster{
Name: clusterConfig.ClusterName,
InitialClusterVersion: "1.17",
LoggingService: "none",
NodePools: []*containerpb.NodePool{
{
Name: "ng-cortex-operator",
Expand Down
10 changes: 5 additions & 5 deletions cli/cmd/lib_cluster_config_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,10 +383,10 @@ func setConfigFieldsFromCached(userClusterConfig *clusterconfig.Config, cachedCl
}
userClusterConfig.ImageNvidia = cachedClusterConfig.ImageNvidia

if s.Obj(cachedClusterConfig.ImageFluentd) != s.Obj(userClusterConfig.ImageFluentd) {
return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImageFluentdKey, cachedClusterConfig.ImageFluentd)
if s.Obj(cachedClusterConfig.ImageFluentBit) != s.Obj(userClusterConfig.ImageFluentBit) {
return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImageFluentBitKey, cachedClusterConfig.ImageFluentBit)
}
userClusterConfig.ImageFluentd = cachedClusterConfig.ImageFluentd
userClusterConfig.ImageFluentBit = cachedClusterConfig.ImageFluentBit

if s.Obj(cachedClusterConfig.ImageStatsd) != s.Obj(userClusterConfig.ImageStatsd) {
return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImageStatsdKey, cachedClusterConfig.ImageStatsd)
Expand Down Expand Up @@ -691,8 +691,8 @@ func clusterConfigConfirmationStr(clusterConfig clusterconfig.Config, awsCreds A
if clusterConfig.ImageNvidia != defaultConfig.ImageNvidia {
items.Add(clusterconfig.ImageNvidiaUserKey, clusterConfig.ImageNvidia)
}
if clusterConfig.ImageFluentd != defaultConfig.ImageFluentd {
items.Add(clusterconfig.ImageFluentdUserKey, clusterConfig.ImageFluentd)
if clusterConfig.ImageFluentBit != defaultConfig.ImageFluentBit {
items.Add(clusterconfig.ImageFluentBitUserKey, clusterConfig.ImageFluentBit)
}
if clusterConfig.ImageStatsd != defaultConfig.ImageStatsd {
items.Add(clusterconfig.ImageStatsdUserKey, clusterConfig.ImageStatsd)
Expand Down
2 changes: 1 addition & 1 deletion cli/cmd/lib_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ import (
"syscall"
"time"

"github.com/cortexlabs/cortex/cli/lib/routines"
"github.com/cortexlabs/cortex/pkg/consts"
"github.com/cortexlabs/cortex/pkg/lib/archive"
"github.com/cortexlabs/cortex/pkg/lib/docker"
"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/exit"
"github.com/cortexlabs/cortex/pkg/lib/files"
"github.com/cortexlabs/cortex/pkg/lib/routines"
"github.com/cortexlabs/cortex/pkg/types/clusterconfig"
"github.com/cortexlabs/yaml"
dockertypes "github.com/docker/docker/api/types"
Expand Down
61 changes: 28 additions & 33 deletions cli/cmd/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,27 @@ limitations under the License.
package cmd

import (
"fmt"
"net/http"
"net/url"

"github.com/cortexlabs/cortex/cli/cluster"
"github.com/cortexlabs/cortex/pkg/lib/console"
"github.com/cortexlabs/cortex/pkg/lib/exit"
"github.com/cortexlabs/cortex/pkg/lib/prompt"
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
"github.com/cortexlabs/cortex/pkg/types"
"github.com/spf13/cobra"
)

var _flagLogsEnv string
var (
_flagLogsEnv string
_flagLogsDisallowPrompt bool
)

func logsInit() {
_logsCmd.Flags().SortFlags = false
_logsCmd.Flags().StringVarP(&_flagLogsEnv, "env", "e", "", "environment to use")
_logsCmd.Flags().BoolVarP(&_flagLogsDisallowPrompt, "yes", "y", false, "skip prompts")
}

var _logsCmd = &cobra.Command{
Use: "logs API_NAME [JOB_ID]",
Short: "stream logs from an api",
Short: "stream logs from a single replica of an api or a single worker for a job",
Args: cobra.RangeArgs(1, 2),
Run: func(cmd *cobra.Command, args []string) {
envName, err := getEnvFromFlag(_flagLogsEnv)
Expand All @@ -59,43 +58,39 @@ var _logsCmd = &cobra.Command{
exit.Error(err)
}

operatorConfig := MustGetOperatorConfig(env.Name)
apiName := args[0]
if env.Provider == types.AWSProviderType {
if len(args) == 1 {
err := cluster.StreamLogs(MustGetOperatorConfig(env.Name), apiName)
if err != nil {
exit.Error(err)
}

if len(args) == 1 {
apiResponse, err := cluster.GetAPI(operatorConfig, apiName)
if err != nil {
exit.Error(err)
}
if len(args) == 2 {
err := cluster.StreamJobLogs(MustGetOperatorConfig(env.Name), apiName, args[1])
if err != nil {
exit.Error(err)
}

if apiResponse[0].Status.Requested > 1 && !_flagLogsDisallowPrompt {
prompt.YesOrExit("logs from a single random replica will be streamed\n\nfor aggregated logs please visit your cloud provider's logging dashboard; see https://docs.cortex.dev for details", "", "")
}
}

if env.Provider == types.GCPProviderType {
gcpLogsResponse, err := cluster.GetGCPLogsURL(MustGetOperatorConfig(env.Name), apiName)
err = cluster.StreamLogs(operatorConfig, apiName)
if err != nil {
exit.Error(err)
}

gcpReq, err := http.NewRequest("GET", "https://console.cloud.google.com/logs/query", nil)
}
if len(args) == 2 {
jobResponse, err := cluster.GetJob(operatorConfig, apiName, args[1])
if err != nil {
exit.Error(err)
}
query := ""
for q, v := range gcpLogsResponse.QueryParams {
query += fmt.Sprintf("%s=\"%s\"\n", q, v)

if jobResponse.JobStatus.Job.Workers > 1 && !_flagLogsDisallowPrompt {
prompt.YesOrExit("logs from a single random worker will be streamed\n\nfor aggregated logs please visit your cloud provider's logging dashboard; see https://docs.cortex.dev for details", "", "")
}
queryValues := make(url.Values)
queryValues.Add("query", query)
gcpReq.URL.RawQuery = queryValues.Encode()

gcpLogsURL := gcpReq.URL.String()
consoleOutput := console.Bold(fmt.Sprintf("visit the following link to view logs for api %s: ", apiName)) + gcpLogsURL
fmt.Println(consoleOutput)
err = cluster.StreamJobLogs(operatorConfig, apiName, args[1])
if err != nil {
exit.Error(err)
}
}

},
}
File renamed without changes.
11 changes: 5 additions & 6 deletions dev/versions.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,11 @@ Note: overriding horizontal-pod-autoscaler-sync-period on EKS is currently not s
1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws), set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml` (e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.16.5/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`

## Fluentd
## FluentBit

1. Find the latest release on [Dockerhub](https://hub.docker.com/r/fluent/fluentd-kubernetes-daemonset/)
1. Update the base image version in `images/fluentd/Dockerfile`
1. Update record-modifier in `images/fluentd/Dockerfile` to the latest version [here](https://github.com/repeatedly/fluent-plugin-record-modifier/blob/master/VERSION)
1. Update `fluentd.yaml` as necessary (make sure to maintain all Cortex environment variables)
1. Find the latest release on [Dockerhub](https://hub.docker.com/r/amazon/aws-for-fluent-bit/tags?page=1&ordering=last_updated)
1. Update the base image version in `images/fluent-bit/Dockerfile`
1. Update `fluent-bit.yaml` as necessary (make sure to maintain all Cortex environment variables)

## Statsd

Expand All @@ -258,7 +257,7 @@ Note: overriding horizontal-pod-autoscaler-sync-period on EKS is currently not s
## kubectl

1. Find the latest release [here](https://storage.googleapis.com/kubernetes-release/release/stable.txt)
1. Update the version in `images/manager/Dockerfile`
1. Update the version in `images/manager/Dockerfile` and `images/operator/Dockerfile`
1. Update your local version and alert developers
* Linux:
1. `curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl`
Expand Down
3 changes: 2 additions & 1 deletion docs/clients/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@ Flags:
## logs

```text
stream logs from an api
stream logs from a single replica of an api or a single worker for a job

Usage:
cortex logs API_NAME [JOB_ID] [flags]

Flags:
-e, --env string environment to use
-y, --yes skip prompts
-h, --help help for logs
```

Expand Down
2 changes: 1 addition & 1 deletion docs/clusters/aws/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ image_metrics_server: quay.io/cortexlabs/metrics-server:master
image_inferentia: quay.io/cortexlabs/inferentia:master
image_neuron_rtd: quay.io/cortexlabs/neuron-rtd:master
image_nvidia: quay.io/cortexlabs/nvidia:master
image_fluentd: quay.io/cortexlabs/fluentd:master
image_fluent_bit: quay.io/cortexlabs/fluent-bit:master
image_statsd: quay.io/cortexlabs/statsd:master
image_istio_proxy: quay.io/cortexlabs/istio-proxy:master
image_istio_pilot: quay.io/cortexlabs/istio-pilot:master
Expand Down
26 changes: 26 additions & 0 deletions docs/clusters/aws/logs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Logs

By default, logs will be pushed to [CloudWatch](https://us-west-2.console.aws.amazon.com/cloudwatch/home) using fluent-bit. A log group with the same name as your cluster will be created to store your logs. API logs are tagged with labels to help with log aggregation and filtering. Below are some sample CloudWatch Log Insight queries:

RealtimeAPI:

```text
fields @timestamp, log
| filter labels.apiName="<INSERT API NAME>"
| filter labels.apiKind="RealtimeAPI"
| sort @timestamp asc
| limit 1000
```

BatchAPI:

```text
fields @timestamp, log
| filter labels.apiName="<INSERT API NAME>"
| filter labels.jobID="<INSERT JOB ID>"
| filter labels.apiKind="BatchAPI"
| sort @timestamp asc
| limit 1000
```

Please make sure to select the log group for your cluster and adjust the time range accordingly before running the queries.
14 changes: 14 additions & 0 deletions docs/clusters/gcp/logs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Logs

By default, logs will be pushed to [StackDriver](https://console.cloud.google.com/logs/query) using fluent-bit. API logs are tagged with labels to help with log aggregation and filtering.

Example query for a RealtimeAPI:

```text
resource.type="k8s_container"
resource.labels.cluster_name="<INSERT CLUSTER NAME>"
jsonPayload.labels.apiKind="RealtimeAPI"
jsonPayload.labels.apiName="<INSERT API NAME>"
```

Please make sure to navigate to the project containing your cluster and adjust the time range accordingly before running queries.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ require (
github.com/ugorji/go/codec v1.2.1
github.com/xlab/treeprint v1.0.0
github.com/xtgo/uuid v0.0.0-20140804021211-a0b114877d4c // indirect
go.uber.org/zap v1.10.0
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb // indirect
golang.org/x/oauth2 v0.0.0-20201203001011-0b49973bad19
golang.org/x/sys v0.0.0-20201204225414-ed752295db88 // indirect
Expand Down
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -513,8 +513,11 @@ go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.5 h1:dntmOdLpSpHlVqbW5Eay97DelsZHe+55D+xC6i0dDS0=
go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk=
go.uber.org/atomic v1.4.0 h1:cxzIVoETapQEqDhQu3QfnvXAV4AlzcvUCxkVUFw3+EU=
go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/multierr v1.1.0 h1:HoEmRHQPVSqub6w2z2d2EOVs2fjyFRGyofhKuyDq0QI=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/zap v1.10.0 h1:ORx85nbTijNz8ljznvCMR1ZBIPKFn3jQrag10X2AsuM=
go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
Expand Down
1 change: 1 addition & 0 deletions images/fluent-bit/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FROM amazon/aws-for-fluent-bit:2.10.1
2 changes: 0 additions & 2 deletions images/fluentd/Dockerfile

This file was deleted.

6 changes: 6 additions & 0 deletions images/operator/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
FROM golang:1.14.7 as builder

RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && \
mv ./kubectl /tmp/kubectl

COPY go.mod go.sum /go/src/github.com/cortexlabs/cortex/
WORKDIR /go/src/github.com/cortexlabs/cortex
RUN go mod download
Expand All @@ -14,6 +17,9 @@ RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux go build -installsuffix cgo -o opera

FROM alpine:3.12

COPY --from=builder /tmp/kubectl /usr/local/bin/kubectl
RUN chmod +x /usr/local/bin/kubectl

RUN apk --no-cache add ca-certificates bash

COPY --from=builder /go/src/github.com/cortexlabs/cortex/pkg/operator/operator /root/
Expand Down
11 changes: 7 additions & 4 deletions images/request-monitor/request-monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,13 @@ func main() {
logLevelZap = zapcore.ErrorLevel
}

encoderConfig := zap.NewProductionEncoderConfig()
encoderConfig.MessageKey = "message"

logger, err = zap.Config{
Level: zap.NewAtomicLevelAt(logLevelZap),
Encoding: "console",
EncoderConfig: zap.NewProductionEncoderConfig(),
Encoding: "json",
EncoderConfig: encoderConfig,
OutputPaths: []string{"stdout"},
ErrorOutputPaths: []string{"stderr"},
}.Build()
Expand All @@ -109,7 +112,7 @@ func main() {
if _, err := os.Stat("/mnt/workspace/api_readiness.txt"); err == nil {
break
} else if os.IsNotExist(err) {
logger.Info("waiting for replica to be ready ...")
logger.Debug("waiting for replica to be ready ...")
time.Sleep(_tickInterval)
} else {
logger.Error("error encountered while looking for /mnt/workspace/api_readiness.txt") // unexpected
Expand Down Expand Up @@ -164,7 +167,7 @@ func publishStats(apiName string, counter *Counter, client *cloudwatch.CloudWatc

total /= float64(len(requestCounts))
}
logger.Info(fmt.Sprintf("recorded %.2f in-flight requests on replica", total))
logger.Debug(fmt.Sprintf("recorded %.2f in-flight requests on replica", total))
curTime := time.Now()
metricData := cloudwatch.PutMetricDataInput{
Namespace: aws.String(clusterName),
Expand Down
8 changes: 7 additions & 1 deletion manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ function cluster_up_aws() {
echo "✓"

echo -n "○ configuring logging "
envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/fluent-bit.yaml.j2 > /workspace/fluent-bit.yaml
kubectl apply -f /workspace/fluent-bit.yaml >/dev/null
echo "✓"

echo -n "○ configuring metrics "
Expand Down Expand Up @@ -112,6 +113,11 @@ function cluster_up_gcp() {
kubectl apply -f /workspace/apis.yaml >/dev/null
echo "✓"

echo -n "○ configuring logging "
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/fluent-bit.yaml.j2 > /workspace/fluent-bit.yaml
kubectl apply -f /workspace/fluent-bit.yaml >/dev/null
echo "✓"

if [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
echo -n "○ configuring gpu support "
envsubst < manifests/nvidia_gcp.yaml | kubectl apply -f - >/dev/null
Expand Down
Loading