From 737ae20f2e71ba83662440589739f60b2aea1a2b Mon Sep 17 00:00:00 2001 From: Jiaxun Song <31323106+songjiaxun@users.noreply.github.com> Date: Fri, 18 Jun 2021 13:54:26 -0700 Subject: [PATCH] update diagnosis scripts to support AKS RP (#231) update diagnosis scripts to support AKS RP, improve documentation --- diagnosis/README.md | 60 +++-- diagnosis/azs-collect-windows-logs.ps1 | 9 +- diagnosis/collectlogs.sh | 187 +++++++++++---- diagnosis/error-messages/etcd-bad-cert.cse | 1 - diagnosis/error-messages/etcd-bad-cert.status | 22 -- diagnosis/error-messages/spn-bad-secret.log | 1 - diagnosis/error-messages/spn-missing.log | 1 - diagnosis/error-messages/spn-not-in-sub.log | 1 - diagnosis/error-messages/spn-permissions.log | 1 - diagnosis/getkuberneteslogs.sh | 67 ++++-- diagnosis/hosts.sh | 6 - diagnosis/k8sCollectLogsCi.sh | 225 ------------------ diagnosis/scripts/updateconfiguration.sh | 153 ------------ 13 files changed, 240 insertions(+), 494 deletions(-) delete mode 100644 diagnosis/error-messages/etcd-bad-cert.cse delete mode 100644 diagnosis/error-messages/etcd-bad-cert.status delete mode 100644 diagnosis/error-messages/spn-bad-secret.log delete mode 100644 diagnosis/error-messages/spn-missing.log delete mode 100644 diagnosis/error-messages/spn-not-in-sub.log delete mode 100644 diagnosis/error-messages/spn-permissions.log delete mode 100644 diagnosis/k8sCollectLogsCi.sh delete mode 100644 diagnosis/scripts/updateconfiguration.sh diff --git a/diagnosis/README.md b/diagnosis/README.md index beb4a00..6d98dd6 100644 --- a/diagnosis/README.md +++ b/diagnosis/README.md @@ -1,12 +1,20 @@ -# Troubleshooting AKS Engine on Azure Stack +# Troubleshooting AKS Cluster Issues on Azure Stack -This short [guide](https://github.com/Azure/aks-engine/blob/master/docs/howto/troubleshooting.md) from Azure's AKS Engine team has a good high level explanation of how AKS Engine interacts with the Azure Resource Manager (ARM) and lists common reasons that can cause AKS Engine commands to fail. That guide applies to Azure Stack as well as it ships with its own ARM instance. If you are facing a problem that is not part of this guide, then you will need extra information to figure out the root cause. +## Introduction +In order to troubleshoot some AKS cluster issues, you may need to collect logs directly from the cluster nodes. Typically, without this script, you would need to connect to each node in the cluster, locate and download the logs manually. -Typically, to collect logs from servers you manage, you have to start a remote session using SSH and browse for relevant log files. The scripts in this directory are aim to simplify the collection of relevant logs from your Kubernetes cluster. Just download/unzip the latest [release](https://github.com/msazurestackworkloads/azurestack-gallery/releases/tag/diagnosis-v0.1.2) and execute script `getkuberneteslogs.sh`. +The scripts in this directory aim to simplify the collection of relevant logs from your Kubernetes cluster. The script will automatically create a snapshot of the cluster, and connect to each node to collect logs. In addition, the script can, optionally, upload the collected logs to a storage account. -> Before you execute `getkuberneteslogs.sh`, make sure that you can login to your Azure Stack instance using `Azure CLI`. Follow this [article](https://docs.microsoft.com/azure-stack/user/azure-stack-version-profiles-azurecli2) to learn how to configure Azure CLI to manage your Azure Stack cloud. +This tool is mainly designed for the Microsoft support team to collect comprehensive cluster logs. For self-diagnosis purposes, please see [`az aks kollect`](https://docs.microsoft.com/en-us/cli/azure/aks?view=azure-cli-latest#az_aks_kollect) command and [aks-periscope](https://github.com/Azure/aks-periscope) application. -The logs retrieved by `getkuberneteslogs.sh` are the following: +## Requirments +- A machine that has access to your Kubernetes cluster, or the same machine you used to deploy your cluster. For Windows machine, install [Git Bash](https://gitforwindows.org/) in order to run bash scripts. +- `Azure CLI` installed on the machine where the script will be run. Make sure that you can login to your Azure Stack environment using `Azure CLI` from the machine. Follow this [article](https://docs.microsoft.com/azure-stack/user/azure-stack-version-profiles-azurecli2) to learn how to install and configure Azure CLI to manage your Azure Stack cloud. +- Switch to the subscription where the Kubernetes cluster is deployed, by using `az account set --subscription `. +- Download the latest [release](https://github.com/msazurestackworkloads/azurestack-gallery/releases) of the script into your machine and extract the scripts. + +## Logs +This script automates the process of gathering the following logs: - Log files in directory `/var/log/azure/` - Log files in directory `/var/log/kubeaudit` (kube audit logs) @@ -18,8 +26,10 @@ The logs retrieved by `getkuberneteslogs.sh` are the following: - kubelet status and journal - etcd status and journal - docker status and journal +- containerd status and journal - kube-system snapshot - Azure CNI config files +- kubelet config files Some additional logs are retrieved for Windows nodes: @@ -31,21 +41,25 @@ Some additional logs are retrieved for Windows nodes: - ETW events for Hyper-V - Azure CNI config files -## Required Parameters - -`-u, --user` - The administrator username for the cluster VMs - -`-i, --identity-file` - RSA private key tied to the public key used to create the Kubernetes cluster (usually named 'id_rsa') - -`-g, --resource-group` - Kubernetes cluster resource group - -## Optional Parameters - -`--disable-host-key-checking` - Sets SSH's `StrictHostKeyChecking` option to `no` while the script executes. Only use in a safe environment. - -`--upload-logs` - Persists retrieved logs in an Azure Stack storage account. Logs can be found in `KubernetesLogs` resource group. - -`--api-model` - Persists apimodel.json file in an Azure Stack Storage account. - Upload apimodel.json file to storage account happens when `--upload-logs` parameter is also provided. - -`-h, --help` - Print script usage +## Parameters +| Parameter | Description | Required | Example | +|-----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------------------------------------------| +| -h, --help | Print command usage. | no | | +| -u,--user | The administrator username for the cluster VMs. | yes | azureuser (default value) | +| -i, --identity-file | SA private key tied to the public key used to create the Kubernetes cluster (sometimes named 'id_rsa'). | yes | /rsa.pem (Putty)
~/.ssh/id_rsa (SSH) | +| -g, --resource-group | Kubernetes cluster resource group. For the clusters created by AKS Service, the managed resource group name follows pattern 'MC_RESOURCEGROUP_CLUSTERNAME_LOCTION'. | yes | k8sresourcegroup
MC_AKSRP_k8scluster1_redmond | +| -n, --user-namespace | Collect logs from containers in the specified namespaces. If not sepcified, logs from ALL namespaces are collected. | no | monitoring | +| --upload-logs | Persists retrieved logs in an Azure Stack Hub storage account. Logs can be found in KubernetesLogs resource group. | no | | +| --api-model | Persists apimodel.json file in an Azure Stack Hub Storage account. Upload apimodel.json file to storage account happens when --upload-logs parameter is also provided. | no | ./apimodel.json | +| --disable-host-key-checking | Sets SSH's StrictHostKeyChecking option to "no" while the script executes. Only use in a safe environment. | no | | + +## Examples +```bash +az account set --subscription +# cd to the directory where the scripts are in. +./getkuberneteslogs.sh -u azureuser -i private.key.1.pem -g k8s-rg +./getkuberneteslogs.sh -u azureuser -i ~/.ssh/id_rsa -g k8s-rg --disable-host-key-checking +./getkuberneteslogs.sh -u azureuser -i ~/.ssh/id_rsa -g k8s-rg -n default -n monitoring +./getkuberneteslogs.sh -u azureuser -i ~/.ssh/id_rsa -g k8s-rg --upload-logs --api-model clusterDefinition.json +./getkuberneteslogs.sh -u azureuser -i ~/.ssh/id_rsa -g k8s-rg --upload-logs +``` \ No newline at end of file diff --git a/diagnosis/azs-collect-windows-logs.ps1 b/diagnosis/azs-collect-windows-logs.ps1 index 34245e2..b4c794c 100644 --- a/diagnosis/azs-collect-windows-logs.ps1 +++ b/diagnosis/azs-collect-windows-logs.ps1 @@ -1,6 +1,6 @@ $ProgressPreference = "SilentlyContinue" -$lockedFiles = "kubelet.err.log", "kubelet.log", "kubeproxy.log", "kubeproxy.err.log", "azure-vnet-telemetry.log", "azure-vnet.log", "network-interfaces.json", "interfaces.json" +$lockedFiles = "kubelet.err.log", "kubelet.log", "kubeproxy.log", "kubeproxy.err.log", "azure-vnet-telemetry.log", "azure-vnet.log", "network-interfaces.json", "interfaces.json", "azure-vnet-ipam.log", "windowsnodereset.log", "csi-proxy.log", "csi-proxy.err.log" $timeStamp = get-date -format 'yyyyMMdd-hhmmss' $zipName = "win_log_$env:computername.zip" @@ -56,7 +56,10 @@ if (-not (Test-Path 'c:\k\debug\collectlogs.ps1')) { & 'c:\k\debug\collectlogs.ps1' | write-Host $netLogs = Get-ChildItem (Get-ChildItem -Path c:\k\debug -Directory | Sort-Object LastWriteTime -Descending | Select-Object -First 1).FullName | Select-Object -ExpandProperty FullName $paths += $netLogs -$paths += "c:\AzureData\CustomDataSetupScript.log" +$setupLog = "c:\AzureData\CustomDataSetupScript.log" +if (Test-Path $setupLog) { + $paths += $setupLog +} Write-Host "Collecting containerd hyperv logs" if ((Test-Path "$Env:ProgramFiles\containerd\diag.ps1") -And (Test-Path "$Env:ProgramFiles\containerd\ContainerPlatform.wprp")) { @@ -75,5 +78,5 @@ else { Write-Host "Compressing all logs to $zipName" $paths | Format-Table FullName, Length -AutoSize Compress-Archive -LiteralPath $paths -DestinationPath $zipName -Remove-Item -Path $paths +Remove-Item -Path $paths -ErrorAction SilentlyContinue Get-ChildItem $zipName # this puts a FileInfo on the pipeline so that another script can get it on the pipeline \ No newline at end of file diff --git a/diagnosis/collectlogs.sh b/diagnosis/collectlogs.sh index d2c9fbe..b43ec93 100755 --- a/diagnosis/collectlogs.sh +++ b/diagnosis/collectlogs.sh @@ -1,9 +1,23 @@ #!/bin/bash +collectContainerdMetadata() +{ + CONTAINERD_VERSION=$(containerd --version | xargs) + CONTAINERD_LOG_FILE=${LOGDIRECTORY}/daemons/k8s-containerd.log + + echo "== BEGIN HEADER ==" > ${CONTAINERD_LOG_FILE} + echo "Type: Daemon" >> ${CONTAINERD_LOG_FILE} + echo "TenantId: ${TENANT_ID}" >> ${CONTAINERD_LOG_FILE} + echo "Name: containerd" >> ${CONTAINERD_LOG_FILE} + echo "Version: ${CONTAINERD_VERSION}" >> ${CONTAINERD_LOG_FILE} + echo "SubscriptionID: ${SUB_ID}" >> ${CONTAINERD_LOG_FILE} + echo "ResourceGroup: ${RESOURCE_GROUP}" >> ${CONTAINERD_LOG_FILE} + echo "== END HEADER ==" >> ${CONTAINERD_LOG_FILE} +} + collectKubeletMetadata() { - KUBELET_REPOSITORY=$(docker images --format '{{.Repository}}' | grep hyperkube) - KUBELET_TAG=$(docker images --format '{{.Repository}}:{{.Tag}}' | grep hyperkube | cut -d ":" -f 2) + KUBELET_VERSION=$(kubelet --version | xargs) KUBELET_VERBOSITY=$(grep -e '--v=[0-9]' -oh /etc/systemd/system/kubelet.service | grep -e '[0-9]' -oh /etc/systemd/system/kubelet.service | head -n 1) KUBELET_LOG_FILE=${LOGDIRECTORY}/daemons/k8s-kubelet.log @@ -11,17 +25,16 @@ collectKubeletMetadata() echo "Type: Daemon" >> ${KUBELET_LOG_FILE} echo "TenantId: ${TENANT_ID}" >> ${KUBELET_LOG_FILE} echo "Name: kubelet" >> ${KUBELET_LOG_FILE} - echo "Version: ${KUBELET_TAG}" >> ${KUBELET_LOG_FILE} + echo "Version: ${KUBELET_VERSION}" >> ${KUBELET_LOG_FILE} echo "Verbosity: ${KUBELET_VERBOSITY}" >> ${KUBELET_LOG_FILE} - echo "Image: ${KUBELET_REPOSITORY}" >> ${KUBELET_LOG_FILE} echo "SubscriptionID: ${SUB_ID}" >> ${KUBELET_LOG_FILE} echo "ResourceGroup: ${RESOURCE_GROUP}" >> ${KUBELET_LOG_FILE} echo "== END HEADER ==" >> ${KUBELET_LOG_FILE} } -collectMobyMetadata() +collectDockerMetadata() { - DOCKER_VERSION=$(docker version | grep -A 20 "Server:" | grep "Version:" | head -n 1 | cut -d ":" -f 2 | xargs) + DOCKER_VERSION=$(sudo docker version | grep -A 20 "Server:" | grep "Version:" | head -n 1 | cut -d ":" -f 2 | xargs) DOCKER_LOG_FILE=${LOGDIRECTORY}/daemons/k8s-docker.log echo "== BEGIN HEADER ==" > ${DOCKER_LOG_FILE} @@ -54,11 +67,9 @@ collectContainerMetadata() local cid=$1 local pname=$2 local cname=$3 - - CVERBOSITY=$(docker inspect ${cid} | grep -e "--v=[0-9]" -oh | grep -e [0-9] -oh | head -n 1) - IMAGE_SHA=$(docker inspect ${cid} | grep Image | grep -e "sha256:[[:alnum:]]*" -oh | head -n 1 | cut -d ':' -f 2) - IMAGE=$(docker image inspect ${IMAGE_SHA} | jq -r '.[] | .RepoTags | @tsv' | xargs) - CLOG_FILE=${LOGDIRECTORY}/containers/k8s-${pname}-${cname}.log + local image=$4 + + CLOG_FILE=${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.log echo "== BEGIN HEADER ==" > ${CLOG_FILE} echo "Type: Container" >> ${CLOG_FILE} @@ -66,8 +77,7 @@ collectContainerMetadata() echo "Name: ${cname}" >> ${CLOG_FILE} echo "Hostname: ${HOSTNAME}" >> ${CLOG_FILE} echo "ContainerID: ${cid}" >> ${CLOG_FILE} - echo "Image: ${IMAGE}" >> ${CLOG_FILE} - echo "Verbosity: ${CVERBOSITY}" >> ${CLOG_FILE} + echo "Image: ${image}" >> ${CLOG_FILE} echo "SubscriptionID: ${SUB_ID}" >> ${CLOG_FILE} echo "ResourceGroup: ${RESOURCE_GROUP}" >> ${CLOG_FILE} echo "== END HEADER ==" >> ${CLOG_FILE} @@ -108,6 +118,57 @@ collectCloudProviderJson() { fi } +collectKubeletConfigFiles() { + echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Collecting Kubelet config files" + if sudo [ -f /etc/kubernetes/kubeadm-config.yaml ]; then + sudo cp /etc/kubernetes/kubeadm-config.yaml ${LOGDIRECTORY}/etc/kubernetes/kubeadm-config.yaml + fi + + kubeletFolder="${LOGDIRECTORY}/etc/kubernetes/kubelet" + mkdir -p ${kubeletFolder} + if sudo [ -f /etc/default/kubelet ]; then + sudo cp /etc/default/kubelet ${kubeletFolder} + fi + if sudo [ -f /etc/systemd/system/kubelet.service ]; then + sudo cp /etc/systemd/system/kubelet.service ${kubeletFolder} + fi + if sudo [ -f /etc/kubernetes/kubelet.conf ]; then + sudo cp /etc/kubernetes/kubelet.conf ${kubeletFolder} + fi + if sudo [ -f /var/lib/kubelet/config.yaml ]; then + sudo cp /var/lib/kubelet/config.yaml ${kubeletFolder} + fi + if sudo [ -f /var/lib/kubelet/kubeadm-flags.env ]; then + sudo cp /var/lib/kubelet/kubeadm-flags.env ${kubeletFolder} + fi + + containerdFolder="${LOGDIRECTORY}/etc/containerd" + mkdir -p ${containerdFolder} + if sudo [ -f /etc/systemd/system/containerd.service ]; then + sudo cp /etc/systemd/system/containerd.service ${containerdFolder} + fi + if sudo [ -f /etc/containerd/config.toml ]; then + sudo cp /etc/containerd/config.toml ${containerdFolder} + fi + if sudo [ -f /etc/containerd/kubenet_template.conf ]; then + sudo cp /etc/containerd/kubenet_template.conf ${containerdFolder} + fi + + mkdir -p ${LOGDIRECTORY}/etc/cni + if sudo [ -d /etc/cni/net.d ]; then + sudo cp -r /etc/cni/net.d ${LOGDIRECTORY}/etc/cni + fi + + sysctlFolder="${LOGDIRECTORY}/etc/sysctl" + mkdir -p ${sysctlFolder} + if sudo [ -f /etc/sysctl.d/11-containerd.conf ]; then + sudo cp /etc/sysctl.d/11-containerd.conf ${sysctlFolder} + fi + if sudo [ -f /etc/sysctl.d/999-sysctl-aks.conf ]; then + sudo cp /etc/sysctl.d/999-sysctl-aks.conf ${sysctlFolder} + fi +} + checkNetworking() { local DIR=${LOGDIRECTORY}/network mkdir -p ${DIR} @@ -132,11 +193,13 @@ do sudo cp "$f" ${LOGDIRECTORY}/var/log/k8s-"${f%}" || : done -cd /var/log/kubeaudit -for f in *.log -do - sudo cp "$f" ${LOGDIRECTORY}/var/log/k8s-"${f%}" || : -done +if [ -d /var/log/kubeaudit ]; then + cd /var/log/kubeaudit + for f in *.log + do + sudo cp "$f" ${LOGDIRECTORY}/var/log/k8s-"${f%}" || : + done +fi sudo cp /var/log/waagent.log ${LOGDIRECTORY}/var/log/k8s-waagent.log || : @@ -155,7 +218,7 @@ sudo cp /etc/kubernetes/addons/* ${LOGDIRECTORY}/etc/kubernetes/addons 2>/dev/nu test $# -gt 0 && NAMESPACES=$@ test -z "${NAMESPACES}" && echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Collecting logs from pods in all namespaces" -test -n "${NAMESPACES}" && echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Collecting logs from pods in these namespaces: $NAMESPACES" +test -n "${NAMESPACES}" && NAMESPACES="kube-system${NAMESPACES}" && echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Collecting logs from pods in these namespaces: $NAMESPACES" mkdir -p ${LOGDIRECTORY}/containers TENANT_ID=$(sudo jq -r '.tenantId' /etc/kubernetes/azure.json) @@ -167,49 +230,87 @@ then TENANT_ID=$(sudo jq -r '.serviceManagementEndpoint' /etc/kubernetes/azurestackcloud.json | cut -d / -f4) fi -for cid in $(docker ps -a -q --no-trunc) -do - cns=$(docker inspect --format='{{ index .Config.Labels "io.kubernetes.pod.namespace" }}' ${cid}) - - # if NAMESPACES not set, then collect everything - if [ -z "${NAMESPACES}" ] || (echo $NAMESPACES | grep -qw $cns); - then - # Ignore the Pause container - if docker inspect --format='{{ .Config.Image }}' ${cid} | grep -q -v pause; +if systemctl is-active --quiet docker; then + for cid in $(sudo docker ps -a -q --no-trunc) + do + cns=$(sudo docker inspect --format='{{ index .Config.Labels "io.kubernetes.pod.namespace" }}' ${cid}) + + # if NAMESPACES not set, then collect everything + if [ -z "${NAMESPACES}" ] || (echo $NAMESPACES | grep -qw $cns); then - pname=$(docker inspect --format='{{ index .Config.Labels "io.kubernetes.pod.name" }}' ${cid}) - cname=$(docker inspect --format='{{ index .Config.Labels "io.kubernetes.container.name" }}' ${cid}) - clog=$(docker inspect --format='{{ .LogPath }}' ${cid}) - - collectContainerMetadata ${cid} ${pname} ${cname} - sudo docker inspect ${cid} &> ${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.json - sudo cat $clog >> ${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.log + image_sha=$(sudo docker inspect ${cid} | jq -r '.[].Image' | grep -e "sha256:[[:alnum:]]*" -oh | head -n 1 | cut -d ':' -f 2) + image=$(sudo docker image inspect ${image_sha} | jq -r '.[] | .RepoTags | @tsv' | xargs) + # Ignore the Pause container + if echo ${image} | grep -q -v pause; + then + pname=$(sudo docker inspect --format='{{ index .Config.Labels "io.kubernetes.pod.name" }}' ${cid}) + cname=$(sudo docker inspect --format='{{ index .Config.Labels "io.kubernetes.container.name" }}' ${cid}) + clog=$(sudo docker inspect --format='{{ .LogPath }}' ${cid}) + + if [ -z "${pname}" ]; then pname=unknown; fi + if [ -z "${cname}" ]; then cname=unknown; fi + + collectContainerMetadata ${cid} ${pname} ${cname} ${image} + sudo docker inspect ${cid} &> ${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.json + sudo cat $clog >> ${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.log + fi fi - fi -done + done +fi + +if command -v crictl &> /dev/null; then + for cid in $(sudo crictl ps -a -q --no-trunc) + do + cinfo=$(sudo crictl inspect ${cid}) + cns=$(echo ${cinfo} | jq -r '.status.labels."io.kubernetes.pod.namespace"') + + # if NAMESPACES not set, then collect everything + if [ -z "${NAMESPACES}" ] || (echo $NAMESPACES | grep -qw $cns); + then + image=$(echo ${cinfo} | jq -r '.status.image.image') + # Ignore the Pause container + if echo ${image} | grep -q -v pause; + then + pname=$(echo ${cinfo} | jq -r '.status.labels."io.kubernetes.pod.name"') + cname=$(echo ${cinfo} | jq -r '.status.labels."io.kubernetes.container.name"') + clog=$(echo ${cinfo} | jq -r '.status.logPath') + + collectContainerMetadata ${cid} ${pname} ${cname} ${image} + echo ${cinfo} &> ${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.json + sudo cat $clog >> ${LOGDIRECTORY}/containers/k8s-${pname}-${cname}-${cid}.log + fi + fi + done +fi -test -n "${NAMESPACES}" && echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Collecting daemon logs" +echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Collecting daemon logs" mkdir -p ${LOGDIRECTORY}/daemons # TODO use --until --since --lines to limit size if systemctl list-units | grep -q kubelet.service; then collectKubeletMetadata - sudo journalctl -n 10000 --utc -o short-iso -u kubelet &>> ${LOGDIRECTORY}/daemons/k8s-kubelet.log + sudo journalctl -n 10000 --utc -o short-iso -r -u kubelet &>> ${LOGDIRECTORY}/daemons/k8s-kubelet.log fi if systemctl list-units | grep -q etcd.service; then collectEtcdMetadata - sudo journalctl -n 10000 --utc -o short-iso -u etcd &>> ${LOGDIRECTORY}/daemons/k8s-etcd.log + sudo journalctl -n 10000 --utc -o short-iso -r -u etcd &>> ${LOGDIRECTORY}/daemons/k8s-etcd.log fi if systemctl list-units | grep -q docker.service; then - collectMobyMetadata - sudo journalctl -n 10000 --utc -o short-iso -u docker &>> ${LOGDIRECTORY}/daemons/k8s-docker.log + collectDockerMetadata + sudo journalctl -n 10000 --utc -o short-iso -r -u docker &>> ${LOGDIRECTORY}/daemons/k8s-docker.log +fi + +if systemctl list-units | grep -q containerd.service; then + collectContainerdMetadata + sudo journalctl -n 10000 --utc -o short-iso -r -u containerd &>> ${LOGDIRECTORY}/daemons/k8s-containerd.log fi collectCloudProviderJson +collectKubeletConfigFiles -test -n "${NAMESPACES}" && echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Basic networking test" +echo "[$(date +%Y%m%d%H%M%S)][INFO][$HOSTNAME] Basic networking test" checkNetworking compressLogsDirectory diff --git a/diagnosis/error-messages/etcd-bad-cert.cse b/diagnosis/error-messages/etcd-bad-cert.cse deleted file mode 100644 index 43597cf..0000000 --- a/diagnosis/error-messages/etcd-bad-cert.cse +++ /dev/null @@ -1 +0,0 @@ -{"code":"DeploymentFailed","message":"At least one resource deployment operation failed. Please list deployment operations for details. Please see https://aka.ms/arm-debug for usage details.","details":[{"code":"Conflict","message":"{\r\n \"status\": \"Failed\",\r\n \"error\": {\r\n \"code\": \"ResourceDeploymentFailure\",\r\n \"message\": \"The resource operation completed with terminal provisioning state 'Failed'.\",\r\n \"details\": [\r\n {\r\n \"code\": \"VMExtensionProvisioningError\",\r\n \"message\": \"VM has reported a failure when processing extension 'cse-master-0'. Error message: Enable failed: failed to execute command: command terminated with exit status=14\\n[stdout]\\n\\n[stderr]\\nConnection to k8s.gcr.io 443 port [tcp/https] Succeeded!\\nConnection to gcr.io 443 port [tcp/https] Succeeded!\\nConnection to docker.io 443 port [tcp/https] Succeeded!\\n\"\r\n }\r\n ]\r\n }\r\n}"},{"code":"Conflict","message":"{\r\n \"status\": \"Failed\",\r\n \"error\": {\r\n \"code\": \"ResourceDeploymentFailure\",\r\n \"message\": \"The resource operation completed with terminal provisioning state 'Failed'.\",\r\n \"details\": [\r\n {\r\n \"code\": \"VMExtensionProvisioningError\",\r\n \"message\": \"VM has reported a failure when processing extension 'cse-master-1'. Error message: Enable failed: failed to execute command: command terminated with exit status=14\\n[stdout]\\n\\n[stderr]\\nConnection to k8s.gcr.io 443 port [tcp/https] Succeeded!\\nConnection to gcr.io 443 port [tcp/https] Succeeded!\\nConnection to docker.io 443 port [tcp/https] Succeeded!\\n\"\r\n }\r\n ]\r\n }\r\n}"},{"code":"Conflict","message":"{\r\n \"status\": \"Failed\",\r\n \"error\": {\r\n \"code\": \"ResourceDeploymentFailure\",\r\n \"message\": \"The resource operation completed with terminal provisioning state 'Failed'.\",\r\n \"details\": [\r\n {\r\n \"code\": \"VMExtensionProvisioningError\",\r\n \"message\": \"VM has reported a failure when processing extension 'cse-master-2'. Error message: Enable failed: failed to execute command: command terminated with exit status=14\\n[stdout]\\n\\n[stderr]\\nConnection to k8s.gcr.io 443 port [tcp/https] Succeeded!\\nConnection to gcr.io 443 port [tcp/https] Succeeded!\\nConnection to docker.io 443 port [tcp/https] Succeeded!\\n\"\r\n }\r\n ]\r\n }\r\n}"}]} \ No newline at end of file diff --git a/diagnosis/error-messages/etcd-bad-cert.status b/diagnosis/error-messages/etcd-bad-cert.status deleted file mode 100644 index f71685e..0000000 --- a/diagnosis/error-messages/etcd-bad-cert.status +++ /dev/null @@ -1,22 +0,0 @@ -● etcd.service - etcd - highly-available key value store - Loaded: loaded (/etc/systemd/system/etcd.service; disabled; vendor preset: enabled) - Active: activating (start) since Sat 2019-03-09 10:46:13 UTC; 29s ago - Docs: https://github.com/coreos/etcd - man:etcd - Main PID: 17795 (etcd) - Tasks: 8 - Memory: 36.3M - CPU: 21.105s - CGroup: /system.slice/etcd.service - └─17795 /usr/bin/etcd --name k8s-master-39889462-0 --peer-client-cert-auth --peer-trusted-ca-file=/etc/kubernetes/certs/ca.crt --peer-cert-file=/etc/kubernetes/certs/etcdpeer0.crt --peer-key-file=/etc/kubernetes/certs/etcdpeer0.key --initial-advertise-peer-urls https://10.100.0.5:2380 --listen-peer-urls https://10.100.0.5:2380 --client-cert-auth --trusted-ca-file=/etc/kubernetes/certs/ca.crt --cert-file=/etc/kubernetes/certs/etcdserver.crt --key-file=/etc/kubernetes/certs/etcdserver.key --advertise-client-urls https://10.100.0.5:2379 --listen-client-urls https://10.100.0.5:2379,https://127.0.0.1:2379 --initial-cluster-token k8s-etcd-cluster --initial-cluster k8s-master-39889462-0=https://10.100.0.5:2380,k8s-master-39889462-1=https://10.100.0.6:2380,k8s-master-39889462-2=https://10.100.0.7:2380 --data-dir /var/lib/etcddisk --initial-cluster-state new - -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: rejected connection from "10.100.0.7:44068" (error "remote error: tls: bad certificate", ServerName "") -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: rejected connection from "10.100.0.6:53512" (error "remote error: tls: bad certificate", ServerName "") -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: rejected connection from "10.100.0.6:53514" (error "remote error: tls: bad certificate", ServerName "") -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: rejected connection from "10.100.0.7:44072" (error "remote error: tls: bad certificate", ServerName "") -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: cc50aad0ab64cb41 is starting a new election at term 2357 -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: cc50aad0ab64cb41 became candidate at term 2358 -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: cc50aad0ab64cb41 received MsgVoteResp from cc50aad0ab64cb41 at term 2358 -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: cc50aad0ab64cb41 [logterm: 1, index: 3] sent MsgVote request to 8e5cbfde980e12f8 at term 2358 -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: cc50aad0ab64cb41 [logterm: 1, index: 3] sent MsgVote request to d34a714a2de724aa at term 2358 -Mar 09 10:46:43 k8s-master-39889462-0 etcd[17795]: rejected connection from "10.100.0.7:44076" (error "remote error: tls: bad certificate", ServerName "") diff --git a/diagnosis/error-messages/spn-bad-secret.log b/diagnosis/error-messages/spn-bad-secret.log deleted file mode 100644 index 345af99..0000000 --- a/diagnosis/error-messages/spn-bad-secret.log +++ /dev/null @@ -1 +0,0 @@ -FATA[0000] failed to load apimodel: failed to get client: azure.BearerAuthorizer#WithAuthorization: Failed to refresh the Token for request to https://management.local.azurestack.external/subscriptions/XXXX/providers?%24top=100&api-version=2018-02-01: StatusCode=401 -- Original Error: adal: Refresh request failed. Status Code = '401'. Response body: {"error":"invalid_client","error_description":"AADSTS7000215: Invalid client secret is provided.\r\nTrace ID: XXXX\r\nCorrelation ID: XXXX\r\nTimestamp: XXXX","error_codes":[7000215],"timestamp":"XXXX","trace_id":"XXXX","correlation_id":"XXXX"} \ No newline at end of file diff --git a/diagnosis/error-messages/spn-missing.log b/diagnosis/error-messages/spn-missing.log deleted file mode 100644 index 85ba5b3..0000000 --- a/diagnosis/error-messages/spn-missing.log +++ /dev/null @@ -1 +0,0 @@ -FATA[0000] failed to load apimodel: failed to get client: azure.BearerAuthorizer#WithAuthorization: Failed to refresh the Token for request to https://management.local.azurestack.external/subscriptions/XXXX/providers?%24top=100&api-version=2018-02-01: StatusCode=400 -- Original Error: adal: Refresh request failed. Status Code = '400'. Response body: {"error":"unauthorized_client","error_description":"AADSTS700016: Application with identifier 'XXXX' was not found in the directory 'XXXX'. This can happen if the application has not been installed by the administrator of the tenant or consented to by any user in the tenant. You may have sent your authentication request to the wrong tenant.\r\nTrace ID: XXXX\r\nCorrelation ID: XXXX\r\nTimestamp: XXXX","error_codes":[700016],"timestamp":"XXXX","trace_id":"XXXX","correlation_id":"XXXX","error_uri":"https://login.microsoftonline.com/error?code=700016"} \ No newline at end of file diff --git a/diagnosis/error-messages/spn-not-in-sub.log b/diagnosis/error-messages/spn-not-in-sub.log deleted file mode 100644 index 16e46e8..0000000 --- a/diagnosis/error-messages/spn-not-in-sub.log +++ /dev/null @@ -1 +0,0 @@ -ERROR: Get Token request returned http error: 400 and server response: {"error":"unauthorized_client","error_description":"AADSTS700016: Application with identifier 'XXX' was not found in the directory 'XXX'. This can happen if the application has not been installed by the administrator of the tenant or consented to by any user in the tenant. You may have sent your authentication request to the wrong tenant.\r\nTrace ID: XXX\r\nCorrelation ID: XXX\r\nTimestamp: XXX","error_codes":[700016],"timestamp":"XXX","trace_id":"XXX","correlation_id":"XXX","error_uri":"https://login.microsoftonline.com/error?code=700016"} \ No newline at end of file diff --git a/diagnosis/error-messages/spn-permissions.log b/diagnosis/error-messages/spn-permissions.log deleted file mode 100644 index 527acad..0000000 --- a/diagnosis/error-messages/spn-permissions.log +++ /dev/null @@ -1 +0,0 @@ -FATA[0000] failed to load apimodel: failed to get client: resources.ProvidersClient#List: Failure responding to request: StatusCode=403 -- Original Error: autorest/azure: Service returned an error. Status=403 Code="AuthorizationFailed" Message="The client 'XXXX' with object id 'XXXX' does not have authorization to perform action 'Microsoft.Resources/subscriptions/providers/read' over scope '/subscriptions/XXXX'." \ No newline at end of file diff --git a/diagnosis/getkuberneteslogs.sh b/diagnosis/getkuberneteslogs.sh index c361c7c..6ccf5cc 100755 --- a/diagnosis/getkuberneteslogs.sh +++ b/diagnosis/getkuberneteslogs.sh @@ -72,9 +72,13 @@ ensureResourceGroup() ensureStorageAccount() { - SA_NAME="${RESOURCE_GROUP}" + SA_NAME="k8slogs$(date +%Y%m%d%H%M%S)" echo "[$(date +%Y%m%d%H%M%S)][INFO] Ensuring storage account: ${SA_NAME}" + az storage account show --name ${SA_NAME} --resource-group ${SA_RESOURCE_GROUP} 1> /dev/null 2> /dev/null + if [ $? -eq 0 ]; then + return + fi az storage account create --name ${SA_NAME} --resource-group ${SA_RESOURCE_GROUP} --location ${LOCATION} --sku Premium_LRS --https-only true 1> /dev/null if [ $? -ne 0 ]; then echo "[$(date +%Y%m%d%H%M%S)][ERR] Error ensuring storage account ${SA_NAME}" @@ -84,7 +88,7 @@ ensureStorageAccount() ensureStorageAccountContainer() { - SA_CONTAINER="kuberneteslogs" + SA_CONTAINER=$(echo "${RESOURCE_GROUP}" | sed 's/[_-]//g' | sed -e 's/\(.*\)/\L\1/') echo "$(date +%Y%m%d%H%M%S)][INFO] Ensuring storage account container: ${SA_CONTAINER}" az storage container create --name ${SA_CONTAINER} --account-name ${SA_NAME} @@ -107,12 +111,15 @@ uploadLogs() processHost() { host=$1 + + hostName=$(ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "hostname") + hostName=$(echo ${hostName} | sed 's/[[:space:]]*$//') - echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing host ${host}" + echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing host ${hostName}" scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" collectlogs.sh ${USER}@${host}:/home/${USER}/collectlogs.sh ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "sudo chmod 744 collectlogs.sh; ./collectlogs.sh ${NAMESPACES};" - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:/home/${USER}/${host}.zip ${LOGFILEFOLDER}/${host}.zip - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "rm -f collectlogs.sh ${host}.zip" + scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:/home/${USER}/${hostName}.zip ${LOGFILEFOLDER}/${hostName}.zip + ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "rm -f collectlogs.sh ${hostName}.zip" } processDvmHost() @@ -132,11 +139,17 @@ processWindowsHost() { host=$1 - echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing windows-host ${host}" + # It has to store the hostname in a file first to avoid whitespace from Windows cmd output. + ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} 'powershell; $env:COMPUTERNAME > %HOMEPATH%\hostname.txt' + scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:'%HOMEPATH%/hostname.txt' hostname.txt + hostName=$(cat hostname.txt | sed 's/[[:space:]]*$//') + rm hostname.txt -f + + echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing windows-host ${hostName}" scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" azs-collect-windows-logs.ps1 ${USER}@${host}:"C:/k/debug/azs-collect-windows-logs.ps1" ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "powershell; Start-Process PowerShell -Verb RunAs; C:/k/debug/azs-collect-windows-logs.ps1" - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:"C:/Users/${USER}/win_log_${host}.zip" ${LOGFILEFOLDER}/"win_log_${host}.zip" - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "powershell; rm C:/k/debug/azs-collect-windows-logs.ps1; rm C:/Users/${USER}/win_log_${host}.zip" + scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:"C:/Users/${USER}/win_log_${hostName}.zip" ${LOGFILEFOLDER}/"win_log_${hostName}.zip" + ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "powershell; rm C:/k/debug/azs-collect-windows-logs.ps1; rm C:/Users/${USER}/win_log_${hostName}.zip" } printUsage() @@ -169,7 +182,7 @@ then printUsage fi -NAMESPACES="kube-system" +NAMESPACES="" UPLOAD_LOGS="false" # Handle named parameters @@ -188,6 +201,10 @@ do RESOURCE_GROUP="$2" shift 2 ;; + -n|--user-namespace ) + NAMESPACES="${NAMESPACES}, $2" + shift 2 + ;; --api-model) API_MODEL="$2" shift 2 @@ -284,7 +301,7 @@ then fi # CLUSTER NODES -MASTER_IP=$(az network public-ip list -g ${RESOURCE_GROUP} --query "[?contains(name, 'k8s-master')].{ip:ipAddress}" --output tsv) +MASTER_IP=$(az network public-ip list -g ${RESOURCE_GROUP} --query "[?contains(name, 'k8s-master') || contains(name, 'aks-master')].{ip:ipAddress}" --output tsv) if [ $? -ne 0 ]; then echo "[$(date +%Y%m%d%H%M%S)][ERR] Error fetching the master nodes' load balancer IP" exit 1 @@ -300,16 +317,16 @@ then scp ${SCP_FLAGS} ${USER}@${MASTER_IP}:/home/${USER}/cluster-snapshot.zip ${LOGFILEFOLDER}/cluster-snapshot.zip ssh ${SSH_FLAGS} ${USER}@${MASTER_IP} "sudo rm -f cluster-snapshot.zip hosts.sh" - CLUSTER_NODES=$(az vm list -g ${RESOURCE_GROUP} --show-details --query "[?storageProfile.osDisk.osType=='Linux'].{Name:name}" --output tsv | sed 's/[[:blank:]]*$//') + LINUX_NODES=$(az vm list -g ${RESOURCE_GROUP} --query "[?storageProfile.osDisk.osType=='Linux' && tags != null && contains(tags.orchestrator, 'Kubernetes')].{Name:name}" --output tsv | sed 's/[[:blank:]]*$//') PROXY_CMD="ssh -i ${IDENTITYFILE} ${KNOWN_HOSTS_OPTIONS} ${USER}@${MASTER_IP} -W %h:%p" - for host in ${CLUSTER_NODES} + for host in ${LINUX_NODES} do processHost ${host} done #Get Windoews nodes log if Windows nodes exist - WINDOWS_NODES=$(az vm list -g ${RESOURCE_GROUP} --show-details --query "[?storageProfile.osDisk.osType=='Windows'].{Name:name}" --output tsv | sed 's/[[:blank:]]*$//') + WINDOWS_NODES=$(az vm list -g ${RESOURCE_GROUP} --query "[?storageProfile.osDisk.osType=='Windows' && tags != null && contains(tags.orchestrator, 'Kubernetes')].{Name:name}" --output tsv | sed 's/[[:blank:]]*$//') if [ -n "$WINDOWS_NODES" ] then @@ -319,6 +336,27 @@ then done fi + # Search VMSS nodes + VMSS_LIST=$(az vmss list -g ${RESOURCE_GROUP} --query "[?tags != null && contains(tags.orchestrator, 'Kubernetes')].{name:name, osType:virtualMachineProfile.storageProfile.osDisk.osType}") + for VMSS in $(echo "${VMSS_LIST}" | jq -c '.[]'); do + VMSS_NAME=$(echo ${VMSS} | jq -r '.name') + OS_TYPE=$(echo ${VMSS} | jq -r '.osType') + VMSS_NODES=$(az network nic list -g ${RESOURCE_GROUP} --query "[?name=='${VMSS_NAME}'].{ip:ipConfigurations[0].privateIpAddress}" --output tsv | sed 's/[[:blank:]]*$//') + + if [ "$OS_TYPE" == "Linux" ] + then + for host in ${VMSS_NODES} + do + processHost ${host} + done + else + for host in ${VMSS_NODES} + do + processWindowsHost ${host} + done + fi + done + fi mkdir -p $LOGFILEFOLDER/resources @@ -326,7 +364,7 @@ az network vnet list -g ${RESOURCE_GROUP} > ${LOGFILEFOLDER}/resources/vnets.jso # UPLOAD if [ "$UPLOAD_LOGS" == "true" ]; then - echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing logs" + echo "[$(date +%Y%m%d%H%M%S)][INFO] Uploading logs to storage account..." createSADirectories copyLogsToSADirectory ensureResourceGroup @@ -334,6 +372,7 @@ if [ "$UPLOAD_LOGS" == "true" ]; then ensureStorageAccountContainer uploadLogs deleteSADirectory + echo "[$(date +%Y%m%d%H%M%S)][INFO] The logs are uploaded to resource group: ${SA_RESOURCE_GROUP}, stroage account: ${SA_NAME}, container: ${SA_CONTAINER}." fi echo "[$(date +%Y%m%d%H%M%S)][INFO] Logs can be found here: $LOGFILEFOLDER" diff --git a/diagnosis/hosts.sh b/diagnosis/hosts.sh index 433d294..10b60e3 100644 --- a/diagnosis/hosts.sh +++ b/diagnosis/hosts.sh @@ -11,9 +11,3 @@ cp ${TMP}/*.json ${TMP}/kube-system/*.json ${LOGDIRECTORY} (cd $TMP && zip -q -r ~/${WD}.zip ${WD}) sudo rm -f -r $TMP - -echo "[$(date +%Y%m%d%H%M%S)][INFO] Getting Linux nodes information" -kubectl get nodes -l kubernetes.io/os=linux -o jsonpath='{.items[*].metadata.name}' > linux_nodes.txt - -echo "[$(date +%Y%m%d%H%M%S)][INFO] Getting Windows nodes information" -kubectl get nodes -l kubernetes.io/os=windows -o jsonpath='{.items[*].metadata.name}' > windows_nodes.txt \ No newline at end of file diff --git a/diagnosis/k8sCollectLogsCi.sh b/diagnosis/k8sCollectLogsCi.sh deleted file mode 100644 index 6f1950f..0000000 --- a/diagnosis/k8sCollectLogsCi.sh +++ /dev/null @@ -1,225 +0,0 @@ -#!/bin/bash -x - -validateKeys() -{ - host=$1 - flags=$2 - - ssh ${flags} ${USER}@${host} "exit" - - if [ $? -ne 0 ]; then - echo "[$(date +%Y%m%d%H%M%S)][ERR] Error connecting to host ${host}" - exit 1 - fi -} - -processHost() -{ - host=$1 - - echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing host ${host}" - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" collectlogs.sh ${USER}@${host}:/home/${USER}/collectlogs.sh - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "sudo chmod 744 collectlogs.sh; ./collectlogs.sh ${NAMESPACES};" - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:/home/${USER}/${host}.zip ${LOGFILEFOLDER}/${host}.zip - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "rm -f collectlogs.sh ${host}.zip" -} - -processWindowsHost() -{ - host=$1 - - echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing windows-host ${host}" - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" azs-collect-windows-logs.ps1 ${USER}@${host}:"C:/k/debug/azs-collect-windows-logs.ps1" - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "powershell; Start-Process PowerShell -Verb RunAs; C:/k/debug/azs-collect-windows-logs.ps1" - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host}:"C:/Users/${USER}/win_log_${host}.zip" ${LOGFILEFOLDER}/"win_log_${host}.zip" - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${host} "powershell; rm C:/k/debug/azs-collect-windows-logs.ps1; rm C:/Users/${USER}/win_log_${host}.zip" -} - -processDvmHost() -{ - host=$1 - dvm_name=$2 - echo "[$(date +%Y%m%d%H%M%S)][INFO] Processing dvm-host ${host}" - scp ${SCP_FLAGS} collectlogs.sh ${USER}@${host}:/home/${USER}/collectlogs.sh - ssh ${SSH_FLAGS} ${USER}@${host} "sudo chmod 744 collectlogs.sh; ./collectlogs.sh ${NAMESPACES};" - scp ${SCP_FLAGS} ${USER}@${host}:/home/${USER}/${dvm_name}.zip ${LOGFILEFOLDER}/${dvm_name}.zip - ssh ${SSH_FLAGS} ${USER}@${host} "rm -f collectlogs.sh ${dvm_name}.zip" -} - -printUsage() -{ - echo "$0 collects diagnostics from Kubernetes clusters provisioned by AKS Engine" - echo "" - echo "Usage:" - echo " $0 [flags]" - echo "" - echo "Flags:" - echo " -u, --user The administrator username for the cluster VMs" - echo " -i, --identity-file RSA private key tied to the public key used to create the Kubernetes cluster (usually named 'id_rsa')" - echo " -g, --resource-group Kubernetes cluster resource group" - echo " -n, --user-namespace Collect logs from containers in the specified namespaces (kube-system logs are always collected)" - echo " --api-model AKS Engine Kubernetes cluster definition json file" - echo " --all-namespaces Collect logs from containers in all namespaces. It overrides --user-namespace" - echo " --disable-host-key-checking Sets SSH's StrictHostKeyChecking option to \"no\" while the script executes. Only use in a safe environment." - echo " -h, --help Print script usage" - echo "" - echo "Examples:" - echo " $0 -u azureuser -i ~/.ssh/id_rsa -g k8s-rg --disable-host-key-checking" - echo " $0 -u azureuser -i ~/.ssh/id_rsa -g k8s-rg -n default -n monitoring" - - exit 1 -} - -if [ "$#" -eq 0 ] -then - printUsage -fi - -NAMESPACES="kube-system" -ALLNAMESPACES=1 - -# Handle named parameters -while [[ "$#" -gt 0 ]] -do - case $1 in - -i|--identity-file) - IDENTITYFILE="$2" - shift 2 - ;; - -u|--user) - USER="$2" - shift 2 - ;; - -g|--resource-group) - RESOURCE_GROUP="$2" - shift 2 - ;; - --dvm-ip) - DVM_HOST="$2" - shift 2 - ;; - --dvm-name) - DVM_NAME="$2" - shift 2 - ;; - --master-ip) - MASTER_IP="$2" - shift 2 - ;; - -n|--user-namespace) - NAMESPACES="$NAMESPACES $2" - shift 2 - ;; - --all-namespaces) - ALLNAMESPACES=0 - shift - ;; - --disable-host-key-checking) - KNOWN_HOSTS_OPTIONS='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR' - shift - ;; - -h|--help) - printUsage - ;; - *) - echo "" - echo "[ERR] Unexpected flag $1" - printUsage - ;; - esac -done - -# Validate input -if [ -z "$USER" ] -then - echo "" - echo "[ERR] --user is required" - printUsage -fi - -if [ -z "$IDENTITYFILE" ] -then - echo "" - echo "[ERR] --identity-file is required" - printUsage -fi - -if [ ! -f $IDENTITYFILE ] -then - echo "" - echo "[ERR] identity-file $IDENTITYFILE not found" - printUsage - exit 1 -else - cat $IDENTITYFILE | grep -q "BEGIN \(RSA\|OPENSSH\) PRIVATE KEY" \ - || { echo "Provided identity file $IDENTITYFILE is not a RSA Private Key file."; echo "A RSA private key starts with '-----BEGIN [RSA|OPENSSH] PRIVATE KEY-----''"; exit 1; } -fi - -test $ALLNAMESPACES -eq 0 && unset NAMESPACES - -# Print user input -echo "" -echo "user: $USER" -echo "identity-file: $IDENTITYFILE" -echo "resource-group: $RESOURCE_GROUP" -echo "namespaces: ${NAMESPACES:-all}" -echo "" - -NOW=`date +%Y%m%d%H%M%S` -LOGFILEFOLDER="_output/${RESOURCE_GROUP}-${NOW}" -mkdir -p $LOGFILEFOLDER - -SSH_FLAGS="-q -t -i ${IDENTITYFILE} ${KNOWN_HOSTS_OPTIONS}" -SCP_FLAGS="-q -i ${IDENTITYFILE} ${KNOWN_HOSTS_OPTIONS}" - - -if [ -n "$DVM_HOST" ] -then - echo "[$(date +%Y%m%d%H%M%S)][INFO] Checking connectivity with DVM host" - validateKeys ${DVM_HOST} "${SSH_FLAGS}" - processDvmHost ${DVM_HOST} ${DVM_NAME} -fi - -if [ -n "$MASTER_IP" ] -then - echo "[$(date +%Y%m%d%H%M%S)][INFO] Checking connectivity with master node" - validateKeys ${MASTER_IP} "${SSH_FLAGS}" - - scp ${SCP_FLAGS} hosts.sh ${USER}@${MASTER_IP}:/home/${USER}/hosts.sh - ssh ${SSH_FLAGS} ${USER}@${MASTER_IP} "sudo chmod 744 hosts.sh; ./hosts.sh" - scp ${SCP_FLAGS} ${USER}@${MASTER_IP}:/home/${USER}/cluster-snapshot.zip ${LOGFILEFOLDER}/cluster-snapshot.zip - scp ${SCP_FLAGS} ${USER}@${MASTER_IP}:/home/${USER}/*_nodes.txt ${LOGFILEFOLDER}/. - ssh ${SSH_FLAGS} ${USER}@${MASTER_IP} "sudo rm -f cluster-snapshot.zip hosts.sh *_nodes.txt" - - if [ ! -f ${LOGFILEFOLDER}/linux_nodes.txt ] - then - echo "Linux nodes not present" - else - PROXY_CMD="ssh -i ${IDENTITYFILE} ${KNOWN_HOSTS_OPTIONS} ${USER}@${MASTER_IP} -W %h:%p" - - INPUT_FILE=${LOGFILEFOLDER}/linux_nodes.txt - LINUX_NODES=$(<$INPUT_FILE) - - for host in ${LINUX_NODES} - do - processHost ${host} - done - fi - - if [ ! -f ${LOGFILEFOLDER}/windows_nodes.txt ] - then - echo "Windows nodes not present" - else - PROXY_CMD="ssh -i ${IDENTITYFILE} ${KNOWN_HOSTS_OPTIONS} ${USER}@${MASTER_IP} -W %h:%p" - - INPUT_FILE=${LOGFILEFOLDER}/windows_nodes.txt - WINDOWS_NODES=$(<$INPUT_FILE) - - for host in ${WINDOWS_NODES} - do - processHost ${host} - done - fi -fi - -echo "[$(date +%Y%m%d%H%M%S)][INFO] Done with k8s log collection, logs can be found here: $LOGFILEFOLDER" diff --git a/diagnosis/scripts/updateconfiguration.sh b/diagnosis/scripts/updateconfiguration.sh deleted file mode 100644 index c02af39..0000000 --- a/diagnosis/scripts/updateconfiguration.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash -e - -agentScript() { - cat < agent.sh -#!/bin/bash -e - -# cloud provider -sudo cp /etc/kubernetes/azure.json /etc/kubernetes/azure.bak -sudo jq '.cloudProviderRatelimit = true | .cloudProviderRateLimitQPS = 3 | .cloudProviderRateLimitBucket = 10 | .cloudProviderBackoffRetries = 1 | .cloudProviderBackoffDuration = 30' /etc/kubernetes/azure.bak | sudo dd status=none of=/etc/kubernetes/azure.json - -# kubelet -sudo cp /etc/default/kubelet /etc/default/kubelet.bak -sudo sed -i -e 's/--node-status-update-frequency=[0-9]*[a-z]/--node-status-update-frequency=1m/' /etc/default/kubelet - -# restart -sudo systemctl daemon-reload -sudo systemctl restart kubelet - -echo "=> azure.json updates" -sudo grep -E 'cloudProviderRatelimit"|cloudProviderRateLimitQPS"|cloudProviderRateLimitBucket"|cloudProviderBackoffRetries"|cloudProviderBackoffDuration"' /etc/kubernetes/azure.json -echo "=> kubelet restarted" -systemctl status kubelet --no-pager -l -EOF -if [ ! -f agent.sh ]; then - echo "[ERR] Error generating script: agent.sh" - return 1 -fi -} - -masterScript() { - cat < master.sh -#!/bin/bash -e - -# cloud provider -sudo cp /etc/kubernetes/azure.json /etc/kubernetes/azure.bak -sudo jq '.cloudProviderRatelimit = false | .cloudProviderRateLimitQPS = 3 | .cloudProviderRateLimitBucket = 10 | .cloudProviderBackoffRetries = 1 | .cloudProviderBackoffDuration = 30' /etc/kubernetes/azure.bak | sudo dd status=none of=/etc/kubernetes/azure.json - -# kubelet -sudo cp /etc/default/kubelet /etc/default/kubelet.bak -sudo sed -i -e 's/--node-status-update-frequency=[0-9]*[a-z]/--node-status-update-frequency=1m/' /etc/default/kubelet - -# kube-controller-manager -sudo cp /etc/kubernetes/manifests/kube-controller-manager.yaml /etc/kubernetes/manifests/kube-controller-manager.bak -sudo sed -i -e 's/--route-reconciliation-period=[0-9]*[a-z]/route-reconciliation-period=1m/' /etc/kubernetes/manifests/kube-controller-manager.yaml -sudo sed -i -e 's/--node-monitor-grace-period=[0-9]*[a-z]/--node-monitor-grace-period=5m/' /etc/kubernetes/manifests/kube-controller-manager.yaml -sudo sed -i -e 's/--pod-eviction-timeout=[0-9]*[a-z]/--pod-eviction-timeout=5m/' /etc/kubernetes/manifests/kube-controller-manager.yaml - -# restart -sudo systemctl daemon-reload -sudo systemctl restart kubelet - -echo "=> controller-manager updates" -grep -o -E 'route-reconciliation-period=[0-9a-zA-Z]*' /etc/kubernetes/manifests/kube-controller-manager.yaml -grep -o -E 'node-monitor-grace-period=[0-9a-zA-Z]*' /etc/kubernetes/manifests/kube-controller-manager.yaml -grep -o -E 'pod-eviction-timeout=[0-9a-zA-Z]*' /etc/kubernetes/manifests/kube-controller-manager.yaml -echo "=> azure.json updates" -sudo grep -E 'cloudProviderRatelimit"|cloudProviderRateLimitQPS"|cloudProviderRateLimitBucket"|cloudProviderBackoffRetries"|cloudProviderBackoffDuration"' /etc/kubernetes/azure.json -echo "=> kubelet restarted" -systemctl status kubelet --no-pager -l -EOF -if [ ! -f master.sh ]; then - echo "[ERR] Error generating script: master.sh" - return 1 -fi -} - -processHost() { - HOST=$1 - SCRIPT=$2 - - if [[ "$HOST" == "$HOSTNAME" ]]; then - sudo chmod +x ${SCRIPT}; - ./${SCRIPT}; - else - KNOWN_HOSTS_OPTIONS='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR' - PROXY_CMD="ssh ${KNOWN_HOSTS_OPTIONS} ${USER}@${HOSTNAME} -W %h:%p" - SSH_FLAGS="-q -t ${KNOWN_HOSTS_OPTIONS}" - SCP_FLAGS="-q ${KNOWN_HOSTS_OPTIONS}" - - scp ${SCP_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${SCRIPT} ${USER}@${HOST}:${SCRIPT} - ssh ${SSH_FLAGS} -o ProxyCommand="${PROXY_CMD}" ${USER}@${HOST} "sudo chmod +x ${SCRIPT}; ./${SCRIPT}; rm ${SCRIPT};" - fi -} - -printUsage() -{ - echo "$0 updates Kubernetes clusters configuration on agent and master nodes." - echo "Usage:" - echo " $0 [flags]" - echo "" - echo "Flags:" - echo " --agents Update configuration on all Kubernetes agent nodes" - echo " --masters Update configuration on all Kubernetes master nodes" - echo "" - echo "Examples:" - echo " $0 --agents" - echo " $0 --masters" - echo " $0 --masters --agents" - exit 1 -} - -if [ "$#" -eq 0 ] -then - printUsage -fi - -DOMASTERS=1 -DOAGENTS=1 -while [[ "$#" -gt 0 ]] -do - case $1 in - --masters) - DOMASTERS=0 - shift - ;; - --agents) - DOAGENTS=0 - shift - ;; - -h|--help) - printUsage - ;; - *) - echo "" - echo "[ERR] Unexpected flag $1" - printUsage - ;; - esac -done - -## NODES -if [ $DOAGENTS -eq 0 ]; then - agentScript - AGENTS=$(kubectl get nodes -o custom-columns=CONTAINER:.metadata.name | tail -n +2 | grep -v k8s-master | xargs) - for AGENT in ${AGENTS}; do - echo "" - echo "==> PROCESSING AGENT $AGENT" - processHost ${AGENT} agent.sh - done - rm agent.sh -fi - -# MASTERS -if [ $DOMASTERS -eq 0 ]; then - masterScript - MASTERS=$(kubectl get nodes -o custom-columns=CONTAINER:.metadata.name | tail -n +2 | grep k8s-master | xargs) - for MASTER in ${MASTERS}; do - echo "" - echo "==> PROCESSING MASTER $MASTER" - processHost ${MASTER} master.sh - done - rm master.sh -fi