Skip to content

Commit

Permalink
MGMT-3108: Adding bootstrap-in-place installation support
Browse files Browse the repository at this point in the history
  • Loading branch information
tsorya authored and osherdp committed Dec 30, 2020
1 parent 81683e9 commit 4364824
Show file tree
Hide file tree
Showing 16 changed files with 536 additions and 72 deletions.
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ destroy_all_nodes_from_namespaces:
destroy_all_nodes:
skipper run $(SKIPPER_PARAMS) 'discovery-infra/delete_nodes.py --delete-all'

deploy_ibip: _test_setup
skipper make $(SKIPPER_PARAMS) _deploy_nodes $(SKIPPER_PARAMS) ADDITIONAL_PARAMS="'--bootstrap-in-place'" NUM_WORKERS=0 NUM_MASTERS=1 NAMESPACE_INDEX=0

redeploy_nodes: destroy_nodes deploy_nodes

Expand Down Expand Up @@ -360,4 +362,4 @@ _test_setup:
cp -p discovery-infra/test_infra/tools/tf_network_pool.json /tmp/tf_network_pool.json

_test_parallel: $(REPORTS) _test_setup
python3 -m pytest -n $(or ${TEST_WORKERS_NUM}, '2') $(or ${TEST},discovery-infra/tests) -k $(or ${TEST_FUNC},'') -m $(or ${TEST_MARKER},'') --verbose -s --junit-xml=$(REPORTS)/unittest.xml
python3 -m pytest -n $(or ${TEST_WORKERS_NUM}, '2') $(or ${TEST},discovery-infra/tests) -k $(or ${TEST_FUNC},'') -m $(or ${TEST_MARKER},'') --verbose -s --junit-xml=$(REPORTS)/unittest.xml
171 changes: 171 additions & 0 deletions discovery-infra/bootstrap_in_place.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import os
import shutil
import shlex
import logging
import yaml

import waiting

from oc_utils import get_operators_status
from download_logs import download_must_gather
from test_infra import utils, consts
from test_infra.tools.assets import NetworkAssets
from test_infra.controllers.node_controllers.ssh import SshConnection
from test_infra.controllers.node_controllers.terraform_controller import TerraformController

BUILD_DIR = "build"
INSTALL_CONFIG_FILE_NAME = "install-config.yaml"
IBIP_DIR = os.path.join(BUILD_DIR, "ibip")
RESOURCES_DIR = os.path.join("discovery-infra", "resources")
INSTALL_CONFIG = os.path.join(IBIP_DIR, INSTALL_CONFIG_FILE_NAME)
INSTALLER_BINARY = os.path.join(BUILD_DIR, "openshift-install")
EMBED_IMAGE_NAME = "installer-SNO-image.iso"
KUBE_CONFIG = os.path.join(IBIP_DIR, "auth", "kubeconfig")
MUST_GATHER_DIR = os.path.join(IBIP_DIR, "must-gather")
SOSREPORT_SCRIPT = os.path.join(RESOURCES_DIR, "man_sosreport.sh")
SSH_KEY = os.path.join("ssh_key", "key")


def installer_generate():
logging.info("Installer generate manifests")
utils.run_command(f"{INSTALLER_BINARY} create manifests --dir={IBIP_DIR}")
logging.info("Installer generate ignitions")
# TODO delete
shutil.copy(f"{RESOURCES_DIR}/sno_manifest.yaml", os.path.join(IBIP_DIR, "openshift"))
utils.run_command(f"{INSTALLER_BINARY} create ignition-configs --dir={IBIP_DIR}")


def download_live_image(download_path, rhcos_version=None):
if os.path.exists(download_path):
logging.info("Image %s already exists, skipping download", download_path)
return

logging.info("Downloading iso to %s", download_path)
rhcos_version = rhcos_version or os.getenv('RHCOS_VERSION', "46.82.202009222340-0")
utils.run_command(f"curl https://releases-art-rhcos.svc.ci.openshift.org/art/storage/releases/rhcos-4.6/"
f"{rhcos_version}/x86_64/rhcos-{rhcos_version}-live.x86_64.iso --retry 5 -o {download_path}")


def embed(image_name, ignition_file, embed_image_name):
logging.info("Embed ignition %s to iso %s", ignition_file, image_name)
embedded_image = os.path.join(BUILD_DIR, embed_image_name)
os.remove(embedded_image) if os.path.exists(embedded_image) else None

flags = shlex.split(f"--privileged --rm -v /dev:/dev -v /run/udev:/run/udev -v .:/data -w /data")
utils.run_container("coreos-installer", "quay.io/coreos/coreos-installer:release", flags,
f"iso ignition embed {BUILD_DIR}/{image_name} "
f"-f --ignition-file /data/{IBIP_DIR}/{ignition_file} -o /data/{embedded_image}")

image_path = os.path.join(consts.BASE_IMAGE_FOLDER, embed_image_name)
shutil.move(embedded_image, image_path)
return image_path


def fill_install_config(pull_secret, ssh_pub_key, net_asset, cluster_name):
yaml.add_representer(str, str_presenter)
with open(INSTALL_CONFIG, "r") as _file:
config = yaml.safe_load(_file)

config["pullSecret"] = pull_secret
config["sshKey"] = ssh_pub_key
config["metadata"]["name"] = cluster_name
config["networking"]["machineNetwork"][0]["cidr"] = net_asset["machine_cidr"]

with open(INSTALL_CONFIG, "w") as _file:
yaml.dump(config, _file)


def setup_files_and_folders(args, net_asset, cluster_name):
logging.info("Creating needed files and folders")
utils.recreate_folder(consts.BASE_IMAGE_FOLDER, force_recreate=False)
utils.recreate_folder(IBIP_DIR, with_chmod=False, force_recreate=True)
shutil.copy(os.path.join(RESOURCES_DIR, INSTALL_CONFIG_FILE_NAME), IBIP_DIR)
fill_install_config(args.pull_secret, args.ssh_key, net_asset, cluster_name)


def str_presenter(dumper, data):
if "ssh-rsa" in data: # check for multiline string
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)


def create_controller(net_asset):
return TerraformController(
cluster_name="test-infra-cluster",
num_masters=1,
num_workers=0,
master_memory=32 * 1024, # 32GB of RAM
master_vcpu=12,
net_asset=net_asset,
iso_download_path="<TBD>", # will be set later on
bootstrap_in_place=True,
)


def all_operators_up():
statuses = get_operators_status(KUBE_CONFIG)
if not statuses:
logging.debug("No operator has been found currently...")
return False

invalid_operators = [operator for operator, up in statuses.items() if not up]

all_operators_are_valid = len(invalid_operators) == 0

if not all_operators_are_valid:
logging.debug("Following operators are still down: %s", ", ".join(invalid_operators))

return all_operators_are_valid


def gather_sosreport_data(host_ip):
with SshConnection(ip=host_ip, private_ssh_key_path=SSH_KEY) as ssh:
ssh.upload_file(SOSREPORT_SCRIPT, "/tmp/man_sosreport.sh")
ssh.execute("chmod a+x /tmp/man_sosreport.sh")
ssh.execute("sudo /tmp/man_sosreport.sh")
ssh.download_file("/tmp/sosreport.tar.bz2", IBIP_DIR)


def execute_ibip_flow(args):
host_ip = None
try:
openshift_release_image = os.getenv('OPENSHIFT_INSTALL_RELEASE_IMAGE')
if not openshift_release_image:
raise ValueError("os env OPENSHIFT_INSTALL_RELEASE_IMAGE must be provided")

net_asset = NetworkAssets().get()
controller = create_controller(net_asset)
setup_files_and_folders(args, net_asset, controller.cluster_name)

utils.extract_installer(openshift_release_image, BUILD_DIR)
installer_generate()

download_live_image(f"{BUILD_DIR}/installer-image.iso")
image_path = embed("installer-image.iso", "bootstrap.ign", EMBED_IMAGE_NAME)

logging.info("Starting nodes...")
controller.image_path = image_path
controller.start_all_nodes()
logging.info("Nodes started!")

logging.info("Configuring /etc/hosts...")
host_ip = controller.master_ips[0][0]
utils.config_etc_hosts(cluster_name=controller.cluster_name,
base_dns_domain=controller.cluster_domain,
api_vip=host_ip)

logging.info("Waiting for installation to complete...")
waiting.wait(all_operators_up,
sleep_seconds=20,
timeout_seconds=60 * 60,
waiting_for="all operators to get up")
logging.info("Installation completed successfully!")

finally:
if host_ip is not None:
logging.info("Gathering sosreport data from host...")
gather_sosreport_data(host_ip)

logging.info("Gathering information via must-gather...")
utils.recreate_folder(MUST_GATHER_DIR)
download_must_gather(KUBE_CONFIG, MUST_GATHER_DIR)
31 changes: 30 additions & 1 deletion discovery-infra/oc_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import urllib3
import json
import urllib3
import subprocess

from kubernetes.config.kube_config import load_kube_config
from kubernetes.config.kube_config import Configuration
Expand Down Expand Up @@ -168,3 +169,31 @@ def _load_resource_config_dict(resource):
'kubectl.kubernetes.io/last-applied-configuration'
]
return json.loads(raw)


def get_operators_status(kubeconfig):
command = ["/usr/local/bin/oc",
"--kubeconfig", kubeconfig,
"get", "clusteroperators", "-o", "json"]

response = subprocess.run(command, stdout=subprocess.PIPE)
if response.returncode != 0:
return {}

output = json.loads(response.stdout)
statuses = {}

for item in output["items"]:
name = item["metadata"]["name"]
if "conditions" not in item["status"]:
statuses[name] = False
continue

for condition in item["status"]["conditions"]:
if condition["type"] == "Available":
statuses[name] = condition["status"] == "True"
break
else:
statuses[name] = False

return statuses
31 changes: 31 additions & 0 deletions discovery-infra/resources/install-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: v1
baseDomain: redhat.com
compute:
- architecture: amd64
hyperthreading: Enabled
name: worker
platform: {}
replicas: 0
controlPlane:
architecture: amd64
hyperthreading: Enabled
name: master
platform: {}
replicas: 1
metadata:
creationTimestamp: null
name: test-infra-cluster
networking:
clusterNetwork:
- cidr: 10.128.0.0/14
hostPrefix: 23
machineNetwork:
- cidr: 192.168.126.0/24
networkType: OpenShiftSDN
serviceNetwork:
- 172.30.0.0/16
platform:
none: {}
publish: External
pullSecret: '{}'
sshKey: ""
123 changes: 123 additions & 0 deletions discovery-infra/resources/man_sosreport.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/bin/bash
export LANG=C

# If this script hangs, un-comment the below two entries and note the command that the script hangs on. Then comment out that command and re-run the script.
# set -x
# set -o verbose

[[ -d /tmp/sosreport ]] && rm -rf /tmp/sosreport
mkdir /tmp/sosreport && cd /tmp/sosreport && mkdir -p var/log etc/lvm etc/sysconfig network storage sos_commands/networking

echo -e "Gathering system information..."
hostname &> hostname
cp -a /etc/redhat-release ./etc/ 2>> error_log
uptime &> uptime

echo -e "Gathering application information..."
chkconfig --list &> chkconfig
top -bn1 &> top_bn1
service --status-all &> service_status_all
date &> date
ps auxww &> ps_auxww
ps -elf &> ps_-elf
rpm -qa --last &> rpm-qa
echo -e "Running 'rpm -Va'. This may take a moment."
rpm -Va &> rpm-Va

echo -e "Gathering memory information..."
free -m &> free
vmstat 1 10 &> vmstat

echo -e "Gathering network information..."
ifconfig &> ./network/ifconfig
netstat -s &>./network/netstat_-s
netstat -agn &> ./network/netstat_-agn
netstat -neopa &> ./network/netstat_-neopa
route -n &> ./network/route_-n
for i in $(ls /etc/sysconfig/network-scripts/{ifcfg,route,rule}-*) ; do echo -e "$i\n----------------------------------"; cat $i;echo " "; done &> ./sos_commands/networking/ifcfg-files
for i in $(ifconfig | grep "^[a-z]" | cut -f 1 -d " "); do echo -e "$i\n-------------------------" ; ethtool $i; ethtool -k $i; ethtool -S $i; ethtool -i $i;echo -e "\n" ; done &> ./sos_commands/networking/ethtool.out
cp /etc/sysconfig/network ./sos_commands/networking/ 2>> error_log
cp /etc/sysconfig/network-scripts/ifcfg-* ./sos_commands/networking/ 2>> error_log
cp /etc/sysconfig/network-scripts/route-* ./sos_commands/networking/ 2>> error_log
cat /proc/net/bonding/bond* &> ./sos_commands/networking/proc-net-bonding-bond 2>> error_log
iptables --list --line-numbers &> ./sos_commands/networking/iptables_--list_--line-numbers
ip route show table all &> ./sos_commands/networking/ip_route_show_table_all
ip link &> ./sos_commands/networking/ip_link

echo -e "Gathering Storage/Filesystem information..."
df -l &> df
fdisk -l &> fdisk
parted -l &> parted
cp -a /etc/fstab ./etc/ 2>> error_log
cp -a /etc/lvm/lvm.conf ./etc/lvm/ 2>> error_log
cp -a /etc/lvm/backup/ ./etc/lvm/ 2>> error_log
cp -a /etc/lvm/archive/ ./etc/lvm/ 2>> error_log
cp -a /etc/multipath.conf ./etc/ 2>> error_log
cat /proc/mounts &> mount
iostat -tkx 1 10 &> iostat_-tkx_1_10
parted -l &> storage/parted_-l
vgdisplay -v &> storage/vgdisplay
lvdisplay &> storage/lvdisplay
pvdisplay &> storage/pvdisplay
pvs -a -v &> storage/pvs
vgs -v &> storage/vgs
lvs -o +devices &> storage/lvs
multipath -v4 -ll &> storage/multipath_ll
pvscan -vvvv &> storage/pvscan
vgscan -vvvv &> storage/vgscan
lvscan -vvvv &> storage/lvscan
lsblk &> storage/lsblk
lsblk -t &> storage/lsblk_t
dmsetup info -C &> storage/dmsetup_info_c
dmsetup status &> storage/dmsetup_status
dmsetup table &> storage/dmsetup_table
ls -lahR /dev &> storage/dev

echo -e "Gathering kernel information..."
cp -a /etc/security/limits.conf ./etc/ 2>> error_log
cp -a /etc/sysctl.conf ./etc/ 2>> error_log
ulimit -a &> ulimit
cat /proc/slabinfo &> slabinfo
cat /proc/interrupts &> interrupts
cat /proc/iomem &> iomem
cat /proc/ioports &> ioports
slabtop -o &> slabtop_-o
uname -a &> uname
sysctl -a &> sysctl_-a
lsmod &> lsmod
cp -a /etc/modprobe.conf ./etc/ 2>> error_log
cp -a /etc/sysconfig/* ./etc/sysconfig/ 2>> error_log
for MOD in `lsmod | grep -v "Used by"| awk '{ print $1 }'`; do modinfo $MOD 2>&1 >> modinfo; done;
ipcs -a &> ipcs_-a
ipcs -s | awk '/^0x/ {print $2}' | while read semid; do ipcs -s -i $semid; done &> ipcs_-s_verbose
sar -A &> sar_-A
cp -a /var/log/dmesg dmesg 2>> error_log
dmesg &> dmesg_now

echo -e "Gathering hardware information..."
dmidecode &> dmidecode
lspci -vvv &> lspci_-vvv
lspci &> lspci
cat /proc/meminfo &> meminfo
cat /proc/cpuinfo &> cpuinfo

echo -e "Gathering kdump information..."
cp -a /etc/kdump.conf ./etc/ 2>> error_log
ls -laR /var/crash &> ls-lar-var-crash
ls -1 /var/crash | while read n; do mkdir -p var/crash/${n}; cp -a /var/crash/${n}/vmcore-dmesg* var/crash/${n}/ 2>> error_log; done

echo -e "Gathering container related information..."
mkdir container
rpm -q podman || alias podman="docker"
podman ps &> container/ps
podman image list &> container/image_list
podman ps | awk '$1!="CONTAINER" {print $1}' | while read id; do podman inspect $id &> container/inspect_${id}; done

echo -e "Gathering logs..."
cp -a /var/log/{containers*,message*,secure*,boot*,cron*,yum*,Xorg*,sa,rhsm,audit,dmesg} ./var/log/ 2>> error_log
cp -a /etc/*syslog.conf ./etc/ 2>> error_log

echo -e "Compressing files..."
tar -cjf /tmp/sosreport.tar.bz2 ./

echo -e "Script complete."
Loading

0 comments on commit 4364824

Please sign in to comment.