From 44f5006d5b9035929d0575a89c01f0f906c44663 Mon Sep 17 00:00:00 2001 From: hyokeun Date: Mon, 4 Dec 2017 20:12:37 +0900 Subject: [PATCH] Make OBS worker online if its not up Problem: Sometimes docker container bring up failed. Fix: Check all the VMs status and call docker start command via docker-compose. Change-Id: Iaa20fa53b44ab3f265d16db4de4e7ed6bffc1cae --- job_control_ondemand_slaves.groovy | 60 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/job_control_ondemand_slaves.groovy b/job_control_ondemand_slaves.groovy index 7d00649..1a17e06 100644 --- a/job_control_ondemand_slaves.groovy +++ b/job_control_ondemand_slaves.groovy @@ -2,7 +2,7 @@ import hudson.model.* import jenkins.model.* import groovy.json.JsonSlurper -def execute_command(cmd, args, verbose=false) { +def execute_command(cmd, args, verbose=false, return_stdout=false) { if (!cmd.toString()?.trim()) { cmd = "python "+System.getenv("JENKINS_HOME")+"/jenkins-scripts/common/aws_ec2.py" } @@ -20,6 +20,10 @@ def execute_command(cmd, args, verbose=false) { println "ERR:\n" + err.toString() println "<<<< END CMD >>>>\n" } + if (return_stdout == true) { + def rs = "\n" + out.toString() + return rs + } def HashMap ret_items def ret_err = err.toString() @@ -106,6 +110,8 @@ echo "ONDEMAND SCAILING" } String userdata() { return this.make_init_script() } Integer instance_base() { conf.EC2_WORKER_INSTANCE_BASE.toInteger() } + boolean health_check_enabled() { return false } + String get_docker_compose_path() { return "/root/docker-compose.yml" } } public class WorkerConf_JENKINS_IMAGER extends WorkerConf { @@ -133,7 +139,7 @@ public class WorkerConf_OBS_WORKER_NORMAL extends WorkerConf { String get_remote_ssh_pub_key() { conf.EC2_WORKER_OBS_NORMAL_REMOTE_SSH_PUB_KEY } String make_init_script() { def hostname_prefix = "obsnw" - def docker_compose_file = "/root/docker-compose.yml" + def docker_compose_file = this.get_docker_compose_path() def String fileContents = new File(this.get_remote_ssh_pub_key()).text def backend02_num = conf.EC2_WORKER_OBS_NORMAL_BACKEND02_NUM def instance_base = this.instance_base() @@ -164,6 +170,7 @@ chown -R ${this.remote_user()}:${this.remote_user()} ${this.remote_fs()} Integer executors_per_slave() { conf.EC2_WORKER_OBS_NORMAL_NUMBER_OF_EXECUTORS.toInteger() } Integer max_slaves() { conf.EC2_WORKER_OBS_NORMAL_INSTANCE_CAP_STR.toInteger() } String tag_source() { conf.EC2_WORKER_OBS_NORMAL_TAG_SOURCE } + boolean health_check_enabled() { return true } } class WorkerConf_OBS_WORKER_POWER extends WorkerConf_OBS_WORKER_NORMAL { @@ -276,6 +283,49 @@ def get_worker_conf(purpose) { return worker_conf } +def check_healthy_status(worker_conf, vm_list) { + + if (worker_conf.health_check_enabled() != true) { + return + } + def inst_for_check = [] + + vm_list.each { k, v -> + if (v["state"] != "terminated" && v["state"] != "shutting-down") { + v["tags"].each { tv -> + if (tv["Key"] == "slot") { + inst_for_check.add(vm_list[k]) + } + } + } + } + + def sendStatus = inst_for_check.each { inst -> + println "Healthy Checking " + inst["private_ip_address"] + " " + inst["instance_id"] + def ssh_username = worker_conf.remote_user() + def ssh_hostname = "${inst["private_ip_address"]}" + def ssh_known_hosts_file = System.getProperty("user.home") + "/.ssh/known_hosts" + def ssh_known_hosts_file_my = ssh_known_hosts_file + ".obs_scailing" + def ssh_options = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=" + ssh_known_hosts_file_my + def docker_command = "sudo docker exec obs_worker rcobsworker status" + // Remove out-of-date known hosts + execute_command('cp', ssh_known_hosts_file + ' ' + ssh_known_hosts_file_my) + execute_command('chmod', '0777 ' + ssh_known_hosts_file_my) + execute_command('ssh-keygen', '-f ' + ssh_known_hosts_file_my + ' -R ' + ssh_hostname, verbose=true) + def ret_val = execute_command("ssh", ssh_options + " -X " + ssh_username + "@" + ssh_hostname + " " + docker_command, + verbose=false, return_stdout=true) + if (!ret_val.contains(" Active: active (running)") || !ret_val.contains("Checking for obsworker: ..running")) { + println 'HEALTHY:FAIL for ' + ssh_hostname + '\nRestarting docker...' + docker_command = "sudo docker-compose -f /root/docker-compose.yml up -d" + execute_command("ssh", ssh_options + " -X " + ssh_username + "@" + ssh_hostname + " " + docker_command, + verbose=false, return_stdout=true) + } + } + + //TODO: Lets run multi-threaded code +} + + //TODO: FIXME: e = { filepath -> evaluate(new File(System.getenv("JENKINS_HOME") + "/init.groovy.d/" + filepath)) @@ -402,7 +452,8 @@ def worker_ondemand_create_request(worker_conf, Integer num_requested_executors) def free_slots = [] def allocated_slots = [] for (i = worker_conf.instance_base(); i <= worker_conf.instance_base()+worker_conf.max_slaves(); i++) { free_slots.add(String.format("%03d", i)) } - get_aws_status(worker_conf).each { k, v -> + def current_aws_status_list = get_aws_status(worker_conf) + current_aws_status_list.each { k, v -> println k println v['tags'] if (v['state'] == 'terminated' || v['state'] == 'shutting-down') { @@ -442,6 +493,9 @@ def worker_ondemand_create_request(worker_conf, Integer num_requested_executors) } println "\"TitleDisplay\": \"Create(${created_inst.size()}/${num_requested_executors})\"" } + + // Make VMs online if its not ready + check_healthy_status(worker_conf, current_aws_status_list) } def worker_ondemand_revoke_request_wo_nodes(worker_conf) { -- 2.7.4