Make OBS worker online if its not up 15/162615/1
authorhyokeun <hyokeun.jeon@samsung.com>
Mon, 4 Dec 2017 11:12:37 +0000 (20:12 +0900)
committerhyokeun <hyokeun.jeon@samsung.com>
Mon, 4 Dec 2017 11:12:39 +0000 (20:12 +0900)
Problem: Sometimes docker container bring up failed.
Fix: Check all the VMs status and call docker start command via docker-compose.

Change-Id: Iaa20fa53b44ab3f265d16db4de4e7ed6bffc1cae

job_control_ondemand_slaves.groovy

index 7d00649..1a17e06 100644 (file)
@@ -2,7 +2,7 @@ import hudson.model.*
 import jenkins.model.*
 import groovy.json.JsonSlurper
 
-def execute_command(cmd, args, verbose=false) {
+def execute_command(cmd, args, verbose=false, return_stdout=false) {
     if (!cmd.toString()?.trim()) {
         cmd = "python "+System.getenv("JENKINS_HOME")+"/jenkins-scripts/common/aws_ec2.py"
     }
@@ -20,6 +20,10 @@ def execute_command(cmd, args, verbose=false) {
         println "ERR:\n" + err.toString()
         println "<<<< END CMD >>>>\n"
     }
+    if (return_stdout == true) {
+        def rs = "\n" + out.toString()
+        return rs
+    }
 
     def HashMap ret_items
     def ret_err = err.toString()
@@ -106,6 +110,8 @@ echo "ONDEMAND SCAILING"
     }
     String userdata() { return this.make_init_script() }
     Integer instance_base() { conf.EC2_WORKER_INSTANCE_BASE.toInteger() }
+    boolean health_check_enabled() { return false }
+    String get_docker_compose_path() { return "/root/docker-compose.yml" }
 }
 
 public class WorkerConf_JENKINS_IMAGER extends WorkerConf {
@@ -133,7 +139,7 @@ public class WorkerConf_OBS_WORKER_NORMAL extends WorkerConf {
     String get_remote_ssh_pub_key() { conf.EC2_WORKER_OBS_NORMAL_REMOTE_SSH_PUB_KEY }
     String make_init_script() {
         def hostname_prefix = "obsnw"
-        def docker_compose_file = "/root/docker-compose.yml"
+        def docker_compose_file = this.get_docker_compose_path()
         def String fileContents = new File(this.get_remote_ssh_pub_key()).text
         def backend02_num = conf.EC2_WORKER_OBS_NORMAL_BACKEND02_NUM
         def instance_base = this.instance_base()
@@ -164,6 +170,7 @@ chown -R ${this.remote_user()}:${this.remote_user()} ${this.remote_fs()}
     Integer executors_per_slave() { conf.EC2_WORKER_OBS_NORMAL_NUMBER_OF_EXECUTORS.toInteger() }
     Integer max_slaves() { conf.EC2_WORKER_OBS_NORMAL_INSTANCE_CAP_STR.toInteger() }
     String tag_source() { conf.EC2_WORKER_OBS_NORMAL_TAG_SOURCE }
+    boolean health_check_enabled() { return true }
 }
 
 class WorkerConf_OBS_WORKER_POWER extends WorkerConf_OBS_WORKER_NORMAL {
@@ -276,6 +283,49 @@ def get_worker_conf(purpose) {
     return worker_conf
 }
 
+def check_healthy_status(worker_conf, vm_list) {
+
+    if (worker_conf.health_check_enabled() != true) {
+        return
+    }
+    def inst_for_check = []
+
+    vm_list.each { k, v ->
+        if (v["state"] != "terminated" && v["state"] != "shutting-down") {
+            v["tags"].each { tv ->
+                if (tv["Key"] == "slot") {
+                    inst_for_check.add(vm_list[k])
+                }
+            }
+        }
+    }
+
+    def sendStatus = inst_for_check.each { inst ->
+        println "Healthy Checking " + inst["private_ip_address"] + " " + inst["instance_id"]
+        def ssh_username = worker_conf.remote_user()
+        def ssh_hostname = "${inst["private_ip_address"]}"
+        def ssh_known_hosts_file = System.getProperty("user.home") + "/.ssh/known_hosts"
+        def ssh_known_hosts_file_my = ssh_known_hosts_file + ".obs_scailing"
+        def ssh_options = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=" + ssh_known_hosts_file_my
+        def docker_command = "sudo docker exec obs_worker rcobsworker status"
+        // Remove out-of-date known hosts
+        execute_command('cp', ssh_known_hosts_file + ' ' + ssh_known_hosts_file_my)
+        execute_command('chmod', '0777 ' + ssh_known_hosts_file_my)
+        execute_command('ssh-keygen', '-f ' + ssh_known_hosts_file_my + ' -R ' + ssh_hostname, verbose=true)
+        def ret_val = execute_command("ssh", ssh_options + " -X " + ssh_username + "@" + ssh_hostname + " " + docker_command,
+                        verbose=false, return_stdout=true)
+        if (!ret_val.contains("   Active: active (running)") || !ret_val.contains("Checking for obsworker: ..running")) {
+            println 'HEALTHY:FAIL for ' + ssh_hostname + '\nRestarting docker...'
+            docker_command = "sudo docker-compose -f /root/docker-compose.yml up -d"
+            execute_command("ssh", ssh_options + " -X " + ssh_username + "@" + ssh_hostname + " " + docker_command,
+                            verbose=false, return_stdout=true)
+        }
+    }
+
+    //TODO: Lets run multi-threaded code
+}
+
+
 //TODO: FIXME:
 e = { filepath ->
     evaluate(new File(System.getenv("JENKINS_HOME") + "/init.groovy.d/" + filepath))
@@ -402,7 +452,8 @@ def worker_ondemand_create_request(worker_conf, Integer num_requested_executors)
     def free_slots = []
     def allocated_slots = []
     for (i = worker_conf.instance_base(); i <= worker_conf.instance_base()+worker_conf.max_slaves(); i++) { free_slots.add(String.format("%03d", i)) }
-    get_aws_status(worker_conf).each { k, v ->
+    def current_aws_status_list = get_aws_status(worker_conf)
+    current_aws_status_list.each { k, v ->
         println k
         println v['tags']
         if (v['state'] == 'terminated' || v['state'] == 'shutting-down') {
@@ -442,6 +493,9 @@ def worker_ondemand_create_request(worker_conf, Integer num_requested_executors)
         }
         println "\"TitleDisplay\": \"Create(${created_inst.size()}/${num_requested_executors})\""
     }
+
+    // Make VMs online if its not ready
+    check_healthy_status(worker_conf, current_aws_status_list)
 }
 
 def worker_ondemand_revoke_request_wo_nodes(worker_conf) {