Merge tag 'drm-intel-next-2018-09-06-2' of git://anongit.freedesktop.org/drm/drm...
[platform/kernel/linux-rpi.git] / drivers / gpu / drm / i915 / selftests / intel_hangcheck.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/kthread.h>
26
27 #include "../i915_selftest.h"
28 #include "i915_random.h"
29 #include "igt_flush_test.h"
30 #include "igt_wedge_me.h"
31
32 #include "mock_context.h"
33 #include "mock_drm.h"
34
35 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
36
37 struct hang {
38         struct drm_i915_private *i915;
39         struct drm_i915_gem_object *hws;
40         struct drm_i915_gem_object *obj;
41         struct i915_gem_context *ctx;
42         u32 *seqno;
43         u32 *batch;
44 };
45
46 static int hang_init(struct hang *h, struct drm_i915_private *i915)
47 {
48         void *vaddr;
49         int err;
50
51         memset(h, 0, sizeof(*h));
52         h->i915 = i915;
53
54         h->ctx = kernel_context(i915);
55         if (IS_ERR(h->ctx))
56                 return PTR_ERR(h->ctx);
57
58         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
59         if (IS_ERR(h->hws)) {
60                 err = PTR_ERR(h->hws);
61                 goto err_ctx;
62         }
63
64         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
65         if (IS_ERR(h->obj)) {
66                 err = PTR_ERR(h->obj);
67                 goto err_hws;
68         }
69
70         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
71         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
72         if (IS_ERR(vaddr)) {
73                 err = PTR_ERR(vaddr);
74                 goto err_obj;
75         }
76         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
77
78         vaddr = i915_gem_object_pin_map(h->obj,
79                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
80         if (IS_ERR(vaddr)) {
81                 err = PTR_ERR(vaddr);
82                 goto err_unpin_hws;
83         }
84         h->batch = vaddr;
85
86         return 0;
87
88 err_unpin_hws:
89         i915_gem_object_unpin_map(h->hws);
90 err_obj:
91         i915_gem_object_put(h->obj);
92 err_hws:
93         i915_gem_object_put(h->hws);
94 err_ctx:
95         kernel_context_close(h->ctx);
96         return err;
97 }
98
99 static u64 hws_address(const struct i915_vma *hws,
100                        const struct i915_request *rq)
101 {
102         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
103 }
104
105 static int emit_recurse_batch(struct hang *h,
106                               struct i915_request *rq)
107 {
108         struct drm_i915_private *i915 = h->i915;
109         struct i915_address_space *vm =
110                 rq->gem_context->ppgtt ?
111                 &rq->gem_context->ppgtt->vm :
112                 &i915->ggtt.vm;
113         struct i915_vma *hws, *vma;
114         unsigned int flags;
115         u32 *batch;
116         int err;
117
118         vma = i915_vma_instance(h->obj, vm, NULL);
119         if (IS_ERR(vma))
120                 return PTR_ERR(vma);
121
122         hws = i915_vma_instance(h->hws, vm, NULL);
123         if (IS_ERR(hws))
124                 return PTR_ERR(hws);
125
126         err = i915_vma_pin(vma, 0, 0, PIN_USER);
127         if (err)
128                 return err;
129
130         err = i915_vma_pin(hws, 0, 0, PIN_USER);
131         if (err)
132                 goto unpin_vma;
133
134         err = i915_vma_move_to_active(vma, rq, 0);
135         if (err)
136                 goto unpin_hws;
137
138         if (!i915_gem_object_has_active_reference(vma->obj)) {
139                 i915_gem_object_get(vma->obj);
140                 i915_gem_object_set_active_reference(vma->obj);
141         }
142
143         err = i915_vma_move_to_active(hws, rq, 0);
144         if (err)
145                 goto unpin_hws;
146
147         if (!i915_gem_object_has_active_reference(hws->obj)) {
148                 i915_gem_object_get(hws->obj);
149                 i915_gem_object_set_active_reference(hws->obj);
150         }
151
152         batch = h->batch;
153         if (INTEL_GEN(i915) >= 8) {
154                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155                 *batch++ = lower_32_bits(hws_address(hws, rq));
156                 *batch++ = upper_32_bits(hws_address(hws, rq));
157                 *batch++ = rq->fence.seqno;
158                 *batch++ = MI_ARB_CHECK;
159
160                 memset(batch, 0, 1024);
161                 batch += 1024 / sizeof(*batch);
162
163                 *batch++ = MI_ARB_CHECK;
164                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
165                 *batch++ = lower_32_bits(vma->node.start);
166                 *batch++ = upper_32_bits(vma->node.start);
167         } else if (INTEL_GEN(i915) >= 6) {
168                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
169                 *batch++ = 0;
170                 *batch++ = lower_32_bits(hws_address(hws, rq));
171                 *batch++ = rq->fence.seqno;
172                 *batch++ = MI_ARB_CHECK;
173
174                 memset(batch, 0, 1024);
175                 batch += 1024 / sizeof(*batch);
176
177                 *batch++ = MI_ARB_CHECK;
178                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
179                 *batch++ = lower_32_bits(vma->node.start);
180         } else if (INTEL_GEN(i915) >= 4) {
181                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
182                 *batch++ = 0;
183                 *batch++ = lower_32_bits(hws_address(hws, rq));
184                 *batch++ = rq->fence.seqno;
185                 *batch++ = MI_ARB_CHECK;
186
187                 memset(batch, 0, 1024);
188                 batch += 1024 / sizeof(*batch);
189
190                 *batch++ = MI_ARB_CHECK;
191                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
192                 *batch++ = lower_32_bits(vma->node.start);
193         } else {
194                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
195                 *batch++ = lower_32_bits(hws_address(hws, rq));
196                 *batch++ = rq->fence.seqno;
197                 *batch++ = MI_ARB_CHECK;
198
199                 memset(batch, 0, 1024);
200                 batch += 1024 / sizeof(*batch);
201
202                 *batch++ = MI_ARB_CHECK;
203                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
204                 *batch++ = lower_32_bits(vma->node.start);
205         }
206         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
207         i915_gem_chipset_flush(h->i915);
208
209         flags = 0;
210         if (INTEL_GEN(vm->i915) <= 5)
211                 flags |= I915_DISPATCH_SECURE;
212
213         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
214
215 unpin_hws:
216         i915_vma_unpin(hws);
217 unpin_vma:
218         i915_vma_unpin(vma);
219         return err;
220 }
221
222 static struct i915_request *
223 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
224 {
225         struct i915_request *rq;
226         int err;
227
228         if (i915_gem_object_is_active(h->obj)) {
229                 struct drm_i915_gem_object *obj;
230                 void *vaddr;
231
232                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
233                 if (IS_ERR(obj))
234                         return ERR_CAST(obj);
235
236                 vaddr = i915_gem_object_pin_map(obj,
237                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
238                 if (IS_ERR(vaddr)) {
239                         i915_gem_object_put(obj);
240                         return ERR_CAST(vaddr);
241                 }
242
243                 i915_gem_object_unpin_map(h->obj);
244                 i915_gem_object_put(h->obj);
245
246                 h->obj = obj;
247                 h->batch = vaddr;
248         }
249
250         rq = i915_request_alloc(engine, h->ctx);
251         if (IS_ERR(rq))
252                 return rq;
253
254         err = emit_recurse_batch(h, rq);
255         if (err) {
256                 i915_request_add(rq);
257                 return ERR_PTR(err);
258         }
259
260         return rq;
261 }
262
263 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
264 {
265         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
266 }
267
268 static void hang_fini(struct hang *h)
269 {
270         *h->batch = MI_BATCH_BUFFER_END;
271         i915_gem_chipset_flush(h->i915);
272
273         i915_gem_object_unpin_map(h->obj);
274         i915_gem_object_put(h->obj);
275
276         i915_gem_object_unpin_map(h->hws);
277         i915_gem_object_put(h->hws);
278
279         kernel_context_close(h->ctx);
280
281         igt_flush_test(h->i915, I915_WAIT_LOCKED);
282 }
283
284 static bool wait_until_running(struct hang *h, struct i915_request *rq)
285 {
286         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
287                                                rq->fence.seqno),
288                              10) &&
289                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
290                                             rq->fence.seqno),
291                           1000));
292 }
293
294 static int igt_hang_sanitycheck(void *arg)
295 {
296         struct drm_i915_private *i915 = arg;
297         struct i915_request *rq;
298         struct intel_engine_cs *engine;
299         enum intel_engine_id id;
300         struct hang h;
301         int err;
302
303         /* Basic check that we can execute our hanging batch */
304
305         mutex_lock(&i915->drm.struct_mutex);
306         err = hang_init(&h, i915);
307         if (err)
308                 goto unlock;
309
310         for_each_engine(engine, i915, id) {
311                 long timeout;
312
313                 if (!intel_engine_can_store_dword(engine))
314                         continue;
315
316                 rq = hang_create_request(&h, engine);
317                 if (IS_ERR(rq)) {
318                         err = PTR_ERR(rq);
319                         pr_err("Failed to create request for %s, err=%d\n",
320                                engine->name, err);
321                         goto fini;
322                 }
323
324                 i915_request_get(rq);
325
326                 *h.batch = MI_BATCH_BUFFER_END;
327                 i915_gem_chipset_flush(i915);
328
329                 i915_request_add(rq);
330
331                 timeout = i915_request_wait(rq,
332                                             I915_WAIT_LOCKED,
333                                             MAX_SCHEDULE_TIMEOUT);
334                 i915_request_put(rq);
335
336                 if (timeout < 0) {
337                         err = timeout;
338                         pr_err("Wait for request failed on %s, err=%d\n",
339                                engine->name, err);
340                         goto fini;
341                 }
342         }
343
344 fini:
345         hang_fini(&h);
346 unlock:
347         mutex_unlock(&i915->drm.struct_mutex);
348         return err;
349 }
350
351 static void global_reset_lock(struct drm_i915_private *i915)
352 {
353         struct intel_engine_cs *engine;
354         enum intel_engine_id id;
355
356         pr_debug("%s: current gpu_error=%08lx\n",
357                  __func__, i915->gpu_error.flags);
358
359         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
360                 wait_event(i915->gpu_error.reset_queue,
361                            !test_bit(I915_RESET_BACKOFF,
362                                      &i915->gpu_error.flags));
363
364         for_each_engine(engine, i915, id) {
365                 while (test_and_set_bit(I915_RESET_ENGINE + id,
366                                         &i915->gpu_error.flags))
367                         wait_on_bit(&i915->gpu_error.flags,
368                                     I915_RESET_ENGINE + id,
369                                     TASK_UNINTERRUPTIBLE);
370         }
371 }
372
373 static void global_reset_unlock(struct drm_i915_private *i915)
374 {
375         struct intel_engine_cs *engine;
376         enum intel_engine_id id;
377
378         for_each_engine(engine, i915, id)
379                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
380
381         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
382         wake_up_all(&i915->gpu_error.reset_queue);
383 }
384
385 static int igt_global_reset(void *arg)
386 {
387         struct drm_i915_private *i915 = arg;
388         unsigned int reset_count;
389         int err = 0;
390
391         /* Check that we can issue a global GPU reset */
392
393         global_reset_lock(i915);
394         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
395
396         mutex_lock(&i915->drm.struct_mutex);
397         reset_count = i915_reset_count(&i915->gpu_error);
398
399         i915_reset(i915, ALL_ENGINES, NULL);
400
401         if (i915_reset_count(&i915->gpu_error) == reset_count) {
402                 pr_err("No GPU reset recorded!\n");
403                 err = -EINVAL;
404         }
405         mutex_unlock(&i915->drm.struct_mutex);
406
407         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
408         global_reset_unlock(i915);
409
410         if (i915_terminally_wedged(&i915->gpu_error))
411                 err = -EIO;
412
413         return err;
414 }
415
416 static bool wait_for_idle(struct intel_engine_cs *engine)
417 {
418         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
419 }
420
421 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
422 {
423         struct intel_engine_cs *engine;
424         enum intel_engine_id id;
425         struct hang h;
426         int err = 0;
427
428         /* Check that we can issue an engine reset on an idle engine (no-op) */
429
430         if (!intel_has_reset_engine(i915))
431                 return 0;
432
433         if (active) {
434                 mutex_lock(&i915->drm.struct_mutex);
435                 err = hang_init(&h, i915);
436                 mutex_unlock(&i915->drm.struct_mutex);
437                 if (err)
438                         return err;
439         }
440
441         for_each_engine(engine, i915, id) {
442                 unsigned int reset_count, reset_engine_count;
443                 IGT_TIMEOUT(end_time);
444
445                 if (active && !intel_engine_can_store_dword(engine))
446                         continue;
447
448                 if (!wait_for_idle(engine)) {
449                         pr_err("%s failed to idle before reset\n",
450                                engine->name);
451                         err = -EIO;
452                         break;
453                 }
454
455                 reset_count = i915_reset_count(&i915->gpu_error);
456                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
457                                                              engine);
458
459                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
460                 do {
461                         u32 seqno = intel_engine_get_seqno(engine);
462
463                         if (active) {
464                                 struct i915_request *rq;
465
466                                 mutex_lock(&i915->drm.struct_mutex);
467                                 rq = hang_create_request(&h, engine);
468                                 if (IS_ERR(rq)) {
469                                         err = PTR_ERR(rq);
470                                         mutex_unlock(&i915->drm.struct_mutex);
471                                         break;
472                                 }
473
474                                 i915_request_get(rq);
475                                 i915_request_add(rq);
476                                 mutex_unlock(&i915->drm.struct_mutex);
477
478                                 if (!wait_until_running(&h, rq)) {
479                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
480
481                                         pr_err("%s: Failed to start request %x, at %x\n",
482                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
483                                         intel_engine_dump(engine, &p,
484                                                           "%s\n", engine->name);
485
486                                         i915_request_put(rq);
487                                         err = -EIO;
488                                         break;
489                                 }
490
491                                 GEM_BUG_ON(!rq->global_seqno);
492                                 seqno = rq->global_seqno - 1;
493                                 i915_request_put(rq);
494                         }
495
496                         err = i915_reset_engine(engine, NULL);
497                         if (err) {
498                                 pr_err("i915_reset_engine failed\n");
499                                 break;
500                         }
501
502                         if (i915_reset_count(&i915->gpu_error) != reset_count) {
503                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
504                                 err = -EINVAL;
505                                 break;
506                         }
507
508                         reset_engine_count += active;
509                         if (i915_reset_engine_count(&i915->gpu_error, engine) !=
510                             reset_engine_count) {
511                                 pr_err("%s engine reset %srecorded!\n",
512                                        engine->name, active ? "not " : "");
513                                 err = -EINVAL;
514                                 break;
515                         }
516
517                         if (!wait_for_idle(engine)) {
518                                 struct drm_printer p =
519                                         drm_info_printer(i915->drm.dev);
520
521                                 pr_err("%s failed to idle after reset\n",
522                                        engine->name);
523                                 intel_engine_dump(engine, &p,
524                                                   "%s\n", engine->name);
525
526                                 err = -EIO;
527                                 break;
528                         }
529                 } while (time_before(jiffies, end_time));
530                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
531
532                 if (err)
533                         break;
534
535                 err = igt_flush_test(i915, 0);
536                 if (err)
537                         break;
538         }
539
540         if (i915_terminally_wedged(&i915->gpu_error))
541                 err = -EIO;
542
543         if (active) {
544                 mutex_lock(&i915->drm.struct_mutex);
545                 hang_fini(&h);
546                 mutex_unlock(&i915->drm.struct_mutex);
547         }
548
549         return err;
550 }
551
552 static int igt_reset_idle_engine(void *arg)
553 {
554         return __igt_reset_engine(arg, false);
555 }
556
557 static int igt_reset_active_engine(void *arg)
558 {
559         return __igt_reset_engine(arg, true);
560 }
561
562 struct active_engine {
563         struct task_struct *task;
564         struct intel_engine_cs *engine;
565         unsigned long resets;
566         unsigned int flags;
567 };
568
569 #define TEST_ACTIVE     BIT(0)
570 #define TEST_OTHERS     BIT(1)
571 #define TEST_SELF       BIT(2)
572 #define TEST_PRIORITY   BIT(3)
573
574 static int active_request_put(struct i915_request *rq)
575 {
576         int err = 0;
577
578         if (!rq)
579                 return 0;
580
581         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
582                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
583                           rq->engine->name,
584                           rq->fence.context,
585                           rq->fence.seqno,
586                           i915_request_global_seqno(rq));
587                 GEM_TRACE_DUMP();
588
589                 i915_gem_set_wedged(rq->i915);
590                 err = -EIO;
591         }
592
593         i915_request_put(rq);
594
595         return err;
596 }
597
598 static int active_engine(void *data)
599 {
600         I915_RND_STATE(prng);
601         struct active_engine *arg = data;
602         struct intel_engine_cs *engine = arg->engine;
603         struct i915_request *rq[8] = {};
604         struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
605         struct drm_file *file;
606         unsigned long count = 0;
607         int err = 0;
608
609         file = mock_file(engine->i915);
610         if (IS_ERR(file))
611                 return PTR_ERR(file);
612
613         for (count = 0; count < ARRAY_SIZE(ctx); count++) {
614                 mutex_lock(&engine->i915->drm.struct_mutex);
615                 ctx[count] = live_context(engine->i915, file);
616                 mutex_unlock(&engine->i915->drm.struct_mutex);
617                 if (IS_ERR(ctx[count])) {
618                         err = PTR_ERR(ctx[count]);
619                         while (--count)
620                                 i915_gem_context_put(ctx[count]);
621                         goto err_file;
622                 }
623         }
624
625         while (!kthread_should_stop()) {
626                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
627                 struct i915_request *old = rq[idx];
628                 struct i915_request *new;
629
630                 mutex_lock(&engine->i915->drm.struct_mutex);
631                 new = i915_request_alloc(engine, ctx[idx]);
632                 if (IS_ERR(new)) {
633                         mutex_unlock(&engine->i915->drm.struct_mutex);
634                         err = PTR_ERR(new);
635                         break;
636                 }
637
638                 if (arg->flags & TEST_PRIORITY)
639                         ctx[idx]->sched.priority =
640                                 i915_prandom_u32_max_state(512, &prng);
641
642                 rq[idx] = i915_request_get(new);
643                 i915_request_add(new);
644                 mutex_unlock(&engine->i915->drm.struct_mutex);
645
646                 err = active_request_put(old);
647                 if (err)
648                         break;
649
650                 cond_resched();
651         }
652
653         for (count = 0; count < ARRAY_SIZE(rq); count++) {
654                 int err__ = active_request_put(rq[count]);
655
656                 /* Keep the first error */
657                 if (!err)
658                         err = err__;
659         }
660
661 err_file:
662         mock_file_free(engine->i915, file);
663         return err;
664 }
665
666 static int __igt_reset_engines(struct drm_i915_private *i915,
667                                const char *test_name,
668                                unsigned int flags)
669 {
670         struct intel_engine_cs *engine, *other;
671         enum intel_engine_id id, tmp;
672         struct hang h;
673         int err = 0;
674
675         /* Check that issuing a reset on one engine does not interfere
676          * with any other engine.
677          */
678
679         if (!intel_has_reset_engine(i915))
680                 return 0;
681
682         if (flags & TEST_ACTIVE) {
683                 mutex_lock(&i915->drm.struct_mutex);
684                 err = hang_init(&h, i915);
685                 mutex_unlock(&i915->drm.struct_mutex);
686                 if (err)
687                         return err;
688
689                 if (flags & TEST_PRIORITY)
690                         h.ctx->sched.priority = 1024;
691         }
692
693         for_each_engine(engine, i915, id) {
694                 struct active_engine threads[I915_NUM_ENGINES] = {};
695                 unsigned long global = i915_reset_count(&i915->gpu_error);
696                 unsigned long count = 0, reported;
697                 IGT_TIMEOUT(end_time);
698
699                 if (flags & TEST_ACTIVE &&
700                     !intel_engine_can_store_dword(engine))
701                         continue;
702
703                 if (!wait_for_idle(engine)) {
704                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
705                                engine->name, test_name);
706                         err = -EIO;
707                         break;
708                 }
709
710                 memset(threads, 0, sizeof(threads));
711                 for_each_engine(other, i915, tmp) {
712                         struct task_struct *tsk;
713
714                         threads[tmp].resets =
715                                 i915_reset_engine_count(&i915->gpu_error,
716                                                         other);
717
718                         if (!(flags & TEST_OTHERS))
719                                 continue;
720
721                         if (other == engine && !(flags & TEST_SELF))
722                                 continue;
723
724                         threads[tmp].engine = other;
725                         threads[tmp].flags = flags;
726
727                         tsk = kthread_run(active_engine, &threads[tmp],
728                                           "igt/%s", other->name);
729                         if (IS_ERR(tsk)) {
730                                 err = PTR_ERR(tsk);
731                                 goto unwind;
732                         }
733
734                         threads[tmp].task = tsk;
735                         get_task_struct(tsk);
736                 }
737
738                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739                 do {
740                         u32 seqno = intel_engine_get_seqno(engine);
741                         struct i915_request *rq = NULL;
742
743                         if (flags & TEST_ACTIVE) {
744                                 mutex_lock(&i915->drm.struct_mutex);
745                                 rq = hang_create_request(&h, engine);
746                                 if (IS_ERR(rq)) {
747                                         err = PTR_ERR(rq);
748                                         mutex_unlock(&i915->drm.struct_mutex);
749                                         break;
750                                 }
751
752                                 i915_request_get(rq);
753                                 i915_request_add(rq);
754                                 mutex_unlock(&i915->drm.struct_mutex);
755
756                                 if (!wait_until_running(&h, rq)) {
757                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
758
759                                         pr_err("%s: Failed to start request %x, at %x\n",
760                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
761                                         intel_engine_dump(engine, &p,
762                                                           "%s\n", engine->name);
763
764                                         i915_request_put(rq);
765                                         err = -EIO;
766                                         break;
767                                 }
768
769                                 GEM_BUG_ON(!rq->global_seqno);
770                                 seqno = rq->global_seqno - 1;
771                         }
772
773                         err = i915_reset_engine(engine, NULL);
774                         if (err) {
775                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
776                                        engine->name, test_name, err);
777                                 break;
778                         }
779
780                         count++;
781
782                         if (rq) {
783                                 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
784                                 i915_request_put(rq);
785                         }
786
787                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
788                                 struct drm_printer p =
789                                         drm_info_printer(i915->drm.dev);
790
791                                 pr_err("i915_reset_engine(%s:%s):"
792                                        " failed to idle after reset\n",
793                                        engine->name, test_name);
794                                 intel_engine_dump(engine, &p,
795                                                   "%s\n", engine->name);
796
797                                 err = -EIO;
798                                 break;
799                         }
800                 } while (time_before(jiffies, end_time));
801                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
802                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
803                         engine->name, test_name, count);
804
805                 reported = i915_reset_engine_count(&i915->gpu_error, engine);
806                 reported -= threads[engine->id].resets;
807                 if (reported != (flags & TEST_ACTIVE ? count : 0)) {
808                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
809                                engine->name, test_name, count, reported,
810                                (flags & TEST_ACTIVE ? count : 0));
811                         if (!err)
812                                 err = -EINVAL;
813                 }
814
815 unwind:
816                 for_each_engine(other, i915, tmp) {
817                         int ret;
818
819                         if (!threads[tmp].task)
820                                 continue;
821
822                         ret = kthread_stop(threads[tmp].task);
823                         if (ret) {
824                                 pr_err("kthread for other engine %s failed, err=%d\n",
825                                        other->name, ret);
826                                 if (!err)
827                                         err = ret;
828                         }
829                         put_task_struct(threads[tmp].task);
830
831                         if (other != engine &&
832                             threads[tmp].resets !=
833                             i915_reset_engine_count(&i915->gpu_error, other)) {
834                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
835                                        other->name,
836                                        i915_reset_engine_count(&i915->gpu_error,
837                                                                other) -
838                                        threads[tmp].resets);
839                                 if (!err)
840                                         err = -EINVAL;
841                         }
842                 }
843
844                 if (global != i915_reset_count(&i915->gpu_error)) {
845                         pr_err("Global reset (count=%ld)!\n",
846                                i915_reset_count(&i915->gpu_error) - global);
847                         if (!err)
848                                 err = -EINVAL;
849                 }
850
851                 if (err)
852                         break;
853
854                 err = igt_flush_test(i915, 0);
855                 if (err)
856                         break;
857         }
858
859         if (i915_terminally_wedged(&i915->gpu_error))
860                 err = -EIO;
861
862         if (flags & TEST_ACTIVE) {
863                 mutex_lock(&i915->drm.struct_mutex);
864                 hang_fini(&h);
865                 mutex_unlock(&i915->drm.struct_mutex);
866         }
867
868         return err;
869 }
870
871 static int igt_reset_engines(void *arg)
872 {
873         static const struct {
874                 const char *name;
875                 unsigned int flags;
876         } phases[] = {
877                 { "idle", 0 },
878                 { "active", TEST_ACTIVE },
879                 { "others-idle", TEST_OTHERS },
880                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
881                 {
882                         "others-priority",
883                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
884                 },
885                 {
886                         "self-priority",
887                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
888                 },
889                 { }
890         };
891         struct drm_i915_private *i915 = arg;
892         typeof(*phases) *p;
893         int err;
894
895         for (p = phases; p->name; p++) {
896                 if (p->flags & TEST_PRIORITY) {
897                         if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
898                                 continue;
899                 }
900
901                 err = __igt_reset_engines(arg, p->name, p->flags);
902                 if (err)
903                         return err;
904         }
905
906         return 0;
907 }
908
909 static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
910 {
911         struct i915_gpu_error *error = &rq->i915->gpu_error;
912         u32 reset_count = i915_reset_count(error);
913
914         error->stalled_mask = mask;
915
916         /* set_bit() must be after we have setup the backchannel (mask) */
917         smp_mb__before_atomic();
918         set_bit(I915_RESET_HANDOFF, &error->flags);
919
920         wake_up_all(&error->wait_queue);
921
922         return reset_count;
923 }
924
925 static int igt_reset_wait(void *arg)
926 {
927         struct drm_i915_private *i915 = arg;
928         struct i915_request *rq;
929         unsigned int reset_count;
930         struct hang h;
931         long timeout;
932         int err;
933
934         if (!intel_engine_can_store_dword(i915->engine[RCS]))
935                 return 0;
936
937         /* Check that we detect a stuck waiter and issue a reset */
938
939         global_reset_lock(i915);
940
941         mutex_lock(&i915->drm.struct_mutex);
942         err = hang_init(&h, i915);
943         if (err)
944                 goto unlock;
945
946         rq = hang_create_request(&h, i915->engine[RCS]);
947         if (IS_ERR(rq)) {
948                 err = PTR_ERR(rq);
949                 goto fini;
950         }
951
952         i915_request_get(rq);
953         i915_request_add(rq);
954
955         if (!wait_until_running(&h, rq)) {
956                 struct drm_printer p = drm_info_printer(i915->drm.dev);
957
958                 pr_err("%s: Failed to start request %x, at %x\n",
959                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
960                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
961
962                 i915_gem_set_wedged(i915);
963
964                 err = -EIO;
965                 goto out_rq;
966         }
967
968         reset_count = fake_hangcheck(rq, ALL_ENGINES);
969
970         timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
971         if (timeout < 0) {
972                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
973                        timeout);
974                 err = timeout;
975                 goto out_rq;
976         }
977
978         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
979         if (i915_reset_count(&i915->gpu_error) == reset_count) {
980                 pr_err("No GPU reset recorded!\n");
981                 err = -EINVAL;
982                 goto out_rq;
983         }
984
985 out_rq:
986         i915_request_put(rq);
987 fini:
988         hang_fini(&h);
989 unlock:
990         mutex_unlock(&i915->drm.struct_mutex);
991         global_reset_unlock(i915);
992
993         if (i915_terminally_wedged(&i915->gpu_error))
994                 return -EIO;
995
996         return err;
997 }
998
999 struct evict_vma {
1000         struct completion completion;
1001         struct i915_vma *vma;
1002 };
1003
1004 static int evict_vma(void *data)
1005 {
1006         struct evict_vma *arg = data;
1007         struct i915_address_space *vm = arg->vma->vm;
1008         struct drm_i915_private *i915 = vm->i915;
1009         struct drm_mm_node evict = arg->vma->node;
1010         int err;
1011
1012         complete(&arg->completion);
1013
1014         mutex_lock(&i915->drm.struct_mutex);
1015         err = i915_gem_evict_for_node(vm, &evict, 0);
1016         mutex_unlock(&i915->drm.struct_mutex);
1017
1018         return err;
1019 }
1020
1021 static int evict_fence(void *data)
1022 {
1023         struct evict_vma *arg = data;
1024         struct drm_i915_private *i915 = arg->vma->vm->i915;
1025         int err;
1026
1027         complete(&arg->completion);
1028
1029         mutex_lock(&i915->drm.struct_mutex);
1030
1031         /* Mark the fence register as dirty to force the mmio update. */
1032         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1033         if (err) {
1034                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1035                 goto out_unlock;
1036         }
1037
1038         err = i915_vma_pin_fence(arg->vma);
1039         if (err) {
1040                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1041                 goto out_unlock;
1042         }
1043
1044         i915_vma_unpin_fence(arg->vma);
1045
1046 out_unlock:
1047         mutex_unlock(&i915->drm.struct_mutex);
1048
1049         return err;
1050 }
1051
1052 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1053                                  struct i915_address_space *vm,
1054                                  int (*fn)(void *),
1055                                  unsigned int flags)
1056 {
1057         struct drm_i915_gem_object *obj;
1058         struct task_struct *tsk = NULL;
1059         struct i915_request *rq;
1060         struct evict_vma arg;
1061         struct hang h;
1062         int err;
1063
1064         if (!intel_engine_can_store_dword(i915->engine[RCS]))
1065                 return 0;
1066
1067         /* Check that we can recover an unbind stuck on a hanging request */
1068
1069         global_reset_lock(i915);
1070
1071         mutex_lock(&i915->drm.struct_mutex);
1072         err = hang_init(&h, i915);
1073         if (err)
1074                 goto unlock;
1075
1076         obj = i915_gem_object_create_internal(i915, SZ_1M);
1077         if (IS_ERR(obj)) {
1078                 err = PTR_ERR(obj);
1079                 goto fini;
1080         }
1081
1082         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1083                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1084                 if (err) {
1085                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1086                         goto out_obj;
1087                 }
1088         }
1089
1090         arg.vma = i915_vma_instance(obj, vm, NULL);
1091         if (IS_ERR(arg.vma)) {
1092                 err = PTR_ERR(arg.vma);
1093                 goto out_obj;
1094         }
1095
1096         rq = hang_create_request(&h, i915->engine[RCS]);
1097         if (IS_ERR(rq)) {
1098                 err = PTR_ERR(rq);
1099                 goto out_obj;
1100         }
1101
1102         err = i915_vma_pin(arg.vma, 0, 0,
1103                            i915_vma_is_ggtt(arg.vma) ?
1104                            PIN_GLOBAL | PIN_MAPPABLE :
1105                            PIN_USER);
1106         if (err) {
1107                 i915_request_add(rq);
1108                 goto out_obj;
1109         }
1110
1111         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1112                 err = i915_vma_pin_fence(arg.vma);
1113                 if (err) {
1114                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1115                         i915_vma_unpin(arg.vma);
1116                         i915_request_add(rq);
1117                         goto out_obj;
1118                 }
1119         }
1120
1121         err = i915_vma_move_to_active(arg.vma, rq, flags);
1122
1123         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1124                 i915_vma_unpin_fence(arg.vma);
1125         i915_vma_unpin(arg.vma);
1126
1127         i915_request_get(rq);
1128         i915_request_add(rq);
1129         if (err)
1130                 goto out_rq;
1131
1132         mutex_unlock(&i915->drm.struct_mutex);
1133
1134         if (!wait_until_running(&h, rq)) {
1135                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1136
1137                 pr_err("%s: Failed to start request %x, at %x\n",
1138                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1139                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1140
1141                 i915_gem_set_wedged(i915);
1142                 goto out_reset;
1143         }
1144
1145         init_completion(&arg.completion);
1146
1147         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1148         if (IS_ERR(tsk)) {
1149                 err = PTR_ERR(tsk);
1150                 tsk = NULL;
1151                 goto out_reset;
1152         }
1153
1154         wait_for_completion(&arg.completion);
1155
1156         if (wait_for(waitqueue_active(&rq->execute), 10)) {
1157                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1158
1159                 pr_err("igt/evict_vma kthread did not wait\n");
1160                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1161
1162                 i915_gem_set_wedged(i915);
1163                 goto out_reset;
1164         }
1165
1166 out_reset:
1167         fake_hangcheck(rq, intel_engine_flag(rq->engine));
1168
1169         if (tsk) {
1170                 struct igt_wedge_me w;
1171
1172                 /* The reset, even indirectly, should take less than 10ms. */
1173                 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1174                         err = kthread_stop(tsk);
1175         }
1176
1177         mutex_lock(&i915->drm.struct_mutex);
1178 out_rq:
1179         i915_request_put(rq);
1180 out_obj:
1181         i915_gem_object_put(obj);
1182 fini:
1183         hang_fini(&h);
1184 unlock:
1185         mutex_unlock(&i915->drm.struct_mutex);
1186         global_reset_unlock(i915);
1187
1188         if (i915_terminally_wedged(&i915->gpu_error))
1189                 return -EIO;
1190
1191         return err;
1192 }
1193
1194 static int igt_reset_evict_ggtt(void *arg)
1195 {
1196         struct drm_i915_private *i915 = arg;
1197
1198         return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1199                                      evict_vma, EXEC_OBJECT_WRITE);
1200 }
1201
1202 static int igt_reset_evict_ppgtt(void *arg)
1203 {
1204         struct drm_i915_private *i915 = arg;
1205         struct i915_gem_context *ctx;
1206         struct drm_file *file;
1207         int err;
1208
1209         file = mock_file(i915);
1210         if (IS_ERR(file))
1211                 return PTR_ERR(file);
1212
1213         mutex_lock(&i915->drm.struct_mutex);
1214         ctx = live_context(i915, file);
1215         mutex_unlock(&i915->drm.struct_mutex);
1216         if (IS_ERR(ctx)) {
1217                 err = PTR_ERR(ctx);
1218                 goto out;
1219         }
1220
1221         err = 0;
1222         if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1223                 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm,
1224                                             evict_vma, EXEC_OBJECT_WRITE);
1225
1226 out:
1227         mock_file_free(i915, file);
1228         return err;
1229 }
1230
1231 static int igt_reset_evict_fence(void *arg)
1232 {
1233         struct drm_i915_private *i915 = arg;
1234
1235         return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1236                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1237 }
1238
1239 static int wait_for_others(struct drm_i915_private *i915,
1240                            struct intel_engine_cs *exclude)
1241 {
1242         struct intel_engine_cs *engine;
1243         enum intel_engine_id id;
1244
1245         for_each_engine(engine, i915, id) {
1246                 if (engine == exclude)
1247                         continue;
1248
1249                 if (!wait_for_idle(engine))
1250                         return -EIO;
1251         }
1252
1253         return 0;
1254 }
1255
1256 static int igt_reset_queue(void *arg)
1257 {
1258         struct drm_i915_private *i915 = arg;
1259         struct intel_engine_cs *engine;
1260         enum intel_engine_id id;
1261         struct hang h;
1262         int err;
1263
1264         /* Check that we replay pending requests following a hang */
1265
1266         global_reset_lock(i915);
1267
1268         mutex_lock(&i915->drm.struct_mutex);
1269         err = hang_init(&h, i915);
1270         if (err)
1271                 goto unlock;
1272
1273         for_each_engine(engine, i915, id) {
1274                 struct i915_request *prev;
1275                 IGT_TIMEOUT(end_time);
1276                 unsigned int count;
1277
1278                 if (!intel_engine_can_store_dword(engine))
1279                         continue;
1280
1281                 prev = hang_create_request(&h, engine);
1282                 if (IS_ERR(prev)) {
1283                         err = PTR_ERR(prev);
1284                         goto fini;
1285                 }
1286
1287                 i915_request_get(prev);
1288                 i915_request_add(prev);
1289
1290                 count = 0;
1291                 do {
1292                         struct i915_request *rq;
1293                         unsigned int reset_count;
1294
1295                         rq = hang_create_request(&h, engine);
1296                         if (IS_ERR(rq)) {
1297                                 err = PTR_ERR(rq);
1298                                 goto fini;
1299                         }
1300
1301                         i915_request_get(rq);
1302                         i915_request_add(rq);
1303
1304                         /*
1305                          * XXX We don't handle resetting the kernel context
1306                          * very well. If we trigger a device reset twice in
1307                          * quick succession while the kernel context is
1308                          * executing, we may end up skipping the breadcrumb.
1309                          * This is really only a problem for the selftest as
1310                          * normally there is a large interlude between resets
1311                          * (hangcheck), or we focus on resetting just one
1312                          * engine and so avoid repeatedly resetting innocents.
1313                          */
1314                         err = wait_for_others(i915, engine);
1315                         if (err) {
1316                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1317                                        __func__, engine->name);
1318                                 i915_request_put(rq);
1319                                 i915_request_put(prev);
1320
1321                                 GEM_TRACE_DUMP();
1322                                 i915_gem_set_wedged(i915);
1323                                 goto fini;
1324                         }
1325
1326                         if (!wait_until_running(&h, prev)) {
1327                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1328
1329                                 pr_err("%s(%s): Failed to start request %x, at %x\n",
1330                                        __func__, engine->name,
1331                                        prev->fence.seqno, hws_seqno(&h, prev));
1332                                 intel_engine_dump(engine, &p,
1333                                                   "%s\n", engine->name);
1334
1335                                 i915_request_put(rq);
1336                                 i915_request_put(prev);
1337
1338                                 i915_gem_set_wedged(i915);
1339
1340                                 err = -EIO;
1341                                 goto fini;
1342                         }
1343
1344                         reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1345
1346                         i915_reset(i915, ENGINE_MASK(id), NULL);
1347
1348                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1349                                             &i915->gpu_error.flags));
1350
1351                         if (prev->fence.error != -EIO) {
1352                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1353                                        prev->fence.error);
1354                                 i915_request_put(rq);
1355                                 i915_request_put(prev);
1356                                 err = -EINVAL;
1357                                 goto fini;
1358                         }
1359
1360                         if (rq->fence.error) {
1361                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1362                                        rq->fence.error);
1363                                 i915_request_put(rq);
1364                                 i915_request_put(prev);
1365                                 err = -EINVAL;
1366                                 goto fini;
1367                         }
1368
1369                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
1370                                 pr_err("No GPU reset recorded!\n");
1371                                 i915_request_put(rq);
1372                                 i915_request_put(prev);
1373                                 err = -EINVAL;
1374                                 goto fini;
1375                         }
1376
1377                         i915_request_put(prev);
1378                         prev = rq;
1379                         count++;
1380                 } while (time_before(jiffies, end_time));
1381                 pr_info("%s: Completed %d resets\n", engine->name, count);
1382
1383                 *h.batch = MI_BATCH_BUFFER_END;
1384                 i915_gem_chipset_flush(i915);
1385
1386                 i915_request_put(prev);
1387
1388                 err = igt_flush_test(i915, I915_WAIT_LOCKED);
1389                 if (err)
1390                         break;
1391         }
1392
1393 fini:
1394         hang_fini(&h);
1395 unlock:
1396         mutex_unlock(&i915->drm.struct_mutex);
1397         global_reset_unlock(i915);
1398
1399         if (i915_terminally_wedged(&i915->gpu_error))
1400                 return -EIO;
1401
1402         return err;
1403 }
1404
1405 static int igt_handle_error(void *arg)
1406 {
1407         struct drm_i915_private *i915 = arg;
1408         struct intel_engine_cs *engine = i915->engine[RCS];
1409         struct hang h;
1410         struct i915_request *rq;
1411         struct i915_gpu_state *error;
1412         int err;
1413
1414         /* Check that we can issue a global GPU and engine reset */
1415
1416         if (!intel_has_reset_engine(i915))
1417                 return 0;
1418
1419         if (!engine || !intel_engine_can_store_dword(engine))
1420                 return 0;
1421
1422         mutex_lock(&i915->drm.struct_mutex);
1423
1424         err = hang_init(&h, i915);
1425         if (err)
1426                 goto err_unlock;
1427
1428         rq = hang_create_request(&h, engine);
1429         if (IS_ERR(rq)) {
1430                 err = PTR_ERR(rq);
1431                 goto err_fini;
1432         }
1433
1434         i915_request_get(rq);
1435         i915_request_add(rq);
1436
1437         if (!wait_until_running(&h, rq)) {
1438                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1439
1440                 pr_err("%s: Failed to start request %x, at %x\n",
1441                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1442                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1443
1444                 i915_gem_set_wedged(i915);
1445
1446                 err = -EIO;
1447                 goto err_request;
1448         }
1449
1450         mutex_unlock(&i915->drm.struct_mutex);
1451
1452         /* Temporarily disable error capture */
1453         error = xchg(&i915->gpu_error.first_error, (void *)-1);
1454
1455         i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1456
1457         xchg(&i915->gpu_error.first_error, error);
1458
1459         mutex_lock(&i915->drm.struct_mutex);
1460
1461         if (rq->fence.error != -EIO) {
1462                 pr_err("Guilty request not identified!\n");
1463                 err = -EINVAL;
1464                 goto err_request;
1465         }
1466
1467 err_request:
1468         i915_request_put(rq);
1469 err_fini:
1470         hang_fini(&h);
1471 err_unlock:
1472         mutex_unlock(&i915->drm.struct_mutex);
1473         return err;
1474 }
1475
1476 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1477 {
1478         static const struct i915_subtest tests[] = {
1479                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1480                 SUBTEST(igt_hang_sanitycheck),
1481                 SUBTEST(igt_reset_idle_engine),
1482                 SUBTEST(igt_reset_active_engine),
1483                 SUBTEST(igt_reset_engines),
1484                 SUBTEST(igt_reset_queue),
1485                 SUBTEST(igt_reset_wait),
1486                 SUBTEST(igt_reset_evict_ggtt),
1487                 SUBTEST(igt_reset_evict_ppgtt),
1488                 SUBTEST(igt_reset_evict_fence),
1489                 SUBTEST(igt_handle_error),
1490         };
1491         bool saved_hangcheck;
1492         int err;
1493
1494         if (!intel_has_gpu_reset(i915))
1495                 return 0;
1496
1497         if (i915_terminally_wedged(&i915->gpu_error))
1498                 return -EIO; /* we're long past hope of a successful reset */
1499
1500         intel_runtime_pm_get(i915);
1501         saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1502
1503         err = i915_subtests(tests, i915);
1504
1505         mutex_lock(&i915->drm.struct_mutex);
1506         igt_flush_test(i915, I915_WAIT_LOCKED);
1507         mutex_unlock(&i915->drm.struct_mutex);
1508
1509         i915_modparams.enable_hangcheck = saved_hangcheck;
1510         intel_runtime_pm_put(i915);
1511
1512         return err;
1513 }