Define the architecture for compressed dump format
[sdk/emulator/qemu.git] / dump.c
1 /*
2  * QEMU dump
3  *
4  * Copyright Fujitsu, Corp. 2011, 2012
5  *
6  * Authors:
7  *     Wen Congyang <wency@cn.fujitsu.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  *
12  */
13
14 #include "qemu-common.h"
15 #include "elf.h"
16 #include "cpu.h"
17 #include "exec/cpu-all.h"
18 #include "exec/hwaddr.h"
19 #include "monitor/monitor.h"
20 #include "sysemu/kvm.h"
21 #include "sysemu/dump.h"
22 #include "sysemu/sysemu.h"
23 #include "sysemu/memory_mapping.h"
24 #include "sysemu/cpus.h"
25 #include "qapi/error.h"
26 #include "qmp-commands.h"
27
28 #include <zlib.h>
29 #ifdef CONFIG_LZO
30 #include <lzo/lzo1x.h>
31 #endif
32 #ifdef CONFIG_SNAPPY
33 #include <snappy-c.h>
34 #endif
35 #ifndef ELF_MACHINE_UNAME
36 #define ELF_MACHINE_UNAME "Unknown"
37 #endif
38
39 static uint16_t cpu_convert_to_target16(uint16_t val, int endian)
40 {
41     if (endian == ELFDATA2LSB) {
42         val = cpu_to_le16(val);
43     } else {
44         val = cpu_to_be16(val);
45     }
46
47     return val;
48 }
49
50 static uint32_t cpu_convert_to_target32(uint32_t val, int endian)
51 {
52     if (endian == ELFDATA2LSB) {
53         val = cpu_to_le32(val);
54     } else {
55         val = cpu_to_be32(val);
56     }
57
58     return val;
59 }
60
61 static uint64_t cpu_convert_to_target64(uint64_t val, int endian)
62 {
63     if (endian == ELFDATA2LSB) {
64         val = cpu_to_le64(val);
65     } else {
66         val = cpu_to_be64(val);
67     }
68
69     return val;
70 }
71
72 typedef struct DumpState {
73     GuestPhysBlockList guest_phys_blocks;
74     ArchDumpInfo dump_info;
75     MemoryMappingList list;
76     uint16_t phdr_num;
77     uint32_t sh_info;
78     bool have_section;
79     bool resume;
80     ssize_t note_size;
81     hwaddr memory_offset;
82     int fd;
83
84     GuestPhysBlock *next_block;
85     ram_addr_t start;
86     bool has_filter;
87     int64_t begin;
88     int64_t length;
89     Error **errp;
90
91     uint8_t *note_buf;          /* buffer for notes */
92     size_t note_buf_offset;     /* the writing place in note_buf */
93     uint32_t nr_cpus;           /* number of guest's cpu */
94     size_t page_size;           /* guest's page size */
95     uint32_t page_shift;        /* guest's page shift */
96     uint64_t max_mapnr;         /* the biggest guest's phys-mem's number */
97     size_t len_dump_bitmap;     /* the size of the place used to store
98                                    dump_bitmap in vmcore */
99     off_t offset_dump_bitmap;   /* offset of dump_bitmap part in vmcore */
100     off_t offset_page;          /* offset of page part in vmcore */
101     size_t num_dumpable;        /* number of page that can be dumped */
102     uint32_t flag_compress;     /* indicate the compression format */
103 } DumpState;
104
105 static int dump_cleanup(DumpState *s)
106 {
107     int ret = 0;
108
109     guest_phys_blocks_free(&s->guest_phys_blocks);
110     memory_mapping_list_free(&s->list);
111     if (s->fd != -1) {
112         close(s->fd);
113     }
114     if (s->resume) {
115         vm_start();
116     }
117
118     return ret;
119 }
120
121 static void dump_error(DumpState *s, const char *reason)
122 {
123     dump_cleanup(s);
124 }
125
126 static int fd_write_vmcore(const void *buf, size_t size, void *opaque)
127 {
128     DumpState *s = opaque;
129     size_t written_size;
130
131     written_size = qemu_write_full(s->fd, buf, size);
132     if (written_size != size) {
133         return -1;
134     }
135
136     return 0;
137 }
138
139 static int write_elf64_header(DumpState *s)
140 {
141     Elf64_Ehdr elf_header;
142     int ret;
143     int endian = s->dump_info.d_endian;
144
145     memset(&elf_header, 0, sizeof(Elf64_Ehdr));
146     memcpy(&elf_header, ELFMAG, SELFMAG);
147     elf_header.e_ident[EI_CLASS] = ELFCLASS64;
148     elf_header.e_ident[EI_DATA] = s->dump_info.d_endian;
149     elf_header.e_ident[EI_VERSION] = EV_CURRENT;
150     elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
151     elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
152                                                    endian);
153     elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
154     elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
155     elf_header.e_phoff = cpu_convert_to_target64(sizeof(Elf64_Ehdr), endian);
156     elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf64_Phdr),
157                                                      endian);
158     elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
159     if (s->have_section) {
160         uint64_t shoff = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr) * s->sh_info;
161
162         elf_header.e_shoff = cpu_convert_to_target64(shoff, endian);
163         elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf64_Shdr),
164                                                          endian);
165         elf_header.e_shnum = cpu_convert_to_target16(1, endian);
166     }
167
168     ret = fd_write_vmcore(&elf_header, sizeof(elf_header), s);
169     if (ret < 0) {
170         dump_error(s, "dump: failed to write elf header.\n");
171         return -1;
172     }
173
174     return 0;
175 }
176
177 static int write_elf32_header(DumpState *s)
178 {
179     Elf32_Ehdr elf_header;
180     int ret;
181     int endian = s->dump_info.d_endian;
182
183     memset(&elf_header, 0, sizeof(Elf32_Ehdr));
184     memcpy(&elf_header, ELFMAG, SELFMAG);
185     elf_header.e_ident[EI_CLASS] = ELFCLASS32;
186     elf_header.e_ident[EI_DATA] = endian;
187     elf_header.e_ident[EI_VERSION] = EV_CURRENT;
188     elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
189     elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
190                                                    endian);
191     elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
192     elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
193     elf_header.e_phoff = cpu_convert_to_target32(sizeof(Elf32_Ehdr), endian);
194     elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf32_Phdr),
195                                                      endian);
196     elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
197     if (s->have_section) {
198         uint32_t shoff = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr) * s->sh_info;
199
200         elf_header.e_shoff = cpu_convert_to_target32(shoff, endian);
201         elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf32_Shdr),
202                                                          endian);
203         elf_header.e_shnum = cpu_convert_to_target16(1, endian);
204     }
205
206     ret = fd_write_vmcore(&elf_header, sizeof(elf_header), s);
207     if (ret < 0) {
208         dump_error(s, "dump: failed to write elf header.\n");
209         return -1;
210     }
211
212     return 0;
213 }
214
215 static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
216                             int phdr_index, hwaddr offset,
217                             hwaddr filesz)
218 {
219     Elf64_Phdr phdr;
220     int ret;
221     int endian = s->dump_info.d_endian;
222
223     memset(&phdr, 0, sizeof(Elf64_Phdr));
224     phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
225     phdr.p_offset = cpu_convert_to_target64(offset, endian);
226     phdr.p_paddr = cpu_convert_to_target64(memory_mapping->phys_addr, endian);
227     phdr.p_filesz = cpu_convert_to_target64(filesz, endian);
228     phdr.p_memsz = cpu_convert_to_target64(memory_mapping->length, endian);
229     phdr.p_vaddr = cpu_convert_to_target64(memory_mapping->virt_addr, endian);
230
231     assert(memory_mapping->length >= filesz);
232
233     ret = fd_write_vmcore(&phdr, sizeof(Elf64_Phdr), s);
234     if (ret < 0) {
235         dump_error(s, "dump: failed to write program header table.\n");
236         return -1;
237     }
238
239     return 0;
240 }
241
242 static int write_elf32_load(DumpState *s, MemoryMapping *memory_mapping,
243                             int phdr_index, hwaddr offset,
244                             hwaddr filesz)
245 {
246     Elf32_Phdr phdr;
247     int ret;
248     int endian = s->dump_info.d_endian;
249
250     memset(&phdr, 0, sizeof(Elf32_Phdr));
251     phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
252     phdr.p_offset = cpu_convert_to_target32(offset, endian);
253     phdr.p_paddr = cpu_convert_to_target32(memory_mapping->phys_addr, endian);
254     phdr.p_filesz = cpu_convert_to_target32(filesz, endian);
255     phdr.p_memsz = cpu_convert_to_target32(memory_mapping->length, endian);
256     phdr.p_vaddr = cpu_convert_to_target32(memory_mapping->virt_addr, endian);
257
258     assert(memory_mapping->length >= filesz);
259
260     ret = fd_write_vmcore(&phdr, sizeof(Elf32_Phdr), s);
261     if (ret < 0) {
262         dump_error(s, "dump: failed to write program header table.\n");
263         return -1;
264     }
265
266     return 0;
267 }
268
269 static int write_elf64_note(DumpState *s)
270 {
271     Elf64_Phdr phdr;
272     int endian = s->dump_info.d_endian;
273     hwaddr begin = s->memory_offset - s->note_size;
274     int ret;
275
276     memset(&phdr, 0, sizeof(Elf64_Phdr));
277     phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
278     phdr.p_offset = cpu_convert_to_target64(begin, endian);
279     phdr.p_paddr = 0;
280     phdr.p_filesz = cpu_convert_to_target64(s->note_size, endian);
281     phdr.p_memsz = cpu_convert_to_target64(s->note_size, endian);
282     phdr.p_vaddr = 0;
283
284     ret = fd_write_vmcore(&phdr, sizeof(Elf64_Phdr), s);
285     if (ret < 0) {
286         dump_error(s, "dump: failed to write program header table.\n");
287         return -1;
288     }
289
290     return 0;
291 }
292
293 static inline int cpu_index(CPUState *cpu)
294 {
295     return cpu->cpu_index + 1;
296 }
297
298 static int write_elf64_notes(WriteCoreDumpFunction f, DumpState *s)
299 {
300     CPUState *cpu;
301     int ret;
302     int id;
303
304     CPU_FOREACH(cpu) {
305         id = cpu_index(cpu);
306         ret = cpu_write_elf64_note(f, cpu, id, s);
307         if (ret < 0) {
308             dump_error(s, "dump: failed to write elf notes.\n");
309             return -1;
310         }
311     }
312
313     CPU_FOREACH(cpu) {
314         ret = cpu_write_elf64_qemunote(f, cpu, s);
315         if (ret < 0) {
316             dump_error(s, "dump: failed to write CPU status.\n");
317             return -1;
318         }
319     }
320
321     return 0;
322 }
323
324 static int write_elf32_note(DumpState *s)
325 {
326     hwaddr begin = s->memory_offset - s->note_size;
327     Elf32_Phdr phdr;
328     int endian = s->dump_info.d_endian;
329     int ret;
330
331     memset(&phdr, 0, sizeof(Elf32_Phdr));
332     phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
333     phdr.p_offset = cpu_convert_to_target32(begin, endian);
334     phdr.p_paddr = 0;
335     phdr.p_filesz = cpu_convert_to_target32(s->note_size, endian);
336     phdr.p_memsz = cpu_convert_to_target32(s->note_size, endian);
337     phdr.p_vaddr = 0;
338
339     ret = fd_write_vmcore(&phdr, sizeof(Elf32_Phdr), s);
340     if (ret < 0) {
341         dump_error(s, "dump: failed to write program header table.\n");
342         return -1;
343     }
344
345     return 0;
346 }
347
348 static int write_elf32_notes(WriteCoreDumpFunction f, DumpState *s)
349 {
350     CPUState *cpu;
351     int ret;
352     int id;
353
354     CPU_FOREACH(cpu) {
355         id = cpu_index(cpu);
356         ret = cpu_write_elf32_note(f, cpu, id, s);
357         if (ret < 0) {
358             dump_error(s, "dump: failed to write elf notes.\n");
359             return -1;
360         }
361     }
362
363     CPU_FOREACH(cpu) {
364         ret = cpu_write_elf32_qemunote(f, cpu, s);
365         if (ret < 0) {
366             dump_error(s, "dump: failed to write CPU status.\n");
367             return -1;
368         }
369     }
370
371     return 0;
372 }
373
374 static int write_elf_section(DumpState *s, int type)
375 {
376     Elf32_Shdr shdr32;
377     Elf64_Shdr shdr64;
378     int endian = s->dump_info.d_endian;
379     int shdr_size;
380     void *shdr;
381     int ret;
382
383     if (type == 0) {
384         shdr_size = sizeof(Elf32_Shdr);
385         memset(&shdr32, 0, shdr_size);
386         shdr32.sh_info = cpu_convert_to_target32(s->sh_info, endian);
387         shdr = &shdr32;
388     } else {
389         shdr_size = sizeof(Elf64_Shdr);
390         memset(&shdr64, 0, shdr_size);
391         shdr64.sh_info = cpu_convert_to_target32(s->sh_info, endian);
392         shdr = &shdr64;
393     }
394
395     ret = fd_write_vmcore(&shdr, shdr_size, s);
396     if (ret < 0) {
397         dump_error(s, "dump: failed to write section header table.\n");
398         return -1;
399     }
400
401     return 0;
402 }
403
404 static int write_data(DumpState *s, void *buf, int length)
405 {
406     int ret;
407
408     ret = fd_write_vmcore(buf, length, s);
409     if (ret < 0) {
410         dump_error(s, "dump: failed to save memory.\n");
411         return -1;
412     }
413
414     return 0;
415 }
416
417 /* write the memroy to vmcore. 1 page per I/O. */
418 static int write_memory(DumpState *s, GuestPhysBlock *block, ram_addr_t start,
419                         int64_t size)
420 {
421     int64_t i;
422     int ret;
423
424     for (i = 0; i < size / TARGET_PAGE_SIZE; i++) {
425         ret = write_data(s, block->host_addr + start + i * TARGET_PAGE_SIZE,
426                          TARGET_PAGE_SIZE);
427         if (ret < 0) {
428             return ret;
429         }
430     }
431
432     if ((size % TARGET_PAGE_SIZE) != 0) {
433         ret = write_data(s, block->host_addr + start + i * TARGET_PAGE_SIZE,
434                          size % TARGET_PAGE_SIZE);
435         if (ret < 0) {
436             return ret;
437         }
438     }
439
440     return 0;
441 }
442
443 /* get the memory's offset and size in the vmcore */
444 static void get_offset_range(hwaddr phys_addr,
445                              ram_addr_t mapping_length,
446                              DumpState *s,
447                              hwaddr *p_offset,
448                              hwaddr *p_filesz)
449 {
450     GuestPhysBlock *block;
451     hwaddr offset = s->memory_offset;
452     int64_t size_in_block, start;
453
454     /* When the memory is not stored into vmcore, offset will be -1 */
455     *p_offset = -1;
456     *p_filesz = 0;
457
458     if (s->has_filter) {
459         if (phys_addr < s->begin || phys_addr >= s->begin + s->length) {
460             return;
461         }
462     }
463
464     QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
465         if (s->has_filter) {
466             if (block->target_start >= s->begin + s->length ||
467                 block->target_end <= s->begin) {
468                 /* This block is out of the range */
469                 continue;
470             }
471
472             if (s->begin <= block->target_start) {
473                 start = block->target_start;
474             } else {
475                 start = s->begin;
476             }
477
478             size_in_block = block->target_end - start;
479             if (s->begin + s->length < block->target_end) {
480                 size_in_block -= block->target_end - (s->begin + s->length);
481             }
482         } else {
483             start = block->target_start;
484             size_in_block = block->target_end - block->target_start;
485         }
486
487         if (phys_addr >= start && phys_addr < start + size_in_block) {
488             *p_offset = phys_addr - start + offset;
489
490             /* The offset range mapped from the vmcore file must not spill over
491              * the GuestPhysBlock, clamp it. The rest of the mapping will be
492              * zero-filled in memory at load time; see
493              * <http://refspecs.linuxbase.org/elf/gabi4+/ch5.pheader.html>.
494              */
495             *p_filesz = phys_addr + mapping_length <= start + size_in_block ?
496                         mapping_length :
497                         size_in_block - (phys_addr - start);
498             return;
499         }
500
501         offset += size_in_block;
502     }
503 }
504
505 static int write_elf_loads(DumpState *s)
506 {
507     hwaddr offset, filesz;
508     MemoryMapping *memory_mapping;
509     uint32_t phdr_index = 1;
510     int ret;
511     uint32_t max_index;
512
513     if (s->have_section) {
514         max_index = s->sh_info;
515     } else {
516         max_index = s->phdr_num;
517     }
518
519     QTAILQ_FOREACH(memory_mapping, &s->list.head, next) {
520         get_offset_range(memory_mapping->phys_addr,
521                          memory_mapping->length,
522                          s, &offset, &filesz);
523         if (s->dump_info.d_class == ELFCLASS64) {
524             ret = write_elf64_load(s, memory_mapping, phdr_index++, offset,
525                                    filesz);
526         } else {
527             ret = write_elf32_load(s, memory_mapping, phdr_index++, offset,
528                                    filesz);
529         }
530
531         if (ret < 0) {
532             return -1;
533         }
534
535         if (phdr_index >= max_index) {
536             break;
537         }
538     }
539
540     return 0;
541 }
542
543 /* write elf header, PT_NOTE and elf note to vmcore. */
544 static int dump_begin(DumpState *s)
545 {
546     int ret;
547
548     /*
549      * the vmcore's format is:
550      *   --------------
551      *   |  elf header |
552      *   --------------
553      *   |  PT_NOTE    |
554      *   --------------
555      *   |  PT_LOAD    |
556      *   --------------
557      *   |  ......     |
558      *   --------------
559      *   |  PT_LOAD    |
560      *   --------------
561      *   |  sec_hdr    |
562      *   --------------
563      *   |  elf note   |
564      *   --------------
565      *   |  memory     |
566      *   --------------
567      *
568      * we only know where the memory is saved after we write elf note into
569      * vmcore.
570      */
571
572     /* write elf header to vmcore */
573     if (s->dump_info.d_class == ELFCLASS64) {
574         ret = write_elf64_header(s);
575     } else {
576         ret = write_elf32_header(s);
577     }
578     if (ret < 0) {
579         return -1;
580     }
581
582     if (s->dump_info.d_class == ELFCLASS64) {
583         /* write PT_NOTE to vmcore */
584         if (write_elf64_note(s) < 0) {
585             return -1;
586         }
587
588         /* write all PT_LOAD to vmcore */
589         if (write_elf_loads(s) < 0) {
590             return -1;
591         }
592
593         /* write section to vmcore */
594         if (s->have_section) {
595             if (write_elf_section(s, 1) < 0) {
596                 return -1;
597             }
598         }
599
600         /* write notes to vmcore */
601         if (write_elf64_notes(fd_write_vmcore, s) < 0) {
602             return -1;
603         }
604
605     } else {
606         /* write PT_NOTE to vmcore */
607         if (write_elf32_note(s) < 0) {
608             return -1;
609         }
610
611         /* write all PT_LOAD to vmcore */
612         if (write_elf_loads(s) < 0) {
613             return -1;
614         }
615
616         /* write section to vmcore */
617         if (s->have_section) {
618             if (write_elf_section(s, 0) < 0) {
619                 return -1;
620             }
621         }
622
623         /* write notes to vmcore */
624         if (write_elf32_notes(fd_write_vmcore, s) < 0) {
625             return -1;
626         }
627     }
628
629     return 0;
630 }
631
632 /* write PT_LOAD to vmcore */
633 static int dump_completed(DumpState *s)
634 {
635     dump_cleanup(s);
636     return 0;
637 }
638
639 static int get_next_block(DumpState *s, GuestPhysBlock *block)
640 {
641     while (1) {
642         block = QTAILQ_NEXT(block, next);
643         if (!block) {
644             /* no more block */
645             return 1;
646         }
647
648         s->start = 0;
649         s->next_block = block;
650         if (s->has_filter) {
651             if (block->target_start >= s->begin + s->length ||
652                 block->target_end <= s->begin) {
653                 /* This block is out of the range */
654                 continue;
655             }
656
657             if (s->begin > block->target_start) {
658                 s->start = s->begin - block->target_start;
659             }
660         }
661
662         return 0;
663     }
664 }
665
666 /* write all memory to vmcore */
667 static int dump_iterate(DumpState *s)
668 {
669     GuestPhysBlock *block;
670     int64_t size;
671     int ret;
672
673     while (1) {
674         block = s->next_block;
675
676         size = block->target_end - block->target_start;
677         if (s->has_filter) {
678             size -= s->start;
679             if (s->begin + s->length < block->target_end) {
680                 size -= block->target_end - (s->begin + s->length);
681             }
682         }
683         ret = write_memory(s, block, s->start, size);
684         if (ret == -1) {
685             return ret;
686         }
687
688         ret = get_next_block(s, block);
689         if (ret == 1) {
690             dump_completed(s);
691             return 0;
692         }
693     }
694 }
695
696 static int create_vmcore(DumpState *s)
697 {
698     int ret;
699
700     ret = dump_begin(s);
701     if (ret < 0) {
702         return -1;
703     }
704
705     ret = dump_iterate(s);
706     if (ret < 0) {
707         return -1;
708     }
709
710     return 0;
711 }
712
713 static int write_start_flat_header(int fd)
714 {
715     uint8_t *buf;
716     MakedumpfileHeader mh;
717     int ret = 0;
718
719     memset(&mh, 0, sizeof(mh));
720     strncpy(mh.signature, MAKEDUMPFILE_SIGNATURE,
721             strlen(MAKEDUMPFILE_SIGNATURE));
722
723     mh.type = cpu_to_be64(TYPE_FLAT_HEADER);
724     mh.version = cpu_to_be64(VERSION_FLAT_HEADER);
725
726     buf = g_malloc0(MAX_SIZE_MDF_HEADER);
727     memcpy(buf, &mh, sizeof(mh));
728
729     size_t written_size;
730     written_size = qemu_write_full(fd, buf, MAX_SIZE_MDF_HEADER);
731     if (written_size != MAX_SIZE_MDF_HEADER) {
732         ret = -1;
733     }
734
735     g_free(buf);
736     return ret;
737 }
738
739 static int write_end_flat_header(int fd)
740 {
741     MakedumpfileDataHeader mdh;
742
743     mdh.offset = END_FLAG_FLAT_HEADER;
744     mdh.buf_size = END_FLAG_FLAT_HEADER;
745
746     size_t written_size;
747     written_size = qemu_write_full(fd, &mdh, sizeof(mdh));
748     if (written_size != sizeof(mdh)) {
749         return -1;
750     }
751
752     return 0;
753 }
754
755 static int write_buffer(int fd, off_t offset, const void *buf, size_t size)
756 {
757     size_t written_size;
758     MakedumpfileDataHeader mdh;
759
760     mdh.offset = cpu_to_be64(offset);
761     mdh.buf_size = cpu_to_be64(size);
762
763     written_size = qemu_write_full(fd, &mdh, sizeof(mdh));
764     if (written_size != sizeof(mdh)) {
765         return -1;
766     }
767
768     written_size = qemu_write_full(fd, buf, size);
769     if (written_size != size) {
770         return -1;
771     }
772
773     return 0;
774 }
775
776 static int buf_write_note(const void *buf, size_t size, void *opaque)
777 {
778     DumpState *s = opaque;
779
780     /* note_buf is not enough */
781     if (s->note_buf_offset + size > s->note_size) {
782         return -1;
783     }
784
785     memcpy(s->note_buf + s->note_buf_offset, buf, size);
786
787     s->note_buf_offset += size;
788
789     return 0;
790 }
791
792 /* write common header, sub header and elf note to vmcore */
793 static int create_header32(DumpState *s)
794 {
795     int ret = 0;
796     DiskDumpHeader32 *dh = NULL;
797     KdumpSubHeader32 *kh = NULL;
798     size_t size;
799     int endian = s->dump_info.d_endian;
800     uint32_t block_size;
801     uint32_t sub_hdr_size;
802     uint32_t bitmap_blocks;
803     uint32_t status = 0;
804     uint64_t offset_note;
805
806     /* write common header, the version of kdump-compressed format is 6th */
807     size = sizeof(DiskDumpHeader32);
808     dh = g_malloc0(size);
809
810     strncpy(dh->signature, KDUMP_SIGNATURE, strlen(KDUMP_SIGNATURE));
811     dh->header_version = cpu_convert_to_target32(6, endian);
812     block_size = s->page_size;
813     dh->block_size = cpu_convert_to_target32(block_size, endian);
814     sub_hdr_size = sizeof(struct KdumpSubHeader32) + s->note_size;
815     sub_hdr_size = DIV_ROUND_UP(sub_hdr_size, block_size);
816     dh->sub_hdr_size = cpu_convert_to_target32(sub_hdr_size, endian);
817     /* dh->max_mapnr may be truncated, full 64bit is in kh.max_mapnr_64 */
818     dh->max_mapnr = cpu_convert_to_target32(MIN(s->max_mapnr, UINT_MAX),
819                                             endian);
820     dh->nr_cpus = cpu_convert_to_target32(s->nr_cpus, endian);
821     bitmap_blocks = DIV_ROUND_UP(s->len_dump_bitmap, block_size) * 2;
822     dh->bitmap_blocks = cpu_convert_to_target32(bitmap_blocks, endian);
823     strncpy(dh->utsname.machine, ELF_MACHINE_UNAME, sizeof(dh->utsname.machine));
824
825     if (s->flag_compress & DUMP_DH_COMPRESSED_ZLIB) {
826         status |= DUMP_DH_COMPRESSED_ZLIB;
827     }
828 #ifdef CONFIG_LZO
829     if (s->flag_compress & DUMP_DH_COMPRESSED_LZO) {
830         status |= DUMP_DH_COMPRESSED_LZO;
831     }
832 #endif
833 #ifdef CONFIG_SNAPPY
834     if (s->flag_compress & DUMP_DH_COMPRESSED_SNAPPY) {
835         status |= DUMP_DH_COMPRESSED_SNAPPY;
836     }
837 #endif
838     dh->status = cpu_convert_to_target32(status, endian);
839
840     if (write_buffer(s->fd, 0, dh, size) < 0) {
841         dump_error(s, "dump: failed to write disk dump header.\n");
842         ret = -1;
843         goto out;
844     }
845
846     /* write sub header */
847     size = sizeof(KdumpSubHeader32);
848     kh = g_malloc0(size);
849
850     /* 64bit max_mapnr_64 */
851     kh->max_mapnr_64 = cpu_convert_to_target64(s->max_mapnr, endian);
852     kh->phys_base = cpu_convert_to_target32(PHYS_BASE, endian);
853     kh->dump_level = cpu_convert_to_target32(DUMP_LEVEL, endian);
854
855     offset_note = DISKDUMP_HEADER_BLOCKS * block_size + size;
856     kh->offset_note = cpu_convert_to_target64(offset_note, endian);
857     kh->note_size = cpu_convert_to_target32(s->note_size, endian);
858
859     if (write_buffer(s->fd, DISKDUMP_HEADER_BLOCKS *
860                      block_size, kh, size) < 0) {
861         dump_error(s, "dump: failed to write kdump sub header.\n");
862         ret = -1;
863         goto out;
864     }
865
866     /* write note */
867     s->note_buf = g_malloc0(s->note_size);
868     s->note_buf_offset = 0;
869
870     /* use s->note_buf to store notes temporarily */
871     if (write_elf32_notes(buf_write_note, s) < 0) {
872         ret = -1;
873         goto out;
874     }
875
876     if (write_buffer(s->fd, offset_note, s->note_buf,
877                      s->note_size) < 0) {
878         dump_error(s, "dump: failed to write notes");
879         ret = -1;
880         goto out;
881     }
882
883     /* get offset of dump_bitmap */
884     s->offset_dump_bitmap = (DISKDUMP_HEADER_BLOCKS + sub_hdr_size) *
885                              block_size;
886
887     /* get offset of page */
888     s->offset_page = (DISKDUMP_HEADER_BLOCKS + sub_hdr_size + bitmap_blocks) *
889                      block_size;
890
891 out:
892     g_free(dh);
893     g_free(kh);
894     g_free(s->note_buf);
895
896     return ret;
897 }
898
899 /* write common header, sub header and elf note to vmcore */
900 static int create_header64(DumpState *s)
901 {
902     int ret = 0;
903     DiskDumpHeader64 *dh = NULL;
904     KdumpSubHeader64 *kh = NULL;
905     size_t size;
906     int endian = s->dump_info.d_endian;
907     uint32_t block_size;
908     uint32_t sub_hdr_size;
909     uint32_t bitmap_blocks;
910     uint32_t status = 0;
911     uint64_t offset_note;
912
913     /* write common header, the version of kdump-compressed format is 6th */
914     size = sizeof(DiskDumpHeader64);
915     dh = g_malloc0(size);
916
917     strncpy(dh->signature, KDUMP_SIGNATURE, strlen(KDUMP_SIGNATURE));
918     dh->header_version = cpu_convert_to_target32(6, endian);
919     block_size = s->page_size;
920     dh->block_size = cpu_convert_to_target32(block_size, endian);
921     sub_hdr_size = sizeof(struct KdumpSubHeader64) + s->note_size;
922     sub_hdr_size = DIV_ROUND_UP(sub_hdr_size, block_size);
923     dh->sub_hdr_size = cpu_convert_to_target32(sub_hdr_size, endian);
924     /* dh->max_mapnr may be truncated, full 64bit is in kh.max_mapnr_64 */
925     dh->max_mapnr = cpu_convert_to_target32(MIN(s->max_mapnr, UINT_MAX),
926                                             endian);
927     dh->nr_cpus = cpu_convert_to_target32(s->nr_cpus, endian);
928     bitmap_blocks = DIV_ROUND_UP(s->len_dump_bitmap, block_size) * 2;
929     dh->bitmap_blocks = cpu_convert_to_target32(bitmap_blocks, endian);
930     strncpy(dh->utsname.machine, ELF_MACHINE_UNAME, sizeof(dh->utsname.machine));
931
932     if (s->flag_compress & DUMP_DH_COMPRESSED_ZLIB) {
933         status |= DUMP_DH_COMPRESSED_ZLIB;
934     }
935 #ifdef CONFIG_LZO
936     if (s->flag_compress & DUMP_DH_COMPRESSED_LZO) {
937         status |= DUMP_DH_COMPRESSED_LZO;
938     }
939 #endif
940 #ifdef CONFIG_SNAPPY
941     if (s->flag_compress & DUMP_DH_COMPRESSED_SNAPPY) {
942         status |= DUMP_DH_COMPRESSED_SNAPPY;
943     }
944 #endif
945     dh->status = cpu_convert_to_target32(status, endian);
946
947     if (write_buffer(s->fd, 0, dh, size) < 0) {
948         dump_error(s, "dump: failed to write disk dump header.\n");
949         ret = -1;
950         goto out;
951     }
952
953     /* write sub header */
954     size = sizeof(KdumpSubHeader64);
955     kh = g_malloc0(size);
956
957     /* 64bit max_mapnr_64 */
958     kh->max_mapnr_64 = cpu_convert_to_target64(s->max_mapnr, endian);
959     kh->phys_base = cpu_convert_to_target64(PHYS_BASE, endian);
960     kh->dump_level = cpu_convert_to_target32(DUMP_LEVEL, endian);
961
962     offset_note = DISKDUMP_HEADER_BLOCKS * block_size + size;
963     kh->offset_note = cpu_convert_to_target64(offset_note, endian);
964     kh->note_size = cpu_convert_to_target64(s->note_size, endian);
965
966     if (write_buffer(s->fd, DISKDUMP_HEADER_BLOCKS *
967                      block_size, kh, size) < 0) {
968         dump_error(s, "dump: failed to write kdump sub header.\n");
969         ret = -1;
970         goto out;
971     }
972
973     /* write note */
974     s->note_buf = g_malloc0(s->note_size);
975     s->note_buf_offset = 0;
976
977     /* use s->note_buf to store notes temporarily */
978     if (write_elf64_notes(buf_write_note, s) < 0) {
979         ret = -1;
980         goto out;
981     }
982
983     if (write_buffer(s->fd, offset_note, s->note_buf,
984                      s->note_size) < 0) {
985         dump_error(s, "dump: failed to write notes");
986         ret = -1;
987         goto out;
988     }
989
990     /* get offset of dump_bitmap */
991     s->offset_dump_bitmap = (DISKDUMP_HEADER_BLOCKS + sub_hdr_size) *
992                              block_size;
993
994     /* get offset of page */
995     s->offset_page = (DISKDUMP_HEADER_BLOCKS + sub_hdr_size + bitmap_blocks) *
996                      block_size;
997
998 out:
999     g_free(dh);
1000     g_free(kh);
1001     g_free(s->note_buf);
1002
1003     return ret;
1004 }
1005
1006 static int write_dump_header(DumpState *s)
1007 {
1008     if (s->dump_info.d_machine == EM_386) {
1009         return create_header32(s);
1010     } else {
1011         return create_header64(s);
1012     }
1013 }
1014
1015 /*
1016  * set dump_bitmap sequencely. the bit before last_pfn is not allowed to be
1017  * rewritten, so if need to set the first bit, set last_pfn and pfn to 0.
1018  * set_dump_bitmap will always leave the recently set bit un-sync. And setting
1019  * (last bit + sizeof(buf) * 8) to 0 will do flushing the content in buf into
1020  * vmcore, ie. synchronizing un-sync bit into vmcore.
1021  */
1022 static int set_dump_bitmap(uint64_t last_pfn, uint64_t pfn, bool value,
1023                            uint8_t *buf, DumpState *s)
1024 {
1025     off_t old_offset, new_offset;
1026     off_t offset_bitmap1, offset_bitmap2;
1027     uint32_t byte, bit;
1028
1029     /* should not set the previous place */
1030     assert(last_pfn <= pfn);
1031
1032     /*
1033      * if the bit needed to be set is not cached in buf, flush the data in buf
1034      * to vmcore firstly.
1035      * making new_offset be bigger than old_offset can also sync remained data
1036      * into vmcore.
1037      */
1038     old_offset = BUFSIZE_BITMAP * (last_pfn / PFN_BUFBITMAP);
1039     new_offset = BUFSIZE_BITMAP * (pfn / PFN_BUFBITMAP);
1040
1041     while (old_offset < new_offset) {
1042         /* calculate the offset and write dump_bitmap */
1043         offset_bitmap1 = s->offset_dump_bitmap + old_offset;
1044         if (write_buffer(s->fd, offset_bitmap1, buf,
1045                          BUFSIZE_BITMAP) < 0) {
1046             return -1;
1047         }
1048
1049         /* dump level 1 is chosen, so 1st and 2nd bitmap are same */
1050         offset_bitmap2 = s->offset_dump_bitmap + s->len_dump_bitmap +
1051                          old_offset;
1052         if (write_buffer(s->fd, offset_bitmap2, buf,
1053                          BUFSIZE_BITMAP) < 0) {
1054             return -1;
1055         }
1056
1057         memset(buf, 0, BUFSIZE_BITMAP);
1058         old_offset += BUFSIZE_BITMAP;
1059     }
1060
1061     /* get the exact place of the bit in the buf, and set it */
1062     byte = (pfn % PFN_BUFBITMAP) / CHAR_BIT;
1063     bit = (pfn % PFN_BUFBITMAP) % CHAR_BIT;
1064     if (value) {
1065         buf[byte] |= 1u << bit;
1066     } else {
1067         buf[byte] &= ~(1u << bit);
1068     }
1069
1070     return 0;
1071 }
1072
1073 /*
1074  * exam every page and return the page frame number and the address of the page.
1075  * bufptr can be NULL. note: the blocks here is supposed to reflect guest-phys
1076  * blocks, so block->target_start and block->target_end should be interal
1077  * multiples of the target page size.
1078  */
1079 static bool get_next_page(GuestPhysBlock **blockptr, uint64_t *pfnptr,
1080                           uint8_t **bufptr, DumpState *s)
1081 {
1082     GuestPhysBlock *block = *blockptr;
1083     hwaddr addr;
1084     uint8_t *buf;
1085
1086     /* block == NULL means the start of the iteration */
1087     if (!block) {
1088         block = QTAILQ_FIRST(&s->guest_phys_blocks.head);
1089         *blockptr = block;
1090         assert(block->target_start % s->page_size == 0);
1091         assert(block->target_end % s->page_size == 0);
1092         *pfnptr = paddr_to_pfn(block->target_start, s->page_shift);
1093         if (bufptr) {
1094             *bufptr = block->host_addr;
1095         }
1096         return true;
1097     }
1098
1099     *pfnptr = *pfnptr + 1;
1100     addr = pfn_to_paddr(*pfnptr, s->page_shift);
1101
1102     if ((addr >= block->target_start) &&
1103         (addr + s->page_size <= block->target_end)) {
1104         buf = block->host_addr + (addr - block->target_start);
1105     } else {
1106         /* the next page is in the next block */
1107         block = QTAILQ_NEXT(block, next);
1108         *blockptr = block;
1109         if (!block) {
1110             return false;
1111         }
1112         assert(block->target_start % s->page_size == 0);
1113         assert(block->target_end % s->page_size == 0);
1114         *pfnptr = paddr_to_pfn(block->target_start, s->page_shift);
1115         buf = block->host_addr;
1116     }
1117
1118     if (bufptr) {
1119         *bufptr = buf;
1120     }
1121
1122     return true;
1123 }
1124
1125 static int write_dump_bitmap(DumpState *s)
1126 {
1127     int ret = 0;
1128     uint64_t last_pfn, pfn;
1129     void *dump_bitmap_buf;
1130     size_t num_dumpable;
1131     GuestPhysBlock *block_iter = NULL;
1132
1133     /* dump_bitmap_buf is used to store dump_bitmap temporarily */
1134     dump_bitmap_buf = g_malloc0(BUFSIZE_BITMAP);
1135
1136     num_dumpable = 0;
1137     last_pfn = 0;
1138
1139     /*
1140      * exam memory page by page, and set the bit in dump_bitmap corresponded
1141      * to the existing page.
1142      */
1143     while (get_next_page(&block_iter, &pfn, NULL, s)) {
1144         ret = set_dump_bitmap(last_pfn, pfn, true, dump_bitmap_buf, s);
1145         if (ret < 0) {
1146             dump_error(s, "dump: failed to set dump_bitmap.\n");
1147             ret = -1;
1148             goto out;
1149         }
1150
1151         last_pfn = pfn;
1152         num_dumpable++;
1153     }
1154
1155     /*
1156      * set_dump_bitmap will always leave the recently set bit un-sync. Here we
1157      * set last_pfn + PFN_BUFBITMAP to 0 and those set but un-sync bit will be
1158      * synchronized into vmcore.
1159      */
1160     if (num_dumpable > 0) {
1161         ret = set_dump_bitmap(last_pfn, last_pfn + PFN_BUFBITMAP, false,
1162                               dump_bitmap_buf, s);
1163         if (ret < 0) {
1164             dump_error(s, "dump: failed to sync dump_bitmap.\n");
1165             ret = -1;
1166             goto out;
1167         }
1168     }
1169
1170     /* number of dumpable pages that will be dumped later */
1171     s->num_dumpable = num_dumpable;
1172
1173 out:
1174     g_free(dump_bitmap_buf);
1175
1176     return ret;
1177 }
1178
1179 static void prepare_data_cache(DataCache *data_cache, DumpState *s,
1180                                off_t offset)
1181 {
1182     data_cache->fd = s->fd;
1183     data_cache->data_size = 0;
1184     data_cache->buf_size = BUFSIZE_DATA_CACHE;
1185     data_cache->buf = g_malloc0(BUFSIZE_DATA_CACHE);
1186     data_cache->offset = offset;
1187 }
1188
1189 static int write_cache(DataCache *dc, const void *buf, size_t size,
1190                        bool flag_sync)
1191 {
1192     /*
1193      * dc->buf_size should not be less than size, otherwise dc will never be
1194      * enough
1195      */
1196     assert(size <= dc->buf_size);
1197
1198     /*
1199      * if flag_sync is set, synchronize data in dc->buf into vmcore.
1200      * otherwise check if the space is enough for caching data in buf, if not,
1201      * write the data in dc->buf to dc->fd and reset dc->buf
1202      */
1203     if ((!flag_sync && dc->data_size + size > dc->buf_size) ||
1204         (flag_sync && dc->data_size > 0)) {
1205         if (write_buffer(dc->fd, dc->offset, dc->buf, dc->data_size) < 0) {
1206             return -1;
1207         }
1208
1209         dc->offset += dc->data_size;
1210         dc->data_size = 0;
1211     }
1212
1213     if (!flag_sync) {
1214         memcpy(dc->buf + dc->data_size, buf, size);
1215         dc->data_size += size;
1216     }
1217
1218     return 0;
1219 }
1220
1221 static void free_data_cache(DataCache *data_cache)
1222 {
1223     g_free(data_cache->buf);
1224 }
1225
1226 static size_t get_len_buf_out(size_t page_size, uint32_t flag_compress)
1227 {
1228     size_t len_buf_out_zlib, len_buf_out_lzo, len_buf_out_snappy;
1229     size_t len_buf_out;
1230
1231     /* init buf_out */
1232     len_buf_out_zlib = len_buf_out_lzo = len_buf_out_snappy = 0;
1233
1234     /* buf size for zlib */
1235     len_buf_out_zlib = compressBound(page_size);
1236
1237     /* buf size for lzo */
1238 #ifdef CONFIG_LZO
1239     if (flag_compress & DUMP_DH_COMPRESSED_LZO) {
1240         if (lzo_init() != LZO_E_OK) {
1241             /* return 0 to indicate lzo is unavailable */
1242             return 0;
1243         }
1244     }
1245
1246     /*
1247      * LZO will expand incompressible data by a little amount. please check the
1248      * following URL to see the expansion calculation:
1249      * http://www.oberhumer.com/opensource/lzo/lzofaq.php
1250      */
1251     len_buf_out_lzo = page_size + page_size / 16 + 64 + 3;
1252 #endif
1253
1254 #ifdef CONFIG_SNAPPY
1255     /* buf size for snappy */
1256     len_buf_out_snappy = snappy_max_compressed_length(page_size);
1257 #endif
1258
1259     /* get the biggest that can store all kinds of compressed page */
1260     len_buf_out = MAX(len_buf_out_zlib,
1261                       MAX(len_buf_out_lzo, len_buf_out_snappy));
1262
1263     return len_buf_out;
1264 }
1265
1266 /*
1267  * check if the page is all 0
1268  */
1269 static inline bool is_zero_page(const uint8_t *buf, size_t page_size)
1270 {
1271     return buffer_is_zero(buf, page_size);
1272 }
1273
1274 static int write_dump_pages(DumpState *s)
1275 {
1276     int ret = 0;
1277     DataCache page_desc, page_data;
1278     size_t len_buf_out, size_out;
1279 #ifdef CONFIG_LZO
1280     lzo_bytep wrkmem = NULL;
1281 #endif
1282     uint8_t *buf_out = NULL;
1283     off_t offset_desc, offset_data;
1284     PageDescriptor pd, pd_zero;
1285     uint8_t *buf;
1286     int endian = s->dump_info.d_endian;
1287     GuestPhysBlock *block_iter = NULL;
1288     uint64_t pfn_iter;
1289
1290     /* get offset of page_desc and page_data in dump file */
1291     offset_desc = s->offset_page;
1292     offset_data = offset_desc + sizeof(PageDescriptor) * s->num_dumpable;
1293
1294     prepare_data_cache(&page_desc, s, offset_desc);
1295     prepare_data_cache(&page_data, s, offset_data);
1296
1297     /* prepare buffer to store compressed data */
1298     len_buf_out = get_len_buf_out(s->page_size, s->flag_compress);
1299     if (len_buf_out == 0) {
1300         dump_error(s, "dump: failed to get length of output buffer.\n");
1301         goto out;
1302     }
1303
1304 #ifdef CONFIG_LZO
1305     wrkmem = g_malloc(LZO1X_1_MEM_COMPRESS);
1306 #endif
1307
1308     buf_out = g_malloc(len_buf_out);
1309
1310     /*
1311      * init zero page's page_desc and page_data, because every zero page
1312      * uses the same page_data
1313      */
1314     pd_zero.size = cpu_convert_to_target32(s->page_size, endian);
1315     pd_zero.flags = cpu_convert_to_target32(0, endian);
1316     pd_zero.offset = cpu_convert_to_target64(offset_data, endian);
1317     pd_zero.page_flags = cpu_convert_to_target64(0, endian);
1318     buf = g_malloc0(s->page_size);
1319     ret = write_cache(&page_data, buf, s->page_size, false);
1320     g_free(buf);
1321     if (ret < 0) {
1322         dump_error(s, "dump: failed to write page data(zero page).\n");
1323         goto out;
1324     }
1325
1326     offset_data += s->page_size;
1327
1328     /*
1329      * dump memory to vmcore page by page. zero page will all be resided in the
1330      * first page of page section
1331      */
1332     while (get_next_page(&block_iter, &pfn_iter, &buf, s)) {
1333         /* check zero page */
1334         if (is_zero_page(buf, s->page_size)) {
1335             ret = write_cache(&page_desc, &pd_zero, sizeof(PageDescriptor),
1336                               false);
1337             if (ret < 0) {
1338                 dump_error(s, "dump: failed to write page desc.\n");
1339                 goto out;
1340             }
1341         } else {
1342             /*
1343              * not zero page, then:
1344              * 1. compress the page
1345              * 2. write the compressed page into the cache of page_data
1346              * 3. get page desc of the compressed page and write it into the
1347              *    cache of page_desc
1348              *
1349              * only one compression format will be used here, for
1350              * s->flag_compress is set. But when compression fails to work,
1351              * we fall back to save in plaintext.
1352              */
1353              size_out = len_buf_out;
1354              if ((s->flag_compress & DUMP_DH_COMPRESSED_ZLIB) &&
1355                     (compress2(buf_out, (uLongf *)&size_out, buf, s->page_size,
1356                     Z_BEST_SPEED) == Z_OK) && (size_out < s->page_size)) {
1357                 pd.flags = cpu_convert_to_target32(DUMP_DH_COMPRESSED_ZLIB,
1358                                                    endian);
1359                 pd.size  = cpu_convert_to_target32(size_out, endian);
1360
1361                 ret = write_cache(&page_data, buf_out, size_out, false);
1362                 if (ret < 0) {
1363                     dump_error(s, "dump: failed to write page data.\n");
1364                     goto out;
1365                 }
1366 #ifdef CONFIG_LZO
1367             } else if ((s->flag_compress & DUMP_DH_COMPRESSED_LZO) &&
1368                     (lzo1x_1_compress(buf, s->page_size, buf_out,
1369                     (lzo_uint *)&size_out, wrkmem) == LZO_E_OK) &&
1370                     (size_out < s->page_size)) {
1371                 pd.flags = cpu_convert_to_target32(DUMP_DH_COMPRESSED_LZO,
1372                                                    endian);
1373                 pd.size  = cpu_convert_to_target32(size_out, endian);
1374
1375                 ret = write_cache(&page_data, buf_out, size_out, false);
1376                 if (ret < 0) {
1377                     dump_error(s, "dump: failed to write page data.\n");
1378                     goto out;
1379                 }
1380 #endif
1381 #ifdef CONFIG_SNAPPY
1382             } else if ((s->flag_compress & DUMP_DH_COMPRESSED_SNAPPY) &&
1383                     (snappy_compress((char *)buf, s->page_size,
1384                     (char *)buf_out, &size_out) == SNAPPY_OK) &&
1385                     (size_out < s->page_size)) {
1386                 pd.flags = cpu_convert_to_target32(
1387                                         DUMP_DH_COMPRESSED_SNAPPY, endian);
1388                 pd.size  = cpu_convert_to_target32(size_out, endian);
1389
1390                 ret = write_cache(&page_data, buf_out, size_out, false);
1391                 if (ret < 0) {
1392                     dump_error(s, "dump: failed to write page data.\n");
1393                     goto out;
1394                 }
1395 #endif
1396             } else {
1397                 /*
1398                  * fall back to save in plaintext, size_out should be
1399                  * assigned to s->page_size
1400                  */
1401                 pd.flags = cpu_convert_to_target32(0, endian);
1402                 size_out = s->page_size;
1403                 pd.size = cpu_convert_to_target32(size_out, endian);
1404
1405                 ret = write_cache(&page_data, buf, s->page_size, false);
1406                 if (ret < 0) {
1407                     dump_error(s, "dump: failed to write page data.\n");
1408                     goto out;
1409                 }
1410             }
1411
1412             /* get and write page desc here */
1413             pd.page_flags = cpu_convert_to_target64(0, endian);
1414             pd.offset = cpu_convert_to_target64(offset_data, endian);
1415             offset_data += size_out;
1416
1417             ret = write_cache(&page_desc, &pd, sizeof(PageDescriptor), false);
1418             if (ret < 0) {
1419                 dump_error(s, "dump: failed to write page desc.\n");
1420                 goto out;
1421             }
1422         }
1423     }
1424
1425     ret = write_cache(&page_desc, NULL, 0, true);
1426     if (ret < 0) {
1427         dump_error(s, "dump: failed to sync cache for page_desc.\n");
1428         goto out;
1429     }
1430     ret = write_cache(&page_data, NULL, 0, true);
1431     if (ret < 0) {
1432         dump_error(s, "dump: failed to sync cache for page_data.\n");
1433         goto out;
1434     }
1435
1436 out:
1437     free_data_cache(&page_desc);
1438     free_data_cache(&page_data);
1439
1440 #ifdef CONFIG_LZO
1441     g_free(wrkmem);
1442 #endif
1443
1444     g_free(buf_out);
1445
1446     return ret;
1447 }
1448
1449 static int create_kdump_vmcore(DumpState *s)
1450 {
1451     int ret;
1452
1453     /*
1454      * the kdump-compressed format is:
1455      *                                               File offset
1456      *  +------------------------------------------+ 0x0
1457      *  |    main header (struct disk_dump_header) |
1458      *  |------------------------------------------+ block 1
1459      *  |    sub header (struct kdump_sub_header)  |
1460      *  |------------------------------------------+ block 2
1461      *  |            1st-dump_bitmap               |
1462      *  |------------------------------------------+ block 2 + X blocks
1463      *  |            2nd-dump_bitmap               | (aligned by block)
1464      *  |------------------------------------------+ block 2 + 2 * X blocks
1465      *  |  page desc for pfn 0 (struct page_desc)  | (aligned by block)
1466      *  |  page desc for pfn 1 (struct page_desc)  |
1467      *  |                    :                     |
1468      *  |------------------------------------------| (not aligned by block)
1469      *  |         page data (pfn 0)                |
1470      *  |         page data (pfn 1)                |
1471      *  |                    :                     |
1472      *  +------------------------------------------+
1473      */
1474
1475     ret = write_start_flat_header(s->fd);
1476     if (ret < 0) {
1477         dump_error(s, "dump: failed to write start flat header.\n");
1478         return -1;
1479     }
1480
1481     ret = write_dump_header(s);
1482     if (ret < 0) {
1483         return -1;
1484     }
1485
1486     ret = write_dump_bitmap(s);
1487     if (ret < 0) {
1488         return -1;
1489     }
1490
1491     ret = write_dump_pages(s);
1492     if (ret < 0) {
1493         return -1;
1494     }
1495
1496     ret = write_end_flat_header(s->fd);
1497     if (ret < 0) {
1498         dump_error(s, "dump: failed to write end flat header.\n");
1499         return -1;
1500     }
1501
1502     dump_completed(s);
1503
1504     return 0;
1505 }
1506
1507 static ram_addr_t get_start_block(DumpState *s)
1508 {
1509     GuestPhysBlock *block;
1510
1511     if (!s->has_filter) {
1512         s->next_block = QTAILQ_FIRST(&s->guest_phys_blocks.head);
1513         return 0;
1514     }
1515
1516     QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
1517         if (block->target_start >= s->begin + s->length ||
1518             block->target_end <= s->begin) {
1519             /* This block is out of the range */
1520             continue;
1521         }
1522
1523         s->next_block = block;
1524         if (s->begin > block->target_start) {
1525             s->start = s->begin - block->target_start;
1526         } else {
1527             s->start = 0;
1528         }
1529         return s->start;
1530     }
1531
1532     return -1;
1533 }
1534
1535 static void get_max_mapnr(DumpState *s)
1536 {
1537     GuestPhysBlock *last_block;
1538
1539     last_block = QTAILQ_LAST(&s->guest_phys_blocks.head, GuestPhysBlockHead);
1540     s->max_mapnr = paddr_to_pfn(last_block->target_end, s->page_shift);
1541 }
1542
1543 static int dump_init(DumpState *s, int fd, bool has_format,
1544                      DumpGuestMemoryFormat format, bool paging, bool has_filter,
1545                      int64_t begin, int64_t length, Error **errp)
1546 {
1547     CPUState *cpu;
1548     int nr_cpus;
1549     Error *err = NULL;
1550     int ret;
1551
1552     /* kdump-compressed is conflict with paging and filter */
1553     if (has_format && format != DUMP_GUEST_MEMORY_FORMAT_ELF) {
1554         assert(!paging && !has_filter);
1555     }
1556
1557     if (runstate_is_running()) {
1558         vm_stop(RUN_STATE_SAVE_VM);
1559         s->resume = true;
1560     } else {
1561         s->resume = false;
1562     }
1563
1564     /* If we use KVM, we should synchronize the registers before we get dump
1565      * info or physmap info.
1566      */
1567     cpu_synchronize_all_states();
1568     nr_cpus = 0;
1569     CPU_FOREACH(cpu) {
1570         nr_cpus++;
1571     }
1572
1573     s->errp = errp;
1574     s->fd = fd;
1575     s->has_filter = has_filter;
1576     s->begin = begin;
1577     s->length = length;
1578
1579     guest_phys_blocks_init(&s->guest_phys_blocks);
1580     guest_phys_blocks_append(&s->guest_phys_blocks);
1581
1582     s->start = get_start_block(s);
1583     if (s->start == -1) {
1584         error_set(errp, QERR_INVALID_PARAMETER, "begin");
1585         goto cleanup;
1586     }
1587
1588     /* get dump info: endian, class and architecture.
1589      * If the target architecture is not supported, cpu_get_dump_info() will
1590      * return -1.
1591      */
1592     ret = cpu_get_dump_info(&s->dump_info, &s->guest_phys_blocks);
1593     if (ret < 0) {
1594         error_set(errp, QERR_UNSUPPORTED);
1595         goto cleanup;
1596     }
1597
1598     s->note_size = cpu_get_note_size(s->dump_info.d_class,
1599                                      s->dump_info.d_machine, nr_cpus);
1600     if (s->note_size < 0) {
1601         error_set(errp, QERR_UNSUPPORTED);
1602         goto cleanup;
1603     }
1604
1605     /* get memory mapping */
1606     memory_mapping_list_init(&s->list);
1607     if (paging) {
1608         qemu_get_guest_memory_mapping(&s->list, &s->guest_phys_blocks, &err);
1609         if (err != NULL) {
1610             error_propagate(errp, err);
1611             goto cleanup;
1612         }
1613     } else {
1614         qemu_get_guest_simple_memory_mapping(&s->list, &s->guest_phys_blocks);
1615     }
1616
1617     s->nr_cpus = nr_cpus;
1618     s->page_size = TARGET_PAGE_SIZE;
1619     s->page_shift = ffs(s->page_size) - 1;
1620
1621     get_max_mapnr(s);
1622
1623     uint64_t tmp;
1624     tmp = DIV_ROUND_UP(DIV_ROUND_UP(s->max_mapnr, CHAR_BIT), s->page_size);
1625     s->len_dump_bitmap = tmp * s->page_size;
1626
1627     /* init for kdump-compressed format */
1628     if (has_format && format != DUMP_GUEST_MEMORY_FORMAT_ELF) {
1629         switch (format) {
1630         case DUMP_GUEST_MEMORY_FORMAT_KDUMP_ZLIB:
1631             s->flag_compress = DUMP_DH_COMPRESSED_ZLIB;
1632             break;
1633
1634         case DUMP_GUEST_MEMORY_FORMAT_KDUMP_LZO:
1635             s->flag_compress = DUMP_DH_COMPRESSED_LZO;
1636             break;
1637
1638         case DUMP_GUEST_MEMORY_FORMAT_KDUMP_SNAPPY:
1639             s->flag_compress = DUMP_DH_COMPRESSED_SNAPPY;
1640             break;
1641
1642         default:
1643             s->flag_compress = 0;
1644         }
1645
1646         return 0;
1647     }
1648
1649     if (s->has_filter) {
1650         memory_mapping_filter(&s->list, s->begin, s->length);
1651     }
1652
1653     /*
1654      * calculate phdr_num
1655      *
1656      * the type of ehdr->e_phnum is uint16_t, so we should avoid overflow
1657      */
1658     s->phdr_num = 1; /* PT_NOTE */
1659     if (s->list.num < UINT16_MAX - 2) {
1660         s->phdr_num += s->list.num;
1661         s->have_section = false;
1662     } else {
1663         s->have_section = true;
1664         s->phdr_num = PN_XNUM;
1665         s->sh_info = 1; /* PT_NOTE */
1666
1667         /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */
1668         if (s->list.num <= UINT32_MAX - 1) {
1669             s->sh_info += s->list.num;
1670         } else {
1671             s->sh_info = UINT32_MAX;
1672         }
1673     }
1674
1675     if (s->dump_info.d_class == ELFCLASS64) {
1676         if (s->have_section) {
1677             s->memory_offset = sizeof(Elf64_Ehdr) +
1678                                sizeof(Elf64_Phdr) * s->sh_info +
1679                                sizeof(Elf64_Shdr) + s->note_size;
1680         } else {
1681             s->memory_offset = sizeof(Elf64_Ehdr) +
1682                                sizeof(Elf64_Phdr) * s->phdr_num + s->note_size;
1683         }
1684     } else {
1685         if (s->have_section) {
1686             s->memory_offset = sizeof(Elf32_Ehdr) +
1687                                sizeof(Elf32_Phdr) * s->sh_info +
1688                                sizeof(Elf32_Shdr) + s->note_size;
1689         } else {
1690             s->memory_offset = sizeof(Elf32_Ehdr) +
1691                                sizeof(Elf32_Phdr) * s->phdr_num + s->note_size;
1692         }
1693     }
1694
1695     return 0;
1696
1697 cleanup:
1698     guest_phys_blocks_free(&s->guest_phys_blocks);
1699
1700     if (s->resume) {
1701         vm_start();
1702     }
1703
1704     return -1;
1705 }
1706
1707 void qmp_dump_guest_memory(bool paging, const char *file, bool has_begin,
1708                            int64_t begin, bool has_length,
1709                            int64_t length, bool has_format,
1710                            DumpGuestMemoryFormat format, Error **errp)
1711 {
1712     const char *p;
1713     int fd = -1;
1714     DumpState *s;
1715     int ret;
1716
1717     /*
1718      * kdump-compressed format need the whole memory dumped, so paging or
1719      * filter is not supported here.
1720      */
1721     if ((has_format && format != DUMP_GUEST_MEMORY_FORMAT_ELF) &&
1722         (paging || has_begin || has_length)) {
1723         error_setg(errp, "kdump-compressed format doesn't support paging or "
1724                          "filter");
1725         return;
1726     }
1727     if (has_begin && !has_length) {
1728         error_set(errp, QERR_MISSING_PARAMETER, "length");
1729         return;
1730     }
1731     if (!has_begin && has_length) {
1732         error_set(errp, QERR_MISSING_PARAMETER, "begin");
1733         return;
1734     }
1735
1736     /* check whether lzo/snappy is supported */
1737 #ifndef CONFIG_LZO
1738     if (has_format && format == DUMP_GUEST_MEMORY_FORMAT_KDUMP_LZO) {
1739         error_setg(errp, "kdump-lzo is not available now");
1740         return;
1741     }
1742 #endif
1743
1744 #ifndef CONFIG_SNAPPY
1745     if (has_format && format == DUMP_GUEST_MEMORY_FORMAT_KDUMP_SNAPPY) {
1746         error_setg(errp, "kdump-snappy is not available now");
1747         return;
1748     }
1749 #endif
1750
1751 #if !defined(WIN32)
1752     if (strstart(file, "fd:", &p)) {
1753         fd = monitor_get_fd(cur_mon, p, errp);
1754         if (fd == -1) {
1755             return;
1756         }
1757     }
1758 #endif
1759
1760     if  (strstart(file, "file:", &p)) {
1761         fd = qemu_open(p, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR);
1762         if (fd < 0) {
1763             error_setg_file_open(errp, errno, p);
1764             return;
1765         }
1766     }
1767
1768     if (fd == -1) {
1769         error_set(errp, QERR_INVALID_PARAMETER, "protocol");
1770         return;
1771     }
1772
1773     s = g_malloc0(sizeof(DumpState));
1774
1775     ret = dump_init(s, fd, has_format, format, paging, has_begin,
1776                     begin, length, errp);
1777     if (ret < 0) {
1778         g_free(s);
1779         return;
1780     }
1781
1782     if (has_format && format != DUMP_GUEST_MEMORY_FORMAT_ELF) {
1783         if (create_kdump_vmcore(s) < 0 && !error_is_set(s->errp)) {
1784             error_set(errp, QERR_IO_ERROR);
1785         }
1786     } else {
1787         if (create_vmcore(s) < 0 && !error_is_set(s->errp)) {
1788             error_set(errp, QERR_IO_ERROR);
1789         }
1790     }
1791
1792     g_free(s);
1793 }