From: Pavel Machek The following patch is designed to fix a problem in the current implementation of swsusp in mainline kernels. Namely, swsusp uses an array of page backup entries (aka pagedir) to store pointers to memory pages that must be saved during suspend and restored during resume. Unfortunately, the pagedir has to be located in a contiguous chunk of memory and it sometimes turns out that an 8-order or even 9-order allocation is needed for this purpose. It sometimes is impossible to get such an allocation and swsusp may fail during either suspend or resume due to the lack of memory, although theoretically there is enough free memory for it to succeed. Moreover, swsusp is more likely to fail for this reason during resume, which means that it may fail during resume after a successful suspend (this actually has happened for some people, including me :-)) and this, potentially, may lead to the loss of data. The problem is fixed by replacing the pagedir with a linklist so that high-order memory allocations are avoided (the patches make swsusp use only 0-order allocations). Unfortunately this means that it's necessary to change assembly routines used to restore the image after it's been loaded from swap so that they walk the list instead of walking the array. This patch makes swsusp allocate only individual pages during resume. it contains the necessary changes to the assembly routines etc. for i386 and x86-64. Signed-off-by: Rafael J. Wysocki Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/kernel/asm-offsets.c | 6 25-akpm/arch/i386/power/swsusp.S | 23 +- 25-akpm/arch/x86_64/kernel/asm-offsets.c | 2 25-akpm/arch/x86_64/kernel/suspend_asm.S | 17 - 25-akpm/kernel/power/swsusp.c | 337 +++++++++++++++++++++---------- 5 files changed, 263 insertions(+), 122 deletions(-) diff -puN arch/i386/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume arch/i386/kernel/asm-offsets.c --- 25/arch/i386/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume 2005-03-09 16:43:10.000000000 -0800 +++ 25-akpm/arch/i386/kernel/asm-offsets.c 2005-03-09 16:43:10.000000000 -0800 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "sigframe.h" #include @@ -56,6 +57,11 @@ void foo(void) OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + BLANK(); + + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); /* Offset from the sysenter stack to tss.esp0 */ DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - diff -puN arch/i386/power/swsusp.S~swsusp-use-non-contiguous-memory-on-resume arch/i386/power/swsusp.S --- 25/arch/i386/power/swsusp.S~swsusp-use-non-contiguous-memory-on-resume 2005-03-09 16:43:10.000000000 -0800 +++ 25-akpm/arch/i386/power/swsusp.S 2005-03-09 16:43:10.000000000 -0800 @@ -12,6 +12,7 @@ #include #include #include +#include .text @@ -28,28 +29,28 @@ ENTRY(swsusp_arch_suspend) ret ENTRY(swsusp_arch_resume) - movl $swsusp_pg_dir-__PAGE_OFFSET,%ecx - movl %ecx,%cr3 + movl $swsusp_pg_dir-__PAGE_OFFSET, %ecx + movl %ecx, %cr3 - movl pagedir_nosave, %ebx - xorl %eax, %eax - xorl %edx, %edx + movl pagedir_nosave, %edx .p2align 4,,7 copy_loop: - movl 4(%ebx,%edx),%edi - movl (%ebx,%edx),%esi + testl %edx, %edx + jz done + + movl pbe_address(%edx), %esi + movl pbe_orig_address(%edx), %edi movl $1024, %ecx rep movsl - incl %eax - addl $16, %edx - cmpl nr_copy_pages,%eax - jb copy_loop + movl pbe_next(%edx), %edx + jmp copy_loop .p2align 4,,7 +done: movl saved_context_esp, %esp movl saved_context_ebp, %ebp movl saved_context_ebx, %ebx diff -puN arch/x86_64/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume arch/x86_64/kernel/asm-offsets.c --- 25/arch/x86_64/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume 2005-03-09 16:43:10.000000000 -0800 +++ 25-akpm/arch/x86_64/kernel/asm-offsets.c 2005-03-09 16:43:10.000000000 -0800 @@ -62,8 +62,8 @@ int main(void) offsetof (struct rt_sigframe32, uc.uc_mcontext)); BLANK(); #endif - DEFINE(SIZEOF_PBE, sizeof(struct pbe)); DEFINE(pbe_address, offsetof(struct pbe, address)); DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); + DEFINE(pbe_next, offsetof(struct pbe, next)); return 0; } diff -puN arch/x86_64/kernel/suspend_asm.S~swsusp-use-non-contiguous-memory-on-resume arch/x86_64/kernel/suspend_asm.S --- 25/arch/x86_64/kernel/suspend_asm.S~swsusp-use-non-contiguous-memory-on-resume 2005-03-09 16:43:10.000000000 -0800 +++ 25-akpm/arch/x86_64/kernel/suspend_asm.S 2005-03-09 16:43:10.000000000 -0800 @@ -54,16 +54,10 @@ ENTRY(swsusp_arch_resume) movq %rax, %cr4; # turn PGE back on movq pagedir_nosave(%rip), %rdx - /* compute the limit */ - movl nr_copy_pages(%rip), %eax - testl %eax, %eax - jz done - movq %rdx,%r8 - movl $SIZEOF_PBE,%r9d - mul %r9 # with rax, clobbers rdx - movq %r8, %rdx - addq %r8, %rax loop: + testq %rdx, %rdx + jz done + /* get addresses from the pbe and copy the page */ movq pbe_address(%rdx), %rsi movq pbe_orig_address(%rdx), %rdi @@ -72,9 +66,8 @@ loop: movsq /* progress to the next pbe */ - addq $SIZEOF_PBE, %rdx - cmpq %rax, %rdx - jb loop + movq pbe_next(%rdx), %rdx + jmp loop done: movl $24, %eax movl %eax, %ds diff -puN kernel/power/swsusp.c~swsusp-use-non-contiguous-memory-on-resume kernel/power/swsusp.c --- 25/kernel/power/swsusp.c~swsusp-use-non-contiguous-memory-on-resume 2005-03-09 16:43:10.000000000 -0800 +++ 25-akpm/kernel/power/swsusp.c 2005-03-09 16:43:10.000000000 -0800 @@ -621,6 +621,46 @@ static inline void free_pagedir(struct p } /** + * fill_pb_page - Create a list of PBEs on a given memory page + */ + +static inline void fill_pb_page(struct pbe *pbpage) +{ + struct pbe *p; + + p = pbpage; + pbpage += PB_PAGE_SKIP; + do + p->next = p + 1; + while (++p < pbpage); +} + +/** + * create_pbe_list - Create a list of PBEs on top of a given chain + * of memory pages allocated with alloc_pagedir() + */ + +static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) +{ + struct pbe *pbpage, *p; + unsigned num = PBES_PER_PAGE; + + for_each_pb_page (pbpage, pblist) { + if (num >= nr_pages) + break; + + fill_pb_page(pbpage); + num += PBES_PER_PAGE; + } + if (pbpage) { + for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) + p->next = p + 1; + p->next = NULL; + } + pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +/** * alloc_pagedir - Allocate the page directory. * * First, determine exactly how many pages we need and @@ -636,7 +676,7 @@ static inline void free_pagedir(struct p static struct pbe * alloc_pagedir(unsigned nr_pages) { unsigned num; - struct pbe *pblist, *pbe, *p; + struct pbe *pblist, *pbe; if (!nr_pages) return NULL; @@ -645,21 +685,13 @@ static struct pbe * alloc_pagedir(unsign pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; pbe = pbe->next, num += PBES_PER_PAGE) { - p = pbe; pbe += PB_PAGE_SKIP; - do - p->next = p + 1; - while (p++ < pbe); pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); } - if (pbe) { - for (num -= PBES_PER_PAGE - 1, p = pbe; num < nr_pages; p++, num++) - p->next = p + 1; - } else { /* get_zeroed_page() failed */ + if (!pbe) { /* get_zeroed_page() failed */ free_pagedir(pblist); pblist = NULL; } - pr_debug("alloc_pagedir(): allocated %d PBEs\n", num); return pblist; } @@ -766,6 +798,7 @@ static int swsusp_alloc(void) printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return -ENOMEM; } + create_pbe_list(pagedir_save, nr_copy_pages); pagedir_nosave = pagedir_save; if ((error = alloc_image_pages())) { printk(KERN_ERR "suspend: Allocating image pages failed.\n"); @@ -917,44 +950,104 @@ static int __init does_collide_order(uns return 0; } -/* - * We check here that pagedir & pages it points to won't collide with pages - * where we're going to restore from the loaded pages later +/** + * On resume, for storing the PBE list and the image, + * we can only use memory pages that do not conflict with the pages + * which had been used before suspend. + * + * We don't know which pages are usable until we allocate them. + * + * Allocated but unusable (ie eaten) memory pages are linked together + * to create a list, so that we can free them easily + * + * We could have used a type other than (void *) + * for this purpose, but ... */ -static int __init check_pagedir(void) +static void **eaten_memory = NULL; + +static inline void eat_page(void *page) { - int i; + void **c; - for(i=0; i < nr_copy_pages; i++) { - unsigned long addr; + c = eaten_memory; + eaten_memory = page; + *eaten_memory = c; +} - do { - addr = get_zeroed_page(GFP_ATOMIC); - if(!addr) - return -ENOMEM; - } while (does_collide_order(addr, 0)); +static unsigned long __init get_usable_page(unsigned gfp_mask) +{ + unsigned long m; - (pagedir_nosave+i)->address = addr; + m = get_zeroed_page(gfp_mask); + while (does_collide_order(m, 0)) { + eat_page((void *)m); + m = get_zeroed_page(gfp_mask); + if (!m) + break; } - return 0; + return m; } -static int __init swsusp_pagedir_relocate(void) +static void __init free_eaten_memory(void) { - /* - * We have to avoid recursion (not to overflow kernel stack), - * and that's why code looks pretty cryptic + unsigned long m; + void **c; + int i = 0; + + c = eaten_memory; + while (c) { + m = (unsigned long)c; + c = *c; + free_page(m); + i++; + } + eaten_memory = NULL; + pr_debug("swsusp: %d unused pages freed\n", i); +} + +/** + * check_pagedir - We ensure here that pages that the PBEs point to + * won't collide with pages where we're going to restore from the loaded + * pages later + */ + +static int __init check_pagedir(struct pbe *pblist) +{ + struct pbe *p; + + /* This is necessary, so that we can free allocated pages + * in case of failure */ - suspend_pagedir_t *old_pagedir = pagedir_nosave; - void **eaten_memory = NULL; - void **c = eaten_memory, *m, *f; - int ret = 0; + for_each_pbe (p, pblist) + p->address = 0UL; + + for_each_pbe (p, pblist) { + p->address = get_usable_page(GFP_ATOMIC); + if (!p->address) + return -ENOMEM; + } + return 0; +} + +/** + * swsusp_pagedir_relocate - It is possible, that some memory pages + * occupied by the list of PBEs collide with pages where we're going to + * restore from the loaded pages later. We relocate them here. + */ + +static struct pbe * __init swsusp_pagedir_relocate(struct pbe *pblist) +{ struct zone *zone; - int i; - struct pbe *p; unsigned long zone_pfn; + struct pbe *pbpage, *tail, *p; + void *m; + int rel = 0, error = 0; + + if (!pblist) /* a sanity check */ + return NULL; - printk("Relocating pagedir "); + pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", + swsusp_info.pagedir_pages); /* Set page flags */ @@ -964,45 +1057,52 @@ static int __init swsusp_pagedir_relocat zone->zone_start_pfn)); } - /* Clear orig address */ + /* Clear orig addresses */ - for(i = 0, p = pagedir_nosave; i < nr_copy_pages; i++, p++) { + for_each_pbe (p, pblist) ClearPageNosaveFree(virt_to_page(p->orig_address)); - } - if (!does_collide_order((unsigned long)old_pagedir, pagedir_order)) { - printk("not necessary\n"); - return check_pagedir(); - } + tail = pblist + PB_PAGE_SKIP; - while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order)) != NULL) { - if (!does_collide_order((unsigned long)m, pagedir_order)) - break; - eaten_memory = m; - printk( "." ); - *eaten_memory = c; - c = eaten_memory; - } + /* Relocate colliding pages */ - if (!m) { - printk("out of memory\n"); - ret = -ENOMEM; - } else { - pagedir_nosave = - memcpy(m, old_pagedir, PAGE_SIZE << pagedir_order); + for_each_pb_page (pbpage, pblist) { + if (does_collide_order((unsigned long)pbpage, 0)) { + m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); + if (!m) { + error = -ENOMEM; + break; + } + memcpy(m, (void *)pbpage, PAGE_SIZE); + if (pbpage == pblist) + pblist = (struct pbe *)m; + else + tail->next = (struct pbe *)m; + + eat_page((void *)pbpage); + pbpage = (struct pbe *)m; + + /* We have to link the PBEs again */ + + for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) + if (p->next) /* needed to save the end */ + p->next = p + 1; + + rel++; + } + tail = pbpage + PB_PAGE_SKIP; } - c = eaten_memory; - while (c) { - printk(":"); - f = c; - c = *c; - free_pages((unsigned long)f, pagedir_order); + if (error) { + printk("\nswsusp: Out of memory\n\n"); + free_pagedir(pblist); + free_eaten_memory(); + pblist = NULL; } - if (ret) - return ret; - printk("|\n"); - return check_pagedir(); + else + printk("swsusp: Relocated %d pages\n", rel); + + return pblist; } /** @@ -1147,76 +1247,117 @@ static int __init check_sig(void) } /** - * swsusp_read_data - Read image pages from swap. + * data_read - Read image pages from swap. * * You do not need to check for overlaps, check_pagedir() * already did that. */ -static int __init data_read(void) +static int __init data_read(struct pbe *pblist) { struct pbe * p; - int error; - int i; - int mod = nr_copy_pages / 100; + int error = 0; + int i = 0; + int mod = swsusp_info.image_pages / 100; if (!mod) mod = 1; - if ((error = swsusp_pagedir_relocate())) - return error; + printk("swsusp: Reading image data (%lu pages): ", + swsusp_info.image_pages); + + for_each_pbe (p, pblist) { + if (!(i % mod)) + printk("\b\b\b\b%3d%%", i / mod); - printk( "Reading image data (%d pages): ", nr_copy_pages ); - for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) { - if (!(i%mod)) - printk( "\b\b\b\b%3d%%", i / mod ); error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); + if (error) + return error; + + i++; } - printk(" %d done.\n",i); + printk("\b\b\b\bdone\n"); return error; - } extern dev_t __init name_to_dev_t(const char *line); -static int __init read_pagedir(void) +/** + * read_pagedir - Read page backup list pages from swap + */ + +static int __init read_pagedir(struct pbe *pblist) { - unsigned long addr; - int i, n = swsusp_info.pagedir_pages; - int error = 0; + struct pbe *pbpage, *p; + unsigned i = 0; + int error; - addr = __get_free_pages(GFP_ATOMIC, pagedir_order); - if (!addr) - return -ENOMEM; - pagedir_nosave = (struct pbe *)addr; + if (!pblist) + return -EFAULT; + + printk("swsusp: Reading pagedir (%lu pages)\n", + swsusp_info.pagedir_pages); - pr_debug("swsusp: Reading pagedir (%d Pages)\n",n); + for_each_pb_page (pbpage, pblist) { + unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); - for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) { - unsigned long offset = swp_offset(swsusp_info.pagedir[i]); - if (offset) - error = bio_read_page(offset, (void *)addr); - else - error = -EFAULT; + error = -EFAULT; + if (offset) { + p = (pbpage + PB_PAGE_SKIP)->next; + error = bio_read_page(offset, (void *)pbpage); + (pbpage + PB_PAGE_SKIP)->next = p; + } + if (error) + break; } + if (error) - free_pages((unsigned long)pagedir_nosave, pagedir_order); + free_page((unsigned long)pblist); + + BUG_ON(i != swsusp_info.pagedir_pages); + return error; } + static int __init read_suspend_image(void) { int error = 0; + struct pbe *p; if ((error = check_sig())) return error; + if ((error = check_header())) return error; - if ((error = read_pagedir())) + + if (!(p = alloc_pagedir(nr_copy_pages))) + return -ENOMEM; + + if ((error = read_pagedir(p))) return error; - if ((error = data_read())) - free_pages((unsigned long)pagedir_nosave, pagedir_order); + + create_pbe_list(p, nr_copy_pages); + + if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) + return -ENOMEM; + + /* Allocate memory for the image and read the data from swap */ + + error = check_pagedir(pagedir_nosave); + free_eaten_memory(); + if (!error) + error = data_read(pagedir_nosave); + + if (error) { /* We fail cleanly */ + for_each_pbe (p, pagedir_nosave) + if (p->address) { + free_page(p->address); + p->address = 0UL; + } + free_pagedir(pagedir_nosave); + } return error; } _