Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP1:GA
xen.8005
59df63b2-x86-limit-linear-page-table-use-to-1-l...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 59df63b2-x86-limit-linear-page-table-use-to-1-level.patch of Package xen.8005
# Commit 6987fc7558bdbab8119eabf026e3cdad1053f0e5 # Date 2017-10-12 14:44:34 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> x86: limit linear page table use to a single level That's the only way that they're meant to be used. Without such a restriction arbitrarily long chains of same-level page tables can be built, tearing down of which may then cause arbitrarily deep recursion, causing a stack overflow. To facilitate this restriction, a counter is being introduced to track both the number of same-level entries in a page table as well as the number of uses of a page table in another same-level one (counting into positive and negative direction respectively, utilizing the fact that both counts can't be non-zero at the same time). Note that the added accounting introduces a restriction on the number of times a page can be used in other same-level page tables - more than 32k of such uses are no longer possible. Note also that some put_page_and_type[_preemptible]() calls are replaced with open-coded equivalents. This seemed preferrable to adding "parent_table" to the matrix of functions. Note further that cross-domain same-level page table references are no longer permitted (they probably never should have been). This is XSA-240. Reported-by: Jann Horn <jannh@google.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -985,6 +985,7 @@ int arch_set_info_guest( case -EINTR: rc = -EAGAIN; case -EAGAIN: + v->arch.old_guest_ptpg = NULL; v->arch.old_guest_table = pagetable_get_page(v->arch.guest_table); v->arch.guest_table = pagetable_null(); --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -674,6 +674,61 @@ static void put_data_page( put_page(page); } +static bool_t inc_linear_entries(struct page_info *pg) +{ + typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; + + do { + /* + * The check below checks for the "linear use" count being non-zero + * as well as overflow. Signed integer overflow is undefined behavior + * according to the C spec. However, as long as linear_pt_count is + * smaller in size than 'int', the arithmetic operation of the + * increment below won't overflow; rather the result will be truncated + * when stored. Ensure that this is always true. + */ + BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); + oc = nc++; + if ( nc <= 0 ) + return 0; + nc = cmpxchg(&pg->linear_pt_count, oc, nc); + } while ( oc != nc ); + + return 1; +} + +static void dec_linear_entries(struct page_info *pg) +{ + typeof(pg->linear_pt_count) oc; + + oc = arch_fetch_and_add(&pg->linear_pt_count, -1); + ASSERT(oc > 0); +} + +static bool_t inc_linear_uses(struct page_info *pg) +{ + typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; + + do { + /* See the respective comment in inc_linear_entries(). */ + BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); + oc = nc--; + if ( nc >= 0 ) + return 0; + nc = cmpxchg(&pg->linear_pt_count, oc, nc); + } while ( oc != nc ); + + return 1; +} + +static void dec_linear_uses(struct page_info *pg) +{ + typeof(pg->linear_pt_count) oc; + + oc = arch_fetch_and_add(&pg->linear_pt_count, 1); + ASSERT(oc < 0); +} + /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: @@ -703,15 +758,35 @@ get_##level##_linear_pagetable( \ if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \ { \ + struct page_info *ptpg = mfn_to_page(pde_pfn); \ + \ + /* Make sure the page table belongs to the correct domain. */ \ + if ( unlikely(page_get_owner(ptpg) != d) ) \ + return 0; \ + \ /* Make sure the mapped frame belongs to the correct domain. */ \ if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \ return 0; \ \ /* \ - * Ensure that the mapped frame is an already-validated page table. \ + * Ensure that the mapped frame is an already-validated page table \ + * and is not itself having linear entries, as well as that the \ + * containing page table is not iself in use as a linear page table \ + * elsewhere. \ * If so, atomically increment the count (checking for overflow). \ */ \ page = mfn_to_page(pfn); \ + if ( !inc_linear_entries(ptpg) ) \ + { \ + put_page(page); \ + return 0; \ + } \ + if ( !inc_linear_uses(page) ) \ + { \ + dec_linear_entries(ptpg); \ + put_page(page); \ + return 0; \ + } \ y = page->u.inuse.type_info; \ do { \ x = y; \ @@ -719,6 +794,8 @@ get_##level##_linear_pagetable( unlikely((x & (PGT_type_mask|PGT_validated)) != \ (PGT_##level##_page_table|PGT_validated)) ) \ { \ + dec_linear_uses(page); \ + dec_linear_entries(ptpg); \ put_page(page); \ return 0; \ } \ @@ -1098,6 +1175,9 @@ get_page_from_l4e( l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \ } while ( 0 ) +static int _put_page_type(struct page_info *page, bool_t preemptible, + struct page_info *ptpg); + void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) { unsigned long pfn = l1e_get_pfn(l1e); @@ -1167,17 +1247,22 @@ static int put_page_from_l2e(l2_pgentry_ if ( l2e_get_flags(l2e) & _PAGE_PSE ) put_superpage(l2e_get_pfn(l2e)); else - put_page_and_type(l2e_get_page(l2e)); + { + struct page_info *pg = l2e_get_page(l2e); + int rc = _put_page_type(pg, 0, mfn_to_page(pfn)); + + ASSERT(!rc); + put_page(pg); + } return 0; } -static int __put_page_type(struct page_info *, int preemptible); - static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, int partial, bool_t defer) { struct page_info *pg; + int rc; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) ) return 1; @@ -1200,21 +1285,28 @@ static int put_page_from_l3e(l3_pgentry_ if ( unlikely(partial > 0) ) { ASSERT(!defer); - return __put_page_type(pg, 1); + return _put_page_type(pg, 1, mfn_to_page(pfn)); } if ( defer ) { + current->arch.old_guest_ptpg = mfn_to_page(pfn); current->arch.old_guest_table = pg; return 0; } - return put_page_and_type_preemptible(pg); + rc = _put_page_type(pg, 1, mfn_to_page(pfn)); + if ( likely(!rc) ) + put_page(pg); + + return rc; } static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, int partial, bool_t defer) { + int rc = 1; + if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) { @@ -1223,18 +1315,22 @@ static int put_page_from_l4e(l4_pgentry_ if ( unlikely(partial > 0) ) { ASSERT(!defer); - return __put_page_type(pg, 1); + return _put_page_type(pg, 1, mfn_to_page(pfn)); } if ( defer ) { + current->arch.old_guest_ptpg = mfn_to_page(pfn); current->arch.old_guest_table = pg; return 0; } - return put_page_and_type_preemptible(pg); + rc = _put_page_type(pg, 1, mfn_to_page(pfn)); + if ( likely(!rc) ) + put_page(pg); } - return 1; + + return rc; } static int alloc_l1_table(struct page_info *page) @@ -1432,6 +1528,7 @@ static int alloc_l3_table(struct page_in { page->nr_validated_ptes = i; page->partial_pte = 0; + current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; } while ( i-- > 0 ) @@ -1497,6 +1594,7 @@ static int alloc_l4_table(struct page_in { if ( current->arch.old_guest_table ) page->nr_validated_ptes++; + current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; } } @@ -2229,14 +2327,20 @@ int free_page_type(struct page_info *pag } -static int __put_final_page_type( - struct page_info *page, unsigned long type, int preemptible) +static int _put_final_page_type(struct page_info *page, unsigned long type, + bool_t preemptible, struct page_info *ptpg) { int rc = free_page_type(page, type, preemptible); /* No need for atomic update of type_info here: noone else updates it. */ if ( rc == 0 ) { + if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) ) + { + dec_linear_uses(page); + dec_linear_entries(ptpg); + } + ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying); /* * Record TLB information for flush later. We do not stamp page tables * when running in shadow mode: @@ -2272,8 +2376,8 @@ static int __put_final_page_type( } -static int __put_page_type(struct page_info *page, - int preemptible) +static int _put_page_type(struct page_info *page, bool_t preemptible, + struct page_info *ptpg) { unsigned long nx, x, y = page->u.inuse.type_info; int rc = 0; @@ -2300,12 +2404,28 @@ static int __put_page_type(struct page_i x, nx)) != x) ) continue; /* We cleared the 'valid bit' so we do the clean up. */ - rc = __put_final_page_type(page, x, preemptible); + rc = _put_final_page_type(page, x, preemptible, ptpg); + ptpg = NULL; if ( x & PGT_partial ) put_page(page); break; } + if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) + { + /* + * page_set_tlbflush_timestamp() accesses the same union + * linear_pt_count lives in. Unvalidated page table pages, + * however, should occur during domain destruction only + * anyway. Updating of linear_pt_count luckily is not + * necessary anymore for a dying domain. + */ + ASSERT(page_get_owner(page)->is_dying); + ASSERT(page->linear_pt_count < 0); + ASSERT(ptpg->linear_pt_count > 0); + ptpg = NULL; + } + /* * Record TLB information for flush later. We do not stamp page * tables when running in shadow mode: @@ -2325,6 +2445,13 @@ static int __put_page_type(struct page_i return -EINTR; } + if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) + { + ASSERT(!rc); + dec_linear_uses(page); + dec_linear_entries(ptpg); + } + return rc; } @@ -2459,6 +2586,7 @@ static int __get_page_type(struct page_i page->nr_validated_ptes = 0; page->partial_pte = 0; } + page->linear_pt_count = 0; rc = alloc_page_type(page, type, preemptible); } @@ -2470,7 +2598,7 @@ static int __get_page_type(struct page_i void put_page_type(struct page_info *page) { - int rc = __put_page_type(page, 0); + int rc = _put_page_type(page, 0, NULL); ASSERT(rc == 0); (void)rc; } @@ -2486,7 +2614,7 @@ int get_page_type(struct page_info *page int put_page_type_preemptible(struct page_info *page) { - return __put_page_type(page, 1); + return _put_page_type(page, 1, NULL); } int get_page_type_preemptible(struct page_info *page, unsigned long type) @@ -2692,11 +2820,14 @@ int put_old_guest_table(struct vcpu *v) if ( !v->arch.old_guest_table ) return 0; - switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) ) + switch ( rc = _put_page_type(v->arch.old_guest_table, 1, + v->arch.old_guest_ptpg) ) { case -EINTR: case -EAGAIN: return -EAGAIN; + case 0: + put_page(v->arch.old_guest_table); } v->arch.old_guest_table = NULL; @@ -2850,6 +2981,7 @@ int new_guest_cr3(unsigned long mfn) case -EINTR: rc = -EAGAIN; case -EAGAIN: + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; break; default: @@ -3095,7 +3227,10 @@ long do_mmuext_op( if ( type == PGT_l1_page_table ) put_page_and_type(page); else + { + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + } } } @@ -3128,6 +3263,7 @@ long do_mmuext_op( { case -EINTR: case -EAGAIN: + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; rc = 0; break; @@ -3205,6 +3341,7 @@ long do_mmuext_op( case -EINTR: rc = -EAGAIN; case -EAGAIN: + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; okay = 0; break; --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -441,6 +441,8 @@ struct arch_vcpu pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ pagetable_t guest_table; /* (MFN) guest notion of cr3 */ struct page_info *old_guest_table; /* partially destructed pagetable */ + struct page_info *old_guest_ptpg; /* containing page table of the */ + /* former, if any */ /* guest_table holds a ref to the page, and also a type-count unless * shadow refcounts are in use */ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -119,11 +119,11 @@ struct page_info u32 tlbflush_timestamp; /* - * When PGT_partial is true then this field is valid and indicates - * that PTEs in the range [0, @nr_validated_ptes) have been validated. - * An extra page reference must be acquired (or not dropped) whenever - * PGT_partial gets set, and it must be dropped when the flag gets - * cleared. This is so that a get() leaving a page in partially + * When PGT_partial is true then the first two fields are valid and + * indicate that PTEs in the range [0, @nr_validated_ptes) have been + * validated. An extra page reference must be acquired (or not dropped) + * whenever PGT_partial gets set, and it must be dropped when the flag + * gets cleared. This is so that a get() leaving a page in partially * validated state (where the caller would drop the reference acquired * due to the getting of the type [apparently] failing [-EAGAIN]) * would not accidentally result in a page left with zero general @@ -147,10 +147,18 @@ struct page_info * put_page_from_lNe() (due to the apparent failure), and hence it * must be dropped when the put operation is resumed (and completes), * but it must not be acquired if picking up the page for validation. + * + * The 3rd field, @linear_pt_count, indicates + * - by a positive value, how many same-level page table entries a page + * table has, + * - by a negative value, in how many same-level page tables a page is + * in use. */ struct { - u16 nr_validated_ptes; - s8 partial_pte; + u16 nr_validated_ptes:PAGETABLE_ORDER + 1; + u16 :16 - PAGETABLE_ORDER - 1 - 2; + s16 partial_pte:2; + s16 linear_pt_count; }; /* @@ -201,6 +209,9 @@ struct page_info #define PGT_count_width PG_shift(9) #define PGT_count_mask ((1UL<<PGT_count_width)-1) +/* Are the 'type mask' bits identical? */ +#define PGT_type_equal(x, y) (!(((x) ^ (y)) & PGT_type_mask)) + /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated PG_shift(1) #define PGC_allocated PG_mask(1, 1) --- a/xen/include/asm-x86/system.h +++ b/xen/include/asm-x86/system.h @@ -118,6 +118,52 @@ static always_inline unsigned long __cmp }) /* + * Undefined symbol to cause link failure if a wrong size is used with + * arch_fetch_and_add(). + */ +extern unsigned long __bad_fetch_and_add_size(void); + +static always_inline unsigned long __xadd( + volatile void *ptr, unsigned long v, int size) +{ + switch ( size ) + { + case 1: + asm volatile ( "lock; xaddb %b0,%1" + : "+r" (v), "+m" (*__xg(ptr)) + :: "memory"); + return v; + case 2: + asm volatile ( "lock; xaddw %w0,%1" + : "+r" (v), "+m" (*__xg(ptr)) + :: "memory"); + return v; + case 4: + asm volatile ( "lock; xaddl %k0,%1" + : "+r" (v), "+m" (*__xg(ptr)) + :: "memory"); + return v; + case 8: + asm volatile ( "lock; xaddq %q0,%1" + : "+r" (v), "+m" (*__xg(ptr)) + :: "memory"); + + return v; + default: + return __bad_fetch_and_add_size(); + } +} + +/* + * Atomically add @v to the 1, 2, 4, or 8 byte value at @ptr. Returns + * the previous value. + * + * This is a full memory barrier. + */ +#define arch_fetch_and_add(ptr, v) \ + ((typeof(*(ptr)))__xadd(ptr, (typeof(*(ptr)))(v), sizeof(*(ptr)))) + +/* * Both Intel and AMD agree that, from a programmer's viewpoint: * Loads cannot be reordered relative to other loads. * Stores cannot be reordered relative to other stores.
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor