diff options
author | Jarkko Sakkinen | 2022-07-01 03:03:07 +0300 |
---|---|---|
committer | Jarkko Sakkinen | 2022-07-01 03:03:07 +0300 |
commit | ab9a6d0381d57850e05514b725c2eb237a5a8307 (patch) | |
tree | 6038240232070fbd059bbad4de20e48758410e6c | |
parent | 0814a72fd293a15a24ec4c96e4f7dfc75446cddb (diff) | |
download | aur-ab9a6d0381d57850e05514b725c2eb237a5a8307.tar.gz |
feat: add SNP patches
Replace existing patch file with a single patch file that includes
SGX2 and SNP patches.
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@iki.fi>
37 files changed, 13623 insertions, 5365 deletions
diff --git a/0001-x86-sgx-Disconnect-backing-page-references-from-dirt.patch b/0001-x86-sgx-Disconnect-backing-page-references-from-dirt.patch deleted file mode 100644 index 4d9839913e31..000000000000 --- a/0001-x86-sgx-Disconnect-backing-page-references-from-dirt.patch +++ /dev/null @@ -1,167 +0,0 @@ -From 16c0d19cdf8ea458d7388593e7f9537bd545f3a7 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Thu, 12 May 2022 14:50:57 -0700 -Subject: [PATCH 01/36] x86/sgx: Disconnect backing page references from dirty - status - -SGX uses shmem backing storage to store encrypted enclave pages -and their crypto metadata when enclave pages are moved out of -enclave memory. Two shmem backing storage pages are associated with -each enclave page - one backing page to contain the encrypted -enclave page data and one backing page (shared by a few -enclave pages) to contain the crypto metadata used by the -processor to verify the enclave page when it is loaded back into -the enclave. - -sgx_encl_put_backing() is used to release references to the -backing storage and, optionally, mark both backing store pages -as dirty. - -Managing references and dirty status together in this way results -in both backing store pages marked as dirty, even if only one of -the backing store pages are changed. - -Additionally, waiting until the page reference is dropped to set -the page dirty risks a race with the page fault handler that -may load outdated data into the enclave when a page is faulted -right after it is reclaimed. - -Consider what happens if the reclaimer writes a page to the backing -store and the page is immediately faulted back, before the reclaimer -is able to set the dirty bit of the page: - -sgx_reclaim_pages() { sgx_vma_fault() { - ... - sgx_encl_get_backing(); - ... ... - sgx_reclaimer_write() { - mutex_lock(&encl->lock); - /* Write data to backing store */ - mutex_unlock(&encl->lock); - } - mutex_lock(&encl->lock); - __sgx_encl_eldu() { - ... - /* - * Enclave backing store - * page not released - * nor marked dirty - - * contents may not be - * up to date. - */ - sgx_encl_get_backing(); - ... - /* - * Enclave data restored - * from backing store - * and PCMD pages that - * are not up to date. - * ENCLS[ELDU] faults - * because of MAC or PCMD - * checking failure. - */ - sgx_encl_put_backing(); - } - ... - /* set page dirty */ - sgx_encl_put_backing(); - ... - mutex_unlock(&encl->lock); -} } - -Remove the option to sgx_encl_put_backing() to set the backing -pages as dirty and set the needed pages as dirty right after -receiving important data while enclave mutex is held. This ensures that -the page fault handler can get up to date data from a page and prepares -the code for a following change where only one of the backing pages -need to be marked as dirty. - -Cc: stable@vger.kernel.org -Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") -Suggested-by: Dave Hansen <dave.hansen@linux.intel.com> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> -Link: https://lore.kernel.org/linux-sgx/8922e48f-6646-c7cc-6393-7c78dcf23d23@intel.com/ ---- - arch/x86/kernel/cpu/sgx/encl.c | 10 ++-------- - arch/x86/kernel/cpu/sgx/encl.h | 2 +- - arch/x86/kernel/cpu/sgx/main.c | 6 ++++-- - 3 files changed, 7 insertions(+), 11 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 7c63a1911fae..398695a20605 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -94,7 +94,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, - kunmap_atomic(pcmd_page); - kunmap_atomic((void *)(unsigned long)pginfo.contents); - -- sgx_encl_put_backing(&b, false); -+ sgx_encl_put_backing(&b); - - sgx_encl_truncate_backing_page(encl, page_index); - -@@ -645,15 +645,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - /** - * sgx_encl_put_backing() - Unpin the backing storage - * @backing: data for accessing backing storage for the page -- * @do_write: mark pages dirty - */ --void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) -+void sgx_encl_put_backing(struct sgx_backing *backing) - { -- if (do_write) { -- set_page_dirty(backing->pcmd); -- set_page_dirty(backing->contents); -- } -- - put_page(backing->pcmd); - put_page(backing->contents); - } -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index fec43ca65065..d44e7372151f 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref); - int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); - int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); --void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); -+void sgx_encl_put_backing(struct sgx_backing *backing); - int sgx_encl_test_and_clear_young(struct mm_struct *mm, - struct sgx_encl_page *page); - -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index 8e4bc6453d26..e71df40a4f38 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, - backing->pcmd_offset; - - ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); -+ set_page_dirty(backing->pcmd); -+ set_page_dirty(backing->contents); - - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - - backing->pcmd_offset)); -@@ -320,7 +322,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, - sgx_encl_free_epc_page(encl->secs.epc_page); - encl->secs.epc_page = NULL; - -- sgx_encl_put_backing(&secs_backing, true); -+ sgx_encl_put_backing(&secs_backing); - } - - out: -@@ -411,7 +413,7 @@ static void sgx_reclaim_pages(void) - - encl_page = epc_page->owner; - sgx_reclaimer_write(epc_page, &backing[i]); -- sgx_encl_put_backing(&backing[i], true); -+ sgx_encl_put_backing(&backing[i]); - - kref_put(&encl_page->encl->refcount, sgx_encl_release); - epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; --- -2.36.1 - diff --git a/0002-x86-sgx-Mark-PCMD-page-as-dirty-when-modifying-conte.patch b/0002-x86-sgx-Mark-PCMD-page-as-dirty-when-modifying-conte.patch deleted file mode 100644 index b1dabf98c596..000000000000 --- a/0002-x86-sgx-Mark-PCMD-page-as-dirty-when-modifying-conte.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 29ebcc3dd06578be9cb3d59007e9466cb336f618 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Thu, 12 May 2022 14:50:58 -0700 -Subject: [PATCH 02/36] x86/sgx: Mark PCMD page as dirty when modifying - contents - -Recent commit 08999b2489b4 ("x86/sgx: Free backing memory -after faulting the enclave page") expanded __sgx_encl_eldu() -to clear an enclave page's PCMD (Paging Crypto MetaData) -from the PCMD page in the backing store after the enclave -page is restored to the enclave. - -Since the PCMD page in the backing store is modified the page -should be marked as dirty to ensure the modified data is retained. - -Cc: stable@vger.kernel.org -Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 398695a20605..5104a428b72c 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -84,6 +84,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, - } - - memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); -+ set_page_dirty(b.pcmd); - - /* - * The area for the PCMD in the page was zeroed above. Check if the --- -2.36.1 - diff --git a/0003-x86-sgx-Obtain-backing-storage-page-with-enclave-mut.patch b/0003-x86-sgx-Obtain-backing-storage-page-with-enclave-mut.patch deleted file mode 100644 index 622b25abd6b5..000000000000 --- a/0003-x86-sgx-Obtain-backing-storage-page-with-enclave-mut.patch +++ /dev/null @@ -1,130 +0,0 @@ -From a455a2c6f55a696cfe170f3f62c1390760dab06c Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Thu, 12 May 2022 14:50:59 -0700 -Subject: [PATCH 03/36] x86/sgx: Obtain backing storage page with enclave mutex - held - -Haitao reported encountering a WARN triggered by the ENCLS[ELDU] -instruction faulting with a #GP. - -The WARN is encountered when the reclaimer evicts a range of -pages from the enclave when the same pages are faulted back -right away. - -The SGX backing storage is accessed on two paths: when there -are insufficient free pages in the EPC the reclaimer works -to move enclave pages to the backing storage and as enclaves -access pages that have been moved to the backing storage -they are retrieved from there as part of page fault handling. - -An oversubscribed SGX system will often run the reclaimer and -page fault handler concurrently and needs to ensure that the -backing store is accessed safely between the reclaimer and -the page fault handler. This is not the case because the -reclaimer accesses the backing store without the enclave mutex -while the page fault handler accesses the backing store with -the enclave mutex. - -Consider the scenario where a page is faulted while a page sharing -a PCMD page with the faulted page is being reclaimed. The -consequence is a race between the reclaimer and page fault -handler, the reclaimer attempting to access a PCMD at the -same time it is truncated by the page fault handler. This -could result in lost PCMD data. Data may still be -lost if the reclaimer wins the race, this is addressed in -the following patch. - -The reclaimer accesses pages from the backing storage without -holding the enclave mutex and runs the risk of concurrently -accessing the backing storage with the page fault handler that -does access the backing storage with the enclave mutex held. - -In the scenario below a PCMD page is truncated from the backing -store after all its pages have been loaded in to the enclave -at the same time the PCMD page is loaded from the backing store -when one of its pages are reclaimed: - -sgx_reclaim_pages() { sgx_vma_fault() { - ... - mutex_lock(&encl->lock); - ... - __sgx_encl_eldu() { - ... - if (pcmd_page_empty) { -/* - * EPC page being reclaimed /* - * shares a PCMD page with an * PCMD page truncated - * enclave page that is being * while requested from - * faulted in. * reclaimer. - */ */ -sgx_encl_get_backing() <----------> sgx_encl_truncate_backing_page() - } - mutex_unlock(&encl->lock); -} } - -In this scenario there is a race between the reclaimer and the page fault -handler when the reclaimer attempts to get access to the same PCMD page -that is being truncated. This could result in the reclaimer writing to -the PCMD page that is then truncated, causing the PCMD data to be lost, -or in a new PCMD page being allocated. The lost PCMD data may still occur -after protecting the backing store access with the mutex - this is fixed -in the next patch. By ensuring the backing store is accessed with the mutex -held the enclave page state can be made accurate with the -SGX_ENCL_PAGE_BEING_RECLAIMED flag accurately reflecting that a page -is in the process of being reclaimed. - -Consistently protect the reclaimer's backing store access with the -enclave's mutex to ensure that it can safely run concurrently with the -page fault handler. - -Cc: stable@vger.kernel.org -Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") -Reported-by: Haitao Huang <haitao.huang@intel.com> -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/main.c | 9 ++++++--- - 1 file changed, 6 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index e71df40a4f38..ab4ec54bbdd9 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -310,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, - sgx_encl_ewb(epc_page, backing); - encl_page->epc_page = NULL; - encl->secs_child_cnt--; -+ sgx_encl_put_backing(backing); - - if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { - ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), -@@ -381,11 +382,14 @@ static void sgx_reclaim_pages(void) - goto skip; - - page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); -+ -+ mutex_lock(&encl_page->encl->lock); - ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); -- if (ret) -+ if (ret) { -+ mutex_unlock(&encl_page->encl->lock); - goto skip; -+ } - -- mutex_lock(&encl_page->encl->lock); - encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; - mutex_unlock(&encl_page->encl->lock); - continue; -@@ -413,7 +417,6 @@ static void sgx_reclaim_pages(void) - - encl_page = epc_page->owner; - sgx_reclaimer_write(epc_page, &backing[i]); -- sgx_encl_put_backing(&backing[i]); - - kref_put(&encl_page->encl->refcount, sgx_encl_release); - epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; --- -2.36.1 - diff --git a/0004-x86-sgx-Fix-race-between-reclaimer-and-page-fault-ha.patch b/0004-x86-sgx-Fix-race-between-reclaimer-and-page-fault-ha.patch deleted file mode 100644 index 85cc8359b731..000000000000 --- a/0004-x86-sgx-Fix-race-between-reclaimer-and-page-fault-ha.patch +++ /dev/null @@ -1,255 +0,0 @@ -From c5894032c2cbbc1836785f2d0150bc34305760ae Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Thu, 12 May 2022 14:51:00 -0700 -Subject: [PATCH 04/36] x86/sgx: Fix race between reclaimer and page fault - handler - -Haitao reported encountering a WARN triggered by the ENCLS[ELDU] -instruction faulting with a #GP. - -The WARN is encountered when the reclaimer evicts a range of -pages from the enclave when the same pages are faulted back right away. - -Consider two enclave pages (ENCLAVE_A and ENCLAVE_B) -sharing a PCMD page (PCMD_AB). ENCLAVE_A is in the -enclave memory and ENCLAVE_B is in the backing store. PCMD_AB contains -just one entry, that of ENCLAVE_B. - -Scenario proceeds where ENCLAVE_A is being evicted from the enclave -while ENCLAVE_B is faulted in. - -sgx_reclaim_pages() { - - ... - - /* - * Reclaim ENCLAVE_A - */ - mutex_lock(&encl->lock); - /* - * Get a reference to ENCLAVE_A's - * shmem page where enclave page - * encrypted data will be stored - * as well as a reference to the - * enclave page's PCMD data page, - * PCMD_AB. - * Release mutex before writing - * any data to the shmem pages. - */ - sgx_encl_get_backing(...); - encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; - mutex_unlock(&encl->lock); - - /* - * Fault ENCLAVE_B - */ - - sgx_vma_fault() { - - mutex_lock(&encl->lock); - /* - * Get reference to - * ENCLAVE_B's shmem page - * as well as PCMD_AB. - */ - sgx_encl_get_backing(...) - /* - * Load page back into - * enclave via ELDU. - */ - /* - * Release reference to - * ENCLAVE_B' shmem page and - * PCMD_AB. - */ - sgx_encl_put_backing(...); - /* - * PCMD_AB is found empty so - * it and ENCLAVE_B's shmem page - * are truncated. - */ - /* Truncate ENCLAVE_B backing page */ - sgx_encl_truncate_backing_page(); - /* Truncate PCMD_AB */ - sgx_encl_truncate_backing_page(); - - mutex_unlock(&encl->lock); - - ... - } - mutex_lock(&encl->lock); - encl_page->desc &= - ~SGX_ENCL_PAGE_BEING_RECLAIMED; - /* - * Write encrypted contents of - * ENCLAVE_A to ENCLAVE_A shmem - * page and its PCMD data to - * PCMD_AB. - */ - sgx_encl_put_backing(...) - - /* - * Reference to PCMD_AB is - * dropped and it is truncated. - * ENCLAVE_A's PCMD data is lost. - */ - mutex_unlock(&encl->lock); -} - -What happens next depends on whether it is ENCLAVE_A being faulted -in or ENCLAVE_B being evicted - but both end up with ENCLS[ELDU] faulting -with a #GP. - -If ENCLAVE_A is faulted then at the time sgx_encl_get_backing() is called -a new PCMD page is allocated and providing the empty PCMD data for -ENCLAVE_A would cause ENCLS[ELDU] to #GP - -If ENCLAVE_B is evicted first then a new PCMD_AB would be allocated by the -reclaimer but later when ENCLAVE_A is faulted the ENCLS[ELDU] instruction -would #GP during its checks of the PCMD value and the WARN would be -encountered. - -Noting that the reclaimer sets SGX_ENCL_PAGE_BEING_RECLAIMED at the time -it obtains a reference to the backing store pages of an enclave page it -is in the process of reclaiming, fix the race by only truncating the PCMD -page after ensuring that no page sharing the PCMD page is in the process -of being reclaimed. - -Cc: stable@vger.kernel.org -Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") -Reported-by: Haitao Huang <haitao.huang@intel.com> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 94 +++++++++++++++++++++++++++++++++- - 1 file changed, 93 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 5104a428b72c..243f3bd78145 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -12,6 +12,92 @@ - #include "encls.h" - #include "sgx.h" - -+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) -+/* -+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to -+ * determine the page index associated with the first PCMD entry -+ * within a PCMD page. -+ */ -+#define PCMD_FIRST_MASK GENMASK(4, 0) -+ -+/** -+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with -+ * a PCMD page is in process of being reclaimed. -+ * @encl: Enclave to which PCMD page belongs -+ * @start_addr: Address of enclave page using first entry within the PCMD page -+ * -+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is -+ * stored. The PCMD data of a reclaimed enclave page contains enough -+ * information for the processor to verify the page at the time -+ * it is loaded back into the Enclave Page Cache (EPC). -+ * -+ * The backing storage to which enclave pages are reclaimed is laid out as -+ * follows: -+ * Encrypted enclave pages:SECS page:PCMD pages -+ * -+ * Each PCMD page contains the PCMD metadata of -+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. -+ * -+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the -+ * process of getting data (and thus soon being non-empty). (b) is tested with -+ * a check if an enclave page sharing the PCMD page is in the process of being -+ * reclaimed. -+ * -+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it -+ * intends to reclaim that enclave page - it means that the PCMD page -+ * associated with that enclave page is about to get some data and thus -+ * even if the PCMD page is empty, it should not be truncated. -+ * -+ * Context: Enclave mutex (&sgx_encl->lock) must be held. -+ * Return: 1 if the reclaimer is about to write to the PCMD page -+ * 0 if the reclaimer has no intention to write to the PCMD page -+ */ -+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, -+ unsigned long start_addr) -+{ -+ int reclaimed = 0; -+ int i; -+ -+ /* -+ * PCMD_FIRST_MASK is based on number of PCMD entries within -+ * PCMD page being 32. -+ */ -+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32); -+ -+ for (i = 0; i < PCMDS_PER_PAGE; i++) { -+ struct sgx_encl_page *entry; -+ unsigned long addr; -+ -+ addr = start_addr + i * PAGE_SIZE; -+ -+ /* -+ * Stop when reaching the SECS page - it does not -+ * have a page_array entry and its reclaim is -+ * started and completed with enclave mutex held so -+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED -+ * flag. -+ */ -+ if (addr == encl->base + encl->size) -+ break; -+ -+ entry = xa_load(&encl->page_array, PFN_DOWN(addr)); -+ if (!entry) -+ continue; -+ -+ /* -+ * VA page slot ID uses same bit as the flag so it is important -+ * to ensure that the page is not already in backing store. -+ */ -+ if (entry->epc_page && -+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { -+ reclaimed = 1; -+ break; -+ } -+ } -+ -+ return reclaimed; -+} -+ - /* - * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's - * follow right after the EPC data in the backing storage. In addition to the -@@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, - unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; - struct sgx_encl *encl = encl_page->encl; - pgoff_t page_index, page_pcmd_off; -+ unsigned long pcmd_first_page; - struct sgx_pageinfo pginfo; - struct sgx_backing b; - bool pcmd_page_empty; -@@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, - else - page_index = PFN_DOWN(encl->size); - -+ /* -+ * Address of enclave page using the first entry within the PCMD page. -+ */ -+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; -+ - page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); - - ret = sgx_encl_get_backing(encl, page_index, &b); -@@ -99,7 +191,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, - - sgx_encl_truncate_backing_page(encl, page_index); - -- if (pcmd_page_empty) -+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) - sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); - - return ret; --- -2.36.1 - diff --git a/0005-x86-sgx-Ensure-no-data-in-PCMD-page-after-truncate.patch b/0005-x86-sgx-Ensure-no-data-in-PCMD-page-after-truncate.patch deleted file mode 100644 index 64d68a2b75ba..000000000000 --- a/0005-x86-sgx-Ensure-no-data-in-PCMD-page-after-truncate.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 5e9c8f738e0c7feee2121685e15e653cdbb998e9 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Thu, 12 May 2022 14:51:01 -0700 -Subject: [PATCH 05/36] x86/sgx: Ensure no data in PCMD page after truncate - -A PCMD (Paging Crypto MetaData) page contains the PCMD -structures of enclave pages that have been encrypted and -moved to the shmem backing store. When all enclave pages -sharing a PCMD page are loaded in the enclave, there is no -need for the PCMD page and it can be truncated from the -backing store. - -A few issues appeared around the truncation of PCMD pages. The -known issues have been addressed but the PCMD handling code could -be made more robust by loudly complaining if any new issue appears -in this area. - -Add a check that will complain with a warning if the PCMD page is not -actually empty after it has been truncated. There should never be data -in the PCMD page at this point since it is was just checked to be empty -and truncated with enclave mutex held and is updated with the -enclave mutex held. - -Suggested-by: Dave Hansen <dave.hansen@linux.intel.com> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 243f3bd78145..3c24e6124d95 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -187,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, - kunmap_atomic(pcmd_page); - kunmap_atomic((void *)(unsigned long)pginfo.contents); - -+ get_page(b.pcmd); - sgx_encl_put_backing(&b); - - sgx_encl_truncate_backing_page(encl, page_index); - -- if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) -+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { - sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); -+ pcmd_page = kmap_atomic(b.pcmd); -+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) -+ pr_warn("PCMD page not empty after truncate.\n"); -+ kunmap_atomic(pcmd_page); -+ } -+ -+ put_page(b.pcmd); - - return ret; - } --- -2.36.1 - diff --git a/0006-x86-sgx-Add-short-descriptions-to-ENCLS-wrappers.patch b/0006-x86-sgx-Add-short-descriptions-to-ENCLS-wrappers.patch deleted file mode 100644 index 6c46a0df4fc4..000000000000 --- a/0006-x86-sgx-Add-short-descriptions-to-ENCLS-wrappers.patch +++ /dev/null @@ -1,109 +0,0 @@ -From 911e551ff47ec3e18b776b9d86b4c2e64cd675be Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:37 -0700 -Subject: [PATCH 06/36] x86/sgx: Add short descriptions to ENCLS wrappers - -The SGX ENCLS instruction uses EAX to specify an SGX function and -may require additional registers, depending on the SGX function. -ENCLS invokes the specified privileged SGX function for managing -and debugging enclaves. Macros are used to wrap the ENCLS -functionality and several wrappers are used to wrap the macros to -make the different SGX functions accessible in the code. - -The wrappers of the supported SGX functions are cryptic. Add short -descriptions of each as a comment. - -Suggested-by: Dave Hansen <dave.hansen@linux.intel.com> -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encls.h | 15 +++++++++++++++ - 1 file changed, 15 insertions(+) - -diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h -index fa04a73daf9c..0e22fa8f77c5 100644 ---- a/arch/x86/kernel/cpu/sgx/encls.h -+++ b/arch/x86/kernel/cpu/sgx/encls.h -@@ -136,57 +136,71 @@ static inline bool encls_failed(int ret) - ret; \ - }) - -+/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ - static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) - { - return __encls_2(ECREATE, pginfo, secs); - } - -+/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ - static inline int __eextend(void *secs, void *addr) - { - return __encls_2(EEXTEND, secs, addr); - } - -+/* -+ * Associate an EPC page to an enclave either as a REG or TCS page -+ * populated with the provided data. -+ */ - static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) - { - return __encls_2(EADD, pginfo, addr); - } - -+/* Finalize enclave build, initialize enclave for user code execution. */ - static inline int __einit(void *sigstruct, void *token, void *secs) - { - return __encls_ret_3(EINIT, sigstruct, secs, token); - } - -+/* Disassociate EPC page from its enclave and mark it as unused. */ - static inline int __eremove(void *addr) - { - return __encls_ret_1(EREMOVE, addr); - } - -+/* Copy data to an EPC page belonging to a debug enclave. */ - static inline int __edbgwr(void *addr, unsigned long *data) - { - return __encls_2(EDGBWR, *data, addr); - } - -+/* Copy data from an EPC page belonging to a debug enclave. */ - static inline int __edbgrd(void *addr, unsigned long *data) - { - return __encls_1_1(EDGBRD, *data, addr); - } - -+/* Track that software has completed the required TLB address clears. */ - static inline int __etrack(void *addr) - { - return __encls_ret_1(ETRACK, addr); - } - -+/* Load, verify, and unblock an EPC page. */ - static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, - void *va) - { - return __encls_ret_3(ELDU, pginfo, addr, va); - } - -+/* Make EPC page inaccessible to enclave, ready to be written to memory. */ - static inline int __eblock(void *addr) - { - return __encls_ret_1(EBLOCK, addr); - } - -+/* Initialize an EPC page into a Version Array (VA) page. */ - static inline int __epa(void *addr) - { - unsigned long rbx = SGX_PAGE_TYPE_VA; -@@ -194,6 +208,7 @@ static inline int __epa(void *addr) - return __encls_2(EPA, rbx, addr); - } - -+/* Invalidate an EPC page and write it out to main memory. */ - static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, - void *va) - { --- -2.36.1 - diff --git a/0007-x86-sgx-Add-wrapper-for-SGX2-EMODPR-function.patch b/0007-x86-sgx-Add-wrapper-for-SGX2-EMODPR-function.patch deleted file mode 100644 index 3fe683efeab0..000000000000 --- a/0007-x86-sgx-Add-wrapper-for-SGX2-EMODPR-function.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 5c8cc5465580ccb7614303a2ba49f9ec253a6ff1 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:38 -0700 -Subject: [PATCH 07/36] x86/sgx: Add wrapper for SGX2 EMODPR function - -Add a wrapper for the EMODPR ENCLS leaf function used to -restrict enclave page permissions as maintained in the -SGX hardware's Enclave Page Cache Map (EPCM). - -EMODPR: -1) Updates the EPCM permissions of an enclave page by treating - the new permissions as a mask. Supplying a value that attempts - to relax EPCM permissions has no effect on EPCM permissions - (PR bit, see below, is changed). -2) Sets the PR bit in the EPCM entry of the enclave page to - indicate that permission restriction is in progress. The bit - is reset by the enclave by invoking ENCLU leaf function - EACCEPT or EACCEPTCOPY. - -The enclave may access the page throughout the entire process -if conforming to the EPCM permissions for the enclave page. - -After performing the permission restriction by issuing EMODPR -the kernel needs to collaborate with the hardware to ensure that -all logical processors sees the new restricted permissions. This -is required for the enclave's EACCEPT/EACCEPTCOPY to succeed and -is accomplished with the ETRACK flow. - -Expand enum sgx_return_code with the possible EMODPR return -values. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/include/asm/sgx.h | 5 +++++ - arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ - 2 files changed, 11 insertions(+) - -diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h -index 3f9334ef67cd..d67810b50a81 100644 ---- a/arch/x86/include/asm/sgx.h -+++ b/arch/x86/include/asm/sgx.h -@@ -65,17 +65,22 @@ enum sgx_encls_function { - - /** - * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV -+ * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. - * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not - * been completed yet. - * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. - * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's - * public key does not match IA32_SGXLEPUBKEYHASH. -+ * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it -+ * is in the PENDING or MODIFIED state. - * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received - */ - enum sgx_return_code { -+ SGX_EPC_PAGE_CONFLICT = 7, - SGX_NOT_TRACKED = 11, - SGX_CHILD_PRESENT = 13, - SGX_INVALID_EINITTOKEN = 16, -+ SGX_PAGE_NOT_MODIFIABLE = 20, - SGX_UNMASKED_EVENT = 128, - }; - -diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h -index 0e22fa8f77c5..2b091912f038 100644 ---- a/arch/x86/kernel/cpu/sgx/encls.h -+++ b/arch/x86/kernel/cpu/sgx/encls.h -@@ -215,4 +215,10 @@ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, - return __encls_ret_3(EWB, pginfo, addr, va); - } - -+/* Restrict the EPCM permissions of an EPC page. */ -+static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) -+{ -+ return __encls_ret_2(EMODPR, secinfo, addr); -+} -+ - #endif /* _X86_ENCLS_H */ --- -2.36.1 - diff --git a/0008-x86-sgx-Add-wrapper-for-SGX2-EMODT-function.patch b/0008-x86-sgx-Add-wrapper-for-SGX2-EMODT-function.patch deleted file mode 100644 index b9e5783f860f..000000000000 --- a/0008-x86-sgx-Add-wrapper-for-SGX2-EMODT-function.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 86327191b4402a01042948f410f810837105b97d Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:39 -0700 -Subject: [PATCH 08/36] x86/sgx: Add wrapper for SGX2 EMODT function - -Add a wrapper for the EMODT ENCLS leaf function used to -change the type of an enclave page as maintained in the -SGX hardware's Enclave Page Cache Map (EPCM). - -EMODT: -1) Updates the EPCM page type of the enclave page. -2) Sets the MODIFIED bit in the EPCM entry of the enclave page. - This bit is reset by the enclave by invoking ENCLU leaf - function EACCEPT or EACCEPTCOPY. - -Access from within the enclave to the enclave page is not possible -while the MODIFIED bit is set. - -After changing the enclave page type by issuing EMODT the kernel -needs to collaborate with the hardware to ensure that no logical -processor continues to hold a reference to the changed page. This -is required to ensure no required security checks are circumvented -and is required for the enclave's EACCEPT/EACCEPTCOPY to succeed. -Ensuring that no references to the changed page remain is -accomplished with the ETRACK flow. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h -index 2b091912f038..7a1ecf704ec1 100644 ---- a/arch/x86/kernel/cpu/sgx/encls.h -+++ b/arch/x86/kernel/cpu/sgx/encls.h -@@ -221,4 +221,10 @@ static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) - return __encls_ret_2(EMODPR, secinfo, addr); - } - -+/* Change the type of an EPC page. */ -+static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) -+{ -+ return __encls_ret_2(EMODT, secinfo, addr); -+} -+ - #endif /* _X86_ENCLS_H */ --- -2.36.1 - diff --git a/0009-x86-sgx-Add-wrapper-for-SGX2-EAUG-function.patch b/0009-x86-sgx-Add-wrapper-for-SGX2-EAUG-function.patch deleted file mode 100644 index 6c025947f265..000000000000 --- a/0009-x86-sgx-Add-wrapper-for-SGX2-EAUG-function.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 904af2dd1bb04758ad2327793ee10f24a944f7b6 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:40 -0700 -Subject: [PATCH 09/36] x86/sgx: Add wrapper for SGX2 EAUG function - -Add a wrapper for the EAUG ENCLS leaf function used to -add a page to an initialized enclave. - -EAUG: -1) Stores all properties of the new enclave page in the SGX - hardware's Enclave Page Cache Map (EPCM). -2) Sets the PENDING bit in the EPCM entry of the enclave page. - This bit is cleared by the enclave by invoking ENCLU leaf - function EACCEPT or EACCEPTCOPY. - -Access from within the enclave to the new enclave page is not -possible until the PENDING bit is cleared. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h -index 7a1ecf704ec1..99004b02e2ed 100644 ---- a/arch/x86/kernel/cpu/sgx/encls.h -+++ b/arch/x86/kernel/cpu/sgx/encls.h -@@ -227,4 +227,10 @@ static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) - return __encls_ret_2(EMODT, secinfo, addr); - } - -+/* Zero a page of EPC memory and add it to an initialized enclave. */ -+static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) -+{ -+ return __encls_2(EAUG, pginfo, addr); -+} -+ - #endif /* _X86_ENCLS_H */ --- -2.36.1 - diff --git a/0010-x86-sgx-Support-loading-enclave-page-without-VMA-per.patch b/0010-x86-sgx-Support-loading-enclave-page-without-VMA-per.patch deleted file mode 100644 index 3b096adb8f8b..000000000000 --- a/0010-x86-sgx-Support-loading-enclave-page-without-VMA-per.patch +++ /dev/null @@ -1,137 +0,0 @@ -From d88ad0efa7127c861cd5aaf241349b5f9a0fbd53 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:41 -0700 -Subject: [PATCH 10/36] x86/sgx: Support loading enclave page without VMA - permissions check - -sgx_encl_load_page() is used to find and load an enclave page into -enclave (EPC) memory, potentially loading it from the backing storage. -Both usages of sgx_encl_load_page() are during an access to the -enclave page from a VMA and thus the permissions of the VMA are -considered before the enclave page is loaded. - -SGX2 functions operating on enclave pages belonging to an initialized -enclave requiring the page to be in EPC. It is thus required to -support loading enclave pages into the EPC independent from a VMA. - -Split the current sgx_encl_load_page() to support the two usages: -A new call, sgx_encl_load_page_in_vma(), behaves exactly like the -current sgx_encl_load_page() that takes VMA permissions into account, -while sgx_encl_load_page() just loads an enclave page into EPC. - -VMA, PTE, and EPCM permissions continue to dictate whether -the pages can be accessed from within an enclave. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++------------ - arch/x86/kernel/cpu/sgx/encl.h | 2 ++ - 2 files changed, 40 insertions(+), 19 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 3c24e6124d95..7ad8b475306a 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -232,25 +232,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, - return epc_page; - } - --static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, -- unsigned long addr, -- unsigned long vm_flags) -+static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, -+ struct sgx_encl_page *entry) - { -- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); - struct sgx_epc_page *epc_page; -- struct sgx_encl_page *entry; -- -- entry = xa_load(&encl->page_array, PFN_DOWN(addr)); -- if (!entry) -- return ERR_PTR(-EFAULT); -- -- /* -- * Verify that the faulted page has equal or higher build time -- * permissions than the VMA permissions (i.e. the subset of {VM_READ, -- * VM_WRITE, VM_EXECUTE} in vma->vm_flags). -- */ -- if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) -- return ERR_PTR(-EFAULT); - - /* Entry successfully located. */ - if (entry->epc_page) { -@@ -276,6 +261,40 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - return entry; - } - -+static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, -+ unsigned long addr, -+ unsigned long vm_flags) -+{ -+ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); -+ struct sgx_encl_page *entry; -+ -+ entry = xa_load(&encl->page_array, PFN_DOWN(addr)); -+ if (!entry) -+ return ERR_PTR(-EFAULT); -+ -+ /* -+ * Verify that the page has equal or higher build time -+ * permissions than the VMA permissions (i.e. the subset of {VM_READ, -+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags). -+ */ -+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) -+ return ERR_PTR(-EFAULT); -+ -+ return __sgx_encl_load_page(encl, entry); -+} -+ -+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, -+ unsigned long addr) -+{ -+ struct sgx_encl_page *entry; -+ -+ entry = xa_load(&encl->page_array, PFN_DOWN(addr)); -+ if (!entry) -+ return ERR_PTR(-EFAULT); -+ -+ return __sgx_encl_load_page(encl, entry); -+} -+ - static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) - { - unsigned long addr = (unsigned long)vmf->address; -@@ -297,7 +316,7 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) - - mutex_lock(&encl->lock); - -- entry = sgx_encl_load_page(encl, addr, vma->vm_flags); -+ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); - if (IS_ERR(entry)) { - mutex_unlock(&encl->lock); - -@@ -445,7 +464,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, - for ( ; ; ) { - mutex_lock(&encl->lock); - -- entry = sgx_encl_load_page(encl, addr, vm_flags); -+ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); - if (PTR_ERR(entry) != -EBUSY) - break; - -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index d44e7372151f..522a17e4fd2d 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -116,5 +116,7 @@ unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); - void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); - bool sgx_va_page_full(struct sgx_va_page *va_page); - void sgx_encl_free_epc_page(struct sgx_epc_page *page); -+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, -+ unsigned long addr); - - #endif /* _X86_ENCL_H */ --- -2.36.1 - diff --git a/0011-x86-sgx-Export-sgx_encl_ewb_cpumask.patch b/0011-x86-sgx-Export-sgx_encl_ewb_cpumask.patch deleted file mode 100644 index 2d636c371e62..000000000000 --- a/0011-x86-sgx-Export-sgx_encl_ewb_cpumask.patch +++ /dev/null @@ -1,165 +0,0 @@ -From 3d04d12e9ee7efadb490cba7ba12e8d4b833b9af Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:42 -0700 -Subject: [PATCH 11/36] x86/sgx: Export sgx_encl_ewb_cpumask() - -Using sgx_encl_ewb_cpumask() to learn which CPUs might have executed -an enclave is useful to ensure that TLBs are cleared when changes are -made to enclave pages. - -sgx_encl_ewb_cpumask() is used within the reclaimer when an enclave -page is evicted. The upcoming SGX2 support enables changes to be -made to enclave pages and will require TLBs to not refer to the -changed pages and thus will be needing sgx_encl_ewb_cpumask(). - -Relocate sgx_encl_ewb_cpumask() to be with the rest of the enclave -code in encl.c now that it is no longer unique to the reclaimer. - -Take care to ensure that any future usage maintains the -current context requirement that ETRACK has been called first. -Expand the existing comments to highlight this while moving them -to a more prominent location before the function. - -No functional change. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 67 ++++++++++++++++++++++++++++++++++ - arch/x86/kernel/cpu/sgx/encl.h | 1 + - arch/x86/kernel/cpu/sgx/main.c | 29 --------------- - 3 files changed, 68 insertions(+), 29 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 7ad8b475306a..6953d331f8d5 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -714,6 +714,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) - return 0; - } - -+/** -+ * sgx_encl_ewb_cpumask() - Query which CPUs might be accessing the enclave -+ * @encl: the enclave -+ * -+ * Some SGX functions require that no cached linear-to-physical address -+ * mappings are present before they can succeed. For example, ENCLS[EWB] -+ * copies a page from the enclave page cache to regular main memory but -+ * it fails if it cannot ensure that there are no cached -+ * linear-to-physical address mappings referring to the page. -+ * -+ * SGX hardware flushes all cached linear-to-physical mappings on a CPU -+ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave -+ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical -+ * address mappings are cleared but coordination with the tracking done within -+ * the SGX hardware is needed to support the SGX functions that depend on this -+ * cache clearing. -+ * -+ * When the ENCLS[ETRACK] function is issued on an enclave the hardware -+ * tracks threads operating inside the enclave at that time. The SGX -+ * hardware tracking require that all the identified threads must have -+ * exited the enclave in order to flush the mappings before a function such -+ * as ENCLS[EWB] will be permitted -+ * -+ * The following flow is used to support SGX functions that require that -+ * no cached linear-to-physical address mappings are present: -+ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. -+ * 2) Use this function (sgx_encl_ewb_cpumask()) to query which CPUs might be -+ * accessing the enclave. -+ * 3) Send IPI to identified CPUs, kicking them out of the enclave and -+ * thus flushing all locally cached linear-to-physical address mappings. -+ * 4) Execute SGX function. -+ * -+ * Context: It is required to call this function after ENCLS[ETRACK]. -+ * This will ensure that if any new mm appears (racing with -+ * sgx_encl_mm_add()) then the new mm will enter into the -+ * enclave with fresh linear-to-physical address mappings. -+ * -+ * It is required that all IPIs are completed before a new -+ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 -+ * of the above flow with the enclave's mutex. -+ * -+ * Return: cpumask of CPUs that might be accessing @encl -+ */ -+const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) -+{ -+ cpumask_t *cpumask = &encl->cpumask; -+ struct sgx_encl_mm *encl_mm; -+ int idx; -+ -+ cpumask_clear(cpumask); -+ -+ idx = srcu_read_lock(&encl->srcu); -+ -+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { -+ if (!mmget_not_zero(encl_mm->mm)) -+ continue; -+ -+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); -+ -+ mmput_async(encl_mm->mm); -+ } -+ -+ srcu_read_unlock(&encl->srcu, idx); -+ -+ return cpumask; -+} -+ - static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, - pgoff_t index) - { -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index 522a17e4fd2d..c6afa58ea3e6 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -105,6 +105,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, - - void sgx_encl_release(struct kref *ref); - int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); -+const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl); - int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); - void sgx_encl_put_backing(struct sgx_backing *backing); -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index ab4ec54bbdd9..2a926278dd29 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -205,35 +205,6 @@ static void sgx_ipi_cb(void *info) - { - } - --static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) --{ -- cpumask_t *cpumask = &encl->cpumask; -- struct sgx_encl_mm *encl_mm; -- int idx; -- -- /* -- * Can race with sgx_encl_mm_add(), but ETRACK has already been -- * executed, which means that the CPUs running in the new mm will enter -- * into the enclave with a fresh epoch. -- */ -- cpumask_clear(cpumask); -- -- idx = srcu_read_lock(&encl->srcu); -- -- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { -- if (!mmget_not_zero(encl_mm->mm)) -- continue; -- -- cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); -- -- mmput_async(encl_mm->mm); -- } -- -- srcu_read_unlock(&encl->srcu, idx); -- -- return cpumask; --} -- - /* - * Swap page to the regular memory transformed to the blocked state by using - * EBLOCK, which means that it can no longer be referenced (no new TLB entries). --- -2.36.1 - diff --git a/0012-x86-sgx-Rename-sgx_encl_ewb_cpumask-as-sgx_encl_cpum.patch b/0012-x86-sgx-Rename-sgx_encl_ewb_cpumask-as-sgx_encl_cpum.patch deleted file mode 100644 index 1e5e7875039f..000000000000 --- a/0012-x86-sgx-Rename-sgx_encl_ewb_cpumask-as-sgx_encl_cpum.patch +++ /dev/null @@ -1,88 +0,0 @@ -From c8182e673630ff65fad540a4773ac7d2888ac7e5 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:43 -0700 -Subject: [PATCH 12/36] x86/sgx: Rename sgx_encl_ewb_cpumask() as - sgx_encl_cpumask() - -sgx_encl_ewb_cpumask() is no longer unique to the reclaimer where it -is used during the EWB ENCLS leaf function when EPC pages are written -out to main memory and sgx_encl_ewb_cpumask() is used to learn which -CPUs might have executed the enclave to ensure that TLBs are cleared. - -Upcoming SGX2 enabling will use sgx_encl_ewb_cpumask() during the -EMODPR and EMODT ENCLS leaf functions that make changes to enclave -pages. The function is needed for the same reason it is used now: to -learn which CPUs might have executed the enclave to ensure that TLBs -no longer point to the changed pages. - -Rename sgx_encl_ewb_cpumask() to sgx_encl_cpumask() to reflect the -broader usage. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 6 +++--- - arch/x86/kernel/cpu/sgx/encl.h | 2 +- - arch/x86/kernel/cpu/sgx/main.c | 2 +- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 6953d331f8d5..7539cef6e66b 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -715,7 +715,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) - } - - /** -- * sgx_encl_ewb_cpumask() - Query which CPUs might be accessing the enclave -+ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave - * @encl: the enclave - * - * Some SGX functions require that no cached linear-to-physical address -@@ -740,7 +740,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) - * The following flow is used to support SGX functions that require that - * no cached linear-to-physical address mappings are present: - * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. -- * 2) Use this function (sgx_encl_ewb_cpumask()) to query which CPUs might be -+ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be - * accessing the enclave. - * 3) Send IPI to identified CPUs, kicking them out of the enclave and - * thus flushing all locally cached linear-to-physical address mappings. -@@ -757,7 +757,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) - * - * Return: cpumask of CPUs that might be accessing @encl - */ --const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) -+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) - { - cpumask_t *cpumask = &encl->cpumask; - struct sgx_encl_mm *encl_mm; -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index c6afa58ea3e6..ef8cf106904b 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -105,7 +105,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, - - void sgx_encl_release(struct kref *ref); - int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); --const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl); -+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); - int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); - void sgx_encl_put_backing(struct sgx_backing *backing); -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index 2a926278dd29..7b53a69d501f 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -251,7 +251,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, - * miss cpus that entered the enclave between - * generating the mask and incrementing epoch. - */ -- on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), -+ on_each_cpu_mask(sgx_encl_cpumask(encl), - sgx_ipi_cb, NULL, 1); - ret = __sgx_encl_ewb(epc_page, va_slot, backing); - } --- -2.36.1 - diff --git a/0013-x86-sgx-Move-PTE-zap-code-to-new-sgx_zap_enclave_pte.patch b/0013-x86-sgx-Move-PTE-zap-code-to-new-sgx_zap_enclave_pte.patch deleted file mode 100644 index d3884fb03a10..000000000000 --- a/0013-x86-sgx-Move-PTE-zap-code-to-new-sgx_zap_enclave_pte.patch +++ /dev/null @@ -1,155 +0,0 @@ -From 3068bc10f54b083eb4d7b85e1420fa1d57ff5b1d Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:44 -0700 -Subject: [PATCH 13/36] x86/sgx: Move PTE zap code to new - sgx_zap_enclave_ptes() - -The SGX reclaimer removes page table entries pointing to pages that are -moved to swap. - -SGX2 enables changes to pages belonging to an initialized enclave, thus -enclave pages may have their permission or type changed while the page -is being accessed by an enclave. Supporting SGX2 requires page table -entries to be removed so that any cached mappings to changed pages -are removed. For example, with the ability to change enclave page types -a regular enclave page may be changed to a Thread Control Structure -(TCS) page that may not be accessed by an enclave. - -Factor out the code removing page table entries to a separate function -sgx_zap_enclave_ptes(), fixing accuracy of comments in the process, -and make it available to the upcoming SGX2 code. - -Place sgx_zap_enclave_ptes() with the rest of the enclave code in -encl.c interacting with the page table since this code is no longer -unique to the reclaimer. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 45 +++++++++++++++++++++++++++++++++- - arch/x86/kernel/cpu/sgx/encl.h | 2 +- - arch/x86/kernel/cpu/sgx/main.c | 31 ++--------------------- - 3 files changed, 47 insertions(+), 31 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 7539cef6e66b..c6cac43b40d6 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -706,7 +706,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) - - spin_lock(&encl->mm_lock); - list_add_rcu(&encl_mm->list, &encl->mm_list); -- /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ -+ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ - smp_wmb(); - encl->mm_list_version++; - spin_unlock(&encl->mm_lock); -@@ -887,6 +887,49 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, - return ret; - } - -+/** -+ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave -+ * @encl: the enclave -+ * @addr: page aligned pointer to single page for which PTEs will be removed -+ * -+ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping -+ * @addr from each VMA. Ensure that page fault handler is ready to handle -+ * new mappings of @addr before calling this function. -+ */ -+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) -+{ -+ unsigned long mm_list_version; -+ struct sgx_encl_mm *encl_mm; -+ struct vm_area_struct *vma; -+ int idx, ret; -+ -+ do { -+ mm_list_version = encl->mm_list_version; -+ -+ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ -+ smp_rmb(); -+ -+ idx = srcu_read_lock(&encl->srcu); -+ -+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { -+ if (!mmget_not_zero(encl_mm->mm)) -+ continue; -+ -+ mmap_read_lock(encl_mm->mm); -+ -+ ret = sgx_encl_find(encl_mm->mm, addr, &vma); -+ if (!ret && encl == vma->vm_private_data) -+ zap_vma_ptes(vma, addr, PAGE_SIZE); -+ -+ mmap_read_unlock(encl_mm->mm); -+ -+ mmput_async(encl_mm->mm); -+ } -+ -+ srcu_read_unlock(&encl->srcu, idx); -+ } while (unlikely(encl->mm_list_version != mm_list_version)); -+} -+ - /** - * sgx_alloc_va_page() - Allocate a Version Array (VA) page - * -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index ef8cf106904b..f72a674e2605 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -111,7 +111,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - void sgx_encl_put_backing(struct sgx_backing *backing); - int sgx_encl_test_and_clear_young(struct mm_struct *mm, - struct sgx_encl_page *page); -- -+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); - struct sgx_epc_page *sgx_alloc_va_page(void); - unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); - void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index 7b53a69d501f..9df2221af498 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) - struct sgx_encl_page *page = epc_page->owner; - unsigned long addr = page->desc & PAGE_MASK; - struct sgx_encl *encl = page->encl; -- unsigned long mm_list_version; -- struct sgx_encl_mm *encl_mm; -- struct vm_area_struct *vma; -- int idx, ret; -- -- do { -- mm_list_version = encl->mm_list_version; -- -- /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ -- smp_rmb(); -- -- idx = srcu_read_lock(&encl->srcu); -- -- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { -- if (!mmget_not_zero(encl_mm->mm)) -- continue; -- -- mmap_read_lock(encl_mm->mm); -- -- ret = sgx_encl_find(encl_mm->mm, addr, &vma); -- if (!ret && encl == vma->vm_private_data) -- zap_vma_ptes(vma, addr, PAGE_SIZE); -- -- mmap_read_unlock(encl_mm->mm); -- -- mmput_async(encl_mm->mm); -- } -+ int ret; - -- srcu_read_unlock(&encl->srcu, idx); -- } while (unlikely(encl->mm_list_version != mm_list_version)); -+ sgx_zap_enclave_ptes(encl, addr); - - mutex_lock(&encl->lock); - --- -2.36.1 - diff --git a/0014-x86-sgx-Make-sgx_ipi_cb-available-internally.patch b/0014-x86-sgx-Make-sgx_ipi_cb-available-internally.patch deleted file mode 100644 index 914073b8ea0a..000000000000 --- a/0014-x86-sgx-Make-sgx_ipi_cb-available-internally.patch +++ /dev/null @@ -1,47 +0,0 @@ -From f3aa084da6b464114361270c037e177de0dbfa5f Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:45 -0700 -Subject: [PATCH 14/36] x86/sgx: Make sgx_ipi_cb() available internally - -The ETRACK function followed by an IPI to all CPUs within an enclave -is a common pattern with more frequent use in support of SGX2. - -Make the (empty) IPI callback function available internally in -preparation for usage by SGX2. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/main.c | 2 +- - arch/x86/kernel/cpu/sgx/sgx.h | 2 ++ - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index 9df2221af498..180ad840b226 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -174,7 +174,7 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, - return ret; - } - --static void sgx_ipi_cb(void *info) -+void sgx_ipi_cb(void *info) - { - } - -diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h -index 0f17def9fe6f..b30cee4de903 100644 ---- a/arch/x86/kernel/cpu/sgx/sgx.h -+++ b/arch/x86/kernel/cpu/sgx/sgx.h -@@ -90,6 +90,8 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); - int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); - struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); - -+void sgx_ipi_cb(void *info); -+ - #ifdef CONFIG_X86_SGX_KVM - int __init sgx_vepc_init(void); - #else --- -2.36.1 - diff --git a/0015-x86-sgx-Create-utility-to-validate-user-provided-off.patch b/0015-x86-sgx-Create-utility-to-validate-user-provided-off.patch deleted file mode 100644 index 8610e8b5d785..000000000000 --- a/0015-x86-sgx-Create-utility-to-validate-user-provided-off.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 7e214d2e919619ab3dbc4547f0f03986009245f3 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:46 -0700 -Subject: [PATCH 15/36] x86/sgx: Create utility to validate user provided - offset and length - -User provided offset and length is validated when parsing the parameters -of the SGX_IOC_ENCLAVE_ADD_PAGES ioctl(). Extract this validation -(with consistent use of IS_ALIGNED) into a utility that can be used -by the SGX2 ioctl()s that will also provide these values. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/ioctl.c | 28 ++++++++++++++++++++++------ - 1 file changed, 22 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 83df20e3e633..a66795e0b685 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -372,6 +372,26 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, - return ret; - } - -+/* -+ * Ensure user provided offset and length values are valid for -+ * an enclave. -+ */ -+static int sgx_validate_offset_length(struct sgx_encl *encl, -+ unsigned long offset, -+ unsigned long length) -+{ -+ if (!IS_ALIGNED(offset, PAGE_SIZE)) -+ return -EINVAL; -+ -+ if (!length || !IS_ALIGNED(length, PAGE_SIZE)) -+ return -EINVAL; -+ -+ if (offset + length - PAGE_SIZE >= encl->size) -+ return -EINVAL; -+ -+ return 0; -+} -+ - /** - * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES - * @encl: an enclave pointer -@@ -425,14 +445,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) - if (copy_from_user(&add_arg, arg, sizeof(add_arg))) - return -EFAULT; - -- if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || -- !IS_ALIGNED(add_arg.src, PAGE_SIZE)) -- return -EINVAL; -- -- if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) -+ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) - return -EINVAL; - -- if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) -+ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) - return -EINVAL; - - if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, --- -2.36.1 - diff --git a/0016-x86-sgx-Keep-record-of-SGX-page-type.patch b/0016-x86-sgx-Keep-record-of-SGX-page-type.patch deleted file mode 100644 index 1b68f9062ccb..000000000000 --- a/0016-x86-sgx-Keep-record-of-SGX-page-type.patch +++ /dev/null @@ -1,88 +0,0 @@ -From f00cceb541c0ed451da982dd4bcf2df721c23902 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:47 -0700 -Subject: [PATCH 16/36] x86/sgx: Keep record of SGX page type - -SGX2 functions are not allowed on all page types. For example, -ENCLS[EMODPR] is only allowed on regular SGX enclave pages and -ENCLS[EMODPT] is only allowed on TCS and regular pages. If these -functions are attempted on another type of page the hardware would -trigger a fault. - -Keep a record of the SGX page type so that there is more -certainty whether an SGX2 instruction can succeed and faults -can be treated as real failures. - -The page type is a property of struct sgx_encl_page -and thus does not cover the VA page type. VA pages are maintained -in separate structures and their type can be determined in -a different way. The SGX2 instructions needing the page type do not -operate on VA pages and this is thus not a scenario needing to -be covered at this time. - -struct sgx_encl_page hosting this information is maintained for each -enclave page so the space consumed by the struct is important. -The existing sgx_encl_page->vm_max_prot_bits is already unsigned long -while only using three bits. Transition to a bitfield for the two -members to support the additional information without increasing -the space consumed by the struct. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/include/asm/sgx.h | 3 +++ - arch/x86/kernel/cpu/sgx/encl.h | 3 ++- - arch/x86/kernel/cpu/sgx/ioctl.c | 2 ++ - 3 files changed, 7 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h -index d67810b50a81..eae20fa52b93 100644 ---- a/arch/x86/include/asm/sgx.h -+++ b/arch/x86/include/asm/sgx.h -@@ -239,6 +239,9 @@ struct sgx_pageinfo { - * %SGX_PAGE_TYPE_REG: a regular page - * %SGX_PAGE_TYPE_VA: a VA page - * %SGX_PAGE_TYPE_TRIM: a page in trimmed state -+ * -+ * Make sure when making changes to this enum that its values can still fit -+ * in the bitfield within &struct sgx_encl_page - */ - enum sgx_page_type { - SGX_PAGE_TYPE_SECS, -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index f72a674e2605..799d4cdb12d5 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -27,7 +27,8 @@ - - struct sgx_encl_page { - unsigned long desc; -- unsigned long vm_max_prot_bits; -+ unsigned long vm_max_prot_bits:8; -+ enum sgx_page_type type:16; - struct sgx_epc_page *epc_page; - struct sgx_encl *encl; - struct sgx_va_page *va_page; -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index a66795e0b685..21078c6643f7 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) - set_bit(SGX_ENCL_DEBUG, &encl->flags); - - encl->secs.encl = encl; -+ encl->secs.type = SGX_PAGE_TYPE_SECS; - encl->base = secs->base; - encl->size = secs->size; - encl->attributes = secs->attributes; -@@ -344,6 +345,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, - */ - encl_page->encl = encl; - encl_page->epc_page = epc_page; -+ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; - encl->secs_child_cnt++; - - if (flags & SGX_PAGE_MEASURE) { --- -2.36.1 - diff --git a/0017-x86-sgx-Export-sgx_encl_-grow-shrink.patch b/0017-x86-sgx-Export-sgx_encl_-grow-shrink.patch deleted file mode 100644 index a3088a91009e..000000000000 --- a/0017-x86-sgx-Export-sgx_encl_-grow-shrink.patch +++ /dev/null @@ -1,53 +0,0 @@ -From f2b77aae84955b467adb8e9b570c04d84cebb863 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:48 -0700 -Subject: [PATCH 17/36] x86/sgx: Export sgx_encl_{grow,shrink}() - -In order to use sgx_encl_{grow,shrink}() in the page augmentation code -located in encl.c, export these functions. - -Suggested-by: Jarkko Sakkinen <jarkko@kernel.org> -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.h | 2 ++ - arch/x86/kernel/cpu/sgx/ioctl.c | 4 ++-- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index 799d4cdb12d5..b6b53c0346ad 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -120,5 +120,7 @@ bool sgx_va_page_full(struct sgx_va_page *va_page); - void sgx_encl_free_epc_page(struct sgx_epc_page *page); - struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr); -+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl); -+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); - - #endif /* _X86_ENCL_H */ -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 21078c6643f7..2df27dd8b30d 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -17,7 +17,7 @@ - #include "encl.h" - #include "encls.h" - --static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) -+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) - { - struct sgx_va_page *va_page = NULL; - void *err; -@@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) - return va_page; - } - --static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) -+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) - { - encl->page_cnt--; - --- -2.36.1 - diff --git a/0018-x86-sgx-Export-sgx_encl_page_alloc.patch b/0018-x86-sgx-Export-sgx_encl_page_alloc.patch deleted file mode 100644 index 0098ef6799b7..000000000000 --- a/0018-x86-sgx-Export-sgx_encl_page_alloc.patch +++ /dev/null @@ -1,120 +0,0 @@ -From dd73a72bdd8afde9390e4cb281a858ba1482413c Mon Sep 17 00:00:00 2001 -From: Jarkko Sakkinen <jarkko@kernel.org> -Date: Tue, 10 May 2022 11:08:49 -0700 -Subject: [PATCH 18/36] x86/sgx: Export sgx_encl_page_alloc() - -Move sgx_encl_page_alloc() to encl.c and export it so that it can be -used in the implementation for support of adding pages to initialized -enclaves, which requires to allocate new enclave pages. - -Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 32 ++++++++++++++++++++++++++++++++ - arch/x86/kernel/cpu/sgx/encl.h | 3 +++ - arch/x86/kernel/cpu/sgx/ioctl.c | 32 -------------------------------- - 3 files changed, 35 insertions(+), 32 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index c6cac43b40d6..5e6a64d8e3d6 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -887,6 +887,38 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, - return ret; - } - -+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, -+ unsigned long offset, -+ u64 secinfo_flags) -+{ -+ struct sgx_encl_page *encl_page; -+ unsigned long prot; -+ -+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); -+ if (!encl_page) -+ return ERR_PTR(-ENOMEM); -+ -+ encl_page->desc = encl->base + offset; -+ encl_page->encl = encl; -+ -+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | -+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | -+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); -+ -+ /* -+ * TCS pages must always RW set for CPU access while the SECINFO -+ * permissions are *always* zero - the CPU ignores the user provided -+ * values and silently overwrites them with zero permissions. -+ */ -+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) -+ prot |= PROT_READ | PROT_WRITE; -+ -+ /* Calculate maximum of the VM flags for the page. */ -+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); -+ -+ return encl_page; -+} -+ - /** - * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave - * @encl: the enclave -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index b6b53c0346ad..2cb58ab868e5 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -112,6 +112,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - void sgx_encl_put_backing(struct sgx_backing *backing); - int sgx_encl_test_and_clear_young(struct mm_struct *mm, - struct sgx_encl_page *page); -+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, -+ unsigned long offset, -+ u64 secinfo_flags); - void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); - struct sgx_epc_page *sgx_alloc_va_page(void); - unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 2df27dd8b30d..bb8cdb2ad0d1 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -169,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) - return ret; - } - --static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, -- unsigned long offset, -- u64 secinfo_flags) --{ -- struct sgx_encl_page *encl_page; -- unsigned long prot; -- -- encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); -- if (!encl_page) -- return ERR_PTR(-ENOMEM); -- -- encl_page->desc = encl->base + offset; -- encl_page->encl = encl; -- -- prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | -- _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | -- _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); -- -- /* -- * TCS pages must always RW set for CPU access while the SECINFO -- * permissions are *always* zero - the CPU ignores the user provided -- * values and silently overwrites them with zero permissions. -- */ -- if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) -- prot |= PROT_READ | PROT_WRITE; -- -- /* Calculate maximum of the VM flags for the page. */ -- encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); -- -- return encl_page; --} -- - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) - { - u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; --- -2.36.1 - diff --git a/0019-x86-sgx-Support-VA-page-allocation-without-reclaimin.patch b/0019-x86-sgx-Support-VA-page-allocation-without-reclaimin.patch deleted file mode 100644 index 7fe43ad0f976..000000000000 --- a/0019-x86-sgx-Support-VA-page-allocation-without-reclaimin.patch +++ /dev/null @@ -1,136 +0,0 @@ -From fd4616da8dc62cdf1976b27742a567efd5aac146 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:50 -0700 -Subject: [PATCH 19/36] x86/sgx: Support VA page allocation without reclaiming - -struct sgx_encl should be protected with the mutex -sgx_encl->lock. One exception is sgx_encl->page_cnt that -is incremented (in sgx_encl_grow()) when an enclave page -is added to the enclave. The reason the mutex is not held -is to allow the reclaimer to be called directly if there are -no EPC pages (in support of a new VA page) available at the time. - -Incrementing sgx_encl->page_cnt without sgc_encl->lock held -is currently (before SGX2) safe from concurrent updates because -all paths in which sgx_encl_grow() is called occur before -enclave initialization and are protected with an atomic -operation on SGX_ENCL_IOCTL. - -SGX2 includes support for dynamically adding pages after -enclave initialization where the protection of SGX_ENCL_IOCTL -is not available. - -Make direct reclaim of EPC pages optional when new VA pages -are added to the enclave. Essentially the existing "reclaim" -flag used when regular EPC pages are added to an enclave -becomes available to the caller when used to allocate VA pages -instead of always being "true". - -When adding pages without invoking the reclaimer it is possible -to do so with sgx_encl->lock held, gaining its protection against -concurrent updates to sgx_encl->page_cnt after enclave -initialization. - -No functional change. - -Reported-by: Haitao Huang <haitao.huang@intel.com> -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 6 ++++-- - arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- - arch/x86/kernel/cpu/sgx/ioctl.c | 8 ++++---- - 3 files changed, 10 insertions(+), 8 deletions(-) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index 5e6a64d8e3d6..ea81e597dd18 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -964,6 +964,8 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) - - /** - * sgx_alloc_va_page() - Allocate a Version Array (VA) page -+ * @reclaim: Reclaim EPC pages directly if none available. Enclave -+ * mutex should not be held if this is set. - * - * Allocate a free EPC page and convert it to a Version Array (VA) page. - * -@@ -971,12 +973,12 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) - * a VA page, - * -errno otherwise - */ --struct sgx_epc_page *sgx_alloc_va_page(void) -+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) - { - struct sgx_epc_page *epc_page; - int ret; - -- epc_page = sgx_alloc_epc_page(NULL, true); -+ epc_page = sgx_alloc_epc_page(NULL, reclaim); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - -diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h -index 2cb58ab868e5..3d0e0ba3edf5 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.h -+++ b/arch/x86/kernel/cpu/sgx/encl.h -@@ -116,14 +116,14 @@ struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, - unsigned long offset, - u64 secinfo_flags); - void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); --struct sgx_epc_page *sgx_alloc_va_page(void); -+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); - unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); - void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); - bool sgx_va_page_full(struct sgx_va_page *va_page); - void sgx_encl_free_epc_page(struct sgx_epc_page *page); - struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr); --struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl); -+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); - void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); - - #endif /* _X86_ENCL_H */ -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index bb8cdb2ad0d1..5d41aa204761 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -17,7 +17,7 @@ - #include "encl.h" - #include "encls.h" - --struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) -+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) - { - struct sgx_va_page *va_page = NULL; - void *err; -@@ -30,7 +30,7 @@ struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) - if (!va_page) - return ERR_PTR(-ENOMEM); - -- va_page->epc_page = sgx_alloc_va_page(); -+ va_page->epc_page = sgx_alloc_va_page(reclaim); - if (IS_ERR(va_page->epc_page)) { - err = ERR_CAST(va_page->epc_page); - kfree(va_page); -@@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) - struct file *backing; - long ret; - -- va_page = sgx_encl_grow(encl); -+ va_page = sgx_encl_grow(encl, true); - if (IS_ERR(va_page)) - return PTR_ERR(va_page); - else if (va_page) -@@ -275,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, - return PTR_ERR(epc_page); - } - -- va_page = sgx_encl_grow(encl); -+ va_page = sgx_encl_grow(encl, true); - if (IS_ERR(va_page)) { - ret = PTR_ERR(va_page); - goto err_out_free; --- -2.36.1 - diff --git a/0020-x86-sgx-Support-restricting-of-enclave-page-permissi.patch b/0020-x86-sgx-Support-restricting-of-enclave-page-permissi.patch deleted file mode 100644 index 943aed066ea8..000000000000 --- a/0020-x86-sgx-Support-restricting-of-enclave-page-permissi.patch +++ /dev/null @@ -1,331 +0,0 @@ -From f77e5b90440a4709d58907d7a467bedbfe8ed4d8 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:51 -0700 -Subject: [PATCH 20/36] x86/sgx: Support restricting of enclave page - permissions - -In the initial (SGX1) version of SGX, pages in an enclave need to be -created with permissions that support all usages of the pages, from the -time the enclave is initialized until it is unloaded. For example, -pages used by a JIT compiler or when code needs to otherwise be -relocated need to always have RWX permissions. - -SGX2 includes a new function ENCLS[EMODPR] that is run from the kernel -and can be used to restrict the EPCM permissions of regular enclave -pages within an initialized enclave. - -Introduce ioctl() SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS to support -restricting EPCM permissions. With this ioctl() the user specifies -a page range and the EPCM permissions to be applied to all pages in -the provided range. ENCLS[EMODPR] is run to restrict the EPCM -permissions followed by the ENCLS[ETRACK] flow that will ensure -no cached linear-to-physical address mappings to the changed -pages remain. - -It is possible for the permission change request to fail on any -page within the provided range, either with an error encountered -by the kernel or by the SGX hardware while running -ENCLS[EMODPR]. To support partial success the ioctl() returns an -error code based on failures encountered by the kernel as well -as two result output parameters: one for the number of pages -that were successfully changed and one for the SGX return code. - -The page table entry permissions are not impacted by the EPCM -permission changes. VMAs and PTEs will continue to allow the -maximum vetted permissions determined at the time the pages -are added to the enclave. The SGX error code in a page fault -will indicate if it was an EPCM permission check that prevented -an access attempt. - -No checking is done to ensure that the permissions are actually -being restricted. This is because the enclave may have relaxed -the EPCM permissions from within the enclave without the kernel -knowing. An attempt to relax permissions using this call will -be ignored by the hardware. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/include/uapi/asm/sgx.h | 21 ++++ - arch/x86/kernel/cpu/sgx/ioctl.c | 216 ++++++++++++++++++++++++++++++++ - 2 files changed, 237 insertions(+) - -diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h -index f4b81587e90b..82648c006470 100644 ---- a/arch/x86/include/uapi/asm/sgx.h -+++ b/arch/x86/include/uapi/asm/sgx.h -@@ -29,6 +29,8 @@ enum sgx_page_flags { - _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) - #define SGX_IOC_VEPC_REMOVE_ALL \ - _IO(SGX_MAGIC, 0x04) -+#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ -+ _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) - - /** - * struct sgx_enclave_create - parameter structure for the -@@ -76,6 +78,25 @@ struct sgx_enclave_provision { - __u64 fd; - }; - -+/** -+ * struct sgx_enclave_restrict_permissions - parameters for ioctl -+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS -+ * @offset: starting page offset (page aligned relative to enclave base -+ * address defined in SECS) -+ * @length: length of memory (multiple of the page size) -+ * @permissions:new permission bits for pages in range described by @offset -+ * and @length -+ * @result: (output) SGX result code of ENCLS[EMODPR] function -+ * @count: (output) bytes successfully changed (multiple of page size) -+ */ -+struct sgx_enclave_restrict_permissions { -+ __u64 offset; -+ __u64 length; -+ __u64 permissions; -+ __u64 result; -+ __u64 count; -+}; -+ - struct sgx_enclave_run; - - /** -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 5d41aa204761..720188d86ed4 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -660,6 +660,218 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) - return sgx_set_attribute(&encl->attributes_mask, params.fd); - } - -+/* -+ * Ensure enclave is ready for SGX2 functions. Readiness is checked -+ * by ensuring the hardware supports SGX2 and the enclave is initialized -+ * and thus able to handle requests to modify pages within it. -+ */ -+static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) -+{ -+ if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) -+ return -ENODEV; -+ -+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+/* -+ * Some SGX functions require that no cached linear-to-physical address -+ * mappings are present before they can succeed. Collaborate with -+ * hardware via ENCLS[ETRACK] to ensure that all cached -+ * linear-to-physical address mappings belonging to all threads of -+ * the enclave are cleared. See sgx_encl_cpumask() for details. -+ * -+ * Must be called with enclave's mutex held from the time the -+ * SGX function requiring that no cached linear-to-physical mappings -+ * are present is executed until this ETRACK flow is complete. -+ */ -+static int sgx_enclave_etrack(struct sgx_encl *encl) -+{ -+ void *epc_virt; -+ int ret; -+ -+ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); -+ ret = __etrack(epc_virt); -+ if (ret) { -+ /* -+ * ETRACK only fails when there is an OS issue. For -+ * example, two consecutive ETRACK was sent without -+ * completed IPI between. -+ */ -+ pr_err_once("ETRACK returned %d (0x%x)", ret, ret); -+ /* -+ * Send IPIs to kick CPUs out of the enclave and -+ * try ETRACK again. -+ */ -+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); -+ ret = __etrack(epc_virt); -+ if (ret) { -+ pr_err_once("ETRACK repeat returned %d (0x%x)", -+ ret, ret); -+ return -EFAULT; -+ } -+ } -+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); -+ -+ return 0; -+} -+ -+/** -+ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions -+ * @encl: Enclave to which the pages belong. -+ * @modp: Checked parameters from user on which pages need modifying and -+ * their new permissions. -+ * -+ * Return: -+ * - 0: Success. -+ * - -errno: Otherwise. -+ */ -+static long -+sgx_enclave_restrict_permissions(struct sgx_encl *encl, -+ struct sgx_enclave_restrict_permissions *modp) -+{ -+ struct sgx_encl_page *entry; -+ struct sgx_secinfo secinfo; -+ unsigned long addr; -+ unsigned long c; -+ void *epc_virt; -+ int ret; -+ -+ memset(&secinfo, 0, sizeof(secinfo)); -+ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; -+ -+ for (c = 0 ; c < modp->length; c += PAGE_SIZE) { -+ addr = encl->base + modp->offset + c; -+ -+ mutex_lock(&encl->lock); -+ -+ entry = sgx_encl_load_page(encl, addr); -+ if (IS_ERR(entry)) { -+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; -+ goto out_unlock; -+ } -+ -+ /* -+ * Changing EPCM permissions is only supported on regular -+ * SGX pages. Attempting this change on other pages will -+ * result in #PF. -+ */ -+ if (entry->type != SGX_PAGE_TYPE_REG) { -+ ret = -EINVAL; -+ goto out_unlock; -+ } -+ -+ /* -+ * Apart from ensuring that read-access remains, do not verify -+ * the permission bits requested. Kernel has no control over -+ * how EPCM permissions can be relaxed from within the enclave. -+ * ENCLS[EMODPR] can only remove existing EPCM permissions, -+ * attempting to set new permissions will be ignored by the -+ * hardware. -+ */ -+ -+ /* Change EPCM permissions. */ -+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page); -+ ret = __emodpr(&secinfo, epc_virt); -+ if (encls_faulted(ret)) { -+ /* -+ * All possible faults should be avoidable: -+ * parameters have been checked, will only change -+ * permissions of a regular page, and no concurrent -+ * SGX1/SGX2 ENCLS instructions since these -+ * are protected with mutex. -+ */ -+ pr_err_once("EMODPR encountered exception %d\n", -+ ENCLS_TRAPNR(ret)); -+ ret = -EFAULT; -+ goto out_unlock; -+ } -+ if (encls_failed(ret)) { -+ modp->result = ret; -+ ret = -EFAULT; -+ goto out_unlock; -+ } -+ -+ ret = sgx_enclave_etrack(encl); -+ if (ret) { -+ ret = -EFAULT; -+ goto out_unlock; -+ } -+ -+ mutex_unlock(&encl->lock); -+ } -+ -+ ret = 0; -+ goto out; -+ -+out_unlock: -+ mutex_unlock(&encl->lock); -+out: -+ modp->count = c; -+ -+ return ret; -+} -+ -+/** -+ * sgx_ioc_enclave_restrict_permissions() - handler for -+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS -+ * @encl: an enclave pointer -+ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions -+ * instance -+ * -+ * SGX2 distinguishes between relaxing and restricting the enclave page -+ * permissions maintained by the hardware (EPCM permissions) of pages -+ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). -+ * -+ * EPCM permissions cannot be restricted from within the enclave, the enclave -+ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] -+ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call -+ * will be ignored by the hardware. -+ * -+ * Return: -+ * - 0: Success -+ * - -errno: Otherwise -+ */ -+static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, -+ void __user *arg) -+{ -+ struct sgx_enclave_restrict_permissions params; -+ long ret; -+ -+ ret = sgx_ioc_sgx2_ready(encl); -+ if (ret) -+ return ret; -+ -+ if (copy_from_user(¶ms, arg, sizeof(params))) -+ return -EFAULT; -+ -+ if (sgx_validate_offset_length(encl, params.offset, params.length)) -+ return -EINVAL; -+ -+ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) -+ return -EINVAL; -+ -+ /* -+ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] -+ * from faulting later when the CPU does the same check. -+ */ -+ if ((params.permissions & SGX_SECINFO_W) && -+ !(params.permissions & SGX_SECINFO_R)) -+ return -EINVAL; -+ -+ if (params.result || params.count) -+ return -EINVAL; -+ -+ ret = sgx_enclave_restrict_permissions(encl, ¶ms); -+ -+ if (copy_to_user(arg, ¶ms, sizeof(params))) -+ return -EFAULT; -+ -+ return ret; -+} -+ - long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - { - struct sgx_encl *encl = filep->private_data; -@@ -681,6 +893,10 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - case SGX_IOC_ENCLAVE_PROVISION: - ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); - break; -+ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: -+ ret = sgx_ioc_enclave_restrict_permissions(encl, -+ (void __user *)arg); -+ break; - default: - ret = -ENOIOCTLCMD; - break; --- -2.36.1 - diff --git a/0021-x86-sgx-Support-adding-of-pages-to-an-initialized-en.patch b/0021-x86-sgx-Support-adding-of-pages-to-an-initialized-en.patch deleted file mode 100644 index d8a57679dd30..000000000000 --- a/0021-x86-sgx-Support-adding-of-pages-to-an-initialized-en.patch +++ /dev/null @@ -1,189 +0,0 @@ -From a819692d9ebd58f1791a63fdaf5d412ac41a0147 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:52 -0700 -Subject: [PATCH 21/36] x86/sgx: Support adding of pages to an initialized - enclave - -With SGX1 an enclave needs to be created with its maximum memory demands -allocated. Pages cannot be added to an enclave after it is initialized. -SGX2 introduces a new function, ENCLS[EAUG], that can be used to add -pages to an initialized enclave. With SGX2 the enclave still needs to -set aside address space for its maximum memory demands during enclave -creation, but all pages need not be added before enclave initialization. -Pages can be added during enclave runtime. - -Add support for dynamically adding pages to an initialized enclave, -architecturally limited to RW permission at creation but allowed to -obtain RWX permissions after trusted enclave runs EMODPE. Add pages -via the page fault handler at the time an enclave address without a -backing enclave page is accessed, potentially directly reclaiming -pages if no free pages are available. - -The enclave is still required to run ENCLU[EACCEPT] on the page before -it can be used. A useful flow is for the enclave to run ENCLU[EACCEPT] -on an uninitialized address. This will trigger the page fault handler -that will add the enclave page and return execution to the enclave to -repeat the ENCLU[EACCEPT] instruction, this time successful. - -If the enclave accesses an uninitialized address in another way, for -example by expanding the enclave stack to a page that has not yet been -added, then the page fault handler would add the page on the first -write but upon returning to the enclave the instruction that triggered -the page fault would be repeated and since ENCLU[EACCEPT] was not run -yet it would trigger a second page fault, this time with the SGX flag -set in the page fault error code. This can only be recovered by entering -the enclave again and directly running the ENCLU[EACCEPT] instruction on -the now initialized address. - -Accessing an uninitialized address from outside the enclave also -triggers this flow but the page will remain inaccessible (access will -result in #PF) until accepted from within the enclave via -ENCLU[EACCEPT]. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 117 +++++++++++++++++++++++++++++++++ - 1 file changed, 117 insertions(+) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index ea81e597dd18..e5b61a59199f 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -295,6 +295,112 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - return __sgx_encl_load_page(encl, entry); - } - -+/** -+ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave -+ * @vma: VMA obtained from fault info from where page is accessed -+ * @encl: enclave accessing the page -+ * @addr: address that triggered the page fault -+ * -+ * When an initialized enclave accesses a page with no backing EPC page -+ * on a SGX2 system then the EPC can be added dynamically via the SGX2 -+ * ENCLS[EAUG] instruction. -+ * -+ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed -+ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. -+ */ -+static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, -+ struct sgx_encl *encl, unsigned long addr) -+{ -+ vm_fault_t vmret = VM_FAULT_SIGBUS; -+ struct sgx_pageinfo pginfo = {0}; -+ struct sgx_encl_page *encl_page; -+ struct sgx_epc_page *epc_page; -+ struct sgx_va_page *va_page; -+ unsigned long phys_addr; -+ u64 secinfo_flags; -+ int ret; -+ -+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) -+ return VM_FAULT_SIGBUS; -+ -+ /* -+ * Ignore internal permission checking for dynamically added pages. -+ * They matter only for data added during the pre-initialization -+ * phase. The enclave decides the permissions by the means of -+ * EACCEPT, EACCEPTCOPY and EMODPE. -+ */ -+ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; -+ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); -+ if (IS_ERR(encl_page)) -+ return VM_FAULT_OOM; -+ -+ mutex_lock(&encl->lock); -+ -+ epc_page = sgx_alloc_epc_page(encl_page, false); -+ if (IS_ERR(epc_page)) { -+ if (PTR_ERR(epc_page) == -EBUSY) -+ vmret = VM_FAULT_NOPAGE; -+ goto err_out_unlock; -+ } -+ -+ va_page = sgx_encl_grow(encl, false); -+ if (IS_ERR(va_page)) -+ goto err_out_epc; -+ -+ if (va_page) -+ list_add(&va_page->list, &encl->va_pages); -+ -+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), -+ encl_page, GFP_KERNEL); -+ /* -+ * If ret == -EBUSY then page was created in another flow while -+ * running without encl->lock -+ */ -+ if (ret) -+ goto err_out_shrink; -+ -+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); -+ pginfo.addr = encl_page->desc & PAGE_MASK; -+ pginfo.metadata = 0; -+ -+ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); -+ if (ret) -+ goto err_out; -+ -+ encl_page->encl = encl; -+ encl_page->epc_page = epc_page; -+ encl_page->type = SGX_PAGE_TYPE_REG; -+ encl->secs_child_cnt++; -+ -+ sgx_mark_page_reclaimable(encl_page->epc_page); -+ -+ phys_addr = sgx_get_epc_phys_addr(epc_page); -+ /* -+ * Do not undo everything when creating PTE entry fails - next #PF -+ * would find page ready for a PTE. -+ */ -+ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); -+ if (vmret != VM_FAULT_NOPAGE) { -+ mutex_unlock(&encl->lock); -+ return VM_FAULT_SIGBUS; -+ } -+ mutex_unlock(&encl->lock); -+ return VM_FAULT_NOPAGE; -+ -+err_out: -+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); -+ -+err_out_shrink: -+ sgx_encl_shrink(encl, va_page); -+err_out_epc: -+ sgx_encl_free_epc_page(epc_page); -+err_out_unlock: -+ mutex_unlock(&encl->lock); -+ kfree(encl_page); -+ -+ return vmret; -+} -+ - static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) - { - unsigned long addr = (unsigned long)vmf->address; -@@ -314,6 +420,17 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) - if (unlikely(!encl)) - return VM_FAULT_SIGBUS; - -+ /* -+ * The page_array keeps track of all enclave pages, whether they -+ * are swapped out or not. If there is no entry for this page and -+ * the system supports SGX2 then it is possible to dynamically add -+ * a new enclave page. This is only possible for an initialized -+ * enclave that will be checked for right away. -+ */ -+ if (cpu_feature_enabled(X86_FEATURE_SGX2) && -+ (!xa_load(&encl->page_array, PFN_DOWN(addr)))) -+ return sgx_encl_eaug_page(vma, encl, addr); -+ - mutex_lock(&encl->lock); - - entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); --- -2.36.1 - diff --git a/0022-x86-sgx-Tighten-accessible-memory-range-after-enclav.patch b/0022-x86-sgx-Tighten-accessible-memory-range-after-enclav.patch deleted file mode 100644 index c7de7fe6289f..000000000000 --- a/0022-x86-sgx-Tighten-accessible-memory-range-after-enclav.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 1f35f20bc83570295e19d6fbf5fb053527975e9b Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:53 -0700 -Subject: [PATCH 22/36] x86/sgx: Tighten accessible memory range after enclave - initialization - -Before an enclave is initialized the enclave's memory range is unknown. -The enclave's memory range is learned at the time it is created via the -SGX_IOC_ENCLAVE_CREATE ioctl() where the provided memory range is -obtained from an earlier mmap() of /dev/sgx_enclave. After an enclave -is initialized its memory can be mapped into user space (mmap()) from -where it can be entered at its defined entry points. - -With the enclave's memory range known after it is initialized there is -no reason why it should be possible to map memory outside this range. - -Lock down access to the initialized enclave's memory range by denying -any attempt to map memory outside its memory range. - -Locking down the memory range also makes adding pages to an initialized -enclave more efficient. Pages are added to an initialized enclave by -accessing memory that belongs to the enclave's memory range but not yet -backed by an enclave page. If it is possible for user space to map -memory that does not form part of the enclave then an access to this -memory would eventually fail. Failures range from a prompt general -protection fault if the access was an ENCLU[EACCEPT] from within the -enclave, or a page fault via the vDSO if it was another access from -within the enclave, or a SIGBUS (also resulting from a page fault) if -the access was from outside the enclave. - -Disallowing invalid memory to be mapped in the first place avoids -preventable failures. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/encl.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c -index e5b61a59199f..295a9c946cef 100644 ---- a/arch/x86/kernel/cpu/sgx/encl.c -+++ b/arch/x86/kernel/cpu/sgx/encl.c -@@ -503,6 +503,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, - - XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); - -+ /* Disallow mapping outside enclave's address range. */ -+ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && -+ (start < encl->base || end > encl->base + encl->size)) -+ return -EACCES; -+ - /* - * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might - * conflict with the enclave page permissions. --- -2.36.1 - diff --git a/0023-x86-sgx-Support-modifying-SGX-page-type.patch b/0023-x86-sgx-Support-modifying-SGX-page-type.patch deleted file mode 100644 index 900563e5980d..000000000000 --- a/0023-x86-sgx-Support-modifying-SGX-page-type.patch +++ /dev/null @@ -1,323 +0,0 @@ -From f53f0d00eb02b3893c2a424b84661df18216cd24 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:54 -0700 -Subject: [PATCH 23/36] x86/sgx: Support modifying SGX page type - -Every enclave contains one or more Thread Control Structures (TCS). The -TCS contains meta-data used by the hardware to save and restore thread -specific information when entering/exiting the enclave. With SGX1 an -enclave needs to be created with enough TCSs to support the largest -number of threads expecting to use the enclave and enough enclave pages -to meet all its anticipated memory demands. In SGX1 all pages remain in -the enclave until the enclave is unloaded. - -SGX2 introduces a new function, ENCLS[EMODT], that is used to change -the type of an enclave page from a regular (SGX_PAGE_TYPE_REG) enclave -page to a TCS (SGX_PAGE_TYPE_TCS) page or change the type from a -regular (SGX_PAGE_TYPE_REG) or TCS (SGX_PAGE_TYPE_TCS) -page to a trimmed (SGX_PAGE_TYPE_TRIM) page (setting it up for later -removal). - -With the existing support of dynamically adding regular enclave pages -to an initialized enclave and changing the page type to TCS it is -possible to dynamically increase the number of threads supported by an -enclave. - -Changing the enclave page type to SGX_PAGE_TYPE_TRIM is the first step -of dynamically removing pages from an initialized enclave. The complete -page removal flow is: -1) Change the type of the pages to be removed to SGX_PAGE_TYPE_TRIM - using the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() introduced here. -2) Approve the page removal by running ENCLU[EACCEPT] from within - the enclave. -3) Initiate actual page removal using the ioctl() introduced in the - following patch. - -Add ioctl() SGX_IOC_ENCLAVE_MODIFY_TYPES to support changing SGX -enclave page types within an initialized enclave. With -SGX_IOC_ENCLAVE_MODIFY_TYPES the user specifies a page range and the -enclave page type to be applied to all pages in the provided range. -The ioctl() itself can return an error code based on failures -encountered by the kernel. It is also possible for SGX specific -failures to be encountered. Add a result output parameter to -communicate the SGX return code. It is possible for the enclave page -type change request to fail on any page within the provided range. -Support partial success by returning the number of pages that were -successfully changed. - -After the page type is changed the page continues to be accessible -from the kernel perspective with page table entries and internal -state. The page may be moved to swap. Any access until ENCLU[EACCEPT] -will encounter a page fault with SGX flag set in error code. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/include/uapi/asm/sgx.h | 20 ++++ - arch/x86/kernel/cpu/sgx/ioctl.c | 202 ++++++++++++++++++++++++++++++++ - 2 files changed, 222 insertions(+) - -diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h -index 82648c006470..567f6166c24a 100644 ---- a/arch/x86/include/uapi/asm/sgx.h -+++ b/arch/x86/include/uapi/asm/sgx.h -@@ -31,6 +31,8 @@ enum sgx_page_flags { - _IO(SGX_MAGIC, 0x04) - #define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ - _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) -+#define SGX_IOC_ENCLAVE_MODIFY_TYPES \ -+ _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) - - /** - * struct sgx_enclave_create - parameter structure for the -@@ -97,6 +99,24 @@ struct sgx_enclave_restrict_permissions { - __u64 count; - }; - -+/** -+ * struct sgx_enclave_modify_types - parameters for ioctl -+ * %SGX_IOC_ENCLAVE_MODIFY_TYPES -+ * @offset: starting page offset (page aligned relative to enclave base -+ * address defined in SECS) -+ * @length: length of memory (multiple of the page size) -+ * @page_type: new type for pages in range described by @offset and @length -+ * @result: (output) SGX result code of ENCLS[EMODT] function -+ * @count: (output) bytes successfully changed (multiple of page size) -+ */ -+struct sgx_enclave_modify_types { -+ __u64 offset; -+ __u64 length; -+ __u64 page_type; -+ __u64 result; -+ __u64 count; -+}; -+ - struct sgx_enclave_run; - - /** -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 720188d86ed4..9ccafbfc4811 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -872,6 +872,205 @@ static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, - return ret; - } - -+/** -+ * sgx_enclave_modify_types() - Modify type of SGX enclave pages -+ * @encl: Enclave to which the pages belong. -+ * @modt: Checked parameters from user about which pages need modifying -+ * and their new page type. -+ * -+ * Return: -+ * - 0: Success -+ * - -errno: Otherwise -+ */ -+static long sgx_enclave_modify_types(struct sgx_encl *encl, -+ struct sgx_enclave_modify_types *modt) -+{ -+ unsigned long max_prot_restore; -+ enum sgx_page_type page_type; -+ struct sgx_encl_page *entry; -+ struct sgx_secinfo secinfo; -+ unsigned long prot; -+ unsigned long addr; -+ unsigned long c; -+ void *epc_virt; -+ int ret; -+ -+ page_type = modt->page_type & SGX_PAGE_TYPE_MASK; -+ -+ /* -+ * The only new page types allowed by hardware are PT_TCS and PT_TRIM. -+ */ -+ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) -+ return -EINVAL; -+ -+ memset(&secinfo, 0, sizeof(secinfo)); -+ -+ secinfo.flags = page_type << 8; -+ -+ for (c = 0 ; c < modt->length; c += PAGE_SIZE) { -+ addr = encl->base + modt->offset + c; -+ -+ mutex_lock(&encl->lock); -+ -+ entry = sgx_encl_load_page(encl, addr); -+ if (IS_ERR(entry)) { -+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; -+ goto out_unlock; -+ } -+ -+ /* -+ * Borrow the logic from the Intel SDM. Regular pages -+ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS -+ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. -+ * CET pages not supported yet. -+ */ -+ if (!(entry->type == SGX_PAGE_TYPE_REG || -+ (entry->type == SGX_PAGE_TYPE_TCS && -+ page_type == SGX_PAGE_TYPE_TRIM))) { -+ ret = -EINVAL; -+ goto out_unlock; -+ } -+ -+ max_prot_restore = entry->vm_max_prot_bits; -+ -+ /* -+ * Once a regular page becomes a TCS page it cannot be -+ * changed back. So the maximum allowed protection reflects -+ * the TCS page that is always RW from kernel perspective but -+ * will be inaccessible from within enclave. Before doing -+ * so, do make sure that the new page type continues to -+ * respect the originally vetted page permissions. -+ */ -+ if (entry->type == SGX_PAGE_TYPE_REG && -+ page_type == SGX_PAGE_TYPE_TCS) { -+ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { -+ ret = -EPERM; -+ goto out_unlock; -+ } -+ prot = PROT_READ | PROT_WRITE; -+ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); -+ -+ /* -+ * Prevent page from being reclaimed while mutex -+ * is released. -+ */ -+ if (sgx_unmark_page_reclaimable(entry->epc_page)) { -+ ret = -EAGAIN; -+ goto out_entry_changed; -+ } -+ -+ /* -+ * Do not keep encl->lock because of dependency on -+ * mmap_lock acquired in sgx_zap_enclave_ptes(). -+ */ -+ mutex_unlock(&encl->lock); -+ -+ sgx_zap_enclave_ptes(encl, addr); -+ -+ mutex_lock(&encl->lock); -+ -+ sgx_mark_page_reclaimable(entry->epc_page); -+ } -+ -+ /* Change EPC type */ -+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page); -+ ret = __emodt(&secinfo, epc_virt); -+ if (encls_faulted(ret)) { -+ /* -+ * All possible faults should be avoidable: -+ * parameters have been checked, will only change -+ * valid page types, and no concurrent -+ * SGX1/SGX2 ENCLS instructions since these are -+ * protected with mutex. -+ */ -+ pr_err_once("EMODT encountered exception %d\n", -+ ENCLS_TRAPNR(ret)); -+ ret = -EFAULT; -+ goto out_entry_changed; -+ } -+ if (encls_failed(ret)) { -+ modt->result = ret; -+ ret = -EFAULT; -+ goto out_entry_changed; -+ } -+ -+ ret = sgx_enclave_etrack(encl); -+ if (ret) { -+ ret = -EFAULT; -+ goto out_unlock; -+ } -+ -+ entry->type = page_type; -+ -+ mutex_unlock(&encl->lock); -+ } -+ -+ ret = 0; -+ goto out; -+ -+out_entry_changed: -+ entry->vm_max_prot_bits = max_prot_restore; -+out_unlock: -+ mutex_unlock(&encl->lock); -+out: -+ modt->count = c; -+ -+ return ret; -+} -+ -+/** -+ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES -+ * @encl: an enclave pointer -+ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance -+ * -+ * Ability to change the enclave page type supports the following use cases: -+ * -+ * * It is possible to add TCS pages to an enclave by changing the type of -+ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. -+ * With this support the number of threads supported by an initialized -+ * enclave can be increased dynamically. -+ * -+ * * Regular or TCS pages can dynamically be removed from an initialized -+ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the -+ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual -+ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called -+ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the -+ * enclave. -+ * -+ * Return: -+ * - 0: Success -+ * - -errno: Otherwise -+ */ -+static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, -+ void __user *arg) -+{ -+ struct sgx_enclave_modify_types params; -+ long ret; -+ -+ ret = sgx_ioc_sgx2_ready(encl); -+ if (ret) -+ return ret; -+ -+ if (copy_from_user(¶ms, arg, sizeof(params))) -+ return -EFAULT; -+ -+ if (sgx_validate_offset_length(encl, params.offset, params.length)) -+ return -EINVAL; -+ -+ if (params.page_type & ~SGX_PAGE_TYPE_MASK) -+ return -EINVAL; -+ -+ if (params.result || params.count) -+ return -EINVAL; -+ -+ ret = sgx_enclave_modify_types(encl, ¶ms); -+ -+ if (copy_to_user(arg, ¶ms, sizeof(params))) -+ return -EFAULT; -+ -+ return ret; -+} -+ - long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - { - struct sgx_encl *encl = filep->private_data; -@@ -897,6 +1096,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - ret = sgx_ioc_enclave_restrict_permissions(encl, - (void __user *)arg); - break; -+ case SGX_IOC_ENCLAVE_MODIFY_TYPES: -+ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); -+ break; - default: - ret = -ENOIOCTLCMD; - break; --- -2.36.1 - diff --git a/0024-x86-sgx-Support-complete-page-removal.patch b/0024-x86-sgx-Support-complete-page-removal.patch deleted file mode 100644 index fc69b5333f0f..000000000000 --- a/0024-x86-sgx-Support-complete-page-removal.patch +++ /dev/null @@ -1,248 +0,0 @@ -From 7d00f6a6de9df6f5daf6f6d866220de01cedd7c7 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:55 -0700 -Subject: [PATCH 24/36] x86/sgx: Support complete page removal - -The SGX2 page removal flow was introduced in previous patch and is -as follows: -1) Change the type of the pages to be removed to SGX_PAGE_TYPE_TRIM - using the ioctl() SGX_IOC_ENCLAVE_MODIFY_TYPES introduced in - previous patch. -2) Approve the page removal by running ENCLU[EACCEPT] from within - the enclave. -3) Initiate actual page removal using the ioctl() - SGX_IOC_ENCLAVE_REMOVE_PAGES introduced here. - -Support the final step of the SGX2 page removal flow with ioctl() -SGX_IOC_ENCLAVE_REMOVE_PAGES. With this ioctl() the user specifies -a page range that should be removed. All pages in the provided -range should have the SGX_PAGE_TYPE_TRIM page type and the request -will fail with EPERM (Operation not permitted) if a page that does -not have the correct type is encountered. Page removal can fail -on any page within the provided range. Support partial success by -returning the number of pages that were successfully removed. - -Since actual page removal will succeed even if ENCLU[EACCEPT] was not -run from within the enclave the ENCLU[EMODPR] instruction with RWX -permissions is used as a no-op mechanism to ensure ENCLU[EACCEPT] was -successfully run from within the enclave before the enclave page is -removed. - -If the user omits running SGX_IOC_ENCLAVE_REMOVE_PAGES the pages will -still be removed when the enclave is unloaded. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Tested-by: Haitao Huang <haitao.huang@intel.com> -Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> -Tested-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/include/uapi/asm/sgx.h | 21 +++++ - arch/x86/kernel/cpu/sgx/ioctl.c | 145 ++++++++++++++++++++++++++++++++ - 2 files changed, 166 insertions(+) - -diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h -index 567f6166c24a..2dd35bbdc822 100644 ---- a/arch/x86/include/uapi/asm/sgx.h -+++ b/arch/x86/include/uapi/asm/sgx.h -@@ -33,6 +33,8 @@ enum sgx_page_flags { - _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) - #define SGX_IOC_ENCLAVE_MODIFY_TYPES \ - _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) -+#define SGX_IOC_ENCLAVE_REMOVE_PAGES \ -+ _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages) - - /** - * struct sgx_enclave_create - parameter structure for the -@@ -117,6 +119,25 @@ struct sgx_enclave_modify_types { - __u64 count; - }; - -+/** -+ * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters -+ * @offset: starting page offset (page aligned relative to enclave base -+ * address defined in SECS) -+ * @length: length of memory (multiple of the page size) -+ * @count: (output) bytes successfully changed (multiple of page size) -+ * -+ * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized -+ * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES -+ * ioctl() should be used to change the page type to PT_TRIM. After that -+ * succeeds ENCLU[EACCEPT] should be run from within the enclave and then -+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal. -+ */ -+struct sgx_enclave_remove_pages { -+ __u64 offset; -+ __u64 length; -+ __u64 count; -+}; -+ - struct sgx_enclave_run; - - /** -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 9ccafbfc4811..1a2595f261d3 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -1071,6 +1071,148 @@ static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, - return ret; - } - -+/** -+ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave -+ * @encl: Enclave to which the pages belong -+ * @params: Checked parameters from user on which pages need to be removed -+ * -+ * Return: -+ * - 0: Success. -+ * - -errno: Otherwise. -+ */ -+static long sgx_encl_remove_pages(struct sgx_encl *encl, -+ struct sgx_enclave_remove_pages *params) -+{ -+ struct sgx_encl_page *entry; -+ struct sgx_secinfo secinfo; -+ unsigned long addr; -+ unsigned long c; -+ void *epc_virt; -+ int ret; -+ -+ memset(&secinfo, 0, sizeof(secinfo)); -+ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; -+ -+ for (c = 0 ; c < params->length; c += PAGE_SIZE) { -+ addr = encl->base + params->offset + c; -+ -+ mutex_lock(&encl->lock); -+ -+ entry = sgx_encl_load_page(encl, addr); -+ if (IS_ERR(entry)) { -+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; -+ goto out_unlock; -+ } -+ -+ if (entry->type != SGX_PAGE_TYPE_TRIM) { -+ ret = -EPERM; -+ goto out_unlock; -+ } -+ -+ /* -+ * ENCLS[EMODPR] is a no-op instruction used to inform if -+ * ENCLU[EACCEPT] was run from within the enclave. If -+ * ENCLS[EMODPR] is run with RWX on a trimmed page that is -+ * not yet accepted then it will return -+ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is -+ * accepted the instruction will encounter a page fault. -+ */ -+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page); -+ ret = __emodpr(&secinfo, epc_virt); -+ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { -+ ret = -EPERM; -+ goto out_unlock; -+ } -+ -+ if (sgx_unmark_page_reclaimable(entry->epc_page)) { -+ ret = -EBUSY; -+ goto out_unlock; -+ } -+ -+ /* -+ * Do not keep encl->lock because of dependency on -+ * mmap_lock acquired in sgx_zap_enclave_ptes(). -+ */ -+ mutex_unlock(&encl->lock); -+ -+ sgx_zap_enclave_ptes(encl, addr); -+ -+ mutex_lock(&encl->lock); -+ -+ sgx_encl_free_epc_page(entry->epc_page); -+ encl->secs_child_cnt--; -+ entry->epc_page = NULL; -+ xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); -+ sgx_encl_shrink(encl, NULL); -+ kfree(entry); -+ -+ mutex_unlock(&encl->lock); -+ } -+ -+ ret = 0; -+ goto out; -+ -+out_unlock: -+ mutex_unlock(&encl->lock); -+out: -+ params->count = c; -+ -+ return ret; -+} -+ -+/** -+ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES -+ * @encl: an enclave pointer -+ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance -+ * -+ * Final step of the flow removing pages from an initialized enclave. The -+ * complete flow is: -+ * -+ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM -+ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). -+ * 2) User approves the page removal by running ENCLU[EACCEPT] from within -+ * the enclave. -+ * 3) User initiates actual page removal using the -+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. -+ * -+ * First remove any page table entries pointing to the page and then proceed -+ * with the actual removal of the enclave page and data in support of it. -+ * -+ * VA pages are not affected by this removal. It is thus possible that the -+ * enclave may end up with more VA pages than needed to support all its -+ * pages. -+ * -+ * Return: -+ * - 0: Success -+ * - -errno: Otherwise -+ */ -+static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, -+ void __user *arg) -+{ -+ struct sgx_enclave_remove_pages params; -+ long ret; -+ -+ ret = sgx_ioc_sgx2_ready(encl); -+ if (ret) -+ return ret; -+ -+ if (copy_from_user(¶ms, arg, sizeof(params))) -+ return -EFAULT; -+ -+ if (sgx_validate_offset_length(encl, params.offset, params.length)) -+ return -EINVAL; -+ -+ if (params.count) -+ return -EINVAL; -+ -+ ret = sgx_encl_remove_pages(encl, ¶ms); -+ -+ if (copy_to_user(arg, ¶ms, sizeof(params))) -+ return -EFAULT; -+ -+ return ret; -+} -+ - long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - { - struct sgx_encl *encl = filep->private_data; -@@ -1099,6 +1241,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) - case SGX_IOC_ENCLAVE_MODIFY_TYPES: - ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); - break; -+ case SGX_IOC_ENCLAVE_REMOVE_PAGES: -+ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); -+ break; - default: - ret = -ENOIOCTLCMD; - break; --- -2.36.1 - diff --git a/0025-x86-sgx-Free-up-EPC-pages-directly-to-support-large-.patch b/0025-x86-sgx-Free-up-EPC-pages-directly-to-support-large-.patch deleted file mode 100644 index b5400b4daf6f..000000000000 --- a/0025-x86-sgx-Free-up-EPC-pages-directly-to-support-large-.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 1414f7b29435dc53d5e0d685244210e50f55c1e5 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:56 -0700 -Subject: [PATCH 25/36] x86/sgx: Free up EPC pages directly to support large - page ranges - -The page reclaimer ensures availability of EPC pages across all -enclaves. In support of this it runs independently from the -individual enclaves in order to take locks from the different -enclaves as it writes pages to swap. - -When needing to load a page from swap an EPC page needs to be -available for its contents to be loaded into. Loading an existing -enclave page from swap does not reclaim EPC pages directly if -none are available, instead the reclaimer is woken when the -available EPC pages are found to be below a watermark. - -When iterating over a large number of pages in an oversubscribed -environment there is a race between the reclaimer woken up and -EPC pages reclaimed fast enough for the page operations to proceed. - -Ensure there are EPC pages available before attempting to load -a page that may potentially be pulled from swap into an available -EPC page. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - arch/x86/kernel/cpu/sgx/ioctl.c | 6 ++++++ - arch/x86/kernel/cpu/sgx/main.c | 11 +++++++++++ - arch/x86/kernel/cpu/sgx/sgx.h | 1 + - 3 files changed, 18 insertions(+) - -diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c -index 1a2595f261d3..ebe79d60619f 100644 ---- a/arch/x86/kernel/cpu/sgx/ioctl.c -+++ b/arch/x86/kernel/cpu/sgx/ioctl.c -@@ -745,6 +745,8 @@ sgx_enclave_restrict_permissions(struct sgx_encl *encl, - for (c = 0 ; c < modp->length; c += PAGE_SIZE) { - addr = encl->base + modp->offset + c; - -+ sgx_reclaim_direct(); -+ - mutex_lock(&encl->lock); - - entry = sgx_encl_load_page(encl, addr); -@@ -910,6 +912,8 @@ static long sgx_enclave_modify_types(struct sgx_encl *encl, - for (c = 0 ; c < modt->length; c += PAGE_SIZE) { - addr = encl->base + modt->offset + c; - -+ sgx_reclaim_direct(); -+ - mutex_lock(&encl->lock); - - entry = sgx_encl_load_page(encl, addr); -@@ -1096,6 +1100,8 @@ static long sgx_encl_remove_pages(struct sgx_encl *encl, - for (c = 0 ; c < params->length; c += PAGE_SIZE) { - addr = encl->base + params->offset + c; - -+ sgx_reclaim_direct(); -+ - mutex_lock(&encl->lock); - - entry = sgx_encl_load_page(encl, addr); -diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c -index 180ad840b226..5acd4c54d904 100644 ---- a/arch/x86/kernel/cpu/sgx/main.c -+++ b/arch/x86/kernel/cpu/sgx/main.c -@@ -375,6 +375,17 @@ static bool sgx_should_reclaim(unsigned long watermark) - !list_empty(&sgx_active_page_list); - } - -+/* -+ * sgx_reclaim_direct() should be called (without enclave's mutex held) -+ * in locations where SGX memory resources might be low and might be -+ * needed in order to make forward progress. -+ */ -+void sgx_reclaim_direct(void) -+{ -+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) -+ sgx_reclaim_pages(); -+} -+ - static int ksgxd(void *p) - { - set_freezable(); -diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h -index b30cee4de903..0f2020653fba 100644 ---- a/arch/x86/kernel/cpu/sgx/sgx.h -+++ b/arch/x86/kernel/cpu/sgx/sgx.h -@@ -86,6 +86,7 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) - struct sgx_epc_page *__sgx_alloc_epc_page(void); - void sgx_free_epc_page(struct sgx_epc_page *page); - -+void sgx_reclaim_direct(void); - void sgx_mark_page_reclaimable(struct sgx_epc_page *page); - int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); - struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); --- -2.36.1 - diff --git a/0026-Documentation-x86-Introduce-enclave-runtime-manageme.patch b/0026-Documentation-x86-Introduce-enclave-runtime-manageme.patch deleted file mode 100644 index a4d973cf42f0..000000000000 --- a/0026-Documentation-x86-Introduce-enclave-runtime-manageme.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 77a37ace712749fe260acdf83b1a59e1b4340342 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:57 -0700 -Subject: [PATCH 26/36] Documentation/x86: Introduce enclave runtime management - section - -Enclave runtime management is introduced following the pattern -of the section describing enclave building. Provide a brief -summary of enclave runtime management, pointing to the functions -implementing the ioctl()s that will contain details within their -kernel-doc. - -Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - Documentation/x86/sgx.rst | 15 +++++++++++++++ - 1 file changed, 15 insertions(+) - -diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst -index 265568a9292c..2bcbffacbed5 100644 ---- a/Documentation/x86/sgx.rst -+++ b/Documentation/x86/sgx.rst -@@ -100,6 +100,21 @@ pages and establish enclave page permissions. - sgx_ioc_enclave_init - sgx_ioc_enclave_provision - -+Enclave runtime management -+-------------------------- -+ -+Systems supporting SGX2 additionally support changes to initialized -+enclaves: modifying enclave page permissions and type, and dynamically -+adding and removing of enclave pages. When an enclave accesses an address -+within its address range that does not have a backing page then a new -+regular page will be dynamically added to the enclave. The enclave is -+still required to run EACCEPT on the new page before it can be used. -+ -+.. kernel-doc:: arch/x86/kernel/cpu/sgx/ioctl.c -+ :functions: sgx_ioc_enclave_restrict_permissions -+ sgx_ioc_enclave_modify_types -+ sgx_ioc_enclave_remove_pages -+ - Enclave vDSO - ------------ - --- -2.36.1 - diff --git a/0027-selftests-sgx-Add-test-for-EPCM-permission-changes.patch b/0027-selftests-sgx-Add-test-for-EPCM-permission-changes.patch deleted file mode 100644 index 06016886a48a..000000000000 --- a/0027-selftests-sgx-Add-test-for-EPCM-permission-changes.patch +++ /dev/null @@ -1,356 +0,0 @@ -From d2441d3dc8f024a09ddb07268947c76f19c36555 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:58 -0700 -Subject: [PATCH 27/36] selftests/sgx: Add test for EPCM permission changes - -EPCM permission changes could be made from within (to relax -permissions) or out (to restrict permissions) the enclave. Kernel -support is needed when permissions are restricted to be able to -call the privileged ENCLS[EMODPR] instruction. EPCM permissions -can be relaxed via ENCLU[EMODPE] from within the enclave but the -enclave still depends on the kernel to install PTEs with the needed -permissions. - -Add a test that exercises a few of the enclave page permission flows: -1) Test starts with a RW (from enclave and kernel perspective) - enclave page that is mapped via a RW VMA. -2) Use the SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl() to restrict - the enclave (EPCM) page permissions to read-only. -3) Run ENCLU[EACCEPT] from within the enclave to accept the new page - permissions. -4) Attempt to write to the enclave page from within the enclave - this - should fail with a page fault on the EPCM permissions since the page - table entry continues to allow RW access. -5) Restore EPCM permissions to RW by running ENCLU[EMODPE] from within - the enclave. -6) Attempt to write to the enclave page from within the enclave - this - should succeed since both EPCM and PTE permissions allow this access. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/defines.h | 15 ++ - tools/testing/selftests/sgx/main.c | 214 ++++++++++++++++++++++++ - tools/testing/selftests/sgx/test_encl.c | 38 +++++ - 3 files changed, 267 insertions(+) - -diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h -index 02d775789ea7..b638eb98c80c 100644 ---- a/tools/testing/selftests/sgx/defines.h -+++ b/tools/testing/selftests/sgx/defines.h -@@ -24,6 +24,8 @@ enum encl_op_type { - ENCL_OP_PUT_TO_ADDRESS, - ENCL_OP_GET_FROM_ADDRESS, - ENCL_OP_NOP, -+ ENCL_OP_EACCEPT, -+ ENCL_OP_EMODPE, - ENCL_OP_MAX, - }; - -@@ -53,4 +55,17 @@ struct encl_op_get_from_addr { - uint64_t addr; - }; - -+struct encl_op_eaccept { -+ struct encl_op_header header; -+ uint64_t epc_addr; -+ uint64_t flags; -+ uint64_t ret; -+}; -+ -+struct encl_op_emodpe { -+ struct encl_op_header header; -+ uint64_t epc_addr; -+ uint64_t flags; -+}; -+ - #endif /* DEFINES_H */ -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index dd74fa42302e..46eac09cd955 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -25,6 +25,18 @@ static const uint64_t MAGIC = 0x1122334455667788ULL; - static const uint64_t MAGIC2 = 0x8877665544332211ULL; - vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; - -+/* -+ * Security Information (SECINFO) data structure needed by a few SGX -+ * instructions (eg. ENCLU[EACCEPT] and ENCLU[EMODPE]) holds meta-data -+ * about an enclave page. &enum sgx_secinfo_page_state specifies the -+ * secinfo flags used for page state. -+ */ -+enum sgx_secinfo_page_state { -+ SGX_SECINFO_PENDING = (1 << 3), -+ SGX_SECINFO_MODIFIED = (1 << 4), -+ SGX_SECINFO_PR = (1 << 5), -+}; -+ - struct vdso_symtab { - Elf64_Sym *elf_symtab; - const char *elf_symstrtab; -@@ -555,4 +567,206 @@ TEST_F(enclave, pte_permissions) - EXPECT_EQ(self->run.exception_addr, 0); - } - -+/* -+ * Enclave page permission test. -+ * -+ * Modify and restore enclave page's EPCM (enclave) permissions from -+ * outside enclave (ENCLS[EMODPR] via kernel) as well as from within -+ * enclave (via ENCLU[EMODPE]). Check for page fault if -+ * VMA allows access but EPCM permissions do not. -+ */ -+TEST_F(enclave, epcm_permissions) -+{ -+ struct sgx_enclave_restrict_permissions restrict_ioc; -+ struct encl_op_get_from_addr get_addr_op; -+ struct encl_op_put_to_addr put_addr_op; -+ struct encl_op_eaccept eaccept_op; -+ struct encl_op_emodpe emodpe_op; -+ unsigned long data_start; -+ int ret, errno_save; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ /* -+ * Ensure kernel supports needed ioctl() and system supports needed -+ * commands. -+ */ -+ memset(&restrict_ioc, 0, sizeof(restrict_ioc)); -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, -+ &restrict_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ ASSERT_EQ(ret, -1); -+ -+ /* ret == -1 */ -+ if (errno_save == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); -+ else if (errno_save == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ -+ /* -+ * Page that will have its permissions changed is the second data -+ * page in the .data segment. This forms part of the local encl_buffer -+ * within the enclave. -+ * -+ * At start of test @data_start should have EPCM as well as PTE and -+ * VMA permissions of RW. -+ */ -+ -+ data_start = self->encl.encl_base + -+ encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ -+ /* -+ * Sanity check that page at @data_start is writable before making -+ * any changes to page permissions. -+ * -+ * Start by writing MAGIC to test page. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = data_start; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory that was just written to, confirming that -+ * page is writable. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = data_start; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Change EPCM permissions to read-only. Kernel still considers -+ * the page writable. -+ */ -+ memset(&restrict_ioc, 0, sizeof(restrict_ioc)); -+ -+ restrict_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ restrict_ioc.length = PAGE_SIZE; -+ restrict_ioc.permissions = SGX_SECINFO_R; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, -+ &restrict_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(restrict_ioc.result, 0); -+ EXPECT_EQ(restrict_ioc.count, 4096); -+ -+ /* -+ * EPCM permissions changed from kernel, need to EACCEPT from enclave. -+ */ -+ eaccept_op.epc_addr = data_start; -+ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_REG | SGX_SECINFO_PR; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* -+ * EPCM permissions of page is now read-only, expect #PF -+ * on EPCM when attempting to write to page from within enclave. -+ */ -+ put_addr_op.value = MAGIC2; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(self->run.function, ERESUME); -+ EXPECT_EQ(self->run.exception_vector, 14); -+ EXPECT_EQ(self->run.exception_error_code, 0x8007); -+ EXPECT_EQ(self->run.exception_addr, data_start); -+ -+ self->run.exception_vector = 0; -+ self->run.exception_error_code = 0; -+ self->run.exception_addr = 0; -+ -+ /* -+ * Received AEX but cannot return to enclave at same entrypoint, -+ * need different TCS from where EPCM permission can be made writable -+ * again. -+ */ -+ self->run.tcs = self->encl.encl_base + PAGE_SIZE; -+ -+ /* -+ * Enter enclave at new TCS to change EPCM permissions to be -+ * writable again and thus fix the page fault that triggered the -+ * AEX. -+ */ -+ -+ emodpe_op.epc_addr = data_start; -+ emodpe_op.flags = SGX_SECINFO_R | SGX_SECINFO_W; -+ emodpe_op.header.type = ENCL_OP_EMODPE; -+ -+ EXPECT_EQ(ENCL_CALL(&emodpe_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Attempt to return to main TCS to resume execution at faulting -+ * instruction, PTE should continue to allow writing to the page. -+ */ -+ self->run.tcs = self->encl.encl_base; -+ -+ /* -+ * Wrong page permissions that caused original fault has -+ * now been fixed via EPCM permissions. -+ * Resume execution in main TCS to re-attempt the memory access. -+ */ -+ self->run.tcs = self->encl.encl_base; -+ -+ EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, -+ ERESUME, 0, 0, -+ &self->run), -+ 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ get_addr_op.value = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC2); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.user_data, 0); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+} -+ - TEST_HARNESS_MAIN -diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c -index 4fca01cfd898..5b6c65331527 100644 ---- a/tools/testing/selftests/sgx/test_encl.c -+++ b/tools/testing/selftests/sgx/test_encl.c -@@ -11,6 +11,42 @@ - */ - static uint8_t encl_buffer[8192] = { 1 }; - -+enum sgx_enclu_function { -+ EACCEPT = 0x5, -+ EMODPE = 0x6, -+}; -+ -+static void do_encl_emodpe(void *_op) -+{ -+ struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; -+ struct encl_op_emodpe *op = _op; -+ -+ secinfo.flags = op->flags; -+ -+ asm volatile(".byte 0x0f, 0x01, 0xd7" -+ : -+ : "a" (EMODPE), -+ "b" (&secinfo), -+ "c" (op->epc_addr)); -+} -+ -+static void do_encl_eaccept(void *_op) -+{ -+ struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; -+ struct encl_op_eaccept *op = _op; -+ int rax; -+ -+ secinfo.flags = op->flags; -+ -+ asm volatile(".byte 0x0f, 0x01, 0xd7" -+ : "=a" (rax) -+ : "a" (EACCEPT), -+ "b" (&secinfo), -+ "c" (op->epc_addr)); -+ -+ op->ret = rax; -+} -+ - static void *memcpy(void *dest, const void *src, size_t n) - { - size_t i; -@@ -62,6 +98,8 @@ void encl_body(void *rdi, void *rsi) - do_encl_op_put_to_addr, - do_encl_op_get_from_addr, - do_encl_op_nop, -+ do_encl_eaccept, -+ do_encl_emodpe, - }; - - struct encl_op_header *op = (struct encl_op_header *)rdi; --- -2.36.1 - diff --git a/0028-selftests-sgx-Add-test-for-TCS-page-permission-chang.patch b/0028-selftests-sgx-Add-test-for-TCS-page-permission-chang.patch deleted file mode 100644 index ebdf31843b1f..000000000000 --- a/0028-selftests-sgx-Add-test-for-TCS-page-permission-chang.patch +++ /dev/null @@ -1,106 +0,0 @@ -From 9df29e1b7db7988ade8f497ff8f29752c7fab17e Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:08:59 -0700 -Subject: [PATCH 28/36] selftests/sgx: Add test for TCS page permission changes - -Kernel should not allow permission changes on TCS pages. Add test to -confirm this behavior. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/main.c | 71 ++++++++++++++++++++++++++++++ - 1 file changed, 71 insertions(+) - -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index 46eac09cd955..016ae3e5f398 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -121,6 +121,24 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) - return NULL; - } - -+/* -+ * Return the offset in the enclave where the TCS segment can be found. -+ * The first RW segment loaded is the TCS. -+ */ -+static off_t encl_get_tcs_offset(struct encl *encl) -+{ -+ int i; -+ -+ for (i = 0; i < encl->nr_segments; i++) { -+ struct encl_segment *seg = &encl->segment_tbl[i]; -+ -+ if (i == 0 && seg->prot == (PROT_READ | PROT_WRITE)) -+ return seg->offset; -+ } -+ -+ return -1; -+} -+ - /* - * Return the offset in the enclave where the data segment can be found. - * The first RW segment loaded is the TCS, skip that to get info on the -@@ -567,6 +585,59 @@ TEST_F(enclave, pte_permissions) - EXPECT_EQ(self->run.exception_addr, 0); - } - -+/* -+ * Modifying permissions of TCS page should not be possible. -+ */ -+TEST_F(enclave, tcs_permissions) -+{ -+ struct sgx_enclave_restrict_permissions ioc; -+ int ret, errno_save; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ memset(&ioc, 0, sizeof(ioc)); -+ -+ /* -+ * Ensure kernel supports needed ioctl() and system supports needed -+ * commands. -+ */ -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ ASSERT_EQ(ret, -1); -+ -+ /* ret == -1 */ -+ if (errno_save == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); -+ else if (errno_save == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ -+ /* -+ * Attempt to make TCS page read-only. This is not allowed and -+ * should be prevented by the kernel. -+ */ -+ ioc.offset = encl_get_tcs_offset(&self->encl); -+ ioc.length = PAGE_SIZE; -+ ioc.permissions = SGX_SECINFO_R; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, -1); -+ EXPECT_EQ(errno_save, EINVAL); -+ EXPECT_EQ(ioc.result, 0); -+ EXPECT_EQ(ioc.count, 0); -+} -+ - /* - * Enclave page permission test. - * --- -2.36.1 - diff --git a/0029-selftests-sgx-Test-two-different-SGX2-EAUG-flows.patch b/0029-selftests-sgx-Test-two-different-SGX2-EAUG-flows.patch deleted file mode 100644 index daf53975f4bd..000000000000 --- a/0029-selftests-sgx-Test-two-different-SGX2-EAUG-flows.patch +++ /dev/null @@ -1,302 +0,0 @@ -From 322b7009f14f32e6de0e323703e902c0c4515852 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:00 -0700 -Subject: [PATCH 29/36] selftests/sgx: Test two different SGX2 EAUG flows - -Enclave pages can be added to an initialized enclave when an address -belonging to the enclave but without a backing page is accessed from -within the enclave. - -Accessing memory without a backing enclave page from within an enclave -can be in different ways: -1) Pre-emptively run ENCLU[EACCEPT]. Since the addition of a page - always needs to be accepted by the enclave via ENCLU[EACCEPT] this - flow is efficient since the first execution of ENCLU[EACCEPT] - triggers the addition of the page and when execution returns to the - same instruction the second execution would be successful as an - acceptance of the page. - -2) A direct read or write. The flow where a direct read or write - triggers the page addition execution cannot resume from the - instruction (read/write) that triggered the fault but instead - the enclave needs to be entered at a different entry point to - run needed ENCLU[EACCEPT] before execution can return to the - original entry point and the read/write instruction that faulted. - -Add tests for both flows. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/main.c | 250 +++++++++++++++++++++++++++++ - 1 file changed, 250 insertions(+) - -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index 016ae3e5f398..79c08e347112 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -86,6 +86,15 @@ static bool vdso_get_symtab(void *addr, struct vdso_symtab *symtab) - return true; - } - -+static inline int sgx2_supported(void) -+{ -+ unsigned int eax, ebx, ecx, edx; -+ -+ __cpuid_count(SGX_CPUID, 0x0, eax, ebx, ecx, edx); -+ -+ return eax & 0x2; -+} -+ - static unsigned long elf_sym_hash(const char *name) - { - unsigned long h = 0, high; -@@ -840,4 +849,245 @@ TEST_F(enclave, epcm_permissions) - EXPECT_EQ(self->run.exception_addr, 0); - } - -+/* -+ * Test the addition of pages to an initialized enclave via writing to -+ * a page belonging to the enclave's address space but was not added -+ * during enclave creation. -+ */ -+TEST_F(enclave, augment) -+{ -+ struct encl_op_get_from_addr get_addr_op; -+ struct encl_op_put_to_addr put_addr_op; -+ struct encl_op_eaccept eaccept_op; -+ size_t total_size = 0; -+ void *addr; -+ int i; -+ -+ if (!sgx2_supported()) -+ SKIP(return, "SGX2 not supported"); -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ for (i = 0; i < self->encl.nr_segments; i++) { -+ struct encl_segment *seg = &self->encl.segment_tbl[i]; -+ -+ total_size += seg->size; -+ } -+ -+ /* -+ * Actual enclave size is expected to be larger than the loaded -+ * test enclave since enclave size must be a power of 2 in bytes -+ * and test_encl does not consume it all. -+ */ -+ EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); -+ -+ /* -+ * Create memory mapping for the page that will be added. New -+ * memory mapping is for one page right after all existing -+ * mappings. -+ * Kernel will allow new mapping using any permissions if it -+ * falls into the enclave's address range but not backed -+ * by existing enclave pages. -+ */ -+ addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, -+ PROT_READ | PROT_WRITE | PROT_EXEC, -+ MAP_SHARED | MAP_FIXED, self->encl.fd, 0); -+ EXPECT_NE(addr, MAP_FAILED); -+ -+ self->run.exception_vector = 0; -+ self->run.exception_error_code = 0; -+ self->run.exception_addr = 0; -+ -+ /* -+ * Attempt to write to the new page from within enclave. -+ * Expected to fail since page is not (yet) part of the enclave. -+ * The first #PF will trigger the addition of the page to the -+ * enclave, but since the new page needs an EACCEPT from within the -+ * enclave before it can be used it would not be possible -+ * to successfully return to the failing instruction. This is the -+ * cause of the second #PF captured here having the SGX bit set, -+ * it is from hardware preventing the page from being used. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = (unsigned long)addr; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(self->run.function, ERESUME); -+ EXPECT_EQ(self->run.exception_vector, 14); -+ EXPECT_EQ(self->run.exception_addr, (unsigned long)addr); -+ -+ if (self->run.exception_error_code == 0x6) { -+ munmap(addr, PAGE_SIZE); -+ SKIP(return, "Kernel does not support adding pages to initialized enclave"); -+ } -+ -+ EXPECT_EQ(self->run.exception_error_code, 0x8007); -+ -+ self->run.exception_vector = 0; -+ self->run.exception_error_code = 0; -+ self->run.exception_addr = 0; -+ -+ /* Handle AEX by running EACCEPT from new entry point. */ -+ self->run.tcs = self->encl.encl_base + PAGE_SIZE; -+ -+ eaccept_op.epc_addr = self->encl.encl_base + total_size; -+ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* Can now return to main TCS to resume execution. */ -+ self->run.tcs = self->encl.encl_base; -+ -+ EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, -+ ERESUME, 0, 0, -+ &self->run), -+ 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory from newly added page that was just written to, -+ * confirming that data previously written (MAGIC) is present. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = (unsigned long)addr; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ munmap(addr, PAGE_SIZE); -+} -+ -+/* -+ * Test for the addition of pages to an initialized enclave via a -+ * pre-emptive run of EACCEPT on page to be added. -+ */ -+TEST_F(enclave, augment_via_eaccept) -+{ -+ struct encl_op_get_from_addr get_addr_op; -+ struct encl_op_put_to_addr put_addr_op; -+ struct encl_op_eaccept eaccept_op; -+ size_t total_size = 0; -+ void *addr; -+ int i; -+ -+ if (!sgx2_supported()) -+ SKIP(return, "SGX2 not supported"); -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ for (i = 0; i < self->encl.nr_segments; i++) { -+ struct encl_segment *seg = &self->encl.segment_tbl[i]; -+ -+ total_size += seg->size; -+ } -+ -+ /* -+ * Actual enclave size is expected to be larger than the loaded -+ * test enclave since enclave size must be a power of 2 in bytes while -+ * test_encl does not consume it all. -+ */ -+ EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); -+ -+ /* -+ * mmap() a page at end of existing enclave to be used for dynamic -+ * EPC page. -+ * -+ * Kernel will allow new mapping using any permissions if it -+ * falls into the enclave's address range but not backed -+ * by existing enclave pages. -+ */ -+ -+ addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, -+ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_FIXED, -+ self->encl.fd, 0); -+ EXPECT_NE(addr, MAP_FAILED); -+ -+ self->run.exception_vector = 0; -+ self->run.exception_error_code = 0; -+ self->run.exception_addr = 0; -+ -+ /* -+ * Run EACCEPT on new page to trigger the #PF->EAUG->EACCEPT(again -+ * without a #PF). All should be transparent to userspace. -+ */ -+ eaccept_op.epc_addr = self->encl.encl_base + total_size; -+ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ if (self->run.exception_vector == 14 && -+ self->run.exception_error_code == 4 && -+ self->run.exception_addr == self->encl.encl_base + total_size) { -+ munmap(addr, PAGE_SIZE); -+ SKIP(return, "Kernel does not support adding pages to initialized enclave"); -+ } -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* -+ * New page should be accessible from within enclave - attempt to -+ * write to it. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = (unsigned long)addr; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory from newly added page that was just written to, -+ * confirming that data previously written (MAGIC) is present. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = (unsigned long)addr; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ munmap(addr, PAGE_SIZE); -+} -+ - TEST_HARNESS_MAIN --- -2.36.1 - diff --git a/0030-selftests-sgx-Introduce-dynamic-entry-point.patch b/0030-selftests-sgx-Introduce-dynamic-entry-point.patch deleted file mode 100644 index 4bfcfbe42560..000000000000 --- a/0030-selftests-sgx-Introduce-dynamic-entry-point.patch +++ /dev/null @@ -1,50 +0,0 @@ -From ec26d201dbda5703a17e31aeba91b6f8ad0ed47e Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:01 -0700 -Subject: [PATCH 30/36] selftests/sgx: Introduce dynamic entry point - -The test enclave (test_encl.elf) is built with two initialized -Thread Control Structures (TCS) included in the binary. Both TCS are -initialized with the same entry point, encl_entry, that correctly -computes the absolute address of the stack based on the stack of each -TCS that is also built into the binary. - -A new TCS can be added dynamically to the enclave and requires to be -initialized with an entry point used to enter the enclave. Since the -existing entry point, encl_entry, assumes that the TCS and its stack -exists at particular offsets within the binary it is not able to handle -a dynamically added TCS and its stack. - -Introduce a new entry point, encl_dyn_entry, that initializes the -absolute address of that thread's stack to the address immediately -preceding the TCS itself. It is now possible to dynamically add a -contiguous memory region to the enclave with the new stack preceding -the new TCS. With the new TCS initialized with encl_dyn_entry as entry -point the absolute address of the stack is computed correctly on entry. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/test_encl_bootstrap.S | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S -index 82fb0dfcbd23..03ae0f57e29d 100644 ---- a/tools/testing/selftests/sgx/test_encl_bootstrap.S -+++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S -@@ -45,6 +45,12 @@ encl_entry: - # TCS #2. By adding the value of encl_stack to it, we get - # the absolute address for the stack. - lea (encl_stack)(%rbx), %rax -+ jmp encl_entry_core -+encl_dyn_entry: -+ # Entry point for dynamically created TCS page expected to follow -+ # its stack directly. -+ lea -1(%rbx), %rax -+encl_entry_core: - xchg %rsp, %rax - push %rax - --- -2.36.1 - diff --git a/0031-selftests-sgx-Introduce-TCS-initialization-enclave-o.patch b/0031-selftests-sgx-Introduce-TCS-initialization-enclave-o.patch deleted file mode 100644 index 2f7205e28264..000000000000 --- a/0031-selftests-sgx-Introduce-TCS-initialization-enclave-o.patch +++ /dev/null @@ -1,102 +0,0 @@ -From 6a11aaed59fdac23b4ec6c5605381061488e3c07 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:02 -0700 -Subject: [PATCH 31/36] selftests/sgx: Introduce TCS initialization enclave - operation - -The Thread Control Structure (TCS) contains meta-data used by the -hardware to save and restore thread specific information when -entering/exiting the enclave. A TCS can be added to an initialized -enclave by first adding a new regular enclave page, initializing the -content of the new page from within the enclave, and then changing that -page's type to a TCS. - -Support the initialization of a TCS from within the enclave. -The variable information needed that should be provided from outside -the enclave is the address of the TCS, address of the State Save Area -(SSA), and the entry point that the thread should use to enter the -enclave. With this information provided all needed fields of a TCS -can be initialized. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/defines.h | 8 +++++++ - tools/testing/selftests/sgx/test_encl.c | 30 +++++++++++++++++++++++++ - 2 files changed, 38 insertions(+) - -diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h -index b638eb98c80c..d8587c971941 100644 ---- a/tools/testing/selftests/sgx/defines.h -+++ b/tools/testing/selftests/sgx/defines.h -@@ -26,6 +26,7 @@ enum encl_op_type { - ENCL_OP_NOP, - ENCL_OP_EACCEPT, - ENCL_OP_EMODPE, -+ ENCL_OP_INIT_TCS_PAGE, - ENCL_OP_MAX, - }; - -@@ -68,4 +69,11 @@ struct encl_op_emodpe { - uint64_t flags; - }; - -+struct encl_op_init_tcs_page { -+ struct encl_op_header header; -+ uint64_t tcs_page; -+ uint64_t ssa; -+ uint64_t entry; -+}; -+ - #endif /* DEFINES_H */ -diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c -index 5b6c65331527..c0d6397295e3 100644 ---- a/tools/testing/selftests/sgx/test_encl.c -+++ b/tools/testing/selftests/sgx/test_encl.c -@@ -57,6 +57,35 @@ static void *memcpy(void *dest, const void *src, size_t n) - return dest; - } - -+static void *memset(void *dest, int c, size_t n) -+{ -+ size_t i; -+ -+ for (i = 0; i < n; i++) -+ ((char *)dest)[i] = c; -+ -+ return dest; -+} -+ -+static void do_encl_init_tcs_page(void *_op) -+{ -+ struct encl_op_init_tcs_page *op = _op; -+ void *tcs = (void *)op->tcs_page; -+ uint32_t val_32; -+ -+ memset(tcs, 0, 16); /* STATE and FLAGS */ -+ memcpy(tcs + 16, &op->ssa, 8); /* OSSA */ -+ memset(tcs + 24, 0, 4); /* CSSA */ -+ val_32 = 1; -+ memcpy(tcs + 28, &val_32, 4); /* NSSA */ -+ memcpy(tcs + 32, &op->entry, 8); /* OENTRY */ -+ memset(tcs + 40, 0, 24); /* AEP, OFSBASE, OGSBASE */ -+ val_32 = 0xFFFFFFFF; -+ memcpy(tcs + 64, &val_32, 4); /* FSLIMIT */ -+ memcpy(tcs + 68, &val_32, 4); /* GSLIMIT */ -+ memset(tcs + 72, 0, 4024); /* Reserved */ -+} -+ - static void do_encl_op_put_to_buf(void *op) - { - struct encl_op_put_to_buf *op2 = op; -@@ -100,6 +129,7 @@ void encl_body(void *rdi, void *rsi) - do_encl_op_nop, - do_encl_eaccept, - do_encl_emodpe, -+ do_encl_init_tcs_page, - }; - - struct encl_op_header *op = (struct encl_op_header *)rdi; --- -2.36.1 - diff --git a/0032-selftests-sgx-Test-complete-changing-of-page-type-fl.patch b/0032-selftests-sgx-Test-complete-changing-of-page-type-fl.patch deleted file mode 100644 index ac5aaf1ab695..000000000000 --- a/0032-selftests-sgx-Test-complete-changing-of-page-type-fl.patch +++ /dev/null @@ -1,448 +0,0 @@ -From 7e1e0ea10dad3e8d2b372c220153e1455fab0619 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:03 -0700 -Subject: [PATCH 32/36] selftests/sgx: Test complete changing of page type flow - -Support for changing an enclave page's type enables an initialized -enclave to be expanded with support for more threads by changing the -type of a regular enclave page to that of a Thread Control Structure -(TCS). Additionally, being able to change a TCS or regular enclave -page's type to be trimmed (SGX_PAGE_TYPE_TRIM) initiates the removal -of the page from the enclave. - -Test changing page type to TCS as well as page removal flows -in two phases: In the first phase support for a new thread is -dynamically added to an initialized enclave and in the second phase -the pages associated with the new thread are removed from the enclave. -As an additional sanity check after the second phase the page used as -a TCS page during the first phase is added back as a regular page and -ensured that it can be written to (which is not possible if it was a -TCS page). - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/load.c | 41 ++++ - tools/testing/selftests/sgx/main.c | 343 +++++++++++++++++++++++++++++ - tools/testing/selftests/sgx/main.h | 1 + - 3 files changed, 385 insertions(+) - -diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c -index 006b464c8fc9..94bdeac1cf04 100644 ---- a/tools/testing/selftests/sgx/load.c -+++ b/tools/testing/selftests/sgx/load.c -@@ -130,6 +130,47 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) - return true; - } - -+/* -+ * Parse the enclave code's symbol table to locate and return address of -+ * the provided symbol -+ */ -+uint64_t encl_get_entry(struct encl *encl, const char *symbol) -+{ -+ Elf64_Shdr *sections; -+ Elf64_Sym *symtab; -+ Elf64_Ehdr *ehdr; -+ char *sym_names; -+ int num_sym; -+ int i; -+ -+ ehdr = encl->bin; -+ sections = encl->bin + ehdr->e_shoff; -+ -+ for (i = 0; i < ehdr->e_shnum; i++) { -+ if (sections[i].sh_type == SHT_SYMTAB) { -+ symtab = (Elf64_Sym *)((char *)encl->bin + sections[i].sh_offset); -+ num_sym = sections[i].sh_size / sections[i].sh_entsize; -+ break; -+ } -+ } -+ -+ for (i = 0; i < ehdr->e_shnum; i++) { -+ if (sections[i].sh_type == SHT_STRTAB) { -+ sym_names = (char *)encl->bin + sections[i].sh_offset; -+ break; -+ } -+ } -+ -+ for (i = 0; i < num_sym; i++) { -+ Elf64_Sym *sym = &symtab[i]; -+ -+ if (!strcmp(symbol, sym_names + sym->st_name)) -+ return (uint64_t)sym->st_value; -+ } -+ -+ return 0; -+} -+ - bool encl_load(const char *path, struct encl *encl, unsigned long heap_size) - { - const char device_path[] = "/dev/sgx_enclave"; -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index 79c08e347112..8bf43646e0bb 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -1090,4 +1090,347 @@ TEST_F(enclave, augment_via_eaccept) - munmap(addr, PAGE_SIZE); - } - -+/* -+ * SGX2 page type modification test in two phases: -+ * Phase 1: -+ * Create a new TCS, consisting out of three new pages (stack page with regular -+ * page type, SSA page with regular page type, and TCS page with TCS page -+ * type) in an initialized enclave and run a simple workload within it. -+ * Phase 2: -+ * Remove the three pages added in phase 1, add a new regular page at the -+ * same address that previously hosted the TCS page and verify that it can -+ * be modified. -+ */ -+TEST_F(enclave, tcs_create) -+{ -+ struct encl_op_init_tcs_page init_tcs_page_op; -+ struct sgx_enclave_remove_pages remove_ioc; -+ struct encl_op_get_from_addr get_addr_op; -+ struct sgx_enclave_modify_types modt_ioc; -+ struct encl_op_put_to_addr put_addr_op; -+ struct encl_op_get_from_buf get_buf_op; -+ struct encl_op_put_to_buf put_buf_op; -+ void *addr, *tcs, *stack_end, *ssa; -+ struct encl_op_eaccept eaccept_op; -+ size_t total_size = 0; -+ uint64_t val_64; -+ int errno_save; -+ int ret, i; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, -+ _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ /* -+ * Hardware (SGX2) and kernel support is needed for this test. Start -+ * with check that test has a chance of succeeding. -+ */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ -+ if (ret == -1) { -+ if (errno == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); -+ else if (errno == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ } -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ EXPECT_EQ(ret, -1); -+ -+ /* -+ * Add three regular pages via EAUG: one will be the TCS stack, one -+ * will be the TCS SSA, and one will be the new TCS. The stack and -+ * SSA will remain as regular pages, the TCS page will need its -+ * type changed after populated with needed data. -+ */ -+ for (i = 0; i < self->encl.nr_segments; i++) { -+ struct encl_segment *seg = &self->encl.segment_tbl[i]; -+ -+ total_size += seg->size; -+ } -+ -+ /* -+ * Actual enclave size is expected to be larger than the loaded -+ * test enclave since enclave size must be a power of 2 in bytes while -+ * test_encl does not consume it all. -+ */ -+ EXPECT_LT(total_size + 3 * PAGE_SIZE, self->encl.encl_size); -+ -+ /* -+ * mmap() three pages at end of existing enclave to be used for the -+ * three new pages. -+ */ -+ addr = mmap((void *)self->encl.encl_base + total_size, 3 * PAGE_SIZE, -+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, -+ self->encl.fd, 0); -+ EXPECT_NE(addr, MAP_FAILED); -+ -+ self->run.exception_vector = 0; -+ self->run.exception_error_code = 0; -+ self->run.exception_addr = 0; -+ -+ stack_end = (void *)self->encl.encl_base + total_size; -+ tcs = (void *)self->encl.encl_base + total_size + PAGE_SIZE; -+ ssa = (void *)self->encl.encl_base + total_size + 2 * PAGE_SIZE; -+ -+ /* -+ * Run EACCEPT on each new page to trigger the -+ * EACCEPT->(#PF)->EAUG->EACCEPT(again without a #PF) flow. -+ */ -+ -+ eaccept_op.epc_addr = (unsigned long)stack_end; -+ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ if (self->run.exception_vector == 14 && -+ self->run.exception_error_code == 4 && -+ self->run.exception_addr == (unsigned long)stack_end) { -+ munmap(addr, 3 * PAGE_SIZE); -+ SKIP(return, "Kernel does not support adding pages to initialized enclave"); -+ } -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ eaccept_op.epc_addr = (unsigned long)ssa; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ eaccept_op.epc_addr = (unsigned long)tcs; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* -+ * Three new pages added to enclave. Now populate the TCS page with -+ * needed data. This should be done from within enclave. Provide -+ * the function that will do the actual data population with needed -+ * data. -+ */ -+ -+ /* -+ * New TCS will use the "encl_dyn_entry" entrypoint that expects -+ * stack to begin in page before TCS page. -+ */ -+ val_64 = encl_get_entry(&self->encl, "encl_dyn_entry"); -+ EXPECT_NE(val_64, 0); -+ -+ init_tcs_page_op.tcs_page = (unsigned long)tcs; -+ init_tcs_page_op.ssa = (unsigned long)total_size + 2 * PAGE_SIZE; -+ init_tcs_page_op.entry = val_64; -+ init_tcs_page_op.header.type = ENCL_OP_INIT_TCS_PAGE; -+ -+ EXPECT_EQ(ENCL_CALL(&init_tcs_page_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* Change TCS page type to TCS. */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ -+ modt_ioc.offset = total_size + PAGE_SIZE; -+ modt_ioc.length = PAGE_SIZE; -+ modt_ioc.page_type = SGX_PAGE_TYPE_TCS; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(modt_ioc.result, 0); -+ EXPECT_EQ(modt_ioc.count, 4096); -+ -+ /* EACCEPT new TCS page from enclave. */ -+ eaccept_op.epc_addr = (unsigned long)tcs; -+ eaccept_op.flags = SGX_SECINFO_TCS | SGX_SECINFO_MODIFIED; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* Run workload from new TCS. */ -+ self->run.tcs = (unsigned long)tcs; -+ -+ /* -+ * Simple workload to write to data buffer and read value back. -+ */ -+ put_buf_op.header.type = ENCL_OP_PUT_TO_BUFFER; -+ put_buf_op.value = MAGIC; -+ -+ EXPECT_EQ(ENCL_CALL(&put_buf_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ get_buf_op.header.type = ENCL_OP_GET_FROM_BUFFER; -+ get_buf_op.value = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&get_buf_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_buf_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Phase 2 of test: -+ * Remove pages associated with new TCS, create a regular page -+ * where TCS page used to be and verify it can be used as a regular -+ * page. -+ */ -+ -+ /* Start page removal by requesting change of page type to PT_TRIM. */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ -+ modt_ioc.offset = total_size; -+ modt_ioc.length = 3 * PAGE_SIZE; -+ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(modt_ioc.result, 0); -+ EXPECT_EQ(modt_ioc.count, 3 * PAGE_SIZE); -+ -+ /* -+ * Enter enclave via TCS #1 and approve page removal by sending -+ * EACCEPT for each of three removed pages. -+ */ -+ self->run.tcs = self->encl.encl_base; -+ -+ eaccept_op.epc_addr = (unsigned long)stack_end; -+ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ eaccept_op.epc_addr = (unsigned long)tcs; -+ eaccept_op.ret = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ eaccept_op.epc_addr = (unsigned long)ssa; -+ eaccept_op.ret = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* Send final ioctl() to complete page removal. */ -+ memset(&remove_ioc, 0, sizeof(remove_ioc)); -+ -+ remove_ioc.offset = total_size; -+ remove_ioc.length = 3 * PAGE_SIZE; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(remove_ioc.count, 3 * PAGE_SIZE); -+ -+ /* -+ * Enter enclave via TCS #1 and access location where TCS #3 was to -+ * trigger dynamic add of regular page at that location. -+ */ -+ eaccept_op.epc_addr = (unsigned long)tcs; -+ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* -+ * New page should be accessible from within enclave - write to it. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = (unsigned long)tcs; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory from newly added page that was just written to, -+ * confirming that data previously written (MAGIC) is present. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = (unsigned long)tcs; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ munmap(addr, 3 * PAGE_SIZE); -+} -+ - TEST_HARNESS_MAIN -diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h -index b45c52ec7ab3..fc585be97e2f 100644 ---- a/tools/testing/selftests/sgx/main.h -+++ b/tools/testing/selftests/sgx/main.h -@@ -38,6 +38,7 @@ void encl_delete(struct encl *ctx); - bool encl_load(const char *path, struct encl *encl, unsigned long heap_size); - bool encl_measure(struct encl *encl); - bool encl_build(struct encl *encl); -+uint64_t encl_get_entry(struct encl *encl, const char *symbol); - - int sgx_enter_enclave(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, - struct sgx_enclave_run *run); --- -2.36.1 - diff --git a/0033-selftests-sgx-Test-faulty-enclave-behavior.patch b/0033-selftests-sgx-Test-faulty-enclave-behavior.patch deleted file mode 100644 index adf4755a6198..000000000000 --- a/0033-selftests-sgx-Test-faulty-enclave-behavior.patch +++ /dev/null @@ -1,149 +0,0 @@ -From aedac9dcb6fcb4be2650b05d71a85c77026282bc Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:04 -0700 -Subject: [PATCH 33/36] selftests/sgx: Test faulty enclave behavior - -Removing a page from an initialized enclave involves three steps: -first the user requests changing the page type to SGX_PAGE_TYPE_TRIM -via an ioctl(), on success the ENCLU[EACCEPT] instruction needs to be -run from within the enclave to accept the page removal, finally the -user requests page removal to be completed via an ioctl(). Only after -acceptance (ENCLU[EACCEPT]) from within the enclave can the kernel -remove the page from a running enclave. - -Test the behavior when the user's request to change the page type -succeeds, but the ENCLU[EACCEPT] instruction is not run before the -ioctl() requesting page removal is run. This should not be permitted. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/main.c | 114 +++++++++++++++++++++++++++++ - 1 file changed, 114 insertions(+) - -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index 8bf43646e0bb..3a82bae915d1 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -1433,4 +1433,118 @@ TEST_F(enclave, tcs_create) - munmap(addr, 3 * PAGE_SIZE); - } - -+/* -+ * Ensure sane behavior if user requests page removal, does not run -+ * EACCEPT from within enclave but still attempts to finalize page removal -+ * with the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). The latter should fail -+ * because the removal was not EACCEPTed from within the enclave. -+ */ -+TEST_F(enclave, remove_added_page_no_eaccept) -+{ -+ struct sgx_enclave_remove_pages remove_ioc; -+ struct encl_op_get_from_addr get_addr_op; -+ struct sgx_enclave_modify_types modt_ioc; -+ struct encl_op_put_to_addr put_addr_op; -+ unsigned long data_start; -+ int ret, errno_save; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ /* -+ * Hardware (SGX2) and kernel support is needed for this test. Start -+ * with check that test has a chance of succeeding. -+ */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ -+ if (ret == -1) { -+ if (errno == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); -+ else if (errno == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ } -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ EXPECT_EQ(ret, -1); -+ -+ /* -+ * Page that will be removed is the second data page in the .data -+ * segment. This forms part of the local encl_buffer within the -+ * enclave. -+ */ -+ data_start = self->encl.encl_base + -+ encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ -+ /* -+ * Sanity check that page at @data_start is writable before -+ * removing it. -+ * -+ * Start by writing MAGIC to test page. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = data_start; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory that was just written to, confirming that data -+ * previously written (MAGIC) is present. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = data_start; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* Start page removal by requesting change of page type to PT_TRIM */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ -+ modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ modt_ioc.length = PAGE_SIZE; -+ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(modt_ioc.result, 0); -+ EXPECT_EQ(modt_ioc.count, 4096); -+ -+ /* Skip EACCEPT */ -+ -+ /* Send final ioctl() to complete page removal */ -+ memset(&remove_ioc, 0, sizeof(remove_ioc)); -+ -+ remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ remove_ioc.length = PAGE_SIZE; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ /* Operation not permitted since EACCEPT was omitted. */ -+ EXPECT_EQ(ret, -1); -+ EXPECT_EQ(errno_save, EPERM); -+ EXPECT_EQ(remove_ioc.count, 0); -+} -+ - TEST_HARNESS_MAIN --- -2.36.1 - diff --git a/0034-selftests-sgx-Test-invalid-access-to-removed-enclave.patch b/0034-selftests-sgx-Test-invalid-access-to-removed-enclave.patch deleted file mode 100644 index 17d72bdc2ccf..000000000000 --- a/0034-selftests-sgx-Test-invalid-access-to-removed-enclave.patch +++ /dev/null @@ -1,290 +0,0 @@ -From 8dabc1d4d87c7e7aabcee17020b76e1ce9b1bb1a Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:05 -0700 -Subject: [PATCH 34/36] selftests/sgx: Test invalid access to removed enclave - page - -Removing a page from an initialized enclave involves three steps: -(1) the user requests changing the page type to SGX_PAGE_TYPE_TRIM -via the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(), (2) on success the -ENCLU[EACCEPT] instruction is run from within the enclave to accept -the page removal, (3) the user initiates the actual removal of the -page via the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). - -Test two possible invalid accesses during the page removal flow: -* Test the behavior when a request to remove the page by changing its - type to SGX_PAGE_TYPE_TRIM completes successfully but instead of - executing ENCLU[EACCEPT] from within the enclave the enclave attempts - to read from the page. Even though the page is accessible from the - page table entries its type is SGX_PAGE_TYPE_TRIM and thus not - accessible according to SGX. The expected behavior is a page fault - with the SGX flag set in the error code. -* Test the behavior when the page type is changed successfully and - ENCLU[EACCEPT] was run from within the enclave. The final ioctl(), - SGX_IOC_ENCLAVE_REMOVE_PAGES, is omitted and replaced with an - attempt to access the page. Even though the page is accessible - from the page table entries its type is SGX_PAGE_TYPE_TRIM and - thus not accessible according to SGX. The expected behavior is - a page fault with the SGX flag set in the error code. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/main.c | 243 +++++++++++++++++++++++++++++ - 1 file changed, 243 insertions(+) - -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index 3a82bae915d1..2c69045253b2 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -1547,4 +1547,247 @@ TEST_F(enclave, remove_added_page_no_eaccept) - EXPECT_EQ(remove_ioc.count, 0); - } - -+/* -+ * Request enclave page removal but instead of correctly following with -+ * EACCEPT a read attempt to page is made from within the enclave. -+ */ -+TEST_F(enclave, remove_added_page_invalid_access) -+{ -+ struct encl_op_get_from_addr get_addr_op; -+ struct encl_op_put_to_addr put_addr_op; -+ struct sgx_enclave_modify_types ioc; -+ unsigned long data_start; -+ int ret, errno_save; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ /* -+ * Hardware (SGX2) and kernel support is needed for this test. Start -+ * with check that test has a chance of succeeding. -+ */ -+ memset(&ioc, 0, sizeof(ioc)); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); -+ -+ if (ret == -1) { -+ if (errno == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); -+ else if (errno == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ } -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ EXPECT_EQ(ret, -1); -+ -+ /* -+ * Page that will be removed is the second data page in the .data -+ * segment. This forms part of the local encl_buffer within the -+ * enclave. -+ */ -+ data_start = self->encl.encl_base + -+ encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ -+ /* -+ * Sanity check that page at @data_start is writable before -+ * removing it. -+ * -+ * Start by writing MAGIC to test page. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = data_start; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory that was just written to, confirming that data -+ * previously written (MAGIC) is present. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = data_start; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* Start page removal by requesting change of page type to PT_TRIM. */ -+ memset(&ioc, 0, sizeof(ioc)); -+ -+ ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ ioc.length = PAGE_SIZE; -+ ioc.page_type = SGX_PAGE_TYPE_TRIM; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(ioc.result, 0); -+ EXPECT_EQ(ioc.count, 4096); -+ -+ /* -+ * Read from page that was just removed. -+ */ -+ get_addr_op.value = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ /* -+ * From kernel perspective the page is present but according to SGX the -+ * page should not be accessible so a #PF with SGX bit set is -+ * expected. -+ */ -+ -+ EXPECT_EQ(self->run.function, ERESUME); -+ EXPECT_EQ(self->run.exception_vector, 14); -+ EXPECT_EQ(self->run.exception_error_code, 0x8005); -+ EXPECT_EQ(self->run.exception_addr, data_start); -+} -+ -+/* -+ * Request enclave page removal and correctly follow with -+ * EACCEPT but do not follow with removal ioctl() but instead a read attempt -+ * to removed page is made from within the enclave. -+ */ -+TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) -+{ -+ struct encl_op_get_from_addr get_addr_op; -+ struct encl_op_put_to_addr put_addr_op; -+ struct sgx_enclave_modify_types ioc; -+ struct encl_op_eaccept eaccept_op; -+ unsigned long data_start; -+ int ret, errno_save; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ /* -+ * Hardware (SGX2) and kernel support is needed for this test. Start -+ * with check that test has a chance of succeeding. -+ */ -+ memset(&ioc, 0, sizeof(ioc)); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); -+ -+ if (ret == -1) { -+ if (errno == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); -+ else if (errno == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ } -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ EXPECT_EQ(ret, -1); -+ -+ /* -+ * Page that will be removed is the second data page in the .data -+ * segment. This forms part of the local encl_buffer within the -+ * enclave. -+ */ -+ data_start = self->encl.encl_base + -+ encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ -+ /* -+ * Sanity check that page at @data_start is writable before -+ * removing it. -+ * -+ * Start by writing MAGIC to test page. -+ */ -+ put_addr_op.value = MAGIC; -+ put_addr_op.addr = data_start; -+ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* -+ * Read memory that was just written to, confirming that data -+ * previously written (MAGIC) is present. -+ */ -+ get_addr_op.value = 0; -+ get_addr_op.addr = data_start; -+ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ EXPECT_EQ(get_addr_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ -+ /* Start page removal by requesting change of page type to PT_TRIM. */ -+ memset(&ioc, 0, sizeof(ioc)); -+ -+ ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ ioc.length = PAGE_SIZE; -+ ioc.page_type = SGX_PAGE_TYPE_TRIM; -+ -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(ioc.result, 0); -+ EXPECT_EQ(ioc.count, 4096); -+ -+ eaccept_op.epc_addr = (unsigned long)data_start; -+ eaccept_op.ret = 0; -+ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ /* Skip ioctl() to remove page. */ -+ -+ /* -+ * Read from page that was just removed. -+ */ -+ get_addr_op.value = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); -+ -+ /* -+ * From kernel perspective the page is present but according to SGX the -+ * page should not be accessible so a #PF with SGX bit set is -+ * expected. -+ */ -+ -+ EXPECT_EQ(self->run.function, ERESUME); -+ EXPECT_EQ(self->run.exception_vector, 14); -+ EXPECT_EQ(self->run.exception_error_code, 0x8005); -+ EXPECT_EQ(self->run.exception_addr, data_start); -+} -+ - TEST_HARNESS_MAIN --- -2.36.1 - diff --git a/0035-selftests-sgx-Test-reclaiming-of-untouched-page.patch b/0035-selftests-sgx-Test-reclaiming-of-untouched-page.patch deleted file mode 100644 index acc9b95fe5c4..000000000000 --- a/0035-selftests-sgx-Test-reclaiming-of-untouched-page.patch +++ /dev/null @@ -1,119 +0,0 @@ -From dd6d57606c6704b5e53e71fd4501b4d96242d007 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:06 -0700 -Subject: [PATCH 35/36] selftests/sgx: Test reclaiming of untouched page - -Removing a page from an initialized enclave involves three steps: -(1) the user requests changing the page type to PT_TRIM via the - SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() -(2) on success the ENCLU[EACCEPT] instruction is run from within - the enclave to accept the page removal -(3) the user initiates the actual removal of the page via the - SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). - -Remove a page that has never been accessed. This means that when the -first ioctl() requesting page removal arrives, there will be no page -table entry, yet a valid page table entry needs to exist for the -ENCLU[EACCEPT] function to succeed. In this test it is verified that -a page table entry can still be installed for a page that is in the -process of being removed. - -Suggested-by: Haitao Huang <haitao.huang@intel.com> -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/main.c | 80 ++++++++++++++++++++++++++++++ - 1 file changed, 80 insertions(+) - -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index 2c69045253b2..ba16671aef79 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -1790,4 +1790,84 @@ TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) - EXPECT_EQ(self->run.exception_addr, data_start); - } - -+TEST_F(enclave, remove_untouched_page) -+{ -+ struct sgx_enclave_remove_pages remove_ioc; -+ struct sgx_enclave_modify_types modt_ioc; -+ struct encl_op_eaccept eaccept_op; -+ unsigned long data_start; -+ int ret, errno_save; -+ -+ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); -+ -+ /* -+ * Hardware (SGX2) and kernel support is needed for this test. Start -+ * with check that test has a chance of succeeding. -+ */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ -+ if (ret == -1) { -+ if (errno == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); -+ else if (errno == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ } -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ EXPECT_EQ(ret, -1); -+ -+ /* SGX2 is supported by kernel and hardware, test can proceed. */ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ data_start = self->encl.encl_base + -+ encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ -+ modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ modt_ioc.length = PAGE_SIZE; -+ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(modt_ioc.result, 0); -+ EXPECT_EQ(modt_ioc.count, 4096); -+ -+ /* -+ * Enter enclave via TCS #1 and approve page removal by sending -+ * EACCEPT for removed page. -+ */ -+ -+ eaccept_op.epc_addr = data_start; -+ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; -+ eaccept_op.ret = 0; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ EXPECT_EQ(eaccept_op.ret, 0); -+ -+ memset(&remove_ioc, 0, sizeof(remove_ioc)); -+ -+ remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; -+ remove_ioc.length = PAGE_SIZE; -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(remove_ioc.count, 4096); -+} -+ - TEST_HARNESS_MAIN --- -2.36.1 - diff --git a/0036-selftests-sgx-Page-removal-stress-test.patch b/0036-selftests-sgx-Page-removal-stress-test.patch deleted file mode 100644 index de29fe7a8ecc..000000000000 --- a/0036-selftests-sgx-Page-removal-stress-test.patch +++ /dev/null @@ -1,155 +0,0 @@ -From 60d3fab7ef151f02110786f64633d70e0a0f8f14 Mon Sep 17 00:00:00 2001 -From: Reinette Chatre <reinette.chatre@intel.com> -Date: Tue, 10 May 2022 11:09:07 -0700 -Subject: [PATCH 36/36] selftests/sgx: Page removal stress test - -Create enclave with additional heap that consumes all physical SGX -memory and then remove it. - -Depending on the available SGX memory this test could take a -significant time to run (several minutes) as it (1) creates the -enclave, (2) changes the type of every page to be trimmed, -(3) enters the enclave once per page to run EACCEPT, before -(4) the pages are finally removed. - -Acked-by: Jarkko Sakkinen <jarkko@kernel.org> -Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> ---- - tools/testing/selftests/sgx/main.c | 120 +++++++++++++++++++++++++++++ - 1 file changed, 120 insertions(+) - -diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c -index ba16671aef79..9820b3809c69 100644 ---- a/tools/testing/selftests/sgx/main.c -+++ b/tools/testing/selftests/sgx/main.c -@@ -378,7 +378,127 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) - EXPECT_EQ(get_op.value, MAGIC); - EXPECT_EEXIT(&self->run); - EXPECT_EQ(self->run.user_data, 0); -+} -+ -+TEST_F_TIMEOUT(enclave, unclobbered_vdso_oversubscribed_remove, 900) -+{ -+ struct sgx_enclave_remove_pages remove_ioc; -+ struct sgx_enclave_modify_types modt_ioc; -+ struct encl_op_get_from_buf get_op; -+ struct encl_op_eaccept eaccept_op; -+ struct encl_op_put_to_buf put_op; -+ struct encl_segment *heap; -+ unsigned long total_mem; -+ int ret, errno_save; -+ unsigned long addr; -+ unsigned long i; -+ -+ /* -+ * Create enclave with additional heap that is as big as all -+ * available physical SGX memory. -+ */ -+ total_mem = get_total_epc_mem(); -+ ASSERT_NE(total_mem, 0); -+ TH_LOG("Creating an enclave with %lu bytes heap may take a while ...", -+ total_mem); -+ ASSERT_TRUE(setup_test_encl(total_mem, &self->encl, _metadata)); -+ -+ /* -+ * Hardware (SGX2) and kernel support is needed for this test. Start -+ * with check that test has a chance of succeeding. -+ */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ -+ if (ret == -1) { -+ if (errno == ENOTTY) -+ SKIP(return, -+ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); -+ else if (errno == ENODEV) -+ SKIP(return, "System does not support SGX2"); -+ } -+ -+ /* -+ * Invalid parameters were provided during sanity check, -+ * expect command to fail. -+ */ -+ EXPECT_EQ(ret, -1); -+ -+ /* SGX2 is supported by kernel and hardware, test can proceed. */ -+ memset(&self->run, 0, sizeof(self->run)); -+ self->run.tcs = self->encl.encl_base; -+ -+ heap = &self->encl.segment_tbl[self->encl.nr_segments - 1]; -+ -+ put_op.header.type = ENCL_OP_PUT_TO_BUFFER; -+ put_op.value = MAGIC; -+ -+ EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); -+ -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.user_data, 0); -+ -+ get_op.header.type = ENCL_OP_GET_FROM_BUFFER; -+ get_op.value = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); -+ -+ EXPECT_EQ(get_op.value, MAGIC); -+ EXPECT_EEXIT(&self->run); -+ EXPECT_EQ(self->run.user_data, 0); - -+ /* Trim entire heap. */ -+ memset(&modt_ioc, 0, sizeof(modt_ioc)); -+ -+ modt_ioc.offset = heap->offset; -+ modt_ioc.length = heap->size; -+ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; -+ -+ TH_LOG("Changing type of %zd bytes to trimmed may take a while ...", -+ heap->size); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(modt_ioc.result, 0); -+ EXPECT_EQ(modt_ioc.count, heap->size); -+ -+ /* EACCEPT all removed pages. */ -+ addr = self->encl.encl_base + heap->offset; -+ -+ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; -+ eaccept_op.header.type = ENCL_OP_EACCEPT; -+ -+ TH_LOG("Entering enclave to run EACCEPT for each page of %zd bytes may take a while ...", -+ heap->size); -+ for (i = 0; i < heap->size; i += 4096) { -+ eaccept_op.epc_addr = addr + i; -+ eaccept_op.ret = 0; -+ -+ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); -+ -+ EXPECT_EQ(self->run.exception_vector, 0); -+ EXPECT_EQ(self->run.exception_error_code, 0); -+ EXPECT_EQ(self->run.exception_addr, 0); -+ ASSERT_EQ(eaccept_op.ret, 0); -+ ASSERT_EQ(self->run.function, EEXIT); -+ } -+ -+ /* Complete page removal. */ -+ memset(&remove_ioc, 0, sizeof(remove_ioc)); -+ -+ remove_ioc.offset = heap->offset; -+ remove_ioc.length = heap->size; -+ -+ TH_LOG("Removing %zd bytes from enclave may take a while ...", -+ heap->size); -+ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); -+ errno_save = ret == -1 ? errno : 0; -+ -+ EXPECT_EQ(ret, 0); -+ EXPECT_EQ(errno_save, 0); -+ EXPECT_EQ(remove_ioc.count, heap->size); - } - - TEST_F(enclave, clobbered_vdso) --- -2.36.1 - diff --git a/patch-5.18-enarx.patch b/patch-5.18-enarx.patch new file mode 100644 index 000000000000..0413ef2ff4b8 --- /dev/null +++ b/patch-5.18-enarx.patch @@ -0,0 +1,13623 @@ +From 7abfca61f8595c036e1bd9f1d65ab78af0006627 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Mon, 25 Apr 2022 18:57:35 +0000 +Subject: [PATCH 01/90] x86/cpufeatures: Add SEV-SNP CPU feature + +Add CPU feature detection for Secure Encrypted Virtualization with +Secure Nested Paging. This feature adds a strong memory integrity +protection to help prevent malicious hypervisor-based attacks like +data replay, memory re-mapping, and more. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/kernel/cpu/amd.c | 3 ++- + tools/arch/x86/include/asm/cpufeatures.h | 1 + + 3 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 30da1341f226..1cba0217669f 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -407,6 +407,7 @@ + #define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ + #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ + #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ ++#define X86_FEATURE_SEV_SNP (19*32+4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ + #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + + /* +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 0c0b09796ced..2e87015a9d69 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -559,7 +559,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) + * If the kernel has not enabled SME via any means then + * don't advertise the SME feature. + * For SEV: If BIOS has not enabled SEV then don't advertise the +- * SEV and SEV_ES feature (set in scattered.c). ++ * SEV, SEV_ES and SEV_SNP feature. + * + * In all cases, since support for SME and SEV requires long mode, + * don't advertise the feature under CONFIG_X86_32. +@@ -594,6 +594,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) + clear_sev: + setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); ++ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + } + } + +diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h +index 73e643ae94b6..a636342ecb26 100644 +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -405,6 +405,7 @@ + #define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ + #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ + #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ ++#define X86_FEATURE_SEV_SNP (19*32+4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ + #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + + /* +-- +2.36.1 + + +From 12df64394b1788156c8a3c2ee8dfd62b51ab3a81 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Mon, 25 Apr 2022 19:59:43 +0000 +Subject: [PATCH 02/90] iommu/amd: Introduce function to check SEV-SNP support + +The SEV-SNP support requires that IOMMU must to enabled, see the IOMMU +spec section 2.12 for further details. If IOMMU is not enabled or the +SNPSup extended feature register is not set then the SNP_INIT command +(used for initializing firmware) will fail. + +The iommu_sev_snp_supported() can be used to check if IOMMU supports the +SEV-SNP feature. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/iommu/amd/init.c | 30 ++++++++++++++++++++++++++++++ + include/linux/iommu.h | 9 +++++++++ + 2 files changed, 39 insertions(+) + +diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c +index 1a3ad58ba846..82be8067ddf5 100644 +--- a/drivers/iommu/amd/init.c ++++ b/drivers/iommu/amd/init.c +@@ -3361,3 +3361,33 @@ int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 + + return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true); + } ++ ++bool iommu_sev_snp_supported(void) ++{ ++ struct amd_iommu *iommu; ++ ++ /* ++ * The SEV-SNP support requires that IOMMU must be enabled, and is ++ * not configured in the passthrough mode. ++ */ ++ if (no_iommu || iommu_default_passthrough()) { ++ pr_err("SEV-SNP: IOMMU is either disabled or configured in passthrough mode.\n"); ++ return false; ++ } ++ ++ /* ++ * Iterate through all the IOMMUs and verify the SNPSup feature is ++ * enabled. ++ */ ++ for_each_iommu(iommu) { ++ if (!iommu_feature(iommu, FEATURE_SNP)) { ++ pr_err("SNPSup is disabled (devid: %02x:%02x.%x)\n", ++ PCI_BUS_NUM(iommu->devid), PCI_SLOT(iommu->devid), ++ PCI_FUNC(iommu->devid)); ++ return false; ++ } ++ } ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(iommu_sev_snp_supported); +diff --git a/include/linux/iommu.h b/include/linux/iommu.h +index 9208eca4b0d1..fecb72e1b11b 100644 +--- a/include/linux/iommu.h ++++ b/include/linux/iommu.h +@@ -675,6 +675,12 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, + void iommu_sva_unbind_device(struct iommu_sva *handle); + u32 iommu_sva_get_pasid(struct iommu_sva *handle); + ++#ifdef CONFIG_AMD_MEM_ENCRYPT ++bool iommu_sev_snp_supported(void); ++#else ++static inline bool iommu_sev_snp_supported(void) { return false; } ++#endif ++ + #else /* CONFIG_IOMMU_API */ + + struct iommu_ops {}; +@@ -1031,6 +1037,9 @@ static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) + { + return NULL; + } ++ ++static inline bool iommu_sev_snp_supported(void) { return false; } ++ + #endif /* CONFIG_IOMMU_API */ + + /** +-- +2.36.1 + + +From 8f4eef289aba5067582d0d3535299c22a4e5c4c4 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Mon, 25 Apr 2022 20:06:47 +0000 +Subject: [PATCH 03/90] x86/sev: Add the host SEV-SNP initialization support + +The memory integrity guarantees of SEV-SNP are enforced through a new +structure called the Reverse Map Table (RMP). The RMP is a single data +structure shared across the system that contains one entry for every 4K +page of DRAM that may be used by SEV-SNP VMs. The goal of RMP is to +track the owner of each page of memory. Pages of memory can be owned by +the hypervisor, owned by a specific VM or owned by the AMD-SP. See APM2 +section 15.36.3 for more detail on RMP. + +The RMP table is used to enforce access control to memory. The table itself +is not directly writable by the software. New CPU instructions (RMPUPDATE, +PVALIDATE, RMPADJUST) are used to manipulate the RMP entries. + +Based on the platform configuration, the BIOS reserves the memory used +for the RMP table. The start and end address of the RMP table must be +queried by reading the RMP_BASE and RMP_END MSRs. If the RMP_BASE and +RMP_END are not set then disable the SEV-SNP feature. + +The SEV-SNP feature is enabled only after the RMP table is successfully +initialized. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/disabled-features.h | 8 +- + arch/x86/include/asm/msr-index.h | 6 + + arch/x86/kernel/sev.c | 144 +++++++++++++++++++++++ + 3 files changed, 157 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index 36369e76cc63..c1be3091a383 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -68,6 +68,12 @@ + # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) + #endif + ++#ifdef CONFIG_AMD_MEM_ENCRYPT ++# define DISABLE_SEV_SNP 0 ++#else ++# define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -91,7 +97,7 @@ + DISABLE_ENQCMD) + #define DISABLED_MASK17 0 + #define DISABLED_MASK18 0 +-#define DISABLED_MASK19 0 ++#define DISABLED_MASK19 (DISABLE_SEV_SNP) + #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) + + #endif /* _ASM_X86_DISABLED_FEATURES_H */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 9e2e7185fc1d..57a8280e283a 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -507,6 +507,8 @@ + #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) + #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) + #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) ++#define MSR_AMD64_RMP_BASE 0xc0010132 ++#define MSR_AMD64_RMP_END 0xc0010133 + + #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f + +@@ -581,6 +583,10 @@ + #define MSR_AMD64_SYSCFG 0xc0010010 + #define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 + #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) ++#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 ++#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) ++#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 ++#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 + /* C1E active bits in int pending message */ + #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index f01f4550e2c6..3a233b5d47c5 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -22,6 +22,8 @@ + #include <linux/efi.h> + #include <linux/platform_device.h> + #include <linux/io.h> ++#include <linux/cpumask.h> ++#include <linux/iommu.h> + + #include <asm/cpu_entry_area.h> + #include <asm/stacktrace.h> +@@ -38,6 +40,7 @@ + #include <asm/apic.h> + #include <asm/cpuid.h> + #include <asm/cmdline.h> ++#include <asm/iommu.h> + + #define DR7_RESET_VALUE 0x400 + +@@ -57,6 +60,12 @@ + #define AP_INIT_CR0_DEFAULT 0x60000010 + #define AP_INIT_MXCSR_DEFAULT 0x1f80 + ++/* ++ * The first 16KB from the RMP_BASE is used by the processor for the ++ * bookkeeping, the range need to be added during the RMP entry lookup. ++ */ ++#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 ++ + /* For early boot hypervisor communication in SEV-ES enabled guests */ + static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +@@ -69,6 +78,10 @@ static struct ghcb *boot_ghcb __section(".data"); + /* Bitmap of SEV features supported by the hypervisor */ + static u64 sev_hv_features __ro_after_init; + ++static unsigned long rmptable_start __ro_after_init; ++static unsigned long rmptable_end __ro_after_init; ++ ++ + /* #VC handler runtime per-CPU data */ + struct sev_es_runtime_data { + struct ghcb ghcb_page; +@@ -2218,3 +2231,134 @@ static int __init snp_init_platform_device(void) + return 0; + } + device_initcall(snp_init_platform_device); ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "SEV-SNP: " fmt ++ ++static int __snp_enable(unsigned int cpu) ++{ ++ u64 val; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return 0; ++ ++ rdmsrl(MSR_AMD64_SYSCFG, val); ++ ++ val |= MSR_AMD64_SYSCFG_SNP_EN; ++ val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; ++ ++ wrmsrl(MSR_AMD64_SYSCFG, val); ++ ++ return 0; ++} ++ ++static __init void snp_enable(void *arg) ++{ ++ __snp_enable(smp_processor_id()); ++} ++ ++static bool get_rmptable_info(u64 *start, u64 *len) ++{ ++ u64 calc_rmp_sz, rmp_sz, rmp_base, rmp_end, nr_pages; ++ ++ rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); ++ rdmsrl(MSR_AMD64_RMP_END, rmp_end); ++ ++ if (!rmp_base || !rmp_end) { ++ pr_info("Memory for the RMP table has not been reserved by BIOS\n"); ++ return false; ++ } ++ ++ rmp_sz = rmp_end - rmp_base + 1; ++ ++ /* ++ * Calculate the amount the memory that must be reserved by the BIOS to ++ * address the full system RAM. The reserved memory should also cover the ++ * RMP table itself. ++ * ++ * See PPR Family 19h Model 01h, Revision B1 section 2.1.4.2 for more ++ * information on memory requirement. ++ */ ++ nr_pages = totalram_pages(); ++ calc_rmp_sz = (((rmp_sz >> PAGE_SHIFT) + nr_pages) << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; ++ ++ if (calc_rmp_sz > rmp_sz) { ++ pr_info("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", ++ calc_rmp_sz, rmp_sz); ++ return false; ++ } ++ ++ *start = rmp_base; ++ *len = rmp_sz; ++ ++ pr_info("RMP table physical address 0x%016llx - 0x%016llx\n", rmp_base, rmp_end); ++ ++ return true; ++} ++ ++static __init int __snp_rmptable_init(void) ++{ ++ u64 rmp_base, sz; ++ void *start; ++ u64 val; ++ ++ if (!get_rmptable_info(&rmp_base, &sz)) ++ return 1; ++ ++ start = memremap(rmp_base, sz, MEMREMAP_WB); ++ if (!start) { ++ pr_err("Failed to map RMP table 0x%llx+0x%llx\n", rmp_base, sz); ++ return 1; ++ } ++ ++ /* ++ * Check if SEV-SNP is already enabled, this can happen if we are coming from ++ * kexec boot. ++ */ ++ rdmsrl(MSR_AMD64_SYSCFG, val); ++ if (val & MSR_AMD64_SYSCFG_SNP_EN) ++ goto skip_enable; ++ ++ /* Initialize the RMP table to zero */ ++ memset(start, 0, sz); ++ ++ /* Flush the caches to ensure that data is written before SNP is enabled. */ ++ wbinvd_on_all_cpus(); ++ ++ /* Enable SNP on all CPUs. */ ++ on_each_cpu(snp_enable, NULL, 1); ++ ++skip_enable: ++ rmptable_start = (unsigned long)start; ++ rmptable_end = rmptable_start + sz; ++ ++ return 0; ++} ++ ++static int __init snp_rmptable_init(void) ++{ ++ if (!boot_cpu_has(X86_FEATURE_SEV_SNP)) ++ return 0; ++ ++ if (!iommu_sev_snp_supported()) ++ goto nosnp; ++ ++ if (__snp_rmptable_init()) ++ goto nosnp; ++ ++ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); ++ ++ return 0; ++ ++nosnp: ++ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); ++ return 1; ++} ++ ++/* ++ * This must be called after the PCI subsystem. This is because before enabling ++ * the SNP feature we need to ensure that IOMMU supports the SEV-SNP feature. ++ * The iommu_sev_snp_support() is used for checking the feature, and it is ++ * available after subsys_initcall(). ++ */ ++fs_initcall(snp_rmptable_init); +-- +2.36.1 + + +From c933e87762d78e5dce78e9bbf9c41aa0b30ddba2 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Mon, 25 Apr 2022 21:58:02 +0000 +Subject: [PATCH 04/90] x86/sev: set SYSCFG.MFMD + +SEV-SNP FW >= 1.51 requires that SYSCFG.MFMD must be set. + +Subsequent CCP patches while require 1.51 as the minimum SEV-SNP +firmware version. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/msr-index.h | 3 +++ + arch/x86/kernel/sev.c | 24 ++++++++++++++++++++++++ + 2 files changed, 27 insertions(+) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 57a8280e283a..1e36f16daa56 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -587,6 +587,9 @@ + #define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) + #define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 + #define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) ++#define MSR_AMD64_SYSCFG_MFDM_BIT 19 ++#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) ++ + #define MSR_K8_INT_PENDING_MSG 0xc0010055 + /* C1E active bits in int pending message */ + #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index 3a233b5d47c5..25c7feb367f6 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -2257,6 +2257,27 @@ static __init void snp_enable(void *arg) + __snp_enable(smp_processor_id()); + } + ++static int __mfdm_enable(unsigned int cpu) ++{ ++ u64 val; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return 0; ++ ++ rdmsrl(MSR_AMD64_SYSCFG, val); ++ ++ val |= MSR_AMD64_SYSCFG_MFDM; ++ ++ wrmsrl(MSR_AMD64_SYSCFG, val); ++ ++ return 0; ++} ++ ++static __init void mfdm_enable(void *arg) ++{ ++ __mfdm_enable(smp_processor_id()); ++} ++ + static bool get_rmptable_info(u64 *start, u64 *len) + { + u64 calc_rmp_sz, rmp_sz, rmp_base, rmp_end, nr_pages; +@@ -2325,6 +2346,9 @@ static __init int __snp_rmptable_init(void) + /* Flush the caches to ensure that data is written before SNP is enabled. */ + wbinvd_on_all_cpus(); + ++ /* MFDM must be enabled on all the CPUs prior to enabling SNP. */ ++ on_each_cpu(mfdm_enable, NULL, 1); ++ + /* Enable SNP on all CPUs. */ + on_each_cpu(snp_enable, NULL, 1); + +-- +2.36.1 + + +From 8f63961f00fd170ba0e561f499292175f3155d26 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Mon, 25 Apr 2022 23:13:01 +0000 +Subject: [PATCH 05/90] x86/sev: Add RMP entry lookup helpers + +The snp_lookup_page_in_rmptable() can be used by the host to read the RMP +entry for a given page. The RMP entry format is documented in AMD PPR, see +https://bugzilla.kernel.org/attachment.cgi?id=296015. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev.h | 27 ++++++++++++++++++++++++ + arch/x86/kernel/sev.c | 43 ++++++++++++++++++++++++++++++++++++++ + include/linux/sev.h | 30 ++++++++++++++++++++++++++ + 3 files changed, 100 insertions(+) + create mode 100644 include/linux/sev.h + +diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h +index 9c2d33f1cfee..cb16f0e5b585 100644 +--- a/arch/x86/include/asm/sev.h ++++ b/arch/x86/include/asm/sev.h +@@ -9,6 +9,7 @@ + #define __ASM_ENCRYPTED_STATE_H + + #include <linux/types.h> ++#include <linux/sev.h> + #include <asm/insn.h> + #include <asm/sev-common.h> + #include <asm/bootparam.h> +@@ -84,6 +85,32 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + + /* RMP page size */ + #define RMP_PG_SIZE_4K 0 ++#define RMP_TO_X86_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) ++ ++/* ++ * The RMP entry format is not architectural. The format is defined in PPR ++ * Family 19h Model 01h, Rev B1 processor. ++ */ ++struct __packed rmpentry { ++ union { ++ struct { ++ u64 assigned : 1, ++ pagesize : 1, ++ immutable : 1, ++ rsvd1 : 9, ++ gpa : 39, ++ asid : 10, ++ vmsa : 1, ++ validated : 1, ++ rsvd2 : 1; ++ } info; ++ u64 low; ++ }; ++ u64 high; ++}; ++ ++#define rmpentry_assigned(x) ((x)->info.assigned) ++#define rmpentry_pagesize(x) ((x)->info.pagesize) + + #define RMPADJUST_VMSA_PAGE_BIT BIT(16) + +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index 25c7feb367f6..59e7ec6b0326 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -65,6 +65,8 @@ + * bookkeeping, the range need to be added during the RMP entry lookup. + */ + #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 ++#define RMPENTRY_SHIFT 8 ++#define rmptable_page_offset(x) (RMPTABLE_CPU_BOOKKEEPING_SZ + (((unsigned long)x) >> RMPENTRY_SHIFT)) + + /* For early boot hypervisor communication in SEV-ES enabled guests */ + static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); +@@ -2386,3 +2388,44 @@ static int __init snp_rmptable_init(void) + * available after subsys_initcall(). + */ + fs_initcall(snp_rmptable_init); ++ ++static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) ++{ ++ unsigned long vaddr, paddr = pfn << PAGE_SHIFT; ++ struct rmpentry *entry, *large_entry; ++ ++ if (!pfn_valid(pfn)) ++ return ERR_PTR(-EINVAL); ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return ERR_PTR(-ENXIO); ++ ++ vaddr = rmptable_start + rmptable_page_offset(paddr); ++ if (unlikely(vaddr > rmptable_end)) ++ return ERR_PTR(-ENXIO); ++ ++ entry = (struct rmpentry *)vaddr; ++ ++ /* Read a large RMP entry to get the correct page level used in RMP entry. */ ++ vaddr = rmptable_start + rmptable_page_offset(paddr & PMD_MASK); ++ large_entry = (struct rmpentry *)vaddr; ++ *level = RMP_TO_X86_PG_LEVEL(rmpentry_pagesize(large_entry)); ++ ++ return entry; ++} ++ ++/* ++ * Return 1 if the RMP entry is assigned, 0 if it exists but is not assigned, ++ * and -errno if there is no corresponding RMP entry. ++ */ ++int snp_lookup_rmpentry(u64 pfn, int *level) ++{ ++ struct rmpentry *e; ++ ++ e = __snp_lookup_rmpentry(pfn, level); ++ if (IS_ERR(e)) ++ return PTR_ERR(e); ++ ++ return !!rmpentry_assigned(e); ++} ++EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); +diff --git a/include/linux/sev.h b/include/linux/sev.h +new file mode 100644 +index 000000000000..1a68842789e1 +--- /dev/null ++++ b/include/linux/sev.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * AMD Secure Encrypted Virtualization ++ * ++ * Author: Brijesh Singh <brijesh.singh@amd.com> ++ */ ++ ++#ifndef __LINUX_SEV_H ++#define __LINUX_SEV_H ++ ++/* RMUPDATE detected 4K page and 2MB page overlap. */ ++#define RMPUPDATE_FAIL_OVERLAP 7 ++ ++#ifdef CONFIG_AMD_MEM_ENCRYPT ++int snp_lookup_rmpentry(u64 pfn, int *level); ++int psmash(u64 pfn); ++int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable); ++int rmp_make_shared(u64 pfn, enum pg_level level); ++#else ++static inline int snp_lookup_rmpentry(u64 pfn, int *level) { return 0; } ++static inline int psmash(u64 pfn) { return -ENXIO; } ++static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, ++ bool immutable) ++{ ++ return -ENODEV; ++} ++static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } ++ ++#endif /* CONFIG_AMD_MEM_ENCRYPT */ ++#endif /* __LINUX_SEV_H */ +-- +2.36.1 + + +From e4643e9d37fcb025d0aec9080feefaae5e9245d5 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:31:13 +0000 +Subject: [PATCH 06/90] x86/sev: Add helper functions for RMPUPDATE and PSMASH + instruction + +The RMPUPDATE instruction writes a new RMP entry in the RMP Table. The +hypervisor will use the instruction to add pages to the RMP table. See +APM3 for details on the instruction operations. + +The PSMASH instruction expands a 2MB RMP entry into a corresponding set of +contiguous 4KB-Page RMP entries. The hypervisor will use this instruction +to adjust the RMP entry without invalidating the previous RMP entry. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev.h | 11 ++++++ + arch/x86/kernel/sev.c | 72 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 83 insertions(+) + +diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h +index cb16f0e5b585..6ab872311544 100644 +--- a/arch/x86/include/asm/sev.h ++++ b/arch/x86/include/asm/sev.h +@@ -85,7 +85,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + + /* RMP page size */ + #define RMP_PG_SIZE_4K 0 ++#define RMP_PG_SIZE_2M 1 + #define RMP_TO_X86_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) ++#define X86_TO_RMP_PG_LEVEL(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M) + + /* + * The RMP entry format is not architectural. The format is defined in PPR +@@ -126,6 +128,15 @@ struct snp_guest_platform_data { + u64 secrets_gpa; + }; + ++struct rmpupdate { ++ u64 gpa; ++ u8 assigned; ++ u8 pagesize; ++ u8 immutable; ++ u8 rsvd; ++ u32 asid; ++} __packed; ++ + #ifdef CONFIG_AMD_MEM_ENCRYPT + extern struct static_key_false sev_es_enable_key; + extern void __sev_es_ist_enter(struct pt_regs *regs); +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index 59e7ec6b0326..f6c64a722e94 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -2429,3 +2429,75 @@ int snp_lookup_rmpentry(u64 pfn, int *level) + return !!rmpentry_assigned(e); + } + EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); ++ ++int psmash(u64 pfn) ++{ ++ unsigned long paddr = pfn << PAGE_SHIFT; ++ int ret; ++ ++ if (!pfn_valid(pfn)) ++ return -EINVAL; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return -ENXIO; ++ ++ /* Binutils version 2.36 supports the PSMASH mnemonic. */ ++ asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" ++ : "=a"(ret) ++ : "a"(paddr) ++ : "memory", "cc"); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(psmash); ++ ++static int rmpupdate(u64 pfn, struct rmpupdate *val) ++{ ++ unsigned long paddr = pfn << PAGE_SHIFT; ++ int ret; ++ ++ if (!pfn_valid(pfn)) ++ return -EINVAL; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return -ENXIO; ++ ++ /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ ++ asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" ++ : "=a"(ret) ++ : "a"(paddr), "c"((unsigned long)val) ++ : "memory", "cc"); ++ return ret; ++} ++ ++int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable) ++{ ++ struct rmpupdate val; ++ ++ if (!pfn_valid(pfn)) ++ return -EINVAL; ++ ++ memset(&val, 0, sizeof(val)); ++ val.assigned = 1; ++ val.asid = asid; ++ val.immutable = immutable; ++ val.gpa = gpa; ++ val.pagesize = X86_TO_RMP_PG_LEVEL(level); ++ ++ return rmpupdate(pfn, &val); ++} ++EXPORT_SYMBOL_GPL(rmp_make_private); ++ ++int rmp_make_shared(u64 pfn, enum pg_level level) ++{ ++ struct rmpupdate val; ++ ++ if (!pfn_valid(pfn)) ++ return -EINVAL; ++ ++ memset(&val, 0, sizeof(val)); ++ val.pagesize = X86_TO_RMP_PG_LEVEL(level); ++ ++ return rmpupdate(pfn, &val); ++} ++EXPORT_SYMBOL_GPL(rmp_make_shared); +-- +2.36.1 + + +From 243778c282cd55a554af9c11d2ecd3ff9ea6820f Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:35:48 +0000 +Subject: [PATCH 07/90] x86/sev: Invalid pages from direct map when adding it + to RMP table + +The integrity guarantee of SEV-SNP is enforced through the RMP table. +The RMP is used with standard x86 and IOMMU page tables to enforce memory +restrictions and page access rights. The RMP check is enforced as soon as +SEV-SNP is enabled globally in the system. When hardware encounters an +RMP checks failure, it raises a page-fault exception. + +The rmp_make_private() and rmp_make_shared() helpers are used to add +or remove the pages from the RMP table. Improve the rmp_make_private() to +invalid state so that pages cannot be used in the direct-map after its +added in the RMP table, and restore to its default valid permission after +the pages are removed from the RMP table. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kernel/sev.c | 61 ++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 60 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index f6c64a722e94..734cddd837f5 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -2451,10 +2451,42 @@ int psmash(u64 pfn) + } + EXPORT_SYMBOL_GPL(psmash); + ++static int restore_direct_map(u64 pfn, int npages) ++{ ++ int i, ret = 0; ++ ++ for (i = 0; i < npages; i++) { ++ ret = set_direct_map_default_noflush(pfn_to_page(pfn + i)); ++ if (ret) ++ goto cleanup; ++ } ++ ++cleanup: ++ WARN(ret > 0, "Failed to restore direct map for pfn 0x%llx\n", pfn + i); ++ return ret; ++} ++ ++static int invalid_direct_map(unsigned long pfn, int npages) ++{ ++ int i, ret = 0; ++ ++ for (i = 0; i < npages; i++) { ++ ret = set_direct_map_invalid_noflush(pfn_to_page(pfn + i)); ++ if (ret) ++ goto cleanup; ++ } ++ ++ return 0; ++ ++cleanup: ++ restore_direct_map(pfn, i); ++ return ret; ++} ++ + static int rmpupdate(u64 pfn, struct rmpupdate *val) + { + unsigned long paddr = pfn << PAGE_SHIFT; +- int ret; ++ int ret, level, npages; + + if (!pfn_valid(pfn)) + return -EINVAL; +@@ -2462,11 +2494,38 @@ static int rmpupdate(u64 pfn, struct rmpupdate *val) + if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return -ENXIO; + ++ level = RMP_TO_X86_PG_LEVEL(val->pagesize); ++ npages = page_level_size(level) / PAGE_SIZE; ++ ++ /* ++ * If page is getting assigned in the RMP table then unmap it from the ++ * direct map. ++ */ ++ if (val->assigned) { ++ if (invalid_direct_map(pfn, npages)) { ++ pr_err("Failed to unmap pfn 0x%llx pages %d from direct_map\n", ++ pfn, npages); ++ return -EFAULT; ++ } ++ } ++ + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" + : "=a"(ret) + : "a"(paddr), "c"((unsigned long)val) + : "memory", "cc"); ++ ++ /* ++ * Restore the direct map after the page is removed from the RMP table. ++ */ ++ if (!ret && !val->assigned) { ++ if (restore_direct_map(pfn, npages)) { ++ pr_err("Failed to map pfn 0x%llx pages %d in direct_map\n", ++ pfn, npages); ++ return -EFAULT; ++ } ++ } ++ + return ret; + } + +-- +2.36.1 + + +From 5328a76b3fab1f20b3ffc400ca2402bec19d9700 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:39:04 +0000 +Subject: [PATCH 08/90] x86/traps: Define RMP violation #PF error code + +Bit 31 in the page fault-error bit will be set when processor encounters +an RMP violation. + +While at it, use the BIT_ULL() macro. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/trap_pf.h | 18 +++++++++++------- + arch/x86/mm/fault.c | 1 + + 2 files changed, 12 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h +index 10b1de500ab1..89b705114b3f 100644 +--- a/arch/x86/include/asm/trap_pf.h ++++ b/arch/x86/include/asm/trap_pf.h +@@ -2,6 +2,8 @@ + #ifndef _ASM_X86_TRAP_PF_H + #define _ASM_X86_TRAP_PF_H + ++#include <linux/bits.h> /* BIT() macro */ ++ + /* + * Page fault error code bits: + * +@@ -12,15 +14,17 @@ + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + * bit 15 == 1: SGX MMU page-fault ++ * bit 31 == 1: fault was due to RMP violation + */ + enum x86_pf_error_code { +- X86_PF_PROT = 1 << 0, +- X86_PF_WRITE = 1 << 1, +- X86_PF_USER = 1 << 2, +- X86_PF_RSVD = 1 << 3, +- X86_PF_INSTR = 1 << 4, +- X86_PF_PK = 1 << 5, +- X86_PF_SGX = 1 << 15, ++ X86_PF_PROT = BIT_ULL(0), ++ X86_PF_WRITE = BIT_ULL(1), ++ X86_PF_USER = BIT_ULL(2), ++ X86_PF_RSVD = BIT_ULL(3), ++ X86_PF_INSTR = BIT_ULL(4), ++ X86_PF_PK = BIT_ULL(5), ++ X86_PF_SGX = BIT_ULL(15), ++ X86_PF_RMP = BIT_ULL(31), + }; + + #endif /* _ASM_X86_TRAP_PF_H */ +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index fad8faa29d04..a4c270e99f7f 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -546,6 +546,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad + !(error_code & X86_PF_PROT) ? "not-present page" : + (error_code & X86_PF_RSVD) ? "reserved bit violation" : + (error_code & X86_PF_PK) ? "protection keys violation" : ++ (error_code & X86_PF_RMP) ? "RMP violation" : + "permissions violation"); + + if (!(error_code & X86_PF_USER) && user_mode(regs)) { +-- +2.36.1 + + +From 0ecb0a4781be933fcadeb56a85070818ef3566e7 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:46:18 +0000 +Subject: [PATCH 09/90] x86/fault: Add support to handle the RMP fault for user + address + +When SEV-SNP is enabled globally, a write from the host goes through the +RMP check. When the host writes to pages, hardware checks the following +conditions at the end of page walk: + +1. Assigned bit in the RMP table is zero (i.e page is shared). +2. If the page table entry that gives the sPA indicates that the target + page size is a large page, then all RMP entries for the 4KB + constituting pages of the target must have the assigned bit 0. +3. Immutable bit in the RMP table is not zero. + +The hardware will raise page fault if one of the above conditions is not +met. Try resolving the fault instead of taking fault again and again. If +the host attempts to write to the guest private memory then send the +SIGBUS signal to kill the process. If the page level between the host and +RMP entry does not match, then split the address to keep the RMP and host +page levels in sync. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/mm/fault.c | 66 ++++++++++++++++++++++++++++++++++++++++ + include/linux/mm.h | 3 +- + include/linux/mm_types.h | 3 ++ + mm/memory.c | 13 ++++++++ + 4 files changed, 84 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index a4c270e99f7f..f5de9673093a 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -19,6 +19,7 @@ + #include <linux/uaccess.h> /* faulthandler_disabled() */ + #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ + #include <linux/mm_types.h> ++#include <linux/sev.h> /* snp_lookup_rmpentry() */ + + #include <asm/cpufeature.h> /* boot_cpu_has, ... */ + #include <asm/traps.h> /* dotraplinkage, ... */ +@@ -1209,6 +1210,60 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + } + NOKPROBE_SYMBOL(do_kern_addr_fault); + ++static inline size_t pages_per_hpage(int level) ++{ ++ return page_level_size(level) / PAGE_SIZE; ++} ++ ++/* ++ * Return 1 if the caller need to retry, 0 if it the address need to be split ++ * in order to resolve the fault. ++ */ ++static int handle_user_rmp_page_fault(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) ++{ ++ int rmp_level, level; ++ pte_t *pte; ++ u64 pfn; ++ ++ pte = lookup_address_in_mm(current->mm, address, &level); ++ ++ /* ++ * It can happen if there was a race between an unmap event and ++ * the RMP fault delivery. ++ */ ++ if (!pte || !pte_present(*pte)) ++ return 1; ++ ++ pfn = pte_pfn(*pte); ++ ++ /* If its large page then calculte the fault pfn */ ++ if (level > PG_LEVEL_4K) { ++ unsigned long mask; ++ ++ mask = pages_per_hpage(level) - pages_per_hpage(level - 1); ++ pfn |= (address >> PAGE_SHIFT) & mask; ++ } ++ ++ /* ++ * If its a guest private page, then the fault cannot be resolved. ++ * Send a SIGBUS to terminate the process. ++ */ ++ if (snp_lookup_rmpentry(pfn, &rmp_level)) { ++ do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); ++ return 1; ++ } ++ ++ /* ++ * The backing page level is higher than the RMP page level, request ++ * to split the page. ++ */ ++ if (level > rmp_level) ++ return 0; ++ ++ return 1; ++} ++ + /* + * Handle faults in the user portion of the address space. Nothing in here + * should check X86_PF_USER without a specific justification: for almost +@@ -1306,6 +1361,17 @@ void do_user_addr_fault(struct pt_regs *regs, + if (error_code & X86_PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; + ++ /* ++ * If its an RMP violation, try resolving it. ++ */ ++ if (error_code & X86_PF_RMP) { ++ if (handle_user_rmp_page_fault(regs, error_code, address)) ++ return; ++ ++ /* Ask to split the page */ ++ flags |= FAULT_FLAG_PAGE_SPLIT; ++ } ++ + #ifdef CONFIG_X86_64 + /* + * Faults in the vsyscall page might need emulation. The +diff --git a/include/linux/mm.h b/include/linux/mm.h +index de32c0383387..2ccc562d166f 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -463,7 +463,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) + { FAULT_FLAG_USER, "USER" }, \ + { FAULT_FLAG_REMOTE, "REMOTE" }, \ + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ +- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } ++ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ ++ { FAULT_FLAG_PAGE_SPLIT, "PAGESPLIT" } + + /* + * vm_fault is filled by the pagefault handler and passed to the vma's +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 6dfaf271ebf8..aa2d8d48ce3e 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -818,6 +818,8 @@ typedef struct { + * mapped R/O. + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. ++ * @FAULT_FLAG_PAGE_SPLIT: The fault was due page size mismatch, split the ++ * region to smaller page size and retry. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two +@@ -855,6 +857,7 @@ enum fault_flag { + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, ++ FAULT_FLAG_PAGE_SPLIT = 1 << 12, + }; + + typedef unsigned int __bitwise zap_flags_t; +diff --git a/mm/memory.c b/mm/memory.c +index 7274f2b52bca..c2187ffcbb8e 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4945,6 +4945,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) + return 0; + } + ++static int handle_split_page_fault(struct vm_fault *vmf) ++{ ++ if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) ++ return VM_FAULT_SIGBUS; ++ ++ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); ++ return 0; ++} ++ + /* + * By the time we get here, we already hold the mm semaphore + * +@@ -5024,6 +5033,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } ++ ++ if (flags & FAULT_FLAG_PAGE_SPLIT) ++ return handle_split_page_fault(&vmf); ++ + if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); +-- +2.36.1 + + +From af381cc88410c0e2c48fda5732741edd0d7609ac Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:50:54 +0000 +Subject: [PATCH 10/90] x86/fault: Add support to dump RMP entry on fault + +When SEV-SNP is enabled globally, a write from the host goes through the +RMP check. If the hardware encounters the check failure, then it raises +the #PF (with RMP set). Dump the RMP entry at the faulting pfn to help +the debug. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev.h | 7 +++++++ + arch/x86/kernel/sev.c | 43 ++++++++++++++++++++++++++++++++++++++ + arch/x86/mm/fault.c | 17 +++++++++++---- + include/linux/sev.h | 2 ++ + 4 files changed, 65 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h +index 6ab872311544..c0c4df817159 100644 +--- a/arch/x86/include/asm/sev.h ++++ b/arch/x86/include/asm/sev.h +@@ -113,6 +113,11 @@ struct __packed rmpentry { + + #define rmpentry_assigned(x) ((x)->info.assigned) + #define rmpentry_pagesize(x) ((x)->info.pagesize) ++#define rmpentry_vmsa(x) ((x)->info.vmsa) ++#define rmpentry_asid(x) ((x)->info.asid) ++#define rmpentry_validated(x) ((x)->info.validated) ++#define rmpentry_gpa(x) ((unsigned long)(x)->info.gpa) ++#define rmpentry_immutable(x) ((x)->info.immutable) + + #define RMPADJUST_VMSA_PAGE_BIT BIT(16) + +@@ -205,6 +210,7 @@ void snp_set_wakeup_secondary_cpu(void); + bool snp_init(struct boot_params *bp); + void snp_abort(void); + int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); ++void dump_rmpentry(u64 pfn); + #else + static inline void sev_es_ist_enter(struct pt_regs *regs) { } + static inline void sev_es_ist_exit(void) { } +@@ -229,6 +235,7 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in + { + return -ENOTTY; + } ++static inline void dump_rmpentry(u64 pfn) {} + #endif + + #endif +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index 734cddd837f5..6640a639fffc 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -2414,6 +2414,49 @@ static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) + return entry; + } + ++void dump_rmpentry(u64 pfn) ++{ ++ unsigned long pfn_end; ++ struct rmpentry *e; ++ int level; ++ ++ e = __snp_lookup_rmpentry(pfn, &level); ++ if (!e) { ++ pr_alert("failed to read RMP entry pfn 0x%llx\n", pfn); ++ return; ++ } ++ ++ if (rmpentry_assigned(e)) { ++ pr_alert("RMPEntry paddr 0x%llx [assigned=%d immutable=%d pagesize=%d gpa=0x%lx" ++ " asid=%d vmsa=%d validated=%d]\n", pfn << PAGE_SHIFT, ++ rmpentry_assigned(e), rmpentry_immutable(e), rmpentry_pagesize(e), ++ rmpentry_gpa(e), rmpentry_asid(e), rmpentry_vmsa(e), ++ rmpentry_validated(e)); ++ return; ++ } ++ ++ /* ++ * If the RMP entry at the faulting pfn was not assigned, then we do not ++ * know what caused the RMP violation. To get some useful debug information, ++ * let iterate through the entire 2MB region, and dump the RMP entries if ++ * one of the bit in the RMP entry is set. ++ */ ++ pfn = pfn & ~(PTRS_PER_PMD - 1); ++ pfn_end = pfn + PTRS_PER_PMD; ++ ++ while (pfn < pfn_end) { ++ e = __snp_lookup_rmpentry(pfn, &level); ++ if (!e) ++ return; ++ ++ if (e->low || e->high) ++ pr_alert("RMPEntry paddr 0x%llx: [high=0x%016llx low=0x%016llx]\n", ++ pfn << PAGE_SHIFT, e->high, e->low); ++ pfn++; ++ } ++} ++EXPORT_SYMBOL_GPL(dump_rmpentry); ++ + /* + * Return 1 if the RMP entry is assigned, 0 if it exists but is not assigned, + * and -errno if there is no corresponding RMP entry. +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index f5de9673093a..25896a6ba04a 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -34,6 +34,7 @@ + #include <asm/kvm_para.h> /* kvm_handle_async_pf */ + #include <asm/vdso.h> /* fixup_vdso_exception() */ + #include <asm/irq_stack.h> ++#include <asm/sev.h> /* dump_rmpentry() */ + + #define CREATE_TRACE_POINTS + #include <asm/trace/exceptions.h> +@@ -290,7 +291,7 @@ static bool low_pfn(unsigned long pfn) + return pfn < max_low_pfn; + } + +-static void dump_pagetable(unsigned long address) ++static void dump_pagetable(unsigned long address, bool show_rmpentry) + { + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = &base[pgd_index(address)]; +@@ -346,10 +347,11 @@ static int bad_address(void *p) + return get_kernel_nofault(dummy, (unsigned long *)p); + } + +-static void dump_pagetable(unsigned long address) ++static void dump_pagetable(unsigned long address, bool show_rmpentry) + { + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = base + pgd_index(address); ++ unsigned long pfn; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; +@@ -367,6 +369,7 @@ static void dump_pagetable(unsigned long address) + if (bad_address(p4d)) + goto bad; + ++ pfn = p4d_pfn(*p4d); + pr_cont("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; +@@ -375,6 +378,7 @@ static void dump_pagetable(unsigned long address) + if (bad_address(pud)) + goto bad; + ++ pfn = pud_pfn(*pud); + pr_cont("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto out; +@@ -383,6 +387,7 @@ static void dump_pagetable(unsigned long address) + if (bad_address(pmd)) + goto bad; + ++ pfn = pmd_pfn(*pmd); + pr_cont("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; +@@ -391,9 +396,13 @@ static void dump_pagetable(unsigned long address) + if (bad_address(pte)) + goto bad; + ++ pfn = pte_pfn(*pte); + pr_cont("PTE %lx", pte_val(*pte)); + out: + pr_cont("\n"); ++ ++ if (show_rmpentry) ++ dump_rmpentry(pfn); + return; + bad: + pr_info("BAD\n"); +@@ -579,7 +588,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad + show_ldttss(&gdt, "TR", tr); + } + +- dump_pagetable(address); ++ dump_pagetable(address, error_code & X86_PF_RMP); + } + + static noinline void +@@ -596,7 +605,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + tsk->comm, address); +- dump_pagetable(address); ++ dump_pagetable(address, false); + + if (__die("Bad pagetable", regs, error_code)) + sig = 0; +diff --git a/include/linux/sev.h b/include/linux/sev.h +index 1a68842789e1..734b13a69c54 100644 +--- a/include/linux/sev.h ++++ b/include/linux/sev.h +@@ -16,6 +16,7 @@ int snp_lookup_rmpentry(u64 pfn, int *level); + int psmash(u64 pfn); + int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable); + int rmp_make_shared(u64 pfn, enum pg_level level); ++void dump_rmpentry(u64 pfn); + #else + static inline int snp_lookup_rmpentry(u64 pfn, int *level) { return 0; } + static inline int psmash(u64 pfn) { return -ENXIO; } +@@ -25,6 +26,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int as + return -ENODEV; + } + static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } ++static inline void dump_rmpentry(u64 pfn) { } + + #endif /* CONFIG_AMD_MEM_ENCRYPT */ + #endif /* __LINUX_SEV_H */ +-- +2.36.1 + + +From f5dec307d246096768afd770d16a26be25fa28b3 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:54:46 +0000 +Subject: [PATCH 11/90] crypto:ccp: Define the SEV-SNP commands + +AMD introduced the next generation of SEV called SEV-SNP (Secure Nested +Paging). SEV-SNP builds upon existing SEV and SEV-ES functionality +while adding new hardware security protection. + +Define the commands and structures used to communicate with the AMD-SP +when creating and managing the SEV-SNP guests. The SEV-SNP firmware spec +is available at developer.amd.com/sev. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 14 +++ + include/linux/psp-sev.h | 222 +++++++++++++++++++++++++++++++++++ + include/uapi/linux/psp-sev.h | 42 +++++++ + 3 files changed, 278 insertions(+) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index fd928199bf1e..9cb3265f3bef 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -153,6 +153,20 @@ static int sev_cmd_buffer_len(int cmd) + case SEV_CMD_GET_ID: return sizeof(struct sev_data_get_id); + case SEV_CMD_ATTESTATION_REPORT: return sizeof(struct sev_data_attestation_report); + case SEV_CMD_SEND_CANCEL: return sizeof(struct sev_data_send_cancel); ++ case SEV_CMD_SNP_GCTX_CREATE: return sizeof(struct sev_data_snp_gctx_create); ++ case SEV_CMD_SNP_LAUNCH_START: return sizeof(struct sev_data_snp_launch_start); ++ case SEV_CMD_SNP_LAUNCH_UPDATE: return sizeof(struct sev_data_snp_launch_update); ++ case SEV_CMD_SNP_ACTIVATE: return sizeof(struct sev_data_snp_activate); ++ case SEV_CMD_SNP_DECOMMISSION: return sizeof(struct sev_data_snp_decommission); ++ case SEV_CMD_SNP_PAGE_RECLAIM: return sizeof(struct sev_data_snp_page_reclaim); ++ case SEV_CMD_SNP_GUEST_STATUS: return sizeof(struct sev_data_snp_guest_status); ++ case SEV_CMD_SNP_LAUNCH_FINISH: return sizeof(struct sev_data_snp_launch_finish); ++ case SEV_CMD_SNP_DBG_DECRYPT: return sizeof(struct sev_data_snp_dbg); ++ case SEV_CMD_SNP_DBG_ENCRYPT: return sizeof(struct sev_data_snp_dbg); ++ case SEV_CMD_SNP_PAGE_UNSMASH: return sizeof(struct sev_data_snp_page_unsmash); ++ case SEV_CMD_SNP_PLATFORM_STATUS: return sizeof(struct sev_data_snp_platform_status_buf); ++ case SEV_CMD_SNP_GUEST_REQUEST: return sizeof(struct sev_data_snp_guest_request); ++ case SEV_CMD_SNP_CONFIG: return sizeof(struct sev_user_data_snp_config); + default: return 0; + } + +diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h +index 1595088c428b..01ba9dc46ca3 100644 +--- a/include/linux/psp-sev.h ++++ b/include/linux/psp-sev.h +@@ -86,6 +86,34 @@ enum sev_cmd { + SEV_CMD_DBG_DECRYPT = 0x060, + SEV_CMD_DBG_ENCRYPT = 0x061, + ++ /* SNP specific commands */ ++ SEV_CMD_SNP_INIT = 0x81, ++ SEV_CMD_SNP_SHUTDOWN = 0x82, ++ SEV_CMD_SNP_PLATFORM_STATUS = 0x83, ++ SEV_CMD_SNP_DF_FLUSH = 0x84, ++ SEV_CMD_SNP_INIT_EX = 0x85, ++ SEV_CMD_SNP_DECOMMISSION = 0x90, ++ SEV_CMD_SNP_ACTIVATE = 0x91, ++ SEV_CMD_SNP_GUEST_STATUS = 0x92, ++ SEV_CMD_SNP_GCTX_CREATE = 0x93, ++ SEV_CMD_SNP_GUEST_REQUEST = 0x94, ++ SEV_CMD_SNP_ACTIVATE_EX = 0x95, ++ SEV_CMD_SNP_LAUNCH_START = 0xA0, ++ SEV_CMD_SNP_LAUNCH_UPDATE = 0xA1, ++ SEV_CMD_SNP_LAUNCH_FINISH = 0xA2, ++ SEV_CMD_SNP_DBG_DECRYPT = 0xB0, ++ SEV_CMD_SNP_DBG_ENCRYPT = 0xB1, ++ SEV_CMD_SNP_PAGE_SWAP_OUT = 0xC0, ++ SEV_CMD_SNP_PAGE_SWAP_IN = 0xC1, ++ SEV_CMD_SNP_PAGE_MOVE = 0xC2, ++ SEV_CMD_SNP_PAGE_MD_INIT = 0xC3, ++ SEV_CMD_SNP_PAGE_MD_RECLAIM = 0xC4, ++ SEV_CMD_SNP_PAGE_RO_RECLAIM = 0xC5, ++ SEV_CMD_SNP_PAGE_RO_RESTORE = 0xC6, ++ SEV_CMD_SNP_PAGE_RECLAIM = 0xC7, ++ SEV_CMD_SNP_PAGE_UNSMASH = 0xC8, ++ SEV_CMD_SNP_CONFIG = 0xC9, ++ + SEV_CMD_MAX, + }; + +@@ -531,6 +559,200 @@ struct sev_data_attestation_report { + u32 len; /* In/Out */ + } __packed; + ++/** ++ * struct sev_data_snp_platform_status_buf - SNP_PLATFORM_STATUS command params ++ * ++ * @address: physical address where the status should be copied ++ */ ++struct sev_data_snp_platform_status_buf { ++ u64 status_paddr; /* In */ ++} __packed; ++ ++/** ++ * struct sev_data_snp_download_firmware - SNP_DOWNLOAD_FIRMWARE command params ++ * ++ * @address: physical address of firmware image ++ * @len: len of the firmware image ++ */ ++struct sev_data_snp_download_firmware { ++ u64 address; /* In */ ++ u32 len; /* In */ ++} __packed; ++ ++/** ++ * struct sev_data_snp_gctx_create - SNP_GCTX_CREATE command params ++ * ++ * @gctx_paddr: system physical address of the page donated to firmware by ++ * the hypervisor to contain the guest context. ++ */ ++struct sev_data_snp_gctx_create { ++ u64 gctx_paddr; /* In */ ++} __packed; ++ ++/** ++ * struct sev_data_snp_activate - SNP_ACTIVATE command params ++ * ++ * @gctx_paddr: system physical address guest context page ++ * @asid: ASID to bind to the guest ++ */ ++struct sev_data_snp_activate { ++ u64 gctx_paddr; /* In */ ++ u32 asid; /* In */ ++} __packed; ++ ++/** ++ * struct sev_data_snp_decommission - SNP_DECOMMISSION command params ++ * ++ * @address: system physical address guest context page ++ */ ++struct sev_data_snp_decommission { ++ u64 gctx_paddr; /* In */ ++} __packed; ++ ++/** ++ * struct sev_data_snp_launch_start - SNP_LAUNCH_START command params ++ * ++ * @gctx_addr: system physical address of guest context page ++ * @policy: guest policy ++ * @ma_gctx_addr: system physical address of migration agent ++ * @imi_en: launch flow is launching an IMI for the purpose of ++ * guest-assisted migration. ++ * @ma_en: the guest is associated with a migration agent ++ */ ++struct sev_data_snp_launch_start { ++ u64 gctx_paddr; /* In */ ++ u64 policy; /* In */ ++ u64 ma_gctx_paddr; /* In */ ++ u32 ma_en:1; /* In */ ++ u32 imi_en:1; /* In */ ++ u32 rsvd:30; ++ u8 gosvw[16]; /* In */ ++} __packed; ++ ++/* SNP support page type */ ++enum { ++ SNP_PAGE_TYPE_NORMAL = 0x1, ++ SNP_PAGE_TYPE_VMSA = 0x2, ++ SNP_PAGE_TYPE_ZERO = 0x3, ++ SNP_PAGE_TYPE_UNMEASURED = 0x4, ++ SNP_PAGE_TYPE_SECRET = 0x5, ++ SNP_PAGE_TYPE_CPUID = 0x6, ++ ++ SNP_PAGE_TYPE_MAX ++}; ++ ++/** ++ * struct sev_data_snp_launch_update - SNP_LAUNCH_UPDATE command params ++ * ++ * @gctx_addr: system physical address of guest context page ++ * @imi_page: indicates that this page is part of the IMI of the guest ++ * @page_type: encoded page type ++ * @page_size: page size 0 indicates 4K and 1 indicates 2MB page ++ * @address: system physical address of destination page to encrypt ++ * @vmpl1_perms: VMPL permission mask for VMPL1 ++ * @vmpl2_perms: VMPL permission mask for VMPL2 ++ * @vmpl3_perms: VMPL permission mask for VMPL3 ++ */ ++struct sev_data_snp_launch_update { ++ u64 gctx_paddr; /* In */ ++ u32 page_size:1; /* In */ ++ u32 page_type:3; /* In */ ++ u32 imi_page:1; /* In */ ++ u32 rsvd:27; ++ u32 rsvd2; ++ u64 address; /* In */ ++ u32 rsvd3:8; ++ u32 vmpl1_perms:8; /* In */ ++ u32 vmpl2_perms:8; /* In */ ++ u32 vmpl3_perms:8; /* In */ ++ u32 rsvd4; ++} __packed; ++ ++/** ++ * struct sev_data_snp_launch_finish - SNP_LAUNCH_FINISH command params ++ * ++ * @gctx_addr: system pphysical address of guest context page ++ */ ++struct sev_data_snp_launch_finish { ++ u64 gctx_paddr; ++ u64 id_block_paddr; ++ u64 id_auth_paddr; ++ u8 id_block_en:1; ++ u8 auth_key_en:1; ++ u64 rsvd:62; ++ u8 host_data[32]; ++} __packed; ++ ++/** ++ * struct sev_data_snp_guest_status - SNP_GUEST_STATUS command params ++ * ++ * @gctx_paddr: system physical address of guest context page ++ * @address: system physical address of guest status page ++ */ ++struct sev_data_snp_guest_status { ++ u64 gctx_paddr; ++ u64 address; ++} __packed; ++ ++/** ++ * struct sev_data_snp_page_reclaim - SNP_PAGE_RECLAIM command params ++ * ++ * @paddr: system physical address of page to be claimed. The BIT0 indicate ++ * the page size. 0h indicates 4 kB and 1h indicates 2 MB page. ++ */ ++struct sev_data_snp_page_reclaim { ++ u64 paddr; ++} __packed; ++ ++/** ++ * struct sev_data_snp_page_unsmash - SNP_PAGE_UNMASH command params ++ * ++ * @paddr: system physical address of page to be unmashed. The BIT0 indicate ++ * the page size. 0h indicates 4 kB and 1h indicates 2 MB page. ++ */ ++struct sev_data_snp_page_unsmash { ++ u64 paddr; ++} __packed; ++ ++/** ++ * struct sev_data_dbg - DBG_ENCRYPT/DBG_DECRYPT command parameters ++ * ++ * @handle: handle of the VM to perform debug operation ++ * @src_addr: source address of data to operate on ++ * @dst_addr: destination address of data to operate on ++ * @len: len of data to operate on ++ */ ++struct sev_data_snp_dbg { ++ u64 gctx_paddr; /* In */ ++ u64 src_addr; /* In */ ++ u64 dst_addr; /* In */ ++ u32 len; /* In */ ++} __packed; ++ ++/** ++ * struct sev_snp_guest_request - SNP_GUEST_REQUEST command params ++ * ++ * @gctx_paddr: system physical address of guest context page ++ * @req_paddr: system physical address of request page ++ * @res_paddr: system physical address of response page ++ */ ++struct sev_data_snp_guest_request { ++ u64 gctx_paddr; /* In */ ++ u64 req_paddr; /* In */ ++ u64 res_paddr; /* In */ ++} __packed; ++ ++/** ++ * struuct sev_data_snp_init - SNP_INIT_EX structure ++ * ++ * @init_rmp: indicate that the RMP should be initialized. ++ */ ++struct sev_data_snp_init_ex { ++ u32 init_rmp:1; ++ u32 rsvd:31; ++ u8 rsvd1[60]; ++} __packed; ++ + #ifdef CONFIG_CRYPTO_DEV_SP_PSP + + /** +diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h +index 91b4c63d5cbf..bed65a891223 100644 +--- a/include/uapi/linux/psp-sev.h ++++ b/include/uapi/linux/psp-sev.h +@@ -61,6 +61,13 @@ typedef enum { + SEV_RET_INVALID_PARAM, + SEV_RET_RESOURCE_LIMIT, + SEV_RET_SECURE_DATA_INVALID, ++ SEV_RET_INVALID_PAGE_SIZE, ++ SEV_RET_INVALID_PAGE_STATE, ++ SEV_RET_INVALID_MDATA_ENTRY, ++ SEV_RET_INVALID_PAGE_OWNER, ++ SEV_RET_INVALID_PAGE_AEAD_OFLOW, ++ SEV_RET_RMP_INIT_REQUIRED, ++ + SEV_RET_MAX, + } sev_ret_code; + +@@ -147,6 +154,41 @@ struct sev_user_data_get_id2 { + __u32 length; /* In/Out */ + } __packed; + ++/** ++ * struct sev_user_data_snp_status - SNP status ++ * ++ * @major: API major version ++ * @minor: API minor version ++ * @state: current platform state ++ * @build: firmware build id for the API version ++ * @guest_count: the number of guest currently managed by the firmware ++ * @tcb_version: current TCB version ++ */ ++struct sev_user_data_snp_status { ++ __u8 api_major; /* Out */ ++ __u8 api_minor; /* Out */ ++ __u8 state; /* Out */ ++ __u8 rsvd; ++ __u32 build_id; /* Out */ ++ __u32 rsvd1; ++ __u32 guest_count; /* Out */ ++ __u64 tcb_version; /* Out */ ++ __u64 rsvd2; ++} __packed; ++ ++/* ++ * struct sev_user_data_snp_config - system wide configuration value for SNP. ++ * ++ * @reported_tcb: The TCB version to report in the guest attestation report. ++ * @mask_chip_id: Indicates that the CHID_ID field in the attestation report ++ * will always be zero. ++ */ ++struct sev_user_data_snp_config { ++ __u64 reported_tcb; /* In */ ++ __u32 mask_chip_id; /* In */ ++ __u8 rsvd[52]; ++} __packed; ++ + /** + * struct sev_issue_cmd - SEV ioctl parameters + * +-- +2.36.1 + + +From b6338036ed8245f13be219c958d95aec290b16c6 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:57:14 +0000 +Subject: [PATCH 12/90] crypto: ccp: Add support to initialize the AMD-SP for + SEV-SNP + +Before SNP VMs can be launched, the platform must be appropriately +configured and initialized. Platform initialization is accomplished via +the SNP_INIT command. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 118 +++++++++++++++++++++++++++++++++++ + drivers/crypto/ccp/sev-dev.h | 2 + + include/linux/psp-sev.h | 16 +++++ + 3 files changed, 136 insertions(+) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index 9cb3265f3bef..8a26735cefb6 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -33,6 +33,10 @@ + #define SEV_FW_FILE "amd/sev.fw" + #define SEV_FW_NAME_SIZE 64 + ++/* Minimum firmware version required for the SEV-SNP support */ ++#define SNP_MIN_API_MAJOR 1 ++#define SNP_MIN_API_MINOR 51 ++ + static DEFINE_MUTEX(sev_cmd_mutex); + static struct sev_misc_dev *misc_dev; + +@@ -775,6 +779,95 @@ static int sev_update_firmware(struct device *dev) + return ret; + } + ++static void snp_set_hsave_pa(void *arg) ++{ ++ wrmsrl(MSR_VM_HSAVE_PA, 0); ++} ++ ++static int __sev_snp_init_locked(int *error) ++{ ++ struct psp_device *psp = psp_master; ++ struct sev_device *sev; ++ int rc = 0; ++ ++ if (!psp || !psp->sev_data) ++ return -ENODEV; ++ ++ sev = psp->sev_data; ++ ++ if (sev->snp_inited) ++ return 0; ++ ++ /* ++ * The SNP_INIT requires the MSR_VM_HSAVE_PA must be set to 0h ++ * across all cores. ++ */ ++ on_each_cpu(snp_set_hsave_pa, NULL, 1); ++ ++ /* Prepare for first SEV guest launch after INIT */ ++ wbinvd_on_all_cpus(); ++ ++ /* Issue the SNP_INIT firmware command. */ ++ rc = __sev_do_cmd_locked(SEV_CMD_SNP_INIT, NULL, error); ++ if (rc) ++ return rc; ++ ++ sev->snp_inited = true; ++ dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); ++ ++ return rc; ++} ++ ++int sev_snp_init(int *error) ++{ ++ int rc; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return -ENODEV; ++ ++ mutex_lock(&sev_cmd_mutex); ++ rc = __sev_snp_init_locked(error); ++ mutex_unlock(&sev_cmd_mutex); ++ ++ return rc; ++} ++EXPORT_SYMBOL_GPL(sev_snp_init); ++ ++static int __sev_snp_shutdown_locked(int *error) ++{ ++ struct sev_device *sev = psp_master->sev_data; ++ int ret; ++ ++ if (!sev->snp_inited) ++ return 0; ++ ++ /* SHUTDOWN requires the DF_FLUSH */ ++ wbinvd_on_all_cpus(); ++ __sev_do_cmd_locked(SEV_CMD_SNP_DF_FLUSH, NULL, NULL); ++ ++ ret = __sev_do_cmd_locked(SEV_CMD_SNP_SHUTDOWN, NULL, error); ++ if (ret) { ++ dev_err(sev->dev, "SEV-SNP firmware shutdown failed\n"); ++ return ret; ++ } ++ ++ sev->snp_inited = false; ++ dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n"); ++ ++ return ret; ++} ++ ++static int sev_snp_shutdown(int *error) ++{ ++ int rc; ++ ++ mutex_lock(&sev_cmd_mutex); ++ rc = __sev_snp_shutdown_locked(NULL); ++ mutex_unlock(&sev_cmd_mutex); ++ ++ return rc; ++} ++ + static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable) + { + struct sev_device *sev = psp_master->sev_data; +@@ -1231,6 +1324,8 @@ static void sev_firmware_shutdown(struct sev_device *sev) + get_order(NV_LENGTH)); + sev_init_ex_buffer = NULL; + } ++ ++ sev_snp_shutdown(NULL); + } + + void sev_dev_destroy(struct psp_device *psp) +@@ -1287,6 +1382,26 @@ void sev_pci_init(void) + } + } + ++ /* ++ * If boot CPU supports the SNP, then first attempt to initialize ++ * the SNP firmware. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) { ++ if (!sev_version_greater_or_equal(SNP_MIN_API_MAJOR, SNP_MIN_API_MINOR)) { ++ dev_err(sev->dev, "SEV-SNP support requires firmware version >= %d:%d\n", ++ SNP_MIN_API_MAJOR, SNP_MIN_API_MINOR); ++ } else { ++ rc = sev_snp_init(&error); ++ if (rc) { ++ /* ++ * If we failed to INIT SNP then don't abort the probe. ++ * Continue to initialize the legacy SEV firmware. ++ */ ++ dev_err(sev->dev, "SEV-SNP: failed to INIT error %#x\n", error); ++ } ++ } ++ } ++ + /* Obtain the TMR memory area for SEV-ES use */ + sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE); + if (!sev_es_tmr) +@@ -1302,6 +1417,9 @@ void sev_pci_init(void) + dev_err(sev->dev, "SEV: failed to INIT error %#x, rc %d\n", + error, rc); + ++ dev_info(sev->dev, "SEV%s API:%d.%d build:%d\n", sev->snp_inited ? ++ "-SNP" : "", sev->api_major, sev->api_minor, sev->build); ++ + return; + + err: +diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h +index 666c21eb81ab..186ad20cbd24 100644 +--- a/drivers/crypto/ccp/sev-dev.h ++++ b/drivers/crypto/ccp/sev-dev.h +@@ -52,6 +52,8 @@ struct sev_device { + u8 build; + + void *cmd_buf; ++ ++ bool snp_inited; + }; + + int sev_dev_init(struct psp_device *psp); +diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h +index 01ba9dc46ca3..ef4d42e8c96e 100644 +--- a/include/linux/psp-sev.h ++++ b/include/linux/psp-sev.h +@@ -769,6 +769,20 @@ struct sev_data_snp_init_ex { + */ + int sev_platform_init(int *error); + ++/** ++ * sev_snp_init - perform SEV SNP_INIT command ++ * ++ * @error: SEV command return code ++ * ++ * Returns: ++ * 0 if the SEV successfully processed the command ++ * -%ENODEV if the SEV device is not available ++ * -%ENOTSUPP if the SEV does not support SEV ++ * -%ETIMEDOUT if the SEV command timed out ++ * -%EIO if the SEV returned a non-zero return code ++ */ ++int sev_snp_init(int *error); ++ + /** + * sev_platform_status - perform SEV PLATFORM_STATUS command + * +@@ -876,6 +890,8 @@ sev_platform_status(struct sev_user_data_status *status, int *error) { return -E + + static inline int sev_platform_init(int *error) { return -ENODEV; } + ++static inline int sev_snp_init(int *error) { return -ENODEV; } ++ + static inline int + sev_guest_deactivate(struct sev_data_deactivate *data, int *error) { return -ENODEV; } + +-- +2.36.1 + + +From 804d7336e9fe02000538a93f127a68db53838642 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 17:58:50 +0000 +Subject: [PATCH 13/90] crypto:ccp: Provide APIs to issue SEV-SNP commands + +Provide the APIs for the hypervisor to manage an SEV-SNP guest. The +commands for SEV-SNP is defined in the SEV-SNP firmware specification. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 24 ++++++++++++ + include/linux/psp-sev.h | 73 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 97 insertions(+) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index 8a26735cefb6..056fd04074e4 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -1202,6 +1202,30 @@ int sev_guest_df_flush(int *error) + } + EXPORT_SYMBOL_GPL(sev_guest_df_flush); + ++int snp_guest_decommission(struct sev_data_snp_decommission *data, int *error) ++{ ++ return sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, data, error); ++} ++EXPORT_SYMBOL_GPL(snp_guest_decommission); ++ ++int snp_guest_df_flush(int *error) ++{ ++ return sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, error); ++} ++EXPORT_SYMBOL_GPL(snp_guest_df_flush); ++ ++int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error) ++{ ++ return sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, data, error); ++} ++EXPORT_SYMBOL_GPL(snp_guest_page_reclaim); ++ ++int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error) ++{ ++ return sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, data, error); ++} ++EXPORT_SYMBOL_GPL(snp_guest_dbg_decrypt); ++ + static void sev_exit(struct kref *ref) + { + misc_deregister(&misc_dev->misc); +diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h +index ef4d42e8c96e..9f921d221b75 100644 +--- a/include/linux/psp-sev.h ++++ b/include/linux/psp-sev.h +@@ -881,6 +881,64 @@ int sev_guest_df_flush(int *error); + */ + int sev_guest_decommission(struct sev_data_decommission *data, int *error); + ++/** ++ * snp_guest_df_flush - perform SNP DF_FLUSH command ++ * ++ * @sev_ret: sev command return code ++ * ++ * Returns: ++ * 0 if the sev successfully processed the command ++ * -%ENODEV if the sev device is not available ++ * -%ENOTSUPP if the sev does not support SEV ++ * -%ETIMEDOUT if the sev command timed out ++ * -%EIO if the sev returned a non-zero return code ++ */ ++int snp_guest_df_flush(int *error); ++ ++/** ++ * snp_guest_decommission - perform SNP_DECOMMISSION command ++ * ++ * @decommission: sev_data_decommission structure to be processed ++ * @sev_ret: sev command return code ++ * ++ * Returns: ++ * 0 if the sev successfully processed the command ++ * -%ENODEV if the sev device is not available ++ * -%ENOTSUPP if the sev does not support SEV ++ * -%ETIMEDOUT if the sev command timed out ++ * -%EIO if the sev returned a non-zero return code ++ */ ++int snp_guest_decommission(struct sev_data_snp_decommission *data, int *error); ++ ++/** ++ * snp_guest_page_reclaim - perform SNP_PAGE_RECLAIM command ++ * ++ * @decommission: sev_snp_page_reclaim structure to be processed ++ * @sev_ret: sev command return code ++ * ++ * Returns: ++ * 0 if the sev successfully processed the command ++ * -%ENODEV if the sev device is not available ++ * -%ENOTSUPP if the sev does not support SEV ++ * -%ETIMEDOUT if the sev command timed out ++ * -%EIO if the sev returned a non-zero return code ++ */ ++int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); ++ ++/** ++ * snp_guest_dbg_decrypt - perform SEV SNP_DBG_DECRYPT command ++ * ++ * @sev_ret: sev command return code ++ * ++ * Returns: ++ * 0 if the sev successfully processed the command ++ * -%ENODEV if the sev device is not available ++ * -%ENOTSUPP if the sev does not support SEV ++ * -%ETIMEDOUT if the sev command timed out ++ * -%EIO if the sev returned a non-zero return code ++ */ ++int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error); ++ + void *psp_copy_user_blob(u64 uaddr, u32 len); + + #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ +@@ -908,6 +966,21 @@ sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int + + static inline void *psp_copy_user_blob(u64 __user uaddr, u32 len) { return ERR_PTR(-EINVAL); } + ++static inline int ++snp_guest_decommission(struct sev_data_snp_decommission *data, int *error) { return -ENODEV; } ++ ++static inline int snp_guest_df_flush(int *error) { return -ENODEV; } ++ ++static inline int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error) ++{ ++ return -ENODEV; ++} ++ ++static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error) ++{ ++ return -ENODEV; ++} ++ + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ + + #endif /* __PSP_SEV_H__ */ +-- +2.36.1 + + +From b0f1e20b35b2c5ff9bbc92d5eb6e26fd8b5790f1 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:01:31 +0000 +Subject: [PATCH 14/90] crypto: ccp: Handle the legacy TMR allocation when SNP + is enabled + +The behavior and requirement for the SEV-legacy command is altered when +the SNP firmware is in the INIT state. See SEV-SNP firmware specification +for more details. + +Allocate the Trusted Memory Region (TMR) as a 2mb sized/aligned region +when SNP is enabled to satify new requirements for the SNP. Continue +allocating a 1mb region for !SNP configuration. + +While at it, provide API that can be used by others to allocate a page +that can be used by the firmware. The immediate user for this API will +be the KVM driver. The KVM driver to need to allocate a firmware context +page during the guest creation. The context page need to be updated +by the firmware. See the SEV-SNP specification for further details. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 173 +++++++++++++++++++++++++++++++++-- + include/linux/psp-sev.h | 11 +++ + 2 files changed, 178 insertions(+), 6 deletions(-) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index 056fd04074e4..4a7ba1872140 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -79,6 +79,14 @@ static void *sev_es_tmr; + #define NV_LENGTH (32 * 1024) + static void *sev_init_ex_buffer; + ++/* When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB size. */ ++#define SEV_SNP_ES_TMR_SIZE (2 * 1024 * 1024) ++ ++static size_t sev_es_tmr_size = SEV_ES_TMR_SIZE; ++ ++static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret); ++static int sev_do_cmd(int cmd, void *data, int *psp_ret); ++ + static inline bool sev_version_greater_or_equal(u8 maj, u8 min) + { + struct sev_device *sev = psp_master->sev_data; +@@ -177,11 +185,161 @@ static int sev_cmd_buffer_len(int cmd) + return 0; + } + ++static void snp_leak_pages(unsigned long pfn, unsigned int npages) ++{ ++ WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages); ++ while (npages--) { ++ memory_failure(pfn, 0); ++ dump_rmpentry(pfn); ++ pfn++; ++ } ++} ++ ++static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked) ++{ ++ struct sev_data_snp_page_reclaim data; ++ int ret, err, i, n = 0; ++ ++ for (i = 0; i < npages; i++) { ++ memset(&data, 0, sizeof(data)); ++ data.paddr = pfn << PAGE_SHIFT; ++ ++ if (locked) ++ ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); ++ else ++ ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); ++ if (ret) ++ goto cleanup; ++ ++ ret = rmp_make_shared(pfn, PG_LEVEL_4K); ++ if (ret) ++ goto cleanup; ++ ++ pfn++; ++ n++; ++ } ++ ++ return 0; ++ ++cleanup: ++ /* ++ * If failed to reclaim the page then page is no longer safe to ++ * be released, leak it. ++ */ ++ snp_leak_pages(pfn, npages - n); ++ return ret; ++} ++ ++static inline int rmp_make_firmware(unsigned long pfn, int level) ++{ ++ return rmp_make_private(pfn, 0, level, 0, true); ++} ++ ++static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, ++ bool need_reclaim) ++{ ++ unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */ ++ int rc, n = 0, i; ++ ++ for (i = 0; i < npages; i++) { ++ if (to_fw) ++ rc = rmp_make_firmware(pfn, PG_LEVEL_4K); ++ else ++ rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) : ++ rmp_make_shared(pfn, PG_LEVEL_4K); ++ if (rc) ++ goto cleanup; ++ ++ pfn++; ++ n++; ++ } ++ ++ return 0; ++ ++cleanup: ++ /* Try unrolling the firmware state changes */ ++ if (to_fw) { ++ /* ++ * Reclaim the pages which were already changed to the ++ * firmware state. ++ */ ++ snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked); ++ ++ return rc; ++ } ++ ++ /* ++ * If failed to change the page state to shared, then its not safe ++ * to release the page back to the system, leak it. ++ */ ++ snp_leak_pages(pfn, npages - n); ++ ++ return rc; ++} ++ ++static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order, bool locked) ++{ ++ unsigned long npages = 1ul << order, paddr; ++ struct sev_device *sev; ++ struct page *page; ++ ++ if (!psp_master || !psp_master->sev_data) ++ return NULL; ++ ++ page = alloc_pages(gfp_mask, order); ++ if (!page) ++ return NULL; ++ ++ /* If SEV-SNP is initialized then add the page in RMP table. */ ++ sev = psp_master->sev_data; ++ if (!sev->snp_inited) ++ return page; ++ ++ paddr = __pa((unsigned long)page_address(page)); ++ if (snp_set_rmp_state(paddr, npages, true, locked, false)) ++ return NULL; ++ ++ return page; ++} ++ ++void *snp_alloc_firmware_page(gfp_t gfp_mask) ++{ ++ struct page *page; ++ ++ page = __snp_alloc_firmware_pages(gfp_mask, 0, false); ++ ++ return page ? page_address(page) : NULL; ++} ++EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); ++ ++static void __snp_free_firmware_pages(struct page *page, int order, bool locked) ++{ ++ unsigned long paddr, npages = 1ul << order; ++ ++ if (!page) ++ return; ++ ++ paddr = __pa((unsigned long)page_address(page)); ++ if (snp_set_rmp_state(paddr, npages, false, locked, true)) ++ return; ++ ++ __free_pages(page, order); ++} ++ ++void snp_free_firmware_page(void *addr) ++{ ++ if (!addr) ++ return; ++ ++ __snp_free_firmware_pages(virt_to_page(addr), 0, false); ++} ++EXPORT_SYMBOL(snp_free_firmware_page); ++ + static void *sev_fw_alloc(unsigned long len) + { + struct page *page; + +- page = alloc_pages(GFP_KERNEL, get_order(len)); ++ page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len), false); + if (!page) + return NULL; + +@@ -393,7 +551,7 @@ static int __sev_init_locked(int *error) + data.tmr_address = __pa(sev_es_tmr); + + data.flags |= SEV_INIT_FLAGS_SEV_ES; +- data.tmr_len = SEV_ES_TMR_SIZE; ++ data.tmr_len = sev_es_tmr_size; + } + + return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error); +@@ -421,7 +579,7 @@ static int __sev_init_ex_locked(int *error) + data.tmr_address = __pa(sev_es_tmr); + + data.flags |= SEV_INIT_FLAGS_SEV_ES; +- data.tmr_len = SEV_ES_TMR_SIZE; ++ data.tmr_len = sev_es_tmr_size; + } + + return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error); +@@ -815,6 +973,8 @@ static int __sev_snp_init_locked(int *error) + sev->snp_inited = true; + dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); + ++ sev_es_tmr_size = SEV_SNP_ES_TMR_SIZE; ++ + return rc; + } + +@@ -1338,8 +1498,9 @@ static void sev_firmware_shutdown(struct sev_device *sev) + /* The TMR area was encrypted, flush it from the cache */ + wbinvd_on_all_cpus(); + +- free_pages((unsigned long)sev_es_tmr, +- get_order(SEV_ES_TMR_SIZE)); ++ __snp_free_firmware_pages(virt_to_page(sev_es_tmr), ++ get_order(sev_es_tmr_size), ++ false); + sev_es_tmr = NULL; + } + +@@ -1427,7 +1588,7 @@ void sev_pci_init(void) + } + + /* Obtain the TMR memory area for SEV-ES use */ +- sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE); ++ sev_es_tmr = sev_fw_alloc(sev_es_tmr_size); + if (!sev_es_tmr) + dev_warn(sev->dev, + "SEV: TMR allocation failed, SEV-ES support unavailable\n"); +diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h +index 9f921d221b75..a3bb792bb842 100644 +--- a/include/linux/psp-sev.h ++++ b/include/linux/psp-sev.h +@@ -12,6 +12,8 @@ + #ifndef __PSP_SEV_H__ + #define __PSP_SEV_H__ + ++#include <linux/sev.h> ++ + #include <uapi/linux/psp-sev.h> + + #ifdef CONFIG_X86 +@@ -940,6 +942,8 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); + int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error); + + void *psp_copy_user_blob(u64 uaddr, u32 len); ++void *snp_alloc_firmware_page(gfp_t mask); ++void snp_free_firmware_page(void *addr); + + #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ + +@@ -981,6 +985,13 @@ static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *erro + return -ENODEV; + } + ++static inline void *snp_alloc_firmware_page(gfp_t mask) ++{ ++ return NULL; ++} ++ ++static inline void snp_free_firmware_page(void *addr) { } ++ + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ + + #endif /* __PSP_SEV_H__ */ +-- +2.36.1 + + +From 4eada0dfd9dd20946c6db4d6f1626ff757137642 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:03:10 +0000 +Subject: [PATCH 15/90] crypto: ccp: Handle the legacy SEV command when SNP is + enabled + +The behavior of the SEV-legacy commands is altered when the SNP firmware +is in the INIT state. When SNP is in INIT state, all the SEV-legacy +commands that cause the firmware to write to memory must be in the +firmware state before issuing the command.. + +A command buffer may contains a system physical address that the firmware +may write to. There are two cases that need to be handled: + +1) system physical address points to a guest memory +2) system physical address points to a host memory + +To handle the case #1, change the page state to the firmware in the RMP +table before issuing the command and restore the state to shared after the +command completes. + +For the case #2, use a bounce buffer to complete the request. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 346 ++++++++++++++++++++++++++++++++++- + drivers/crypto/ccp/sev-dev.h | 12 ++ + 2 files changed, 348 insertions(+), 10 deletions(-) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index 4a7ba1872140..a744ec3f9571 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -441,12 +441,295 @@ static void sev_write_init_ex_file_if_required(int cmd_id) + sev_write_init_ex_file(); + } + ++static int alloc_snp_host_map(struct sev_device *sev) ++{ ++ struct page *page; ++ int i; ++ ++ for (i = 0; i < MAX_SNP_HOST_MAP_BUFS; i++) { ++ struct snp_host_map *map = &sev->snp_host_map[i]; ++ ++ memset(map, 0, sizeof(*map)); ++ ++ page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(SEV_FW_BLOB_MAX_SIZE)); ++ if (!page) ++ return -ENOMEM; ++ ++ map->host = page_address(page); ++ } ++ ++ return 0; ++} ++ ++static void free_snp_host_map(struct sev_device *sev) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_SNP_HOST_MAP_BUFS; i++) { ++ struct snp_host_map *map = &sev->snp_host_map[i]; ++ ++ if (map->host) { ++ __free_pages(virt_to_page(map->host), get_order(SEV_FW_BLOB_MAX_SIZE)); ++ memset(map, 0, sizeof(*map)); ++ } ++ } ++} ++ ++static int map_firmware_writeable(u64 *paddr, u32 len, bool guest, struct snp_host_map *map) ++{ ++ unsigned int npages = PAGE_ALIGN(len) >> PAGE_SHIFT; ++ ++ map->active = false; ++ ++ if (!paddr || !len) ++ return 0; ++ ++ map->paddr = *paddr; ++ map->len = len; ++ ++ /* If paddr points to a guest memory then change the page state to firmwware. */ ++ if (guest) { ++ if (snp_set_rmp_state(*paddr, npages, true, true, false)) ++ return -EFAULT; ++ ++ goto done; ++ } ++ ++ if (!map->host) ++ return -ENOMEM; ++ ++ /* Check if the pre-allocated buffer can be used to fullfil the request. */ ++ if (len > SEV_FW_BLOB_MAX_SIZE) ++ return -EINVAL; ++ ++ /* Transition the pre-allocated buffer to the firmware state. */ ++ if (snp_set_rmp_state(__pa(map->host), npages, true, true, false)) ++ return -EFAULT; ++ ++ /* Set the paddr to use pre-allocated firmware buffer */ ++ *paddr = __psp_pa(map->host); ++ ++done: ++ map->active = true; ++ return 0; ++} ++ ++static int unmap_firmware_writeable(u64 *paddr, u32 len, bool guest, struct snp_host_map *map) ++{ ++ unsigned int npages = PAGE_ALIGN(len) >> PAGE_SHIFT; ++ ++ if (!map->active) ++ return 0; ++ ++ /* If paddr points to a guest memory then restore the page state to hypervisor. */ ++ if (guest) { ++ if (snp_set_rmp_state(*paddr, npages, false, true, true)) ++ return -EFAULT; ++ ++ goto done; ++ } ++ ++ /* ++ * Transition the pre-allocated buffer to hypervisor state before the access. ++ * ++ * This is because while changing the page state to firmware, the kernel unmaps ++ * the pages from the direct map, and to restore the direct map we must ++ * transition the pages to shared state. ++ */ ++ if (snp_set_rmp_state(__pa(map->host), npages, false, true, true)) ++ return -EFAULT; ++ ++ /* Copy the response data firmware buffer to the callers buffer. */ ++ memcpy(__va(__sme_clr(map->paddr)), map->host, min_t(size_t, len, map->len)); ++ *paddr = map->paddr; ++ ++done: ++ map->active = false; ++ return 0; ++} ++ ++static bool sev_legacy_cmd_buf_writable(int cmd) ++{ ++ switch (cmd) { ++ case SEV_CMD_PLATFORM_STATUS: ++ case SEV_CMD_GUEST_STATUS: ++ case SEV_CMD_LAUNCH_START: ++ case SEV_CMD_RECEIVE_START: ++ case SEV_CMD_LAUNCH_MEASURE: ++ case SEV_CMD_SEND_START: ++ case SEV_CMD_SEND_UPDATE_DATA: ++ case SEV_CMD_SEND_UPDATE_VMSA: ++ case SEV_CMD_PEK_CSR: ++ case SEV_CMD_PDH_CERT_EXPORT: ++ case SEV_CMD_GET_ID: ++ case SEV_CMD_ATTESTATION_REPORT: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++#define prep_buffer(name, addr, len, guest, map) \ ++ func(&((typeof(name *))cmd_buf)->addr, ((typeof(name *))cmd_buf)->len, guest, map) ++ ++static int __snp_cmd_buf_copy(int cmd, void *cmd_buf, bool to_fw, int fw_err) ++{ ++ int (*func)(u64 *paddr, u32 len, bool guest, struct snp_host_map *map); ++ struct sev_device *sev = psp_master->sev_data; ++ bool from_fw = !to_fw; ++ ++ /* ++ * After the command is completed, change the command buffer memory to ++ * hypervisor state. ++ * ++ * The immutable bit is automatically cleared by the firmware, so ++ * no not need to reclaim the page. ++ */ ++ if (from_fw && sev_legacy_cmd_buf_writable(cmd)) { ++ if (snp_set_rmp_state(__pa(cmd_buf), 1, false, true, false)) ++ return -EFAULT; ++ ++ /* No need to go further if firmware failed to execute command. */ ++ if (fw_err) ++ return 0; ++ } ++ ++ if (to_fw) ++ func = map_firmware_writeable; ++ else ++ func = unmap_firmware_writeable; ++ ++ /* ++ * A command buffer may contains a system physical address. If the address ++ * points to a host memory then use an intermediate firmware page otherwise ++ * change the page state in the RMP table. ++ */ ++ switch (cmd) { ++ case SEV_CMD_PDH_CERT_EXPORT: ++ if (prep_buffer(struct sev_data_pdh_cert_export, pdh_cert_address, ++ pdh_cert_len, false, &sev->snp_host_map[0])) ++ goto err; ++ if (prep_buffer(struct sev_data_pdh_cert_export, cert_chain_address, ++ cert_chain_len, false, &sev->snp_host_map[1])) ++ goto err; ++ break; ++ case SEV_CMD_GET_ID: ++ if (prep_buffer(struct sev_data_get_id, address, len, ++ false, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_PEK_CSR: ++ if (prep_buffer(struct sev_data_pek_csr, address, len, ++ false, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_LAUNCH_UPDATE_DATA: ++ if (prep_buffer(struct sev_data_launch_update_data, address, len, ++ true, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_LAUNCH_UPDATE_VMSA: ++ if (prep_buffer(struct sev_data_launch_update_vmsa, address, len, ++ true, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_LAUNCH_MEASURE: ++ if (prep_buffer(struct sev_data_launch_measure, address, len, ++ false, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_LAUNCH_UPDATE_SECRET: ++ if (prep_buffer(struct sev_data_launch_secret, guest_address, guest_len, ++ true, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_DBG_DECRYPT: ++ if (prep_buffer(struct sev_data_dbg, dst_addr, len, false, ++ &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_DBG_ENCRYPT: ++ if (prep_buffer(struct sev_data_dbg, dst_addr, len, true, ++ &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_ATTESTATION_REPORT: ++ if (prep_buffer(struct sev_data_attestation_report, address, len, ++ false, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_SEND_START: ++ if (prep_buffer(struct sev_data_send_start, session_address, ++ session_len, false, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_SEND_UPDATE_DATA: ++ if (prep_buffer(struct sev_data_send_update_data, hdr_address, hdr_len, ++ false, &sev->snp_host_map[0])) ++ goto err; ++ if (prep_buffer(struct sev_data_send_update_data, trans_address, ++ trans_len, false, &sev->snp_host_map[1])) ++ goto err; ++ break; ++ case SEV_CMD_SEND_UPDATE_VMSA: ++ if (prep_buffer(struct sev_data_send_update_vmsa, hdr_address, hdr_len, ++ false, &sev->snp_host_map[0])) ++ goto err; ++ if (prep_buffer(struct sev_data_send_update_vmsa, trans_address, ++ trans_len, false, &sev->snp_host_map[1])) ++ goto err; ++ break; ++ case SEV_CMD_RECEIVE_UPDATE_DATA: ++ if (prep_buffer(struct sev_data_receive_update_data, guest_address, ++ guest_len, true, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ case SEV_CMD_RECEIVE_UPDATE_VMSA: ++ if (prep_buffer(struct sev_data_receive_update_vmsa, guest_address, ++ guest_len, true, &sev->snp_host_map[0])) ++ goto err; ++ break; ++ default: ++ break; ++ } ++ ++ /* The command buffer need to be in the firmware state. */ ++ if (to_fw && sev_legacy_cmd_buf_writable(cmd)) { ++ if (snp_set_rmp_state(__pa(cmd_buf), 1, true, true, false)) ++ return -EFAULT; ++ } ++ ++ return 0; ++ ++err: ++ return -EINVAL; ++} ++ ++static inline bool need_firmware_copy(int cmd) ++{ ++ struct sev_device *sev = psp_master->sev_data; ++ ++ /* After SNP is INIT'ed, the behavior of legacy SEV command is changed. */ ++ return ((cmd < SEV_CMD_SNP_INIT) && sev->snp_inited) ? true : false; ++} ++ ++static int snp_aware_copy_to_firmware(int cmd, void *data) ++{ ++ return __snp_cmd_buf_copy(cmd, data, true, 0); ++} ++ ++static int snp_aware_copy_from_firmware(int cmd, void *data, int fw_err) ++{ ++ return __snp_cmd_buf_copy(cmd, data, false, fw_err); ++} ++ + static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) + { + struct psp_device *psp = psp_master; + struct sev_device *sev; + unsigned int phys_lsb, phys_msb; + unsigned int reg, ret = 0; ++ void *cmd_buf; + int buf_len; + + if (!psp || !psp->sev_data) +@@ -466,12 +749,28 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) + * work for some memory, e.g. vmalloc'd addresses, and @data may not be + * physically contiguous. + */ +- if (data) +- memcpy(sev->cmd_buf, data, buf_len); ++ if (data) { ++ if (sev->cmd_buf_active > 2) ++ return -EBUSY; ++ ++ cmd_buf = sev->cmd_buf_active ? sev->cmd_buf_backup : sev->cmd_buf; ++ ++ memcpy(cmd_buf, data, buf_len); ++ sev->cmd_buf_active++; ++ ++ /* ++ * The behavior of the SEV-legacy commands is altered when the ++ * SNP firmware is in the INIT state. ++ */ ++ if (need_firmware_copy(cmd) && snp_aware_copy_to_firmware(cmd, sev->cmd_buf)) ++ return -EFAULT; ++ } else { ++ cmd_buf = sev->cmd_buf; ++ } + + /* Get the physical address of the command buffer */ +- phys_lsb = data ? lower_32_bits(__psp_pa(sev->cmd_buf)) : 0; +- phys_msb = data ? upper_32_bits(__psp_pa(sev->cmd_buf)) : 0; ++ phys_lsb = data ? lower_32_bits(__psp_pa(cmd_buf)) : 0; ++ phys_msb = data ? upper_32_bits(__psp_pa(cmd_buf)) : 0; + + dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", + cmd, phys_msb, phys_lsb, psp_timeout); +@@ -514,15 +813,24 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) + sev_write_init_ex_file_if_required(cmd); + } + +- print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data, +- buf_len, false); +- + /* + * Copy potential output from the PSP back to data. Do this even on + * failure in case the caller wants to glean something from the error. + */ +- if (data) +- memcpy(data, sev->cmd_buf, buf_len); ++ if (data) { ++ /* ++ * Restore the page state after the command completes. ++ */ ++ if (need_firmware_copy(cmd) && ++ snp_aware_copy_from_firmware(cmd, cmd_buf, ret)) ++ return -EFAULT; ++ ++ memcpy(data, cmd_buf, buf_len); ++ sev->cmd_buf_active--; ++ } ++ ++ print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data, ++ buf_len, false); + + return ret; + } +@@ -1448,10 +1756,12 @@ int sev_dev_init(struct psp_device *psp) + if (!sev) + goto e_err; + +- sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 0); ++ sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 1); + if (!sev->cmd_buf) + goto e_sev; + ++ sev->cmd_buf_backup = (uint8_t *)sev->cmd_buf + PAGE_SIZE; ++ + psp->sev_data = sev; + + sev->dev = dev; +@@ -1510,6 +1820,12 @@ static void sev_firmware_shutdown(struct sev_device *sev) + sev_init_ex_buffer = NULL; + } + ++ /* ++ * The host map need to clear the immutable bit so it must be free'd before the ++ * SNP firmware shutdown. ++ */ ++ free_snp_host_map(sev); ++ + sev_snp_shutdown(NULL); + } + +@@ -1585,6 +1901,14 @@ void sev_pci_init(void) + dev_err(sev->dev, "SEV-SNP: failed to INIT error %#x\n", error); + } + } ++ ++ /* ++ * Allocate the intermediate buffers used for the legacy command handling. ++ */ ++ if (alloc_snp_host_map(sev)) { ++ dev_notice(sev->dev, "Failed to alloc host map (disabling legacy SEV)\n"); ++ goto skip_legacy; ++ } + } + + /* Obtain the TMR memory area for SEV-ES use */ +@@ -1602,12 +1926,14 @@ void sev_pci_init(void) + dev_err(sev->dev, "SEV: failed to INIT error %#x, rc %d\n", + error, rc); + ++skip_legacy: + dev_info(sev->dev, "SEV%s API:%d.%d build:%d\n", sev->snp_inited ? + "-SNP" : "", sev->api_major, sev->api_minor, sev->build); + + return; + + err: ++ free_snp_host_map(sev); + psp_master->sev_data = NULL; + } + +diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h +index 186ad20cbd24..fe5d7a3ebace 100644 +--- a/drivers/crypto/ccp/sev-dev.h ++++ b/drivers/crypto/ccp/sev-dev.h +@@ -29,11 +29,20 @@ + #define SEV_CMDRESP_CMD_SHIFT 16 + #define SEV_CMDRESP_IOC BIT(0) + ++#define MAX_SNP_HOST_MAP_BUFS 2 ++ + struct sev_misc_dev { + struct kref refcount; + struct miscdevice misc; + }; + ++struct snp_host_map { ++ u64 paddr; ++ u32 len; ++ void *host; ++ bool active; ++}; ++ + struct sev_device { + struct device *dev; + struct psp_device *psp; +@@ -52,8 +61,11 @@ struct sev_device { + u8 build; + + void *cmd_buf; ++ void *cmd_buf_backup; ++ int cmd_buf_active; + + bool snp_inited; ++ struct snp_host_map snp_host_map[MAX_SNP_HOST_MAP_BUFS]; + }; + + int sev_dev_init(struct psp_device *psp); +-- +2.36.1 + + +From 17a3e7f59a93c904884fc824b28f906513dae53a Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:04:38 +0000 +Subject: [PATCH 16/90] crypto: ccp: Add the SNP_PLATFORM_STATUS command + +The command can be used by the userspace to query the SNP platform status +report. See the SEV-SNP spec for more details. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + Documentation/virt/coco/sevguest.rst | 27 +++++++++++++++++ + drivers/crypto/ccp/sev-dev.c | 45 ++++++++++++++++++++++++++++ + include/uapi/linux/psp-sev.h | 1 + + 3 files changed, 73 insertions(+) + +diff --git a/Documentation/virt/coco/sevguest.rst b/Documentation/virt/coco/sevguest.rst +index bf593e88cfd9..11ea67c944df 100644 +--- a/Documentation/virt/coco/sevguest.rst ++++ b/Documentation/virt/coco/sevguest.rst +@@ -61,6 +61,22 @@ counter (e.g. counter overflow), then -EIO will be returned. + __u64 fw_err; + }; + ++The host ioctl should be called to /dev/sev device. The ioctl accepts command ++id and command input structure. ++ ++:: ++ struct sev_issue_cmd { ++ /* Command ID */ ++ __u32 cmd; ++ ++ /* Command request structure */ ++ __u64 data; ++ ++ /* firmware error code on failure (see psp-sev.h) */ ++ __u32 error; ++ }; ++ ++ + 2.1 SNP_GET_REPORT + ------------------ + +@@ -118,6 +134,17 @@ be updated with the expected value. + + See GHCB specification for further detail on how to parse the certificate blob. + ++2.4 SNP_PLATFORM_STATUS ++----------------------- ++:Technology: sev-snp ++:Type: hypervisor ioctl cmd ++:Parameters (in): struct sev_data_snp_platform_status ++:Returns (out): 0 on success, -negative on error ++ ++The SNP_PLATFORM_STATUS command is used to query the SNP platform status. The ++status includes API major, minor version and more. See the SEV-SNP ++specification for further details. ++ + 3. SEV-SNP CPUID Enforcement + ============================ + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index a744ec3f9571..ca6bcc755595 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -1571,6 +1571,48 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable) + return ret; + } + ++static int sev_ioctl_snp_platform_status(struct sev_issue_cmd *argp) ++{ ++ struct sev_device *sev = psp_master->sev_data; ++ struct sev_data_snp_platform_status_buf buf; ++ struct page *status_page; ++ void *data; ++ int ret; ++ ++ if (!sev->snp_inited || !argp->data) ++ return -EINVAL; ++ ++ status_page = alloc_page(GFP_KERNEL_ACCOUNT); ++ if (!status_page) ++ return -ENOMEM; ++ ++ data = page_address(status_page); ++ if (snp_set_rmp_state(__pa(data), 1, true, true, false)) { ++ __free_pages(status_page, 0); ++ return -EFAULT; ++ } ++ ++ buf.status_paddr = __psp_pa(data); ++ ret = __sev_do_cmd_locked(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &argp->error); ++ ++ /* Change the page state before accessing it */ ++ if (snp_set_rmp_state(__pa(data), 1, false, true, true)) { ++ snp_leak_pages(__pa(data) >> PAGE_SHIFT, 1); ++ return -EFAULT; ++ } ++ ++ if (ret) ++ goto cleanup; ++ ++ if (copy_to_user((void __user *)argp->data, data, ++ sizeof(struct sev_user_data_snp_status))) ++ ret = -EFAULT; ++ ++cleanup: ++ __free_pages(status_page, 0); ++ return ret; ++} ++ + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) + { + void __user *argp = (void __user *)arg; +@@ -1622,6 +1664,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) + case SEV_GET_ID2: + ret = sev_ioctl_do_get_id2(&input); + break; ++ case SNP_PLATFORM_STATUS: ++ ret = sev_ioctl_snp_platform_status(&input); ++ break; + default: + ret = -EINVAL; + goto out; +diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h +index bed65a891223..ffd60e8b0a31 100644 +--- a/include/uapi/linux/psp-sev.h ++++ b/include/uapi/linux/psp-sev.h +@@ -28,6 +28,7 @@ enum { + SEV_PEK_CERT_IMPORT, + SEV_GET_ID, /* This command is deprecated, use SEV_GET_ID2 */ + SEV_GET_ID2, ++ SNP_PLATFORM_STATUS, + + SEV_MAX, + }; +-- +2.36.1 + + +From 8856594bb3cab98d33a1a093f61c968d7d2365ab Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:06:20 +0000 +Subject: [PATCH 17/90] crypto: ccp: Add the SNP_{SET,GET}_EXT_CONFIG command + +The SEV-SNP firmware provides the SNP_CONFIG command used to set the +system-wide configuration value for SNP guests. The information includes +the TCB version string to be reported in guest attestation reports. + +Version 2 of the GHCB specification adds an NAE (SNP extended guest +request) that a guest can use to query the reports that include additional +certificates. + +In both cases, userspace provided additional data is included in the +attestation reports. The userspace will use the SNP_SET_EXT_CONFIG +command to give the certificate blob and the reported TCB version string +at once. Note that the specification defines certificate blob with a +specific GUID format; the userspace is responsible for building the +proper certificate blob. The ioctl treats it an opaque blob. + +While it is not defined in the spec, but let's add SNP_GET_EXT_CONFIG +command that can be used to obtain the data programmed through the +SNP_SET_EXT_CONFIG. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + Documentation/virt/coco/sevguest.rst | 27 +++++++ + drivers/crypto/ccp/sev-dev.c | 115 +++++++++++++++++++++++++++ + drivers/crypto/ccp/sev-dev.h | 3 + + include/uapi/linux/psp-sev.h | 17 ++++ + 4 files changed, 162 insertions(+) + +diff --git a/Documentation/virt/coco/sevguest.rst b/Documentation/virt/coco/sevguest.rst +index 11ea67c944df..3014de47e4ce 100644 +--- a/Documentation/virt/coco/sevguest.rst ++++ b/Documentation/virt/coco/sevguest.rst +@@ -145,6 +145,33 @@ The SNP_PLATFORM_STATUS command is used to query the SNP platform status. The + status includes API major, minor version and more. See the SEV-SNP + specification for further details. + ++2.5 SNP_SET_EXT_CONFIG ++---------------------- ++:Technology: sev-snp ++:Type: hypervisor ioctl cmd ++:Parameters (in): struct sev_data_snp_ext_config ++:Returns (out): 0 on success, -negative on error ++ ++The SNP_SET_EXT_CONFIG is used to set the system-wide configuration such as ++reported TCB version in the attestation report. The command is similar to ++SNP_CONFIG command defined in the SEV-SNP spec. The main difference is the ++command also accepts an additional certificate blob defined in the GHCB ++specification. ++ ++If the certs_address is zero, then previous certificate blob will deleted. ++For more information on the certificate blob layout, see the GHCB spec ++(extended guest request message). ++ ++2.6 SNP_GET_EXT_CONFIG ++---------------------- ++:Technology: sev-snp ++:Type: hypervisor ioctl cmd ++:Parameters (in): struct sev_data_snp_ext_config ++:Returns (out): 0 on success, -negative on error ++ ++The SNP_SET_EXT_CONFIG is used to query the system-wide configuration set ++through the SNP_SET_EXT_CONFIG. ++ + 3. SEV-SNP CPUID Enforcement + ============================ + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index ca6bcc755595..eac10c93bdcd 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -1309,6 +1309,10 @@ static int __sev_snp_shutdown_locked(int *error) + if (!sev->snp_inited) + return 0; + ++ /* Free the memory used for caching the certificate data */ ++ kfree(sev->snp_certs_data); ++ sev->snp_certs_data = NULL; ++ + /* SHUTDOWN requires the DF_FLUSH */ + wbinvd_on_all_cpus(); + __sev_do_cmd_locked(SEV_CMD_SNP_DF_FLUSH, NULL, NULL); +@@ -1613,6 +1617,111 @@ static int sev_ioctl_snp_platform_status(struct sev_issue_cmd *argp) + return ret; + } + ++static int sev_ioctl_snp_get_config(struct sev_issue_cmd *argp) ++{ ++ struct sev_device *sev = psp_master->sev_data; ++ struct sev_user_data_ext_snp_config input; ++ int ret; ++ ++ if (!sev->snp_inited || !argp->data) ++ return -EINVAL; ++ ++ if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) ++ return -EFAULT; ++ ++ /* Copy the TCB version programmed through the SET_CONFIG to userspace */ ++ if (input.config_address) { ++ if (copy_to_user((void * __user)input.config_address, ++ &sev->snp_config, sizeof(struct sev_user_data_snp_config))) ++ return -EFAULT; ++ } ++ ++ /* Copy the extended certs programmed through the SNP_SET_CONFIG */ ++ if (input.certs_address && sev->snp_certs_data) { ++ if (input.certs_len < sev->snp_certs_len) { ++ /* Return the certs length to userspace */ ++ input.certs_len = sev->snp_certs_len; ++ ++ ret = -ENOSR; ++ goto e_done; ++ } ++ ++ if (copy_to_user((void * __user)input.certs_address, ++ sev->snp_certs_data, sev->snp_certs_len)) ++ return -EFAULT; ++ } ++ ++ ret = 0; ++ ++e_done: ++ if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) ++ ret = -EFAULT; ++ ++ return ret; ++} ++ ++static int sev_ioctl_snp_set_config(struct sev_issue_cmd *argp, bool writable) ++{ ++ struct sev_device *sev = psp_master->sev_data; ++ struct sev_user_data_ext_snp_config input; ++ struct sev_user_data_snp_config config; ++ void *certs = NULL; ++ int ret = 0; ++ ++ if (!sev->snp_inited || !argp->data) ++ return -EINVAL; ++ ++ if (!writable) ++ return -EPERM; ++ ++ if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) ++ return -EFAULT; ++ ++ /* Copy the certs from userspace */ ++ if (input.certs_address) { ++ if (!input.certs_len || !IS_ALIGNED(input.certs_len, PAGE_SIZE)) ++ return -EINVAL; ++ ++ certs = psp_copy_user_blob(input.certs_address, input.certs_len); ++ if (IS_ERR(certs)) ++ return PTR_ERR(certs); ++ } ++ ++ /* Issue the PSP command to update the TCB version using the SNP_CONFIG. */ ++ if (input.config_address) { ++ if (copy_from_user(&config, ++ (void __user *)input.config_address, sizeof(config))) { ++ ret = -EFAULT; ++ goto e_free; ++ } ++ ++ ret = __sev_do_cmd_locked(SEV_CMD_SNP_CONFIG, &config, &argp->error); ++ if (ret) ++ goto e_free; ++ ++ memcpy(&sev->snp_config, &config, sizeof(config)); ++ } ++ ++ /* ++ * If the new certs are passed then cache it else free the old certs. ++ */ ++ if (certs) { ++ kfree(sev->snp_certs_data); ++ sev->snp_certs_data = certs; ++ sev->snp_certs_len = input.certs_len; ++ } else { ++ kfree(sev->snp_certs_data); ++ sev->snp_certs_data = NULL; ++ sev->snp_certs_len = 0; ++ } ++ ++ return 0; ++ ++e_free: ++ kfree(certs); ++ return ret; ++} ++ + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) + { + void __user *argp = (void __user *)arg; +@@ -1667,6 +1776,12 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) + case SNP_PLATFORM_STATUS: + ret = sev_ioctl_snp_platform_status(&input); + break; ++ case SNP_SET_EXT_CONFIG: ++ ret = sev_ioctl_snp_set_config(&input, writable); ++ break; ++ case SNP_GET_EXT_CONFIG: ++ ret = sev_ioctl_snp_get_config(&input); ++ break; + default: + ret = -EINVAL; + goto out; +diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h +index fe5d7a3ebace..d2fe1706311a 100644 +--- a/drivers/crypto/ccp/sev-dev.h ++++ b/drivers/crypto/ccp/sev-dev.h +@@ -66,6 +66,9 @@ struct sev_device { + + bool snp_inited; + struct snp_host_map snp_host_map[MAX_SNP_HOST_MAP_BUFS]; ++ void *snp_certs_data; ++ u32 snp_certs_len; ++ struct sev_user_data_snp_config snp_config; + }; + + int sev_dev_init(struct psp_device *psp); +diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h +index ffd60e8b0a31..60e7a8d1a18e 100644 +--- a/include/uapi/linux/psp-sev.h ++++ b/include/uapi/linux/psp-sev.h +@@ -29,6 +29,8 @@ enum { + SEV_GET_ID, /* This command is deprecated, use SEV_GET_ID2 */ + SEV_GET_ID2, + SNP_PLATFORM_STATUS, ++ SNP_SET_EXT_CONFIG, ++ SNP_GET_EXT_CONFIG, + + SEV_MAX, + }; +@@ -190,6 +192,21 @@ struct sev_user_data_snp_config { + __u8 rsvd[52]; + } __packed; + ++/** ++ * struct sev_data_snp_ext_config - system wide configuration value for SNP. ++ * ++ * @config_address: address of the struct sev_user_data_snp_config or 0 when ++ * reported_tcb does not need to be updated. ++ * @certs_address: address of extended guest request certificate chain or ++ * 0 when previous certificate should be removed on SNP_SET_EXT_CONFIG. ++ * @certs_len: length of the certs ++ */ ++struct sev_user_data_ext_snp_config { ++ __u64 config_address; /* In */ ++ __u64 certs_address; /* In */ ++ __u32 certs_len; /* In */ ++}; ++ + /** + * struct sev_issue_cmd - SEV ioctl parameters + * +-- +2.36.1 + + +From 0e9f77f36537183cebe2e46ef954657f7b69fd1b Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:07:46 +0000 +Subject: [PATCH 18/90] crypto: ccp: Provide APIs to query extended attestation + report + +Version 2 of the GHCB specification defines VMGEXIT that is used to get +the extended attestation report. The extended attestation report includes +the certificate blobs provided through the SNP_SET_EXT_CONFIG. + +The snp_guest_ext_guest_request() will be used by the hypervisor to get +the extended attestation report. See the GHCB specification for more +details. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 43 ++++++++++++++++++++++++++++++++++++ + include/linux/psp-sev.h | 24 ++++++++++++++++++++ + 2 files changed, 67 insertions(+) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index eac10c93bdcd..740e610aeaf6 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -25,6 +25,7 @@ + #include <linux/fs.h> + + #include <asm/smp.h> ++#include <asm/sev.h> + + #include "psp-dev.h" + #include "sev-dev.h" +@@ -1854,6 +1855,48 @@ int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error) + } + EXPORT_SYMBOL_GPL(snp_guest_dbg_decrypt); + ++int snp_guest_ext_guest_request(struct sev_data_snp_guest_request *data, ++ unsigned long vaddr, unsigned long *npages, unsigned long *fw_err) ++{ ++ unsigned long expected_npages; ++ struct sev_device *sev; ++ int rc; ++ ++ if (!psp_master || !psp_master->sev_data) ++ return -ENODEV; ++ ++ sev = psp_master->sev_data; ++ ++ if (!sev->snp_inited) ++ return -EINVAL; ++ ++ /* ++ * Check if there is enough space to copy the certificate chain. Otherwise ++ * return ERROR code defined in the GHCB specification. ++ */ ++ expected_npages = sev->snp_certs_len >> PAGE_SHIFT; ++ if (*npages < expected_npages) { ++ *npages = expected_npages; ++ *fw_err = SNP_GUEST_REQ_INVALID_LEN; ++ return -EINVAL; ++ } ++ ++ rc = sev_do_cmd(SEV_CMD_SNP_GUEST_REQUEST, data, (int *)&fw_err); ++ if (rc) ++ return rc; ++ ++ /* Copy the certificate blob */ ++ if (sev->snp_certs_data) { ++ *npages = expected_npages; ++ memcpy((void *)vaddr, sev->snp_certs_data, *npages << PAGE_SHIFT); ++ } else { ++ *npages = 0; ++ } ++ ++ return rc; ++} ++EXPORT_SYMBOL_GPL(snp_guest_ext_guest_request); ++ + static void sev_exit(struct kref *ref) + { + misc_deregister(&misc_dev->misc); +diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h +index a3bb792bb842..cd37ccd1fa1f 100644 +--- a/include/linux/psp-sev.h ++++ b/include/linux/psp-sev.h +@@ -945,6 +945,23 @@ void *psp_copy_user_blob(u64 uaddr, u32 len); + void *snp_alloc_firmware_page(gfp_t mask); + void snp_free_firmware_page(void *addr); + ++/** ++ * snp_guest_ext_guest_request - perform the SNP extended guest request command ++ * defined in the GHCB specification. ++ * ++ * @data: the input guest request structure ++ * @vaddr: address where the certificate blob need to be copied. ++ * @npages: number of pages for the certificate blob. ++ * If the specified page count is less than the certificate blob size, then the ++ * required page count is returned with error code defined in the GHCB spec. ++ * If the specified page count is more than the certificate blob size, then ++ * page count is updated to reflect the amount of valid data copied in the ++ * vaddr. ++ */ ++int snp_guest_ext_guest_request(struct sev_data_snp_guest_request *data, ++ unsigned long vaddr, unsigned long *npages, ++ unsigned long *error); ++ + #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ + + static inline int +@@ -992,6 +1009,13 @@ static inline void *snp_alloc_firmware_page(gfp_t mask) + + static inline void snp_free_firmware_page(void *addr) { } + ++static inline int snp_guest_ext_guest_request(struct sev_data_snp_guest_request *data, ++ unsigned long vaddr, unsigned long *n, ++ unsigned long *error) ++{ ++ return -ENODEV; ++} ++ + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ + + #endif /* __PSP_SEV_H__ */ +-- +2.36.1 + + +From a59b103843db8a57715f117ab06c642692733ef2 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Tue, 26 Apr 2022 18:11:10 +0000 +Subject: [PATCH 19/90] KVM: SVM: Add support to handle AP reset MSR protocol + +Add support for AP Reset Hold being invoked using the GHCB MSR protocol, +available in version 2 of the GHCB specification. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev-common.h | 2 ++ + arch/x86/kvm/svm/sev.c | 56 ++++++++++++++++++++++++++----- + arch/x86/kvm/svm/svm.h | 1 + + 3 files changed, 51 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h +index b8357d6ecd47..e15548d88f2a 100644 +--- a/arch/x86/include/asm/sev-common.h ++++ b/arch/x86/include/asm/sev-common.h +@@ -56,6 +56,8 @@ + /* AP Reset Hold */ + #define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 + #define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 ++#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 ++#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) + + /* GHCB GPA Register */ + #define GHCB_MSR_REG_GPA_REQ 0x012 +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 609471204c6e..a1318236acd2 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -56,6 +56,10 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444); + #define sev_es_enabled false + #endif /* CONFIG_KVM_AMD_SEV */ + ++#define AP_RESET_HOLD_NONE 0 ++#define AP_RESET_HOLD_NAE_EVENT 1 ++#define AP_RESET_HOLD_MSR_PROTO 2 ++ + static u8 sev_enc_bit; + static DECLARE_RWSEM(sev_deactivate_lock); + static DEFINE_MUTEX(sev_bitmap_lock); +@@ -2511,6 +2515,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) + + void sev_es_unmap_ghcb(struct vcpu_svm *svm) + { ++ /* Clear any indication that the vCPU is in a type of AP Reset Hold */ ++ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; ++ + if (!svm->sev_es.ghcb) + return; + +@@ -2723,6 +2730,22 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + GHCB_MSR_INFO_POS); + break; + } ++ case GHCB_MSR_AP_RESET_HOLD_REQ: ++ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; ++ ret = kvm_emulate_ap_reset_hold(&svm->vcpu); ++ ++ /* ++ * Preset the result to a non-SIPI return and then only set ++ * the result to non-zero when delivering a SIPI. ++ */ ++ set_ghcb_msr_bits(svm, 0, ++ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, ++ GHCB_MSR_AP_RESET_HOLD_RESULT_POS); ++ ++ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, ++ GHCB_MSR_INFO_MASK, ++ GHCB_MSR_INFO_POS); ++ break; + case GHCB_MSR_TERM_REQ: { + u64 reason_set, reason_code; + +@@ -2823,6 +2846,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); + break; + case SVM_VMGEXIT_AP_HLT_LOOP: ++ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; + ret = kvm_emulate_ap_reset_hold(vcpu); + break; + case SVM_VMGEXIT_AP_JUMP_TABLE: { +@@ -2966,13 +2990,29 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) + return; + } + +- /* +- * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where +- * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a +- * non-zero value. +- */ +- if (!svm->sev_es.ghcb) +- return; ++ /* Subsequent SIPI */ ++ switch (svm->sev_es.ap_reset_hold_type) { ++ case AP_RESET_HOLD_NAE_EVENT: ++ /* ++ * Return from an AP Reset Hold VMGEXIT, where the guest will ++ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. ++ */ ++ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); ++ break; ++ case AP_RESET_HOLD_MSR_PROTO: ++ /* ++ * Return from an AP Reset Hold VMGEXIT, where the guest will ++ * set the CS and RIP. Set GHCB data field to a non-zero value. ++ */ ++ set_ghcb_msr_bits(svm, 1, ++ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, ++ GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + +- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); ++ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, ++ GHCB_MSR_INFO_MASK, ++ GHCB_MSR_INFO_POS); ++ break; ++ default: ++ break; ++ } + } +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index bb9ec9139af3..9f7eb1f18893 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -186,6 +186,7 @@ struct vcpu_sev_es_state { + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + bool received_first_sipi; ++ unsigned int ap_reset_hold_type; + + /* SEV-ES scratch area support */ + void *ghcb_sa; +-- +2.36.1 + + +From 6c8cd954ac5bb1681d379a87bd25f50c24888e63 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:13:09 +0000 +Subject: [PATCH 20/90] KVM: SVM: Provide the Hypervisor Feature support + VMGEXIT + +Version 2 of the GHCB specification introduced advertisement of features +that are supported by the Hypervisor. + +Now that KVM supports version 2 of the GHCB specification, bump the +maximum supported protocol version. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev-common.h | 2 ++ + arch/x86/kvm/svm/sev.c | 14 ++++++++++++++ + arch/x86/kvm/svm/svm.h | 3 ++- + 3 files changed, 18 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h +index e15548d88f2a..539de6b93420 100644 +--- a/arch/x86/include/asm/sev-common.h ++++ b/arch/x86/include/asm/sev-common.h +@@ -101,6 +101,8 @@ enum psc_op { + /* GHCB Hypervisor Feature Request/Response */ + #define GHCB_MSR_HV_FT_REQ 0x080 + #define GHCB_MSR_HV_FT_RESP 0x081 ++#define GHCB_MSR_HV_FT_POS 12 ++#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0) + #define GHCB_MSR_HV_FT_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index a1318236acd2..b49c370d5ae9 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2480,6 +2480,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) + case SVM_VMGEXIT_AP_HLT_LOOP: + case SVM_VMGEXIT_AP_JUMP_TABLE: + case SVM_VMGEXIT_UNSUPPORTED_EVENT: ++ case SVM_VMGEXIT_HV_FEATURES: + break; + default: + reason = GHCB_ERR_INVALID_EVENT; +@@ -2746,6 +2747,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; ++ case GHCB_MSR_HV_FT_REQ: { ++ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, ++ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); ++ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, ++ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); ++ break; ++ } + case GHCB_MSR_TERM_REQ: { + u64 reason_set, reason_code; + +@@ -2871,6 +2879,12 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + ret = 1; + break; + } ++ case SVM_VMGEXIT_HV_FEATURES: { ++ ghcb_set_sw_exit_info_2(ghcb, GHCB_HV_FT_SUPPORTED); ++ ++ ret = 1; ++ break; ++ } + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + vcpu_unimpl(vcpu, + "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 9f7eb1f18893..1f4a8bd09c9e 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -629,9 +629,10 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); + + /* sev.c */ + +-#define GHCB_VERSION_MAX 1ULL ++#define GHCB_VERSION_MAX 2ULL + #define GHCB_VERSION_MIN 1ULL + ++#define GHCB_HV_FT_SUPPORTED 0 + + extern unsigned int max_sev_asid; + +-- +2.36.1 + + +From 2055af1db9b5ada44edb6bd3a8165c6a044c6aeb Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:16:07 +0000 +Subject: [PATCH 21/90] KVM: SVM: Make AVIC backing, VMSA and VMCB memory + allocation SNP safe + +Implement a workaround for an SNP erratum where the CPU will incorrectly +signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the +RMP entry of a VMCB, VMSA or AVIC backing page. + +When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC +backing pages as "in-use" in the RMP after a successful VMRUN. This +is done for _all_ VMs, not just SNP-Active VMs. + +If the hypervisor accesses an in-use page through a writable +translation, the CPU will throw an RMP violation #PF. On early SNP +hardware, if an in-use page is 2mb aligned and software accesses any +part of the associated 2mb region with a hupage, the CPU will +incorrectly treat the entire 2mb region as in-use and signal a spurious +RMP violation #PF. + +The recommended is to not use the hugepage for the VMCB, VMSA or +AVIC backing page. Add a generic allocator that will ensure that the +page returns is not hugepage (2mb or 1gb) and is safe to be used when +SEV-SNP is enabled. + +Co-developed-by: Marc Orr <marcorr@google.com> +Signed-off-by: Marc Orr <marcorr@google.com> +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 + + arch/x86/include/asm/kvm_host.h | 2 ++ + arch/x86/kvm/lapic.c | 5 ++++- + arch/x86/kvm/svm/sev.c | 35 ++++++++++++++++++++++++++++++ + arch/x86/kvm/svm/svm.c | 16 ++++++++++++-- + arch/x86/kvm/svm/svm.h | 1 + + 6 files changed, 57 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index da47f60a4650..a66292dae698 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -128,6 +128,7 @@ KVM_X86_OP(msr_filter_changed) + KVM_X86_OP(complete_emulated_msr) + KVM_X86_OP(vcpu_deliver_sipi_vector) + KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); ++KVM_X86_OP(alloc_apic_backing_page) + + #undef KVM_X86_OP + #undef KVM_X86_OP_OPTIONAL +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index c24a72ddc93b..0205e2944067 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1512,6 +1512,8 @@ struct kvm_x86_ops { + * Returns vCPU specific APICv inhibit reasons + */ + unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); ++ ++ void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + }; + + struct kvm_x86_nested_ops { +diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c +index 66b0eb0bda94..7c7fc6c4a7f9 100644 +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -2506,7 +2506,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) + + vcpu->arch.apic = apic; + +- apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); ++ if (kvm_x86_ops.alloc_apic_backing_page) ++ apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu); ++ else ++ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!apic->regs) { + printk(KERN_ERR "malloc apic regs error for vcpu %x\n", + vcpu->vcpu_id); +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index b49c370d5ae9..93365996bd59 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3030,3 +3030,38 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) + break; + } + } ++ ++struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) ++{ ++ unsigned long pfn; ++ struct page *p; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); ++ ++ /* ++ * Allocate an SNP safe page to workaround the SNP erratum where ++ * the CPU will incorrectly signal an RMP violation #PF if a ++ * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA ++ * or AVIC backing page. The recommeded workaround is to not use the ++ * hugepage. ++ * ++ * Allocate one extra page, use a page which is not 2mb aligned ++ * and free the other. ++ */ ++ p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); ++ if (!p) ++ return NULL; ++ ++ split_page(p, 1); ++ ++ pfn = page_to_pfn(p); ++ if (IS_ALIGNED(__pfn_to_phys(pfn), PMD_SIZE)) { ++ pfn++; ++ __free_page(p); ++ } else { ++ __free_page(pfn_to_page(pfn + 1)); ++ } ++ ++ return pfn_to_page(pfn); ++} +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index efc7623d0f90..b4bd64f94d3a 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1260,7 +1260,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) + svm = to_svm(vcpu); + + err = -ENOMEM; +- vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); ++ vmcb01_page = snp_safe_alloc_page(vcpu); + if (!vmcb01_page) + goto out; + +@@ -1269,7 +1269,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) + * SEV-ES guests require a separate VMSA page used to contain + * the encrypted register state of the guest. + */ +- vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); ++ vmsa_page = snp_safe_alloc_page(vcpu); + if (!vmsa_page) + goto error_free_vmcb_page; + +@@ -4598,6 +4598,16 @@ static int svm_vm_init(struct kvm *kvm) + return 0; + } + ++static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) ++{ ++ struct page *page = snp_safe_alloc_page(vcpu); ++ ++ if (!page) ++ return NULL; ++ ++ return page_address(page); ++} ++ + static struct kvm_x86_ops svm_x86_ops __initdata = { + .name = "kvm_amd", + +@@ -4722,6 +4732,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + + .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, + .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, ++ ++ .alloc_apic_backing_page = svm_alloc_apic_backing_page, + }; + + /* +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 1f4a8bd09c9e..9672e25a338d 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -659,6 +659,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm); + void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); + void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); + void sev_es_unmap_ghcb(struct vcpu_svm *svm); ++struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); + + /* vmenter.S */ + +-- +2.36.1 + + +From e3ca74bb1d8b7db70d9e70a9a62dcb49fa308140 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:20:53 +0000 +Subject: [PATCH 22/90] KVM: SVM: Add initial SEV-SNP support + +The next generation of SEV is called SEV-SNP (Secure Nested Paging). +SEV-SNP builds upon existing SEV and SEV-ES functionality while adding new +hardware based security protection. SEV-SNP adds strong memory encryption +integrity protection to help prevent malicious hypervisor-based attacks +such as data replay, memory re-mapping, and more, to create an isolated +execution environment. + +The SNP feature is added incrementally, the later patches adds a new module +parameters that can be used to enabled SEV-SNP in the KVM. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 10 +++++++++- + arch/x86/kvm/svm/svm.h | 8 ++++++++ + 2 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 93365996bd59..dc1f69a28aa7 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -56,6 +56,9 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444); + #define sev_es_enabled false + #endif /* CONFIG_KVM_AMD_SEV */ + ++/* enable/disable SEV-SNP support */ ++static bool sev_snp_enabled; ++ + #define AP_RESET_HOLD_NONE 0 + #define AP_RESET_HOLD_NAE_EVENT 1 + #define AP_RESET_HOLD_MSR_PROTO 2 +@@ -2120,6 +2123,7 @@ void __init sev_hardware_setup(void) + { + #ifdef CONFIG_KVM_AMD_SEV + unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; ++ bool sev_snp_supported = false; + bool sev_es_supported = false; + bool sev_supported = false; + +@@ -2190,12 +2194,16 @@ void __init sev_hardware_setup(void) + if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)) + goto out; + +- pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count); + sev_es_supported = true; ++ sev_snp_supported = sev_snp_enabled && cpu_feature_enabled(X86_FEATURE_SEV_SNP); ++ ++ pr_info("SEV-ES %ssupported: %u ASIDs\n", ++ sev_snp_supported ? "and SEV-SNP " : "", sev_es_asid_count); + + out: + sev_enabled = sev_supported; + sev_es_enabled = sev_es_supported; ++ sev_snp_enabled = sev_snp_supported; + #endif + } + +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 9672e25a338d..edecc5066517 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -75,6 +75,7 @@ enum { + struct kvm_sev_info { + bool active; /* SEV enabled guest */ + bool es_active; /* SEV-ES enabled guest */ ++ bool snp_active; /* SEV-SNP enabled guest */ + unsigned int asid; /* ASID used for this guest */ + unsigned int handle; /* SEV firmware handle */ + int fd; /* SEV device fd */ +@@ -314,6 +315,13 @@ static __always_inline bool sev_es_guest(struct kvm *kvm) + #endif + } + ++static inline bool sev_snp_guest(struct kvm *kvm) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ ++ return sev_es_guest(kvm) && sev->snp_active; ++} ++ + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) + { + vmcb->control.clean = 0; +-- +2.36.1 + + +From c67061ecca3ec218b2739d926b3d4008e6762007 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:23:14 +0000 +Subject: [PATCH 23/90] KVM: SVM: Add KVM_SNP_INIT command + +The KVM_SNP_INIT command is used by the hypervisor to initialize the +SEV-SNP platform context. In a typical workflow, this command should be the +first command issued. When creating SEV-SNP guest, the VMM must use this +command instead of the KVM_SEV_INIT or KVM_SEV_ES_INIT. + +The flags value must be zero, it will be extended in future SNP support to +communicate the optional features (such as restricted INT injection etc). + +Co-developed-by: Pavan Kumar Paluri <papaluri@amd.com> +Signed-off-by: Pavan Kumar Paluri <papaluri@amd.com> +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + .../virt/kvm/x86/amd-memory-encryption.rst | 27 ++++++++++++ + arch/x86/include/asm/svm.h | 1 + + arch/x86/kvm/svm/sev.c | 44 ++++++++++++++++++- + arch/x86/kvm/svm/svm.h | 4 ++ + include/uapi/linux/kvm.h | 13 ++++++ + 5 files changed, 87 insertions(+), 2 deletions(-) + +diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +index 2d307811978c..903023f524af 100644 +--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst ++++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +@@ -435,6 +435,33 @@ issued by the hypervisor to make the guest ready for execution. + + Returns: 0 on success, -negative on error + ++18. KVM_SNP_INIT ++---------------- ++ ++The KVM_SNP_INIT command can be used by the hypervisor to initialize SEV-SNP ++context. In a typical workflow, this command should be the first command issued. ++ ++Parameters (in/out): struct kvm_snp_init ++ ++Returns: 0 on success, -negative on error ++ ++:: ++ ++ struct kvm_snp_init { ++ __u64 flags; ++ }; ++ ++The flags bitmap is defined as:: ++ ++ /* enable the restricted injection */ ++ #define KVM_SEV_SNP_RESTRICTED_INJET (1<<0) ++ ++ /* enable the restricted injection timer */ ++ #define KVM_SEV_SNP_RESTRICTED_TIMER_INJET (1<<1) ++ ++If the specified flags is not supported then return -EOPNOTSUPP, and the supported ++flags are returned. ++ + References + ========== + +diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h +index 1b07fba11704..284a8113227e 100644 +--- a/arch/x86/include/asm/svm.h ++++ b/arch/x86/include/asm/svm.h +@@ -263,6 +263,7 @@ enum avic_ipi_failure_cause { + #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) + #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + ++#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) + + struct vmcb_seg { + u16 selector; +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index dc1f69a28aa7..813bda7f7b55 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -241,6 +241,25 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) + sev_decommission(handle); + } + ++static int verify_snp_init_flags(struct kvm *kvm, struct kvm_sev_cmd *argp) ++{ ++ struct kvm_snp_init params; ++ int ret = 0; ++ ++ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) ++ return -EFAULT; ++ ++ if (params.flags & ~SEV_SNP_SUPPORTED_FLAGS) ++ ret = -EOPNOTSUPP; ++ ++ params.flags = SEV_SNP_SUPPORTED_FLAGS; ++ ++ if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) ++ ret = -EFAULT; ++ ++ return ret; ++} ++ + static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) + { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; +@@ -254,13 +273,23 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) + return ret; + + sev->active = true; +- sev->es_active = argp->id == KVM_SEV_ES_INIT; ++ sev->es_active = (argp->id == KVM_SEV_ES_INIT || argp->id == KVM_SEV_SNP_INIT); ++ sev->snp_active = argp->id == KVM_SEV_SNP_INIT; + asid = sev_asid_new(sev); + if (asid < 0) + goto e_no_asid; + sev->asid = asid; + +- ret = sev_platform_init(&argp->error); ++ if (sev->snp_active) { ++ ret = verify_snp_init_flags(kvm, argp); ++ if (ret) ++ goto e_free; ++ ++ ret = sev_snp_init(&argp->error); ++ } else { ++ ret = sev_platform_init(&argp->error); ++ } ++ + if (ret) + goto e_free; + +@@ -275,6 +304,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) + sev_asid_free(sev); + sev->asid = 0; + e_no_asid: ++ sev->snp_active = false; + sev->es_active = false; + sev->active = false; + return ret; +@@ -610,6 +640,10 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) + save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; + ++ /* Enable the SEV-SNP feature */ ++ if (sev_snp_guest(svm->vcpu.kvm)) ++ save->sev_features |= SVM_SEV_FEAT_SNP_ACTIVE; ++ + return 0; + } + +@@ -1815,6 +1849,12 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + } + + switch (sev_cmd.id) { ++ case KVM_SEV_SNP_INIT: ++ if (!sev_snp_enabled) { ++ r = -ENOTTY; ++ goto out; ++ } ++ fallthrough; + case KVM_SEV_ES_INIT: + if (!sev_es_enabled) { + r = -ENOTTY; +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index edecc5066517..2f45589ee596 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -72,6 +72,9 @@ enum { + /* TPR and CR2 are always written before VMRUN */ + #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) + ++/* Supported init feature flags */ ++#define SEV_SNP_SUPPORTED_FLAGS 0x0 ++ + struct kvm_sev_info { + bool active; /* SEV enabled guest */ + bool es_active; /* SEV-ES enabled guest */ +@@ -87,6 +90,7 @@ struct kvm_sev_info { + struct list_head mirror_entry; /* Use as a list entry of mirrors */ + struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; ++ u64 snp_init_flags; + }; + + struct kvm_svm { +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 68ce07185f03..0f912cefc544 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -1810,6 +1810,9 @@ enum sev_cmd_id { + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + ++ /* SNP specific commands */ ++ KVM_SEV_SNP_INIT, ++ + KVM_SEV_NR_MAX, + }; + +@@ -1906,6 +1909,16 @@ struct kvm_sev_receive_update_data { + __u32 trans_len; + }; + ++/* enable the restricted injection */ ++#define KVM_SEV_SNP_RESTRICTED_INJET (1 << 0) ++ ++/* enable the restricted injection timer */ ++#define KVM_SEV_SNP_RESTRICTED_TIMER_INJET (1 << 1) ++ ++struct kvm_snp_init { ++ __u64 flags; ++}; ++ + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) + #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) +-- +2.36.1 + + +From 964cfb8ea8df7dd7055023e92c967f8565adb5e1 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:25:44 +0000 +Subject: [PATCH 24/90] KVM: SVM: Add KVM_SEV_SNP_LAUNCH_START command + +KVM_SEV_SNP_LAUNCH_START begins the launch process for an SEV-SNP guest. +The command initializes a cryptographic digest context used to construct +the measurement of the guest. If the guest is expected to be migrated, +the command also binds a migration agent (MA) to the guest. + +For more information see the SEV-SNP specification. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + .../virt/kvm/x86/amd-memory-encryption.rst | 24 ++++ + arch/x86/kvm/svm/sev.c | 115 +++++++++++++++++- + arch/x86/kvm/svm/svm.h | 1 + + include/uapi/linux/kvm.h | 10 ++ + 4 files changed, 147 insertions(+), 3 deletions(-) + +diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +index 903023f524af..878711f2dca6 100644 +--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst ++++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +@@ -462,6 +462,30 @@ The flags bitmap is defined as:: + If the specified flags is not supported then return -EOPNOTSUPP, and the supported + flags are returned. + ++19. KVM_SNP_LAUNCH_START ++------------------------ ++ ++The KVM_SNP_LAUNCH_START command is used for creating the memory encryption ++context for the SEV-SNP guest. To create the encryption context, user must ++provide a guest policy, migration agent (if any) and guest OS visible ++workarounds value as defined SEV-SNP specification. ++ ++Parameters (in): struct kvm_snp_launch_start ++ ++Returns: 0 on success, -negative on error ++ ++:: ++ ++ struct kvm_sev_snp_launch_start { ++ __u64 policy; /* Guest policy to use. */ ++ __u64 ma_uaddr; /* userspace address of migration agent */ ++ __u8 ma_en; /* 1 if the migtation agent is enabled */ ++ __u8 imi_en; /* set IMI to 1. */ ++ __u8 gosvw[16]; /* guest OS visible workarounds */ ++ }; ++ ++See the SEV-SNP specification for further detail on the launch input. ++ + References + ========== + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 813bda7f7b55..9e6fc7a94ed7 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -21,6 +21,7 @@ + #include <asm/pkru.h> + #include <asm/trapnr.h> + #include <asm/fpu/xcr.h> ++#include <asm/sev.h> + + #include "x86.h" + #include "svm.h" +@@ -73,6 +74,8 @@ static unsigned int nr_asids; + static unsigned long *sev_asid_bitmap; + static unsigned long *sev_reclaim_asid_bitmap; + ++static int snp_decommission_context(struct kvm *kvm); ++ + struct enc_region { + struct list_head list; + unsigned long npages; +@@ -98,12 +101,17 @@ static int sev_flush_asids(int min_asid, int max_asid) + down_write(&sev_deactivate_lock); + + wbinvd_on_all_cpus(); +- ret = sev_guest_df_flush(&error); ++ ++ if (sev_snp_enabled) ++ ret = snp_guest_df_flush(&error); ++ else ++ ret = sev_guest_df_flush(&error); + + up_write(&sev_deactivate_lock); + + if (ret) +- pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); ++ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", ++ sev_snp_enabled ? "-SNP" : "", ret, error); + + return ret; + } +@@ -1825,6 +1833,74 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) + return ret; + } + ++static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) ++{ ++ struct sev_data_snp_gctx_create data = {}; ++ void *context; ++ int rc; ++ ++ /* Allocate memory for context page */ ++ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); ++ if (!context) ++ return NULL; ++ ++ data.gctx_paddr = __psp_pa(context); ++ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); ++ if (rc) { ++ snp_free_firmware_page(context); ++ return NULL; ++ } ++ ++ return context; ++} ++ ++static int snp_bind_asid(struct kvm *kvm, int *error) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct sev_data_snp_activate data = {0}; ++ ++ data.gctx_paddr = __psp_pa(sev->snp_context); ++ data.asid = sev_get_asid(kvm); ++ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); ++} ++ ++static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct sev_data_snp_launch_start start = {0}; ++ struct kvm_sev_snp_launch_start params; ++ int rc; ++ ++ if (!sev_snp_guest(kvm)) ++ return -ENOTTY; ++ ++ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) ++ return -EFAULT; ++ ++ sev->snp_context = snp_context_create(kvm, argp); ++ if (!sev->snp_context) ++ return -ENOTTY; ++ ++ start.gctx_paddr = __psp_pa(sev->snp_context); ++ start.policy = params.policy; ++ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); ++ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); ++ if (rc) ++ goto e_free_context; ++ ++ sev->fd = argp->sev_fd; ++ rc = snp_bind_asid(kvm, &argp->error); ++ if (rc) ++ goto e_free_context; ++ ++ return 0; ++ ++e_free_context: ++ snp_decommission_context(kvm); ++ ++ return rc; ++} ++ + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + { + struct kvm_sev_cmd sev_cmd; +@@ -1915,6 +1991,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + case KVM_SEV_RECEIVE_FINISH: + r = sev_receive_finish(kvm, &sev_cmd); + break; ++ case KVM_SEV_SNP_LAUNCH_START: ++ r = snp_launch_start(kvm, &sev_cmd); ++ break; + default: + r = -EINVAL; + goto out; +@@ -2106,6 +2185,28 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) + return ret; + } + ++static int snp_decommission_context(struct kvm *kvm) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct sev_data_snp_decommission data = {}; ++ int ret; ++ ++ /* If context is not created then do nothing */ ++ if (!sev->snp_context) ++ return 0; ++ ++ data.gctx_paddr = __sme_pa(sev->snp_context); ++ ret = snp_guest_decommission(&data, NULL); ++ if (WARN_ONCE(ret, "failed to release guest context")) ++ return ret; ++ ++ /* free the context page now */ ++ snp_free_firmware_page(sev->snp_context); ++ sev->snp_context = NULL; ++ ++ return 0; ++} ++ + void sev_vm_destroy(struct kvm *kvm) + { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; +@@ -2147,7 +2248,15 @@ void sev_vm_destroy(struct kvm *kvm) + } + } + +- sev_unbind_asid(kvm, sev->handle); ++ if (sev_snp_guest(kvm)) { ++ if (snp_decommission_context(kvm)) { ++ WARN_ONCE(1, "Failed to free SNP guest context, leaking asid!\n"); ++ return; ++ } ++ } else { ++ sev_unbind_asid(kvm, sev->handle); ++ } ++ + sev_asid_free(sev); + } + +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 2f45589ee596..71c011af098e 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -91,6 +91,7 @@ struct kvm_sev_info { + struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; + u64 snp_init_flags; ++ void *snp_context; /* SNP guest context page */ + }; + + struct kvm_svm { +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 0f912cefc544..0cb119d66ae5 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -1812,6 +1812,7 @@ enum sev_cmd_id { + + /* SNP specific commands */ + KVM_SEV_SNP_INIT, ++ KVM_SEV_SNP_LAUNCH_START, + + KVM_SEV_NR_MAX, + }; +@@ -1919,6 +1920,15 @@ struct kvm_snp_init { + __u64 flags; + }; + ++struct kvm_sev_snp_launch_start { ++ __u64 policy; ++ __u64 ma_uaddr; ++ __u8 ma_en; ++ __u8 imi_en; ++ __u8 gosvw[16]; ++ __u8 pad[6]; ++}; ++ + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) + #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) +-- +2.36.1 + + +From 5516a4d4748f6a118b22836ee891782eecb7ae5c Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:27:38 +0000 +Subject: [PATCH 25/90] KVM: SVM: Disallow registering memory range from + HugeTLB for SNP guest + +While creating the VM, userspace call the KVM_MEMORY_ENCRYPT_REG_REGION +ioctl to register the memory regions for the guest. This registered +memory region is typically used as a guest RAM. Later, the guest may +issue the page state change (PSC) request that will require splitting +the large page into smaller page. If the memory is allocated from the +HugeTLB then hypervisor will not be able to split it. + +Do not allow registering the memory range backed by the HugeTLB until +the hypervisor support is added to handle the case. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 9e6fc7a94ed7..41b83aa6b5f4 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -17,6 +17,7 @@ + #include <linux/misc_cgroup.h> + #include <linux/processor.h> + #include <linux/trace_events.h> ++#include <linux/hugetlb.h> + + #include <asm/pkru.h> + #include <asm/trapnr.h> +@@ -2007,6 +2008,35 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + return r; + } + ++static bool is_range_hugetlb(struct kvm *kvm, struct kvm_enc_region *range) ++{ ++ struct vm_area_struct *vma; ++ u64 start, end; ++ bool ret = true; ++ ++ start = range->addr; ++ end = start + range->size; ++ ++ mmap_read_lock(kvm->mm); ++ ++ do { ++ vma = find_vma_intersection(kvm->mm, start, end); ++ if (!vma) ++ goto unlock; ++ ++ if (is_vm_hugetlb_page(vma)) ++ goto unlock; ++ ++ start = vma->vm_end; ++ } while (end > vma->vm_end); ++ ++ ret = false; ++ ++unlock: ++ mmap_read_unlock(kvm->mm); ++ return ret; ++} ++ + int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range) + { +@@ -2024,6 +2054,13 @@ int sev_mem_enc_register_region(struct kvm *kvm, + if (range->addr > ULONG_MAX || range->size > ULONG_MAX) + return -EINVAL; + ++ /* ++ * SEV-SNP does not support the backing pages from the HugeTLB. Verify ++ * that the registered memory range is not from the HugeTLB. ++ */ ++ if (sev_snp_guest(kvm) && is_range_hugetlb(kvm, range)) ++ return -EINVAL; ++ + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); + if (!region) + return -ENOMEM; +-- +2.36.1 + + +From e944c65507bddf2807a31819d8383beb67f49ee3 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:29:08 +0000 +Subject: [PATCH 26/90] KVM: SVM: Add KVM_SEV_SNP_LAUNCH_UPDATE command + +The KVM_SEV_SNP_LAUNCH_UPDATE command can be used to insert data into the +guest's memory. The data is encrypted with the cryptographic context +created with the KVM_SEV_SNP_LAUNCH_START. + +In addition to the inserting data, it can insert a two special pages +into the guests memory: the secrets page and the CPUID page. + +While terminating the guest, reclaim the guest pages added in the RMP +table. If the reclaim fails, then the page is no longer safe to be +released back to the system and leak them. + +For more information see the SEV-SNP specification. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + .../virt/kvm/x86/amd-memory-encryption.rst | 29 +++ + arch/x86/kvm/svm/sev.c | 187 ++++++++++++++++++ + include/uapi/linux/kvm.h | 19 ++ + 3 files changed, 235 insertions(+) + +diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +index 878711f2dca6..62abd5c1f72b 100644 +--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst ++++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +@@ -486,6 +486,35 @@ Returns: 0 on success, -negative on error + + See the SEV-SNP specification for further detail on the launch input. + ++20. KVM_SNP_LAUNCH_UPDATE ++------------------------- ++ ++The KVM_SNP_LAUNCH_UPDATE is used for encrypting a memory region. It also ++calculates a measurement of the memory contents. The measurement is a signature ++of the memory contents that can be sent to the guest owner as an attestation ++that the memory was encrypted correctly by the firmware. ++ ++Parameters (in): struct kvm_snp_launch_update ++ ++Returns: 0 on success, -negative on error ++ ++:: ++ ++ struct kvm_sev_snp_launch_update { ++ __u64 start_gfn; /* Guest page number to start from. */ ++ __u64 uaddr; /* userspace address need to be encrypted */ ++ __u32 len; /* length of memory region */ ++ __u8 imi_page; /* 1 if memory is part of the IMI */ ++ __u8 page_type; /* page type */ ++ __u8 vmpl3_perms; /* VMPL3 permission mask */ ++ __u8 vmpl2_perms; /* VMPL2 permission mask */ ++ __u8 vmpl1_perms; /* VMPL1 permission mask */ ++ }; ++ ++See the SEV-SNP spec for further details on how to build the VMPL permission ++mask and page type. ++ ++ + References + ========== + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 41b83aa6b5f4..b5f0707d7ed6 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -18,6 +18,7 @@ + #include <linux/processor.h> + #include <linux/trace_events.h> + #include <linux/hugetlb.h> ++#include <linux/sev.h> + + #include <asm/pkru.h> + #include <asm/trapnr.h> +@@ -233,6 +234,49 @@ static void sev_decommission(unsigned int handle) + sev_guest_decommission(&decommission, NULL); + } + ++static inline void snp_leak_pages(u64 pfn, enum pg_level level) ++{ ++ unsigned int npages = page_level_size(level) >> PAGE_SHIFT; ++ ++ WARN(1, "psc failed pfn 0x%llx pages %d (leaking)\n", pfn, npages); ++ ++ while (npages) { ++ memory_failure(pfn, 0); ++ dump_rmpentry(pfn); ++ npages--; ++ pfn++; ++ } ++} ++ ++static int snp_page_reclaim(u64 pfn) ++{ ++ struct sev_data_snp_page_reclaim data = {0}; ++ int err, rc; ++ ++ data.paddr = __sme_set(pfn << PAGE_SHIFT); ++ rc = snp_guest_page_reclaim(&data, &err); ++ if (rc) { ++ /* ++ * If the reclaim failed, then page is no longer safe ++ * to use. ++ */ ++ snp_leak_pages(pfn, PG_LEVEL_4K); ++ } ++ ++ return rc; ++} ++ ++static int host_rmp_make_shared(u64 pfn, enum pg_level level, bool leak) ++{ ++ int rc; ++ ++ rc = rmp_make_shared(pfn, level); ++ if (rc && leak) ++ snp_leak_pages(pfn, level); ++ ++ return rc; ++} ++ + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) + { + struct sev_data_deactivate deactivate; +@@ -1902,6 +1946,123 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) + return rc; + } + ++static bool is_hva_registered(struct kvm *kvm, hva_t hva, size_t len) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct list_head *head = &sev->regions_list; ++ struct enc_region *i; ++ ++ lockdep_assert_held(&kvm->lock); ++ ++ list_for_each_entry(i, head, list) { ++ u64 start = i->uaddr; ++ u64 end = start + i->size; ++ ++ if (start <= hva && end >= (hva + len)) ++ return true; ++ } ++ ++ return false; ++} ++ ++static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct sev_data_snp_launch_update data = {0}; ++ struct kvm_sev_snp_launch_update params; ++ unsigned long npages, pfn, n = 0; ++ int *error = &argp->error; ++ struct page **inpages; ++ int ret, i, level; ++ u64 gfn; ++ ++ if (!sev_snp_guest(kvm)) ++ return -ENOTTY; ++ ++ if (!sev->snp_context) ++ return -EINVAL; ++ ++ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) ++ return -EFAULT; ++ ++ /* Verify that the specified address range is registered. */ ++ if (!is_hva_registered(kvm, params.uaddr, params.len)) ++ return -EINVAL; ++ ++ /* ++ * The userspace memory is already locked so technically we don't ++ * need to lock it again. Later part of the function needs to know ++ * pfn so call the sev_pin_memory() so that we can get the list of ++ * pages to iterate through. ++ */ ++ inpages = sev_pin_memory(kvm, params.uaddr, params.len, &npages, 1); ++ if (!inpages) ++ return -ENOMEM; ++ ++ /* ++ * Verify that all the pages are marked shared in the RMP table before ++ * going further. This is avoid the cases where the userspace may try ++ * updating the same page twice. ++ */ ++ for (i = 0; i < npages; i++) { ++ if (snp_lookup_rmpentry(page_to_pfn(inpages[i]), &level) != 0) { ++ sev_unpin_memory(kvm, inpages, npages); ++ return -EFAULT; ++ } ++ } ++ ++ gfn = params.start_gfn; ++ level = PG_LEVEL_4K; ++ data.gctx_paddr = __psp_pa(sev->snp_context); ++ ++ for (i = 0; i < npages; i++) { ++ pfn = page_to_pfn(inpages[i]); ++ ++ ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, level, sev_get_asid(kvm), true); ++ if (ret) { ++ ret = -EFAULT; ++ goto e_unpin; ++ } ++ ++ n++; ++ data.address = __sme_page_pa(inpages[i]); ++ data.page_size = X86_TO_RMP_PG_LEVEL(level); ++ data.page_type = params.page_type; ++ data.vmpl3_perms = params.vmpl3_perms; ++ data.vmpl2_perms = params.vmpl2_perms; ++ data.vmpl1_perms = params.vmpl1_perms; ++ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, &data, error); ++ if (ret) { ++ /* ++ * If the command failed then need to reclaim the page. ++ */ ++ snp_page_reclaim(pfn); ++ goto e_unpin; ++ } ++ ++ gfn++; ++ } ++ ++e_unpin: ++ /* Content of memory is updated, mark pages dirty */ ++ for (i = 0; i < n; i++) { ++ set_page_dirty_lock(inpages[i]); ++ mark_page_accessed(inpages[i]); ++ ++ /* ++ * If its an error, then update RMP entry to change page ownership ++ * to the hypervisor. ++ */ ++ if (ret) ++ host_rmp_make_shared(pfn, level, true); ++ } ++ ++ /* Unlock the user pages */ ++ sev_unpin_memory(kvm, inpages, npages); ++ ++ return ret; ++} ++ + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + { + struct kvm_sev_cmd sev_cmd; +@@ -1995,6 +2156,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; ++ case KVM_SEV_SNP_LAUNCH_UPDATE: ++ r = snp_launch_update(kvm, &sev_cmd); ++ break; + default: + r = -EINVAL; + goto out; +@@ -2113,6 +2277,29 @@ find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) + static void __unregister_enc_region_locked(struct kvm *kvm, + struct enc_region *region) + { ++ unsigned long i, pfn; ++ int level; ++ ++ /* ++ * The guest memory pages are assigned in the RMP table. Unassign it ++ * before releasing the memory. ++ */ ++ if (sev_snp_guest(kvm)) { ++ for (i = 0; i < region->npages; i++) { ++ pfn = page_to_pfn(region->pages[i]); ++ ++ if (!snp_lookup_rmpentry(pfn, &level)) ++ continue; ++ ++ cond_resched(); ++ ++ if (level > PG_LEVEL_4K) ++ pfn &= ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); ++ ++ host_rmp_make_shared(pfn, level, true); ++ } ++ } ++ + sev_unpin_memory(kvm, region->pages, region->npages); + list_del(®ion->list); + kfree(region); +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 0cb119d66ae5..9b36b07414ea 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -1813,6 +1813,7 @@ enum sev_cmd_id { + /* SNP specific commands */ + KVM_SEV_SNP_INIT, + KVM_SEV_SNP_LAUNCH_START, ++ KVM_SEV_SNP_LAUNCH_UPDATE, + + KVM_SEV_NR_MAX, + }; +@@ -1929,6 +1930,24 @@ struct kvm_sev_snp_launch_start { + __u8 pad[6]; + }; + ++#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 ++#define KVM_SEV_SNP_PAGE_TYPE_VMSA 0x2 ++#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 ++#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 ++#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 ++#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 ++ ++struct kvm_sev_snp_launch_update { ++ __u64 start_gfn; ++ __u64 uaddr; ++ __u32 len; ++ __u8 imi_page; ++ __u8 page_type; ++ __u8 vmpl3_perms; ++ __u8 vmpl2_perms; ++ __u8 vmpl1_perms; ++}; ++ + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) + #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) +-- +2.36.1 + + +From e66d9cd4d88623939dbfaf2fb85419e29dd4b26c Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:30:45 +0000 +Subject: [PATCH 27/90] KVM: SVM: Mark the private vma unmerable for SEV-SNP + guests + +When SEV-SNP is enabled, the guest private pages are added in the RMP +table; while adding the pages, the rmp_make_private() unmaps the pages +from the direct map. If KSM attempts to access those unmapped pages then +it will trigger #PF (page-not-present). + +Encrypted guest pages cannot be shared between the process, so an +userspace should not mark the region mergeable but to be safe, mark the +process vma unmerable before adding the pages in the RMP table. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 32 ++++++++++++++++++++++++++++++++ + 1 file changed, 32 insertions(+) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index b5f0707d7ed6..a9461d352eda 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -19,11 +19,13 @@ + #include <linux/trace_events.h> + #include <linux/hugetlb.h> + #include <linux/sev.h> ++#include <linux/ksm.h> + + #include <asm/pkru.h> + #include <asm/trapnr.h> + #include <asm/fpu/xcr.h> + #include <asm/sev.h> ++#include <asm/mman.h> + + #include "x86.h" + #include "svm.h" +@@ -1965,6 +1967,30 @@ static bool is_hva_registered(struct kvm *kvm, hva_t hva, size_t len) + return false; + } + ++static int snp_mark_unmergable(struct kvm *kvm, u64 start, u64 size) ++{ ++ struct vm_area_struct *vma; ++ u64 end = start + size; ++ int ret; ++ ++ do { ++ vma = find_vma_intersection(kvm->mm, start, end); ++ if (!vma) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, ++ MADV_UNMERGEABLE, &vma->vm_flags); ++ if (ret) ++ break; ++ ++ start = vma->vm_end; ++ } while (end > vma->vm_end); ++ ++ return ret; ++} ++ + static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) + { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; +@@ -1989,6 +2015,12 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) + if (!is_hva_registered(kvm, params.uaddr, params.len)) + return -EINVAL; + ++ mmap_write_lock(kvm->mm); ++ ret = snp_mark_unmergable(kvm, params.uaddr, params.len); ++ mmap_write_unlock(kvm->mm); ++ if (ret) ++ return -EFAULT; ++ + /* + * The userspace memory is already locked so technically we don't + * need to lock it again. Later part of the function needs to know +-- +2.36.1 + + +From 36992844a221523098a7bf2f67d249d921b4fb61 Mon Sep 17 00:00:00 2001 +From: Ashish Kalra <ashish.kalra@amd.com> +Date: Wed, 8 Jun 2022 19:15:26 +0000 +Subject: [PATCH 28/90] KVM: SVM: Add KVM_SEV_SNP_LAUNCH_FINISH command + +The KVM_SEV_SNP_LAUNCH_FINISH finalize the cryptographic digest and stores +it as the measurement of the guest at launch. + +While finalizing the launch flow, it also issues the LAUNCH_UPDATE command +to encrypt the VMSA pages. + +If its an SNP guest, then VMSA was added in the RMP entry as +a guest owned page and also removed from the kernel direct map +so flush it later after it is transitioned back to hypervisor +state and restored in the direct map. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> +--- + .../virt/kvm/x86/amd-memory-encryption.rst | 22 ++++ + arch/x86/kvm/svm/sev.c | 118 ++++++++++++++++++ + include/uapi/linux/kvm.h | 14 +++ + 3 files changed, 154 insertions(+) + +diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +index 62abd5c1f72b..750162cff87b 100644 +--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst ++++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst +@@ -514,6 +514,28 @@ Returns: 0 on success, -negative on error + See the SEV-SNP spec for further details on how to build the VMPL permission + mask and page type. + ++21. KVM_SNP_LAUNCH_FINISH ++------------------------- ++ ++After completion of the SNP guest launch flow, the KVM_SNP_LAUNCH_FINISH command can be ++issued to make the guest ready for the execution. ++ ++Parameters (in): struct kvm_sev_snp_launch_finish ++ ++Returns: 0 on success, -negative on error ++ ++:: ++ ++ struct kvm_sev_snp_launch_finish { ++ __u64 id_block_uaddr; ++ __u64 id_auth_uaddr; ++ __u8 id_block_en; ++ __u8 auth_key_en; ++ __u8 host_data[32]; ++ }; ++ ++ ++See SEV-SNP specification for further details on launch finish input parameters. + + References + ========== +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index a9461d352eda..f540581d997f 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2095,6 +2095,106 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) + return ret; + } + ++static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct sev_data_snp_launch_update data = {}; ++ int i, ret; ++ ++ data.gctx_paddr = __psp_pa(sev->snp_context); ++ data.page_type = SNP_PAGE_TYPE_VMSA; ++ ++ for (i = 0; i < kvm->created_vcpus; i++) { ++ struct vcpu_svm *svm = to_svm(xa_load(&kvm->vcpu_array, i)); ++ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; ++ ++ /* Perform some pre-encryption checks against the VMSA */ ++ ret = sev_es_sync_vmsa(svm); ++ if (ret) ++ return ret; ++ ++ /* Transition the VMSA page to a firmware state. */ ++ ret = rmp_make_private(pfn, -1, PG_LEVEL_4K, sev->asid, true); ++ if (ret) ++ return ret; ++ ++ /* Issue the SNP command to encrypt the VMSA */ ++ data.address = __sme_pa(svm->sev_es.vmsa); ++ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, ++ &data, &argp->error); ++ if (ret) { ++ snp_page_reclaim(pfn); ++ return ret; ++ } ++ ++ svm->vcpu.arch.guest_state_protected = true; ++ } ++ ++ return 0; ++} ++ ++static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ struct sev_data_snp_launch_finish *data; ++ void *id_block = NULL, *id_auth = NULL; ++ struct kvm_sev_snp_launch_finish params; ++ int ret; ++ ++ if (!sev_snp_guest(kvm)) ++ return -ENOTTY; ++ ++ if (!sev->snp_context) ++ return -EINVAL; ++ ++ if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) ++ return -EFAULT; ++ ++ /* Measure all vCPUs using LAUNCH_UPDATE before we finalize the launch flow. */ ++ ret = snp_launch_update_vmsa(kvm, argp); ++ if (ret) ++ return ret; ++ ++ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); ++ if (!data) ++ return -ENOMEM; ++ ++ if (params.id_block_en) { ++ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); ++ if (IS_ERR(id_block)) { ++ ret = PTR_ERR(id_block); ++ goto e_free; ++ } ++ ++ data->id_block_en = 1; ++ data->id_block_paddr = __sme_pa(id_block); ++ } ++ ++ if (params.auth_key_en) { ++ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); ++ if (IS_ERR(id_auth)) { ++ ret = PTR_ERR(id_auth); ++ goto e_free_id_block; ++ } ++ ++ data->auth_key_en = 1; ++ data->id_auth_paddr = __sme_pa(id_auth); ++ } ++ ++ data->gctx_paddr = __psp_pa(sev->snp_context); ++ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); ++ ++ kfree(id_auth); ++ ++e_free_id_block: ++ kfree(id_block); ++ ++e_free: ++ kfree(data); ++ ++ return ret; ++} ++ + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + { + struct kvm_sev_cmd sev_cmd; +@@ -2191,6 +2291,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; ++ case KVM_SEV_SNP_LAUNCH_FINISH: ++ r = snp_launch_finish(kvm, &sev_cmd); ++ break; + default: + r = -EINVAL; + goto out; +@@ -2696,11 +2799,26 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) + + svm = to_svm(vcpu); + ++ /* ++ * If its an SNP guest, then VMSA was added in the RMP entry as ++ * a guest owned page. Transition the page to hyperivosr state ++ * before releasing it back to the system. ++ * Also the page is removed from the kernel direct map, so flush it ++ * later after it is transitioned back to hypervisor state and ++ * restored in the direct map. ++ */ ++ if (sev_snp_guest(vcpu->kvm)) { ++ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; ++ if (host_rmp_make_shared(pfn, PG_LEVEL_4K, false)) ++ goto skip_vmsa_free; ++ } ++ + if (vcpu->arch.guest_state_protected) + sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); + + __free_page(virt_to_page(svm->sev_es.vmsa)); + ++skip_vmsa_free: + if (svm->sev_es.ghcb_sa_free) + kvfree(svm->sev_es.ghcb_sa); + } +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 9b36b07414ea..5a4662716b6a 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -1814,6 +1814,7 @@ enum sev_cmd_id { + KVM_SEV_SNP_INIT, + KVM_SEV_SNP_LAUNCH_START, + KVM_SEV_SNP_LAUNCH_UPDATE, ++ KVM_SEV_SNP_LAUNCH_FINISH, + + KVM_SEV_NR_MAX, + }; +@@ -1948,6 +1949,19 @@ struct kvm_sev_snp_launch_update { + __u8 vmpl1_perms; + }; + ++#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 ++#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 ++#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 ++ ++struct kvm_sev_snp_launch_finish { ++ __u64 id_block_uaddr; ++ __u64 id_auth_uaddr; ++ __u8 id_block_en; ++ __u8 auth_key_en; ++ __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; ++ __u8 pad[6]; ++}; ++ + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) + #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) +-- +2.36.1 + + +From d2203462d423ede4e6146a2649576fbf3d7c5d96 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:39:57 +0000 +Subject: [PATCH 29/90] KVM: X86: Keep the NPT and RMP page level in sync + +When running an SEV-SNP VM, the sPA used to index the RMP entry is +obtained through the NPT translation (gva->gpa->spa). The NPT page +level is checked against the page level programmed in the RMP entry. +If the page level does not match, then it will cause a nested page +fault with the RMP bit set to indicate the RMP violation. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 + + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/mmu/mmu.c | 5 ++++ + arch/x86/kvm/svm/sev.c | 46 ++++++++++++++++++++++++++++++ + arch/x86/kvm/svm/svm.c | 1 + + arch/x86/kvm/svm/svm.h | 1 + + 6 files changed, 55 insertions(+) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index a66292dae698..e0068e702692 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -129,6 +129,7 @@ KVM_X86_OP(complete_emulated_msr) + KVM_X86_OP(vcpu_deliver_sipi_vector) + KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); + KVM_X86_OP(alloc_apic_backing_page) ++KVM_X86_OP_OPTIONAL(rmp_page_level_adjust) + + #undef KVM_X86_OP + #undef KVM_X86_OP_OPTIONAL +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 0205e2944067..2748c69609e3 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1514,6 +1514,7 @@ struct kvm_x86_ops { + unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); ++ void (*rmp_page_level_adjust)(struct kvm *kvm, kvm_pfn_t pfn, int *level); + }; + + struct kvm_x86_nested_ops { +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index c623019929a7..997318ecebd1 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -43,6 +43,7 @@ + #include <linux/hash.h> + #include <linux/kern_levels.h> + #include <linux/kthread.h> ++#include <linux/sev.h> + + #include <asm/page.h> + #include <asm/memtype.h> +@@ -2824,6 +2825,10 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + if (unlikely(!pte)) + return PG_LEVEL_4K; + ++ /* Adjust the page level based on the SEV-SNP RMP page level. */ ++ if (kvm_x86_ops.rmp_page_level_adjust) ++ static_call(kvm_x86_rmp_page_level_adjust)(kvm, pfn, &level); ++ + return level; + } + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index f540581d997f..31be2d52ec2d 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3596,3 +3596,49 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) + + return pfn_to_page(pfn); + } ++ ++static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) ++{ ++ int level; ++ ++ while (end > start) { ++ if (snp_lookup_rmpentry(start, &level) != 0) ++ return false; ++ start++; ++ } ++ ++ return true; ++} ++ ++void sev_rmp_page_level_adjust(struct kvm *kvm, kvm_pfn_t pfn, int *level) ++{ ++ int rmp_level, assigned; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) ++ return; ++ ++ assigned = snp_lookup_rmpentry(pfn, &rmp_level); ++ if (unlikely(assigned < 0)) ++ return; ++ ++ if (!assigned) { ++ /* ++ * If all the pages are shared then no need to keep the RMP ++ * and NPT in sync. ++ */ ++ pfn = pfn & ~(PTRS_PER_PMD - 1); ++ if (is_pfn_range_shared(pfn, pfn + PTRS_PER_PMD)) ++ return; ++ } ++ ++ /* ++ * The hardware installs 2MB TLB entries to access to 1GB pages, ++ * therefore allow NPT to use 1GB pages when pfn was added as 2MB ++ * in the RMP table. ++ */ ++ if (rmp_level == PG_LEVEL_2M && (*level == PG_LEVEL_1G)) ++ return; ++ ++ /* Adjust the level to keep the NPT and RMP in sync */ ++ *level = min_t(size_t, *level, rmp_level); ++} +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index b4bd64f94d3a..18e2cd4d9559 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4734,6 +4734,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, + + .alloc_apic_backing_page = svm_alloc_apic_backing_page, ++ .rmp_page_level_adjust = sev_rmp_page_level_adjust, + }; + + /* +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 71c011af098e..7782312a1cda 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -673,6 +673,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); + void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); + void sev_es_unmap_ghcb(struct vcpu_svm *svm); + struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); ++void sev_rmp_page_level_adjust(struct kvm *kvm, kvm_pfn_t pfn, int *level); + + /* vmenter.S */ + +-- +2.36.1 + + +From c0ae059b260cd7f56ff5198e13024285652b543d Mon Sep 17 00:00:00 2001 +From: Sean Christopherson <sean.j.christopherson@intel.com> +Date: Tue, 26 Apr 2022 18:42:35 +0000 +Subject: [PATCH 30/90] KVM: x86/mmu: Introduce kvm_mmu_map_tdp_page() for use + by TDX and SNP + +Introduce a helper to directly (pun intended) fault-in a TDP page +without having to go through the full page fault path. This allows +TDX to get the resulting pfn and also allows the RET_PF_* enums to +stay in mmu.c where they belong. + +Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> +Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/mmu.h | 3 +++ + arch/x86/kvm/mmu/mmu.c | 51 ++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 54 insertions(+) + +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index e6cae6f22683..c99b15e97a0a 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -204,6 +204,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + return vcpu->arch.mmu->page_fault(vcpu, &fault); + } + ++kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa, ++ u32 error_code, int max_level); ++ + /* + * Check if a given access (described through the I/D, W/R and U/S bits of a + * page fault error code pfec) causes a permission fault with the given PTE +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 997318ecebd1..569021af349a 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4100,6 +4100,57 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) + return direct_page_fault(vcpu, fault); + } + ++kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa, ++ u32 err, int max_level) ++{ ++ struct kvm_page_fault fault = { ++ .addr = gpa, ++ .error_code = err, ++ .exec = err & PFERR_FETCH_MASK, ++ .write = err & PFERR_WRITE_MASK, ++ .present = err & PFERR_PRESENT_MASK, ++ .rsvd = err & PFERR_RSVD_MASK, ++ .user = err & PFERR_USER_MASK, ++ .prefetch = false, ++ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), ++ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), ++ ++ .max_level = max_level, ++ .req_level = PG_LEVEL_4K, ++ .goal_level = PG_LEVEL_4K, ++ }; ++ int r; ++ ++ if (mmu_topup_memory_caches(vcpu, false)) ++ return KVM_PFN_ERR_FAULT; ++ ++ /* ++ * Loop on the page fault path to handle the case where an mmu_notifier ++ * invalidation triggers RET_PF_RETRY. In the normal page fault path, ++ * KVM needs to resume the guest in case the invalidation changed any ++ * of the page fault properties, i.e. the gpa or error code. For this ++ * path, the gpa and error code are fixed by the caller, and the caller ++ * expects failure if and only if the page fault can't be fixed. ++ */ ++ do { ++ /* ++ * TODO: this should probably go through kvm_mmu_do_page_fault(), ++ * but we need a way to control the max_level, so maybe a direct ++ * call to kvm_tdp_page_fault, which will call into ++ * direct_page_fault() when appropriate. ++ */ ++ //r = direct_page_fault(vcpu, &fault); ++#if CONFIG_RETPOLINE ++ if (fault.is_tdp) ++ r = kvm_tdp_page_fault(vcpu, &fault); ++#else ++ r = vcpu->arch.mmu->page_fault(vcpu, &fault); ++#endif ++ } while (r == RET_PF_RETRY && !is_error_noslot_pfn(fault.pfn)); ++ return fault.pfn; ++} ++EXPORT_SYMBOL_GPL(kvm_mmu_map_tdp_page); ++ + static void nonpaging_init_context(struct kvm_mmu *context) + { + context->page_fault = nonpaging_page_fault; +-- +2.36.1 + + +From b797943039b696cf2def858b7f38af4fe2922577 Mon Sep 17 00:00:00 2001 +From: Ashish Kalra <ashish.kalra@amd.com> +Date: Wed, 8 Jun 2022 19:10:07 +0000 +Subject: [PATCH 31/90] KVM: x86: Introduce kvm_mmu_get_tdp_walk() for SEV-SNP + use + +The SEV-SNP VMs may call the page state change VMGEXIT to add the GPA +as private or shared in the RMP table. The page state change VMGEXIT +will contain the RMP page level to be used in the RMP entry. If the +page level between the TDP and RMP does not match then, it will result +in nested-page-fault (RMP violation). + +The SEV-SNP VMGEXIT handler will use the kvm_mmu_get_tdp_walk() to get +the current page-level in the TDP for the given GPA and calculate a +workable page level. If a GPA is mapped as a 4K-page in the TDP, but +the guest requested to add the GPA as a 2M in the RMP entry then the +2M request will be broken into 4K-pages to keep the RMP and TDP +page-levels in sync. + +TDP SPTEs are RCU protected so need to put kvm_mmu_get_tdp_walk() in RCU +read-side critical section by using walk_shadow_page_lockless_begin() and +walk_lockless_shadow_page_lockless_end(). This fixes the "suspicious RCU usage" +message seen with lockdep kernel build. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +Signed-off by: Ashish Kalra <ashish.kalra@amd.com> +--- + arch/x86/kvm/mmu.h | 2 ++ + arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++++++++++++++++++++++ + 2 files changed, 35 insertions(+) + +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index c99b15e97a0a..d55b5166389a 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -178,6 +178,8 @@ static inline bool is_nx_huge_page_enabled(void) + return READ_ONCE(nx_huge_pages); + } + ++bool kvm_mmu_get_tdp_walk(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t *pfn, int *level); ++ + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 err, bool prefetch) + { +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 569021af349a..c1ac486e096e 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4151,6 +4151,39 @@ kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa, + } + EXPORT_SYMBOL_GPL(kvm_mmu_map_tdp_page); + ++bool kvm_mmu_get_tdp_walk(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t *pfn, int *level) ++{ ++ u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; ++ int leaf, root; ++ ++ walk_shadow_page_lockless_begin(vcpu); ++ ++ if (is_tdp_mmu(vcpu->arch.mmu)) ++ leaf = kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, &root); ++ else ++ leaf = get_walk(vcpu, gpa, sptes, &root); ++ ++ walk_shadow_page_lockless_end(vcpu); ++ ++ if (unlikely(leaf < 0)) ++ return false; ++ ++ /* Check if the leaf SPTE is present */ ++ if (!is_shadow_present_pte(sptes[leaf])) ++ return false; ++ ++ *pfn = spte_to_pfn(sptes[leaf]); ++ if (leaf > PG_LEVEL_4K) { ++ u64 page_mask = KVM_PAGES_PER_HPAGE(leaf) - KVM_PAGES_PER_HPAGE(leaf - 1); ++ *pfn |= (gpa_to_gfn(gpa) & page_mask); ++ } ++ ++ *level = leaf; ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(kvm_mmu_get_tdp_walk); ++ + static void nonpaging_init_context(struct kvm_mmu *context) + { + context->page_fault = nonpaging_page_fault; +-- +2.36.1 + + +From 0f89d525ae1495cdcb19184dc46207c2ef7281d2 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:46:34 +0000 +Subject: [PATCH 32/90] KVM: x86: Define RMP page fault error bits for #NPF + +When SEV-SNP is enabled globally, the hardware places restrictions on all +memory accesses based on the RMP entry, whether the hypervisor or a VM, +performs the accesses. When hardware encounters an RMP access violation +during a guest access, it will cause a #VMEXIT(NPF). + +See APM2 section 16.36.10 for more details. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/kvm_host.h | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 2748c69609e3..49b217dc8d7e 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -247,9 +247,13 @@ enum x86_intercept_stage; + #define PFERR_FETCH_BIT 4 + #define PFERR_PK_BIT 5 + #define PFERR_SGX_BIT 15 ++#define PFERR_GUEST_RMP_BIT 31 + #define PFERR_GUEST_FINAL_BIT 32 + #define PFERR_GUEST_PAGE_BIT 33 + #define PFERR_IMPLICIT_ACCESS_BIT 48 ++#define PFERR_GUEST_ENC_BIT 34 ++#define PFERR_GUEST_SIZEM_BIT 35 ++#define PFERR_GUEST_VMPL_BIT 36 + + #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) + #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +@@ -261,6 +265,10 @@ enum x86_intercept_stage; + #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) + #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + #define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) ++#define PFERR_GUEST_RMP_MASK (1ULL << PFERR_GUEST_RMP_BIT) ++#define PFERR_GUEST_ENC_MASK (1ULL << PFERR_GUEST_ENC_BIT) ++#define PFERR_GUEST_SIZEM_MASK (1ULL << PFERR_GUEST_SIZEM_BIT) ++#define PFERR_GUEST_VMPL_MASK (1ULL << PFERR_GUEST_VMPL_BIT) + + #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_WRITE_MASK | \ +-- +2.36.1 + + +From 9c3e136755d85cc5f1d1f8750a54f33593ff9abd Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:48:43 +0000 +Subject: [PATCH 33/90] KVM: x86: Update page-fault trace to log full 64-bit + error code + +The #NPT error code is a 64-bit value but the trace prints only the +lower 32-bits. Some of the fault error code (e.g PFERR_GUEST_FINAL_MASK) +are available in the upper 32-bits. + +Cc: <stable@kernel.org> +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/trace.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h +index e3a24b8f04be..9b9bc5468103 100644 +--- a/arch/x86/kvm/trace.h ++++ b/arch/x86/kvm/trace.h +@@ -383,12 +383,12 @@ TRACE_EVENT(kvm_inj_exception, + * Tracepoint for page fault. + */ + TRACE_EVENT(kvm_page_fault, +- TP_PROTO(unsigned long fault_address, unsigned int error_code), ++ TP_PROTO(unsigned long fault_address, u64 error_code), + TP_ARGS(fault_address, error_code), + + TP_STRUCT__entry( + __field( unsigned long, fault_address ) +- __field( unsigned int, error_code ) ++ __field( u64, error_code ) + ), + + TP_fast_assign( +@@ -396,7 +396,7 @@ TRACE_EVENT(kvm_page_fault, + __entry->error_code = error_code; + ), + +- TP_printk("address %lx error_code %x", ++ TP_printk("address %lx error_code %llx", + __entry->fault_address, __entry->error_code) + ); + +-- +2.36.1 + + +From 5295a253366219ebd565fceab86a579486ac0c72 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:50:19 +0000 +Subject: [PATCH 34/90] KVM: SVM: Do not use long-lived GHCB map while setting + scratch area + +The setup_vmgexit_scratch() function may rely on a long-lived GHCB +mapping if the GHCB shared buffer area was used for the scratch area. +In preparation for eliminating the long-lived GHCB mapping, always +allocate a buffer for the scratch area so it can be accessed without +the GHCB mapping. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 74 +++++++++++++++++++----------------------- + arch/x86/kvm/svm/svm.h | 3 +- + 2 files changed, 36 insertions(+), 41 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 31be2d52ec2d..49291a70a6a0 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2819,8 +2819,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) + __free_page(virt_to_page(svm->sev_es.vmsa)); + + skip_vmsa_free: +- if (svm->sev_es.ghcb_sa_free) +- kvfree(svm->sev_es.ghcb_sa); ++ kvfree(svm->sev_es.ghcb_sa); + } + + static void dump_ghcb(struct vcpu_svm *svm) +@@ -2908,6 +2907,9 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) + control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); + control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); + ++ /* Copy the GHCB scratch area GPA */ ++ svm->sev_es.ghcb_sa_gpa = ghcb_get_sw_scratch(ghcb); ++ + /* Clear the valid entries fields */ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + } +@@ -3053,23 +3055,12 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) + if (!svm->sev_es.ghcb) + return; + +- if (svm->sev_es.ghcb_sa_free) { +- /* +- * The scratch area lives outside the GHCB, so there is a +- * buffer that, depending on the operation performed, may +- * need to be synced, then freed. +- */ +- if (svm->sev_es.ghcb_sa_sync) { +- kvm_write_guest(svm->vcpu.kvm, +- ghcb_get_sw_scratch(svm->sev_es.ghcb), +- svm->sev_es.ghcb_sa, +- svm->sev_es.ghcb_sa_len); +- svm->sev_es.ghcb_sa_sync = false; +- } +- +- kvfree(svm->sev_es.ghcb_sa); +- svm->sev_es.ghcb_sa = NULL; +- svm->sev_es.ghcb_sa_free = false; ++ /* Sync the scratch buffer area. */ ++ if (svm->sev_es.ghcb_sa_sync) { ++ kvm_write_guest(svm->vcpu.kvm, ++ ghcb_get_sw_scratch(svm->sev_es.ghcb), ++ svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); ++ svm->sev_es.ghcb_sa_sync = false; + } + + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); +@@ -3110,9 +3101,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + struct ghcb *ghcb = svm->sev_es.ghcb; + u64 ghcb_scratch_beg, ghcb_scratch_end; + u64 scratch_gpa_beg, scratch_gpa_end; +- void *scratch_va; + +- scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); ++ scratch_gpa_beg = svm->sev_es.ghcb_sa_gpa; + if (!scratch_gpa_beg) { + pr_err("vmgexit: scratch gpa not provided\n"); + goto e_scratch; +@@ -3142,9 +3132,6 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + scratch_gpa_beg, scratch_gpa_end); + goto e_scratch; + } +- +- scratch_va = (void *)svm->sev_es.ghcb; +- scratch_va += (scratch_gpa_beg - control->ghcb_gpa); + } else { + /* + * The guest memory must be read into a kernel buffer, so +@@ -3155,29 +3142,36 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + len, GHCB_SCRATCH_AREA_LIMIT); + goto e_scratch; + } +- scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); +- if (!scratch_va) +- return -ENOMEM; ++ } + +- if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { +- /* Unable to copy scratch area from guest */ +- pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); ++ if (svm->sev_es.ghcb_sa_alloc_len < len) { ++ void *scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); + +- kvfree(scratch_va); +- return -EFAULT; +- } ++ if (!scratch_va) ++ return -ENOMEM; + + /* +- * The scratch area is outside the GHCB. The operation will +- * dictate whether the buffer needs to be synced before running +- * the vCPU next time (i.e. a read was requested so the data +- * must be written back to the guest memory). ++ * Free the old scratch area and switch to using newly ++ * allocated. + */ +- svm->sev_es.ghcb_sa_sync = sync; +- svm->sev_es.ghcb_sa_free = true; ++ kvfree(svm->sev_es.ghcb_sa); ++ ++ svm->sev_es.ghcb_sa_alloc_len = len; ++ svm->sev_es.ghcb_sa = scratch_va; + } + +- svm->sev_es.ghcb_sa = scratch_va; ++ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, svm->sev_es.ghcb_sa, len)) { ++ /* Unable to copy scratch area from guest */ ++ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); ++ return -EFAULT; ++ } ++ ++ /* ++ * The operation will dictate whether the buffer needs to be synced ++ * before running the vCPU next time (i.e. a read was requested so ++ * the data must be written back to the guest memory). ++ */ ++ svm->sev_es.ghcb_sa_sync = sync; + svm->sev_es.ghcb_sa_len = len; + + return 0; +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 7782312a1cda..bd0db4d4a61e 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -197,8 +197,9 @@ struct vcpu_sev_es_state { + /* SEV-ES scratch area support */ + void *ghcb_sa; + u32 ghcb_sa_len; ++ u64 ghcb_sa_gpa; ++ u32 ghcb_sa_alloc_len; + bool ghcb_sa_sync; +- bool ghcb_sa_free; + }; + + struct vcpu_svm { +-- +2.36.1 + + +From 7a35659b11d5b24517159e3d67b604000fd83e27 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:53:13 +0000 +Subject: [PATCH 35/90] KVM: SVM: Remove the long-lived GHCB host map + +On VMGEXIT, sev_handle_vmgexit() creates a host mapping for the GHCB GPA, +and unmaps it just before VM-entry. This long-lived GHCB map is used by +the VMGEXIT handler through accessors such as ghcb_{set_get}_xxx(). + +A long-lived GHCB map can cause issue when SEV-SNP is enabled. When +SEV-SNP is enabled the mapped GPA needs to be protected against a page +state change. + +To eliminate the long-lived GHCB mapping, update the GHCB sync operations +to explicitly map the GHCB before access and unmap it after access is +complete. This requires that the setting of the GHCBs sw_exit_info_{1,2} +fields be done during sev_es_sync_to_ghcb(), so create two new fields in +the vcpu_svm struct to hold these values when required to be set outside +of the GHCB mapping. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 131 ++++++++++++++++++++++++++--------------- + arch/x86/kvm/svm/svm.c | 12 ++-- + arch/x86/kvm/svm/svm.h | 24 +++++++- + 3 files changed, 111 insertions(+), 56 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 49291a70a6a0..f356f80f9959 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2822,15 +2822,40 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) + kvfree(svm->sev_es.ghcb_sa); + } + ++static inline int svm_map_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map) ++{ ++ struct vmcb_control_area *control = &svm->vmcb->control; ++ u64 gfn = gpa_to_gfn(control->ghcb_gpa); ++ ++ if (kvm_vcpu_map(&svm->vcpu, gfn, map)) { ++ /* Unable to map GHCB from guest */ ++ pr_err("error mapping GHCB GFN [%#llx] from guest\n", gfn); ++ return -EFAULT; ++ } ++ ++ return 0; ++} ++ ++static inline void svm_unmap_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map) ++{ ++ kvm_vcpu_unmap(&svm->vcpu, map, true); ++} ++ + static void dump_ghcb(struct vcpu_svm *svm) + { +- struct ghcb *ghcb = svm->sev_es.ghcb; ++ struct kvm_host_map map; + unsigned int nbits; ++ struct ghcb *ghcb; ++ ++ if (svm_map_ghcb(svm, &map)) ++ return; ++ ++ ghcb = map.hva; + + /* Re-use the dump_invalid_vmcb module parameter */ + if (!dump_invalid_vmcb) { + pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); +- return; ++ goto e_unmap; + } + + nbits = sizeof(ghcb->save.valid_bitmap) * 8; +@@ -2845,12 +2870,21 @@ static void dump_ghcb(struct vcpu_svm *svm) + pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", + ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); ++ ++e_unmap: ++ svm_unmap_ghcb(svm, &map); + } + +-static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) ++static bool sev_es_sync_to_ghcb(struct vcpu_svm *svm) + { + struct kvm_vcpu *vcpu = &svm->vcpu; +- struct ghcb *ghcb = svm->sev_es.ghcb; ++ struct kvm_host_map map; ++ struct ghcb *ghcb; ++ ++ if (svm_map_ghcb(svm, &map)) ++ return false; ++ ++ ghcb = map.hva; + + /* + * The GHCB protocol so far allows for the following data +@@ -2864,13 +2898,24 @@ static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) + ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); + ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); + ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); ++ ++ /* ++ * Copy the return values from the exit_info_{1,2}. ++ */ ++ ghcb_set_sw_exit_info_1(ghcb, svm->sev_es.ghcb_sw_exit_info_1); ++ ghcb_set_sw_exit_info_2(ghcb, svm->sev_es.ghcb_sw_exit_info_2); ++ ++ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, ghcb); ++ ++ svm_unmap_ghcb(svm, &map); ++ ++ return true; + } + +-static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) ++static void sev_es_sync_from_ghcb(struct vcpu_svm *svm, struct ghcb *ghcb) + { + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; +- struct ghcb *ghcb = svm->sev_es.ghcb; + u64 exit_code; + + /* +@@ -2914,20 +2959,25 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + } + +-static int sev_es_validate_vmgexit(struct vcpu_svm *svm) ++static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code) + { +- struct kvm_vcpu *vcpu; ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct kvm_host_map map; + struct ghcb *ghcb; +- u64 exit_code; + u64 reason; + +- ghcb = svm->sev_es.ghcb; ++ if (svm_map_ghcb(svm, &map)) ++ return -EFAULT; ++ ++ ghcb = map.hva; ++ ++ trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); + + /* + * Retrieve the exit code now even though it may not be marked valid + * as it could help with debugging. + */ +- exit_code = ghcb_get_sw_exit_code(ghcb); ++ *exit_code = ghcb_get_sw_exit_code(ghcb); + + /* Only GHCB Usage code 0 is supported */ + if (ghcb->ghcb_usage) { +@@ -3020,6 +3070,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) + goto vmgexit_err; + } + ++ sev_es_sync_from_ghcb(svm, ghcb); ++ ++ svm_unmap_ghcb(svm, &map); + return 0; + + vmgexit_err: +@@ -3030,10 +3083,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) + ghcb->ghcb_usage); + } else if (reason == GHCB_ERR_INVALID_EVENT) { + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", +- exit_code); ++ *exit_code); + } else { + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", +- exit_code); ++ *exit_code); + dump_ghcb(svm); + } + +@@ -3043,6 +3096,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, reason); + ++ svm_unmap_ghcb(svm, &map); ++ + /* Resume the guest to "return" the error code. */ + return 1; + } +@@ -3052,23 +3107,20 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + +- if (!svm->sev_es.ghcb) ++ if (!svm->sev_es.ghcb_in_use) + return; + + /* Sync the scratch buffer area. */ + if (svm->sev_es.ghcb_sa_sync) { + kvm_write_guest(svm->vcpu.kvm, +- ghcb_get_sw_scratch(svm->sev_es.ghcb), ++ svm->sev_es.ghcb_sa_gpa, + svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; + } + +- trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); +- + sev_es_sync_to_ghcb(svm); + +- kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); +- svm->sev_es.ghcb = NULL; ++ svm->sev_es.ghcb_in_use = false; + } + + void pre_sev_run(struct vcpu_svm *svm, int cpu) +@@ -3098,7 +3150,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) + static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + { + struct vmcb_control_area *control = &svm->vmcb->control; +- struct ghcb *ghcb = svm->sev_es.ghcb; + u64 ghcb_scratch_beg, ghcb_scratch_end; + u64 scratch_gpa_beg, scratch_gpa_end; + +@@ -3177,8 +3228,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + return 0; + + e_scratch: +- ghcb_set_sw_exit_info_1(ghcb, 2); +- ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); ++ svm_set_ghcb_sw_exit_info_1(&svm->vcpu, 2); ++ svm_set_ghcb_sw_exit_info_2(&svm->vcpu, GHCB_ERR_INVALID_SCRATCH_AREA); + + return 1; + } +@@ -3315,7 +3366,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + u64 ghcb_gpa, exit_code; +- struct ghcb *ghcb; + int ret; + + /* Validate the GHCB */ +@@ -3330,29 +3380,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + return 1; + } + +- if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { +- /* Unable to map GHCB from guest */ +- vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", +- ghcb_gpa); +- +- /* Without a GHCB, just return right back to the guest */ +- return 1; +- } +- +- svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; +- ghcb = svm->sev_es.ghcb_map.hva; +- +- trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); +- +- exit_code = ghcb_get_sw_exit_code(ghcb); +- +- ret = sev_es_validate_vmgexit(svm); ++ ret = sev_es_validate_vmgexit(svm, &exit_code); + if (ret) + return ret; + +- sev_es_sync_from_ghcb(svm); +- ghcb_set_sw_exit_info_1(ghcb, 0); +- ghcb_set_sw_exit_info_2(ghcb, 0); ++ svm->sev_es.ghcb_in_use = true; ++ ++ svm_set_ghcb_sw_exit_info_1(vcpu, 0); ++ svm_set_ghcb_sw_exit_info_2(vcpu, 0); + + switch (exit_code) { + case SVM_VMGEXIT_MMIO_READ: +@@ -3392,20 +3427,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + break; + case 1: + /* Get AP jump table address */ +- ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table); ++ svm_set_ghcb_sw_exit_info_2(vcpu, sev->ap_jump_table); + break; + default: + pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", + control->exit_info_1); +- ghcb_set_sw_exit_info_1(ghcb, 2); +- ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); ++ svm_set_ghcb_sw_exit_info_1(vcpu, 2); ++ svm_set_ghcb_sw_exit_info_2(vcpu, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; + } + case SVM_VMGEXIT_HV_FEATURES: { +- ghcb_set_sw_exit_info_2(ghcb, GHCB_HV_FT_SUPPORTED); ++ svm_set_ghcb_sw_exit_info_2(vcpu, GHCB_HV_FT_SUPPORTED); + + ret = 1; + break; +@@ -3536,7 +3571,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. + */ +- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); ++ svm_set_ghcb_sw_exit_info_2(vcpu, 1); + break; + case AP_RESET_HOLD_MSR_PROTO: + /* +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 18e2cd4d9559..b24e0171cbf2 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -2720,14 +2720,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) + { + struct vcpu_svm *svm = to_svm(vcpu); +- if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) ++ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb_in_use)) + return kvm_complete_insn_gp(vcpu, err); + +- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); +- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, +- X86_TRAP_GP | +- SVM_EVTINJ_TYPE_EXEPT | +- SVM_EVTINJ_VALID); ++ svm_set_ghcb_sw_exit_info_1(vcpu, 1); ++ svm_set_ghcb_sw_exit_info_2(vcpu, ++ X86_TRAP_GP | ++ SVM_EVTINJ_TYPE_EXEPT | ++ SVM_EVTINJ_VALID); + return 1; + } + +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index bd0db4d4a61e..c80352c9c0d6 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -189,8 +189,7 @@ struct svm_nested_state { + struct vcpu_sev_es_state { + /* SEV-ES support */ + struct sev_es_save_area *vmsa; +- struct ghcb *ghcb; +- struct kvm_host_map ghcb_map; ++ bool ghcb_in_use; + bool received_first_sipi; + unsigned int ap_reset_hold_type; + +@@ -200,6 +199,13 @@ struct vcpu_sev_es_state { + u64 ghcb_sa_gpa; + u32 ghcb_sa_alloc_len; + bool ghcb_sa_sync; ++ ++ /* ++ * SEV-ES support to hold the sw_exit_info return values to be ++ * sync'ed to the GHCB when mapped. ++ */ ++ u64 ghcb_sw_exit_info_1; ++ u64 ghcb_sw_exit_info_2; + }; + + struct vcpu_svm { +@@ -614,6 +620,20 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); + void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); + void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); + ++static inline void svm_set_ghcb_sw_exit_info_1(struct kvm_vcpu *vcpu, u64 val) ++{ ++ struct vcpu_svm *svm = to_svm(vcpu); ++ ++ svm->sev_es.ghcb_sw_exit_info_1 = val; ++} ++ ++static inline void svm_set_ghcb_sw_exit_info_2(struct kvm_vcpu *vcpu, u64 val) ++{ ++ struct vcpu_svm *svm = to_svm(vcpu); ++ ++ svm->sev_es.ghcb_sw_exit_info_2 = val; ++} ++ + extern struct kvm_x86_nested_ops svm_nested_ops; + + /* avic.c */ +-- +2.36.1 + + +From 61e2c9e422a5186e87b801a5fac6aa2bc1c055b3 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:54:45 +0000 +Subject: [PATCH 36/90] KVM: SVM: Add support to handle GHCB GPA register + VMGEXIT + +SEV-SNP guests are required to perform a GHCB GPA registration. Before +using a GHCB GPA for a vCPU the first time, a guest must register the +vCPU GHCB GPA. If hypervisor can work with the guest requested GPA then +it must respond back with the same GPA otherwise return -1. + +On VMEXIT, Verify that GHCB GPA matches with the registered value. If a +mismatch is detected then abort the guest. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev-common.h | 8 ++++++++ + arch/x86/kvm/svm/sev.c | 27 +++++++++++++++++++++++++++ + arch/x86/kvm/svm/svm.h | 7 +++++++ + 3 files changed, 42 insertions(+) + +diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h +index 539de6b93420..0a9055cdfae2 100644 +--- a/arch/x86/include/asm/sev-common.h ++++ b/arch/x86/include/asm/sev-common.h +@@ -59,6 +59,14 @@ + #define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 + #define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) + ++/* Preferred GHCB GPA Request */ ++#define GHCB_MSR_PREF_GPA_REQ 0x010 ++#define GHCB_MSR_GPA_VALUE_POS 12 ++#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0) ++ ++#define GHCB_MSR_PREF_GPA_RESP 0x011 ++#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff ++ + /* GHCB GPA Register */ + #define GHCB_MSR_REG_GPA_REQ 0x012 + #define GHCB_MSR_REG_GPA_REQ_VAL(v) \ +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index f356f80f9959..a95b574c6569 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3330,6 +3330,27 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; + } ++ case GHCB_MSR_PREF_GPA_REQ: { ++ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, ++ GHCB_MSR_GPA_VALUE_POS); ++ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, ++ GHCB_MSR_INFO_POS); ++ break; ++ } ++ case GHCB_MSR_REG_GPA_REQ: { ++ u64 gfn; ++ ++ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, ++ GHCB_MSR_GPA_VALUE_POS); ++ ++ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); ++ ++ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, ++ GHCB_MSR_GPA_VALUE_POS); ++ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, ++ GHCB_MSR_INFO_POS); ++ break; ++ } + case GHCB_MSR_TERM_REQ: { + u64 reason_set, reason_code; + +@@ -3380,6 +3401,12 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + return 1; + } + ++ /* SEV-SNP guest requires that the GHCB GPA must be registered */ ++ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { ++ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); ++ return -EINVAL; ++ } ++ + ret = sev_es_validate_vmgexit(svm, &exit_code); + if (ret) + return ret; +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index c80352c9c0d6..54ff56cb6125 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -206,6 +206,8 @@ struct vcpu_sev_es_state { + */ + u64 ghcb_sw_exit_info_1; + u64 ghcb_sw_exit_info_2; ++ ++ u64 ghcb_registered_gpa; + }; + + struct vcpu_svm { +@@ -334,6 +336,11 @@ static inline bool sev_snp_guest(struct kvm *kvm) + return sev_es_guest(kvm) && sev->snp_active; + } + ++static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) ++{ ++ return svm->sev_es.ghcb_registered_gpa == val; ++} ++ + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) + { + vmcb->control.clean = 0; +-- +2.36.1 + + +From cdb3ff5f5f51d8fb2dd5b84cb417bc813bf1da8f Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:58:08 +0000 +Subject: [PATCH 37/90] KVM: SVM: Add support to handle MSR based Page State + Change VMGEXIT + +SEV-SNP VMs can ask the hypervisor to change the page state in the RMP +table to be private or shared using the Page State Change MSR protocol +as defined in the GHCB specification. + +Before changing the page state in the RMP entry, lookup the page in the +NPT to make sure that there is a valid mapping for it. If the mapping +exist then try to find a workable page level between the NPT and RMP for +the page. If the page is not mapped in the NPT, then create a fault such +that it gets mapped before we change the page state in the RMP entry. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev-common.h | 9 ++ + arch/x86/kvm/svm/sev.c | 197 ++++++++++++++++++++++++++++++ + arch/x86/kvm/trace.h | 34 ++++++ + arch/x86/kvm/x86.c | 1 + + 4 files changed, 241 insertions(+) + +diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h +index 0a9055cdfae2..ee38f7408470 100644 +--- a/arch/x86/include/asm/sev-common.h ++++ b/arch/x86/include/asm/sev-common.h +@@ -93,6 +93,10 @@ enum psc_op { + }; + + #define GHCB_MSR_PSC_REQ 0x014 ++#define GHCB_MSR_PSC_GFN_POS 12 ++#define GHCB_MSR_PSC_GFN_MASK GENMASK_ULL(39, 0) ++#define GHCB_MSR_PSC_OP_POS 52 ++#define GHCB_MSR_PSC_OP_MASK 0xf + #define GHCB_MSR_PSC_REQ_GFN(gfn, op) \ + /* GHCBData[55:52] */ \ + (((u64)((op) & 0xf) << 52) | \ +@@ -102,6 +106,11 @@ enum psc_op { + GHCB_MSR_PSC_REQ) + + #define GHCB_MSR_PSC_RESP 0x015 ++#define GHCB_MSR_PSC_ERROR_POS 32 ++#define GHCB_MSR_PSC_ERROR_MASK GENMASK_ULL(31, 0) ++#define GHCB_MSR_PSC_ERROR GENMASK_ULL(31, 0) ++#define GHCB_MSR_PSC_RSVD_POS 12 ++#define GHCB_MSR_PSC_RSVD_MASK GENMASK_ULL(19, 0) + #define GHCB_MSR_PSC_RESP_VAL(val) \ + /* GHCBData[63:32] */ \ + (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index a95b574c6569..a9048eeb2e5a 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -32,6 +32,7 @@ + #include "svm_ops.h" + #include "cpuid.h" + #include "trace.h" ++#include "mmu.h" + + #ifndef CONFIG_KVM_AMD_SEV + /* +@@ -3251,6 +3252,181 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) + svm->vmcb->control.ghcb_gpa = value; + } + ++static int snp_rmptable_psmash(struct kvm *kvm, kvm_pfn_t pfn) ++{ ++ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); ++ ++ return psmash(pfn); ++} ++ ++static int snp_make_page_shared(struct kvm *kvm, gpa_t gpa, kvm_pfn_t pfn, int level) ++{ ++ int rc, rmp_level; ++ ++ rc = snp_lookup_rmpentry(pfn, &rmp_level); ++ if (rc < 0) ++ return -EINVAL; ++ ++ /* If page is not assigned then do nothing */ ++ if (!rc) ++ return 0; ++ ++ /* ++ * Is the page part of an existing 2MB RMP entry ? Split the 2MB into ++ * multiple of 4K-page before making the memory shared. ++ */ ++ if (level == PG_LEVEL_4K && rmp_level == PG_LEVEL_2M) { ++ rc = snp_rmptable_psmash(kvm, pfn); ++ if (rc) ++ return rc; ++ } ++ ++ return rmp_make_shared(pfn, level); ++} ++ ++static int snp_check_and_build_npt(struct kvm_vcpu *vcpu, gpa_t gpa, int level) ++{ ++ struct kvm *kvm = vcpu->kvm; ++ int rc, npt_level; ++ kvm_pfn_t pfn; ++ ++ /* ++ * Get the pfn and level for the gpa from the nested page table. ++ * ++ * If the tdp walk fails, then its safe to say that there is no ++ * valid mapping for this gpa. Create a fault to build the map. ++ */ ++ write_lock(&kvm->mmu_lock); ++ rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); ++ write_unlock(&kvm->mmu_lock); ++ if (!rc) { ++ pfn = kvm_mmu_map_tdp_page(vcpu, gpa, PFERR_USER_MASK, level); ++ if (is_error_noslot_pfn(pfn)) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int snp_gpa_to_hva(struct kvm *kvm, gpa_t gpa, hva_t *hva) ++{ ++ struct kvm_memory_slot *slot; ++ gfn_t gfn = gpa_to_gfn(gpa); ++ int idx; ++ ++ idx = srcu_read_lock(&kvm->srcu); ++ slot = gfn_to_memslot(kvm, gfn); ++ if (!slot) { ++ srcu_read_unlock(&kvm->srcu, idx); ++ return -EINVAL; ++ } ++ ++ /* ++ * Note, using the __gfn_to_hva_memslot() is not solely for performance, ++ * it's also necessary to avoid the "writable" check in __gfn_to_hva_many(), ++ * which will always fail on read-only memslots due to gfn_to_hva() assuming ++ * writes. ++ */ ++ *hva = __gfn_to_hva_memslot(slot, gfn); ++ srcu_read_unlock(&kvm->srcu, idx); ++ ++ return 0; ++} ++ ++static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa, ++ int level) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; ++ struct kvm *kvm = vcpu->kvm; ++ int rc, npt_level; ++ kvm_pfn_t pfn; ++ gpa_t gpa_end; ++ ++ gpa_end = gpa + page_level_size(level); ++ ++ while (gpa < gpa_end) { ++ /* ++ * If the gpa is not present in the NPT then build the NPT. ++ */ ++ rc = snp_check_and_build_npt(vcpu, gpa, level); ++ if (rc) ++ return -EINVAL; ++ ++ if (op == SNP_PAGE_STATE_PRIVATE) { ++ hva_t hva; ++ ++ if (snp_gpa_to_hva(kvm, gpa, &hva)) ++ return -EINVAL; ++ ++ /* ++ * Verify that the hva range is registered. This enforcement is ++ * required to avoid the cases where a page is marked private ++ * in the RMP table but never gets cleanup during the VM ++ * termination path. ++ */ ++ mutex_lock(&kvm->lock); ++ rc = is_hva_registered(kvm, hva, page_level_size(level)); ++ mutex_unlock(&kvm->lock); ++ if (!rc) ++ return -EINVAL; ++ ++ /* ++ * Mark the userspace range unmerable before adding the pages ++ * in the RMP table. ++ */ ++ mmap_write_lock(kvm->mm); ++ rc = snp_mark_unmergable(kvm, hva, page_level_size(level)); ++ mmap_write_unlock(kvm->mm); ++ if (rc) ++ return -EINVAL; ++ } ++ ++ write_lock(&kvm->mmu_lock); ++ ++ rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); ++ if (!rc) { ++ /* ++ * This may happen if another vCPU unmapped the page ++ * before we acquire the lock. Retry the PSC. ++ */ ++ write_unlock(&kvm->mmu_lock); ++ return 0; ++ } ++ ++ /* ++ * Adjust the level so that we don't go higher than the backing ++ * page level. ++ */ ++ level = min_t(size_t, level, npt_level); ++ ++ trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level); ++ ++ switch (op) { ++ case SNP_PAGE_STATE_SHARED: ++ rc = snp_make_page_shared(kvm, gpa, pfn, level); ++ break; ++ case SNP_PAGE_STATE_PRIVATE: ++ rc = rmp_make_private(pfn, gpa, level, sev->asid, false); ++ break; ++ default: ++ rc = -EINVAL; ++ break; ++ } ++ ++ write_unlock(&kvm->mmu_lock); ++ ++ if (rc) { ++ pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n", ++ op, gpa, pfn, level, rc); ++ return rc; ++ } ++ ++ gpa = gpa + page_level_size(level); ++ } ++ ++ return 0; ++} ++ + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + { + struct vmcb_control_area *control = &svm->vmcb->control; +@@ -3351,6 +3527,27 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + GHCB_MSR_INFO_POS); + break; + } ++ case GHCB_MSR_PSC_REQ: { ++ gfn_t gfn; ++ int ret; ++ enum psc_op op; ++ ++ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_GFN_MASK, GHCB_MSR_PSC_GFN_POS); ++ op = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_OP_MASK, GHCB_MSR_PSC_OP_POS); ++ ++ ret = __snp_handle_page_state_change(vcpu, op, gfn_to_gpa(gfn), PG_LEVEL_4K); ++ ++ if (ret) ++ set_ghcb_msr_bits(svm, GHCB_MSR_PSC_ERROR, ++ GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS); ++ else ++ set_ghcb_msr_bits(svm, 0, ++ GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS); ++ ++ set_ghcb_msr_bits(svm, 0, GHCB_MSR_PSC_RSVD_MASK, GHCB_MSR_PSC_RSVD_POS); ++ set_ghcb_msr_bits(svm, GHCB_MSR_PSC_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); ++ break; ++ } + case GHCB_MSR_TERM_REQ: { + u64 reason_set, reason_code; + +diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h +index 9b9bc5468103..79801e50344a 100644 +--- a/arch/x86/kvm/trace.h ++++ b/arch/x86/kvm/trace.h +@@ -7,6 +7,7 @@ + #include <asm/svm.h> + #include <asm/clocksource.h> + #include <asm/pvclock-abi.h> ++#include <asm/sev-common.h> + + #undef TRACE_SYSTEM + #define TRACE_SYSTEM kvm +@@ -1755,6 +1756,39 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, + __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) + ); + ++/* ++ * Tracepoint for the SEV-SNP page state change processing ++ */ ++#define psc_operation \ ++ {SNP_PAGE_STATE_PRIVATE, "private"}, \ ++ {SNP_PAGE_STATE_SHARED, "shared"} \ ++ ++TRACE_EVENT(kvm_snp_psc, ++ TP_PROTO(unsigned int vcpu_id, u64 pfn, u64 gpa, u8 op, int level), ++ TP_ARGS(vcpu_id, pfn, gpa, op, level), ++ ++ TP_STRUCT__entry( ++ __field(int, vcpu_id) ++ __field(u64, pfn) ++ __field(u64, gpa) ++ __field(u8, op) ++ __field(int, level) ++ ), ++ ++ TP_fast_assign( ++ __entry->vcpu_id = vcpu_id; ++ __entry->pfn = pfn; ++ __entry->gpa = gpa; ++ __entry->op = op; ++ __entry->level = level; ++ ), ++ ++ TP_printk("vcpu %u, pfn %llx, gpa %llx, op %s, level %d", ++ __entry->vcpu_id, __entry->pfn, __entry->gpa, ++ __print_symbolic(__entry->op, psc_operation), ++ __entry->level) ++); ++ + #endif /* _TRACE_KVM_H */ + + #undef TRACE_INCLUDE_PATH +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 50fff5202e7e..4a1d16231e30 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -13066,6 +13066,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); ++EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_snp_psc); + + static int __init kvm_x86_init(void) + { +-- +2.36.1 + + +From 7bb61b9143c67e7cd0f1e5e619c1555585f555bd Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 18:59:44 +0000 +Subject: [PATCH 38/90] KVM: SVM: Add support to handle Page State Change + VMGEXIT + +SEV-SNP VMs can ask the hypervisor to change the page state in the RMP +table to be private or shared using the Page State Change NAE event +as defined in the GHCB specification version 2. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/sev-common.h | 7 +++ + arch/x86/kvm/svm/sev.c | 79 +++++++++++++++++++++++++++++-- + 2 files changed, 81 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h +index ee38f7408470..1b111cde8c82 100644 +--- a/arch/x86/include/asm/sev-common.h ++++ b/arch/x86/include/asm/sev-common.h +@@ -130,6 +130,13 @@ enum psc_op { + /* SNP Page State Change NAE event */ + #define VMGEXIT_PSC_MAX_ENTRY 253 + ++/* The page state change hdr structure in not valid */ ++#define PSC_INVALID_HDR 1 ++/* The hdr.cur_entry or hdr.end_entry is not valid */ ++#define PSC_INVALID_ENTRY 2 ++/* Page state change encountered undefined error */ ++#define PSC_UNDEF_ERR 3 ++ + struct psc_hdr { + u16 cur_entry; + u16 end_entry; +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index a9048eeb2e5a..60fd747c5b05 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3065,6 +3065,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code) + case SVM_VMGEXIT_AP_JUMP_TABLE: + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: ++ case SVM_VMGEXIT_PSC: + break; + default: + reason = GHCB_ERR_INVALID_EVENT; +@@ -3350,13 +3351,13 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + */ + rc = snp_check_and_build_npt(vcpu, gpa, level); + if (rc) +- return -EINVAL; ++ return PSC_UNDEF_ERR; + + if (op == SNP_PAGE_STATE_PRIVATE) { + hva_t hva; + + if (snp_gpa_to_hva(kvm, gpa, &hva)) +- return -EINVAL; ++ return PSC_UNDEF_ERR; + + /* + * Verify that the hva range is registered. This enforcement is +@@ -3368,7 +3369,7 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + rc = is_hva_registered(kvm, hva, page_level_size(level)); + mutex_unlock(&kvm->lock); + if (!rc) +- return -EINVAL; ++ return PSC_UNDEF_ERR; + + /* + * Mark the userspace range unmerable before adding the pages +@@ -3378,7 +3379,7 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + rc = snp_mark_unmergable(kvm, hva, page_level_size(level)); + mmap_write_unlock(kvm->mm); + if (rc) +- return -EINVAL; ++ return PSC_UNDEF_ERR; + } + + write_lock(&kvm->mmu_lock); +@@ -3409,7 +3410,7 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + rc = rmp_make_private(pfn, gpa, level, sev->asid, false); + break; + default: +- rc = -EINVAL; ++ rc = PSC_INVALID_ENTRY; + break; + } + +@@ -3427,6 +3428,65 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + return 0; + } + ++static inline unsigned long map_to_psc_vmgexit_code(int rc) ++{ ++ switch (rc) { ++ case PSC_INVALID_HDR: ++ return ((1ul << 32) | 1); ++ case PSC_INVALID_ENTRY: ++ return ((1ul << 32) | 2); ++ case RMPUPDATE_FAIL_OVERLAP: ++ return ((3ul << 32) | 2); ++ default: return (4ul << 32); ++ } ++} ++ ++static unsigned long snp_handle_page_state_change(struct vcpu_svm *svm) ++{ ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ int level, op, rc = PSC_UNDEF_ERR; ++ struct snp_psc_desc *info; ++ struct psc_entry *entry; ++ u16 cur, end; ++ gpa_t gpa; ++ ++ if (!sev_snp_guest(vcpu->kvm)) ++ return PSC_INVALID_HDR; ++ ++ if (setup_vmgexit_scratch(svm, true, sizeof(*info))) { ++ pr_err("vmgexit: scratch area is not setup.\n"); ++ return PSC_INVALID_HDR; ++ } ++ ++ info = (struct snp_psc_desc *)svm->sev_es.ghcb_sa; ++ cur = info->hdr.cur_entry; ++ end = info->hdr.end_entry; ++ ++ if (cur >= VMGEXIT_PSC_MAX_ENTRY || ++ end >= VMGEXIT_PSC_MAX_ENTRY || cur > end) ++ return PSC_INVALID_ENTRY; ++ ++ for (; cur <= end; cur++) { ++ entry = &info->entries[cur]; ++ gpa = gfn_to_gpa(entry->gfn); ++ level = RMP_TO_X86_PG_LEVEL(entry->pagesize); ++ op = entry->operation; ++ ++ if (!IS_ALIGNED(gpa, page_level_size(level))) { ++ rc = PSC_INVALID_ENTRY; ++ goto out; ++ } ++ ++ rc = __snp_handle_page_state_change(vcpu, op, gpa, level); ++ if (rc) ++ goto out; ++ } ++ ++out: ++ info->hdr.cur_entry = cur; ++ return rc ? map_to_psc_vmgexit_code(rc) : 0; ++} ++ + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + { + struct vmcb_control_area *control = &svm->vmcb->control; +@@ -3669,6 +3729,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + ret = 1; + break; + } ++ case SVM_VMGEXIT_PSC: { ++ unsigned long rc; ++ ++ ret = 1; ++ ++ rc = snp_handle_page_state_change(svm); ++ svm_set_ghcb_sw_exit_info_2(vcpu, rc); ++ break; ++ } + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + vcpu_unimpl(vcpu, + "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", +-- +2.36.1 + + +From 0b1cd5fdc8007f3e0e52902dc1c79df344515bb5 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:11:25 +0000 +Subject: [PATCH 39/90] KVM: SVM: Introduce ops for the post gfn map and unmap + +When SEV-SNP is enabled in the guest VM, the guest memory pages can +either be a private or shared. A write from the hypervisor goes through +the RMP checks. If hardware sees that hypervisor is attempting to write +to a guest private page, then it triggers an RMP violation #PF. + +To avoid the RMP violation with GHCB pages added new post_{map,unmap}_gfn +functions to verify if its safe to map GHCB pages. Need to add generic +post_{map,unmap}_gfn() ops that can be used to verify that its safe +to map a given guest page in the hypervisor. Uses a spinlock to protect +against the page state change for existing mapped pages. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +Signed-off by: Ashish Kalra <ashish.kalra@amd.com> +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 + + arch/x86/include/asm/kvm_host.h | 3 ++ + arch/x86/kvm/svm/sev.c | 48 ++++++++++++++++++++++++++++-- + arch/x86/kvm/svm/svm.c | 3 ++ + arch/x86/kvm/svm/svm.h | 11 +++++++ + 5 files changed, 64 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index e0068e702692..2dd2bc0cf4c3 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -130,6 +130,7 @@ KVM_X86_OP(vcpu_deliver_sipi_vector) + KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); + KVM_X86_OP(alloc_apic_backing_page) + KVM_X86_OP_OPTIONAL(rmp_page_level_adjust) ++KVM_X86_OP(update_protected_guest_state) + + #undef KVM_X86_OP + #undef KVM_X86_OP_OPTIONAL +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 49b217dc8d7e..8abc0e724f5c 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1522,7 +1522,10 @@ struct kvm_x86_ops { + unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); ++ + void (*rmp_page_level_adjust)(struct kvm *kvm, kvm_pfn_t pfn, int *level); ++ ++ int (*update_protected_guest_state)(struct kvm_vcpu *vcpu); + }; + + struct kvm_x86_nested_ops { +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 60fd747c5b05..cd8812d9ce33 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -341,6 +341,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) + if (ret) + goto e_free; + ++ spin_lock_init(&sev->psc_lock); + ret = sev_snp_init(&argp->error); + } else { + ret = sev_platform_init(&argp->error); +@@ -2827,19 +2828,28 @@ static inline int svm_map_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map) + { + struct vmcb_control_area *control = &svm->vmcb->control; + u64 gfn = gpa_to_gfn(control->ghcb_gpa); ++ struct kvm_vcpu *vcpu = &svm->vcpu; + +- if (kvm_vcpu_map(&svm->vcpu, gfn, map)) { ++ if (kvm_vcpu_map(vcpu, gfn, map)) { + /* Unable to map GHCB from guest */ + pr_err("error mapping GHCB GFN [%#llx] from guest\n", gfn); + return -EFAULT; + } + ++ if (sev_post_map_gfn(vcpu->kvm, map->gfn, map->pfn)) { ++ kvm_vcpu_unmap(vcpu, map, false); ++ return -EBUSY; ++ } ++ + return 0; + } + + static inline void svm_unmap_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map) + { +- kvm_vcpu_unmap(&svm->vcpu, map, true); ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ ++ kvm_vcpu_unmap(vcpu, map, true); ++ sev_post_unmap_gfn(vcpu->kvm, map->gfn, map->pfn); + } + + static void dump_ghcb(struct vcpu_svm *svm) +@@ -3382,6 +3392,8 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + return PSC_UNDEF_ERR; + } + ++ spin_lock(&sev->psc_lock); ++ + write_lock(&kvm->mmu_lock); + + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); +@@ -3416,6 +3428,8 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, + + write_unlock(&kvm->mmu_lock); + ++ spin_unlock(&sev->psc_lock); ++ + if (rc) { + pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n", + op, gpa, pfn, level, rc); +@@ -3964,3 +3978,33 @@ void sev_rmp_page_level_adjust(struct kvm *kvm, kvm_pfn_t pfn, int *level) + /* Adjust the level to keep the NPT and RMP in sync */ + *level = min_t(size_t, *level, rmp_level); + } ++ ++int sev_post_map_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ int level; ++ ++ if (!sev_snp_guest(kvm)) ++ return 0; ++ ++ spin_lock(&sev->psc_lock); ++ ++ /* If pfn is not added as private then fail */ ++ if (snp_lookup_rmpentry(pfn, &level) == 1) { ++ spin_unlock(&sev->psc_lock); ++ pr_err_ratelimited("failed to map private gfn 0x%llx pfn 0x%llx\n", gfn, pfn); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++void sev_post_unmap_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ ++ if (!sev_snp_guest(kvm)) ++ return; ++ ++ spin_unlock(&sev->psc_lock); ++} +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index b24e0171cbf2..1c8e035ba011 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -4734,7 +4734,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, + + .alloc_apic_backing_page = svm_alloc_apic_backing_page, ++ + .rmp_page_level_adjust = sev_rmp_page_level_adjust, ++ ++ .update_protected_guest_state = sev_snp_update_protected_guest_state, + }; + + /* +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 54ff56cb6125..3fd95193ed8d 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -79,19 +79,25 @@ struct kvm_sev_info { + bool active; /* SEV enabled guest */ + bool es_active; /* SEV-ES enabled guest */ + bool snp_active; /* SEV-SNP enabled guest */ ++ + unsigned int asid; /* ASID used for this guest */ + unsigned int handle; /* SEV firmware handle */ + int fd; /* SEV device fd */ ++ + unsigned long pages_locked; /* Number of pages locked */ + struct list_head regions_list; /* List of registered regions */ ++ + u64 ap_jump_table; /* SEV-ES AP Jump Table address */ ++ + struct kvm *enc_context_owner; /* Owner of copied encryption context */ + struct list_head mirror_vms; /* List of VMs mirroring */ + struct list_head mirror_entry; /* Use as a list entry of mirrors */ + struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; ++ + u64 snp_init_flags; + void *snp_context; /* SNP guest context page */ ++ spinlock_t psc_lock; + }; + + struct kvm_svm { +@@ -702,6 +708,11 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); + void sev_es_unmap_ghcb(struct vcpu_svm *svm); + struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); + void sev_rmp_page_level_adjust(struct kvm *kvm, kvm_pfn_t pfn, int *level); ++int sev_post_map_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn); ++void sev_post_unmap_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn); ++void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); ++void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); ++int sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu); + + /* vmenter.S */ + +-- +2.36.1 + + +From 8e73f837c94bdad71039ff15d519bad23c90b0f6 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:14:13 +0000 +Subject: [PATCH 40/90] KVM: x86: Export the kvm_zap_gfn_range() for the SNP + use + +While resolving the RMP page fault, we may run into cases where the page +level between the RMP entry and TDP does not match and the 2M RMP entry +must be split into 4K RMP entries. Or a 2M TDP page need to be broken +into multiple of 4K pages. + +To keep the RMP and TDP page level in sync, we will zap the gfn range +after splitting the pages in the RMP entry. The zap should force the +TDP to gets rebuilt with the new page level. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/kvm_host.h | 2 ++ + arch/x86/kvm/mmu.h | 2 -- + arch/x86/kvm/mmu/mmu.c | 1 + + 3 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 8abc0e724f5c..1db4d178eb1d 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1627,6 +1627,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + void kvm_mmu_zap_all(struct kvm *kvm); + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); + void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); ++void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); ++ + + int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); + +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index d55b5166389a..c5044958a0fa 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -267,8 +267,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + return -(u32)fault & errcode; + } + +-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); +- + int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + + int kvm_mmu_post_init_vm(struct kvm *kvm); +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index c1ac486e096e..67120bfeb667 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6084,6 +6084,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + + return need_tlb_flush; + } ++EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); + + void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +-- +2.36.1 + + +From bfffb57ef84a1a86c1f4bd33402559033d8bec65 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:15:44 +0000 +Subject: [PATCH 41/90] KVM: SVM: Add support to handle the RMP nested page + fault + +When SEV-SNP is enabled in the guest, the hardware places restrictions on +all memory accesses based on the contents of the RMP table. When hardware +encounters RMP check failure caused by the guest memory access it raises +the #NPF. The error code contains additional information on the access +type. See the APM volume 2 for additional information. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 76 ++++++++++++++++++++++++++++++++++++++++++ + arch/x86/kvm/svm/svm.c | 14 +++++--- + 2 files changed, 86 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index cd8812d9ce33..73ed0f03d623 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -4008,3 +4008,79 @@ void sev_post_unmap_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn) + + spin_unlock(&sev->psc_lock); + } ++ ++void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) ++{ ++ int rmp_level, npt_level, rc, assigned; ++ struct kvm *kvm = vcpu->kvm; ++ gfn_t gfn = gpa_to_gfn(gpa); ++ bool need_psc = false; ++ enum psc_op psc_op; ++ kvm_pfn_t pfn; ++ bool private; ++ ++ write_lock(&kvm->mmu_lock); ++ ++ if (unlikely(!kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level))) ++ goto unlock; ++ ++ assigned = snp_lookup_rmpentry(pfn, &rmp_level); ++ if (unlikely(assigned < 0)) ++ goto unlock; ++ ++ private = !!(error_code & PFERR_GUEST_ENC_MASK); ++ ++ /* ++ * If the fault was due to size mismatch, or NPT and RMP page level's ++ * are not in sync, then use PSMASH to split the RMP entry into 4K. ++ */ ++ if ((error_code & PFERR_GUEST_SIZEM_MASK) || ++ (npt_level == PG_LEVEL_4K && rmp_level == PG_LEVEL_2M && private)) { ++ rc = snp_rmptable_psmash(kvm, pfn); ++ if (rc) ++ pr_err_ratelimited("psmash failed, gpa 0x%llx pfn 0x%llx rc %d\n", ++ gpa, pfn, rc); ++ goto out; ++ } ++ ++ /* ++ * If it's a private access, and the page is not assigned in the ++ * RMP table, create a new private RMP entry. This can happen if ++ * guest did not use the PSC VMGEXIT to transition the page state ++ * before the access. ++ */ ++ if (!assigned && private) { ++ need_psc = 1; ++ psc_op = SNP_PAGE_STATE_PRIVATE; ++ goto out; ++ } ++ ++ /* ++ * If it's a shared access, but the page is private in the RMP table ++ * then make the page shared in the RMP table. This can happen if ++ * the guest did not use the PSC VMGEXIT to transition the page ++ * state before the access. ++ */ ++ if (assigned && !private) { ++ need_psc = 1; ++ psc_op = SNP_PAGE_STATE_SHARED; ++ } ++ ++out: ++ write_unlock(&kvm->mmu_lock); ++ ++ if (need_psc) ++ rc = __snp_handle_page_state_change(vcpu, psc_op, gpa, PG_LEVEL_4K); ++ ++ /* ++ * The fault handler has updated the RMP pagesize, zap the existing ++ * rmaps for large entry ranges so that nested page table gets rebuilt ++ * with the updated RMP pagesize. ++ */ ++ gfn = gpa_to_gfn(gpa) & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); ++ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); ++ return; ++ ++unlock: ++ write_unlock(&kvm->mmu_lock); ++} +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 1c8e035ba011..7742bc986afc 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1866,15 +1866,21 @@ static int pf_interception(struct kvm_vcpu *vcpu) + static int npf_interception(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ int rc; + + u64 fault_address = svm->vmcb->control.exit_info_2; + u64 error_code = svm->vmcb->control.exit_info_1; + + trace_kvm_page_fault(fault_address, error_code); +- return kvm_mmu_page_fault(vcpu, fault_address, error_code, +- static_cpu_has(X86_FEATURE_DECODEASSISTS) ? +- svm->vmcb->control.insn_bytes : NULL, +- svm->vmcb->control.insn_len); ++ rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, ++ static_cpu_has(X86_FEATURE_DECODEASSISTS) ? ++ svm->vmcb->control.insn_bytes : NULL, ++ svm->vmcb->control.insn_len); ++ ++ if (error_code & PFERR_GUEST_RMP_MASK) ++ handle_rmp_page_fault(vcpu, fault_address, error_code); ++ ++ return rc; + } + + static int db_interception(struct kvm_vcpu *vcpu) +-- +2.36.1 + + +From c8d55ce304392f9f54a91a561e2fa9f7d995448b Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:17:22 +0000 +Subject: [PATCH 42/90] KVM: SVM: Provide support for SNP_GUEST_REQUEST NAE + event + +Version 2 of GHCB specification added the support for two SNP Guest +Request Message NAE events. The events allows for an SEV-SNP guest to +make request to the SEV-SNP firmware through hypervisor using the +SNP_GUEST_REQUEST API define in the SEV-SNP firmware specification. + +The SNP_EXT_GUEST_REQUEST is similar to SNP_GUEST_REQUEST with the +difference of an additional certificate blob that can be passed through +the SNP_SET_CONFIG ioctl defined in the CCP driver. The CCP driver +provides snp_guest_ext_guest_request() that is used by the KVM to get +both the report and certificate data at once. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 196 +++++++++++++++++++++++++++++++++++++++-- + arch/x86/kvm/svm/svm.h | 2 + + 2 files changed, 192 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 73ed0f03d623..be980d4fe377 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -343,6 +343,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) + + spin_lock_init(&sev->psc_lock); + ret = sev_snp_init(&argp->error); ++ mutex_init(&sev->guest_req_lock); + } else { + ret = sev_platform_init(&argp->error); + } +@@ -1884,23 +1885,39 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) + + static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) + { ++ void *context = NULL, *certs_data = NULL, *resp_page = NULL; ++ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_gctx_create data = {}; +- void *context; + int rc; + ++ /* Allocate memory used for the certs data in SNP guest request */ ++ certs_data = kmalloc(SEV_FW_BLOB_MAX_SIZE, GFP_KERNEL_ACCOUNT); ++ if (!certs_data) ++ return NULL; ++ + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) +- return NULL; ++ goto e_free; ++ ++ /* Allocate a firmware buffer used during the guest command handling. */ ++ resp_page = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); ++ if (!resp_page) ++ goto e_free; + + data.gctx_paddr = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); +- if (rc) { +- snp_free_firmware_page(context); +- return NULL; +- } ++ if (rc) ++ goto e_free; ++ ++ sev->snp_certs_data = certs_data; + + return context; ++ ++e_free: ++ snp_free_firmware_page(context); ++ kfree(certs_data); ++ return NULL; + } + + static int snp_bind_asid(struct kvm *kvm, int *error) +@@ -2565,6 +2582,8 @@ static int snp_decommission_context(struct kvm *kvm) + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + ++ kfree(sev->snp_certs_data); ++ + return 0; + } + +@@ -3076,6 +3095,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code) + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: + case SVM_VMGEXIT_PSC: ++ case SVM_VMGEXIT_GUEST_REQUEST: ++ case SVM_VMGEXIT_EXT_GUEST_REQUEST: + break; + default: + reason = GHCB_ERR_INVALID_EVENT; +@@ -3501,6 +3522,155 @@ static unsigned long snp_handle_page_state_change(struct vcpu_svm *svm) + return rc ? map_to_psc_vmgexit_code(rc) : 0; + } + ++static unsigned long snp_setup_guest_buf(struct vcpu_svm *svm, ++ struct sev_data_snp_guest_request *data, ++ gpa_t req_gpa, gpa_t resp_gpa) ++{ ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct kvm *kvm = vcpu->kvm; ++ kvm_pfn_t req_pfn, resp_pfn; ++ struct kvm_sev_info *sev; ++ ++ sev = &to_kvm_svm(kvm)->sev_info; ++ ++ if (!IS_ALIGNED(req_gpa, PAGE_SIZE) || !IS_ALIGNED(resp_gpa, PAGE_SIZE)) ++ return SEV_RET_INVALID_PARAM; ++ ++ req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa)); ++ if (is_error_noslot_pfn(req_pfn)) ++ return SEV_RET_INVALID_ADDRESS; ++ ++ resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); ++ if (is_error_noslot_pfn(resp_pfn)) ++ return SEV_RET_INVALID_ADDRESS; ++ ++ if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) ++ return SEV_RET_INVALID_ADDRESS; ++ ++ data->gctx_paddr = __psp_pa(sev->snp_context); ++ data->req_paddr = __sme_set(req_pfn << PAGE_SHIFT); ++ data->res_paddr = __sme_set(resp_pfn << PAGE_SHIFT); ++ ++ return 0; ++} ++ ++static void snp_cleanup_guest_buf(struct sev_data_snp_guest_request *data, unsigned long *rc) ++{ ++ u64 pfn = __sme_clr(data->res_paddr) >> PAGE_SHIFT; ++ int ret; ++ ++ ret = snp_page_reclaim(pfn); ++ if (ret) ++ *rc = SEV_RET_INVALID_ADDRESS; ++ ++ ret = rmp_make_shared(pfn, PG_LEVEL_4K); ++ if (ret) ++ *rc = SEV_RET_INVALID_ADDRESS; ++} ++ ++static void snp_handle_guest_request(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) ++{ ++ struct sev_data_snp_guest_request data = {0}; ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct kvm *kvm = vcpu->kvm; ++ struct kvm_sev_info *sev; ++ unsigned long rc; ++ int err; ++ ++ if (!sev_snp_guest(vcpu->kvm)) { ++ rc = SEV_RET_INVALID_GUEST; ++ goto e_fail; ++ } ++ ++ sev = &to_kvm_svm(kvm)->sev_info; ++ ++ mutex_lock(&sev->guest_req_lock); ++ ++ rc = snp_setup_guest_buf(svm, &data, req_gpa, resp_gpa); ++ if (rc) ++ goto unlock; ++ ++ rc = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &err); ++ if (rc) ++ /* use the firmware error code */ ++ rc = err; ++ ++ snp_cleanup_guest_buf(&data, &rc); ++ ++unlock: ++ mutex_unlock(&sev->guest_req_lock); ++ ++e_fail: ++ svm_set_ghcb_sw_exit_info_2(vcpu, rc); ++} ++ ++static void snp_handle_ext_guest_request(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) ++{ ++ struct sev_data_snp_guest_request req = {0}; ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct kvm *kvm = vcpu->kvm; ++ unsigned long data_npages; ++ struct kvm_sev_info *sev; ++ unsigned long rc, err; ++ u64 data_gpa; ++ ++ if (!sev_snp_guest(vcpu->kvm)) { ++ rc = SEV_RET_INVALID_GUEST; ++ goto e_fail; ++ } ++ ++ sev = &to_kvm_svm(kvm)->sev_info; ++ ++ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; ++ data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; ++ ++ if (!IS_ALIGNED(data_gpa, PAGE_SIZE)) { ++ rc = SEV_RET_INVALID_ADDRESS; ++ goto e_fail; ++ } ++ ++ /* Verify that requested blob will fit in certificate buffer */ ++ if ((data_npages << PAGE_SHIFT) > SEV_FW_BLOB_MAX_SIZE) { ++ rc = SEV_RET_INVALID_PARAM; ++ goto e_fail; ++ } ++ ++ mutex_lock(&sev->guest_req_lock); ++ ++ rc = snp_setup_guest_buf(svm, &req, req_gpa, resp_gpa); ++ if (rc) ++ goto unlock; ++ ++ rc = snp_guest_ext_guest_request(&req, (unsigned long)sev->snp_certs_data, ++ &data_npages, &err); ++ if (rc) { ++ /* ++ * If buffer length is small then return the expected ++ * length in rbx. ++ */ ++ if (err == SNP_GUEST_REQ_INVALID_LEN) ++ vcpu->arch.regs[VCPU_REGS_RBX] = data_npages; ++ ++ /* pass the firmware error code */ ++ rc = err; ++ goto cleanup; ++ } ++ ++ /* Copy the certificate blob in the guest memory */ ++ if (data_npages && ++ kvm_write_guest(kvm, data_gpa, sev->snp_certs_data, data_npages << PAGE_SHIFT)) ++ rc = SEV_RET_INVALID_ADDRESS; ++ ++cleanup: ++ snp_cleanup_guest_buf(&req, &rc); ++ ++unlock: ++ mutex_unlock(&sev->guest_req_lock); ++ ++e_fail: ++ svm_set_ghcb_sw_exit_info_2(vcpu, rc); ++} ++ + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + { + struct vmcb_control_area *control = &svm->vmcb->control; +@@ -3752,6 +3922,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + svm_set_ghcb_sw_exit_info_2(vcpu, rc); + break; + } ++ case SVM_VMGEXIT_GUEST_REQUEST: { ++ snp_handle_guest_request(svm, control->exit_info_1, control->exit_info_2); ++ ++ ret = 1; ++ break; ++ } ++ case SVM_VMGEXIT_EXT_GUEST_REQUEST: { ++ snp_handle_ext_guest_request(svm, ++ control->exit_info_1, ++ control->exit_info_2); ++ ++ ret = 1; ++ break; ++ } + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + vcpu_unimpl(vcpu, + "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 3fd95193ed8d..3be24da1a743 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -98,6 +98,8 @@ struct kvm_sev_info { + u64 snp_init_flags; + void *snp_context; /* SNP guest context page */ + spinlock_t psc_lock; ++ void *snp_certs_data; ++ struct mutex guest_req_lock; + }; + + struct kvm_svm { +-- +2.36.1 + + +From 66208e410232d535cb584003f66b85f78da067f2 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Tue, 26 Apr 2022 19:19:40 +0000 +Subject: [PATCH 43/90] KVM: SVM: Use a VMSA physical address variable for + populating VMCB + +In preparation to support SEV-SNP AP Creation, use a variable that holds +the VMSA physical address rather than converting the virtual address. +This will allow SEV-SNP AP Creation to set the new physical address that +will be used should the vCPU reset path be taken. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +--- + arch/x86/kvm/svm/sev.c | 5 ++--- + arch/x86/kvm/svm/svm.c | 9 ++++++++- + arch/x86/kvm/svm/svm.h | 1 + + 3 files changed, 11 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index be980d4fe377..62fc0cea34f1 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3979,10 +3979,9 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) + + /* + * An SEV-ES guest requires a VMSA area that is a separate from the +- * VMCB page. Do not include the encryption mask on the VMSA physical +- * address since hardware will access it using the guest key. ++ * VMCB page. + */ +- svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); ++ svm->vmcb->control.vmsa_pa = svm->sev_es.vmsa_pa; + + /* Can't intercept CR register access, HV can't modify CR registers */ + svm_clr_intercept(svm, INTERCEPT_CR0_READ); +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 7742bc986afc..f7155abe7567 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1296,9 +1296,16 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) + svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); + svm_switch_vmcb(svm, &svm->vmcb01); + +- if (vmsa_page) ++ if (vmsa_page) { + svm->sev_es.vmsa = page_address(vmsa_page); + ++ /* ++ * Do not include the encryption mask on the VMSA physical ++ * address since hardware will access it using the guest key. ++ */ ++ svm->sev_es.vmsa_pa = __pa(svm->sev_es.vmsa); ++ } ++ + svm->guest_state_loaded = false; + + return 0; +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 3be24da1a743..46790bab07a8 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -197,6 +197,7 @@ struct svm_nested_state { + struct vcpu_sev_es_state { + /* SEV-ES support */ + struct sev_es_save_area *vmsa; ++ hpa_t vmsa_pa; + bool ghcb_in_use; + bool received_first_sipi; + unsigned int ap_reset_hold_type; +-- +2.36.1 + + +From 529d713888c62e02894db94d3ad1ac09b94aa901 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Tue, 26 Apr 2022 19:21:40 +0000 +Subject: [PATCH 44/90] KVM: SVM: Support SEV-SNP AP Creation NAE event + +Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP +guests to alter the register state of the APs on their own. This allows +the guest a way of simulating INIT-SIPI. + +A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used +so as to avoid updating the VMSA pointer while the vCPU is running. + +For CREATE + The guest supplies the GPA of the VMSA to be used for the vCPU with + the specified APIC ID. The GPA is saved in the svm struct of the + target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added + to the vCPU and then the vCPU is kicked. + +For CREATE_ON_INIT: + The guest supplies the GPA of the VMSA to be used for the vCPU with + the specified APIC ID the next time an INIT is performed. The GPA is + saved in the svm struct of the target vCPU. + +For DESTROY: + The guest indicates it wishes to stop the vCPU. The GPA is cleared + from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is + added to vCPU and then the vCPU is kicked. + +The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked +as a result of the event or as a result of an INIT. The handler sets the +vCPU to the KVM_MP_STATE_UNINITIALIZED state, so that any errors will +leave the vCPU as not runnable. Any previous VMSA pages that were +installed as part of an SEV-SNP AP Creation NAE event are un-pinned. If +a new VMSA is to be installed, the VMSA guest page is pinned and set as +the VMSA in the vCPU VMCB and the vCPU state is set to +KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is +cleared in the vCPU VMCB and the vCPU state is left as +KVM_MP_STATE_UNINITIALIZED to prevent it from being run. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/include/asm/kvm-x86-ops.h | 1 - + arch/x86/include/asm/kvm_host.h | 3 +- + arch/x86/include/asm/svm.h | 7 +- + arch/x86/kvm/svm/sev.c | 197 +++++++++++++++++++++++++++++ + arch/x86/kvm/svm/svm.c | 5 +- + arch/x86/kvm/svm/svm.h | 6 + + arch/x86/kvm/x86.c | 9 +- + 7 files changed, 221 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index 2dd2bc0cf4c3..e0068e702692 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -130,7 +130,6 @@ KVM_X86_OP(vcpu_deliver_sipi_vector) + KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); + KVM_X86_OP(alloc_apic_backing_page) + KVM_X86_OP_OPTIONAL(rmp_page_level_adjust) +-KVM_X86_OP(update_protected_guest_state) + + #undef KVM_X86_OP + #undef KVM_X86_OP_OPTIONAL +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 1db4d178eb1d..660cf39344fb 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -105,6 +105,7 @@ + KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) + #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ + KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) ++#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(32) + + #define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ +@@ -1524,8 +1525,6 @@ struct kvm_x86_ops { + void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + + void (*rmp_page_level_adjust)(struct kvm *kvm, kvm_pfn_t pfn, int *level); +- +- int (*update_protected_guest_state)(struct kvm_vcpu *vcpu); + }; + + struct kvm_x86_nested_ops { +diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h +index 284a8113227e..a69b6da71a65 100644 +--- a/arch/x86/include/asm/svm.h ++++ b/arch/x86/include/asm/svm.h +@@ -263,7 +263,12 @@ enum avic_ipi_failure_cause { + #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) + #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + +-#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) ++#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) ++#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) ++#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) ++#define SVM_SEV_FEAT_INT_INJ_MODES \ ++ (SVM_SEV_FEAT_RESTRICTED_INJECTION | \ ++ SVM_SEV_FEAT_ALTERNATE_INJECTION) + + struct vmcb_seg { + u16 selector; +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 62fc0cea34f1..90de603e9132 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -657,6 +657,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) + + static int sev_es_sync_vmsa(struct vcpu_svm *svm) + { ++ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct sev_es_save_area *save = svm->sev_es.vmsa; + + /* Check some debug related fields before encrypting the VMSA */ +@@ -702,6 +703,12 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) + if (sev_snp_guest(svm->vcpu.kvm)) + save->sev_features |= SVM_SEV_FEAT_SNP_ACTIVE; + ++ /* ++ * Save the VMSA synced SEV features. For now, they are the same for ++ * all vCPUs, so just save each time. ++ */ ++ sev->sev_features = save->sev_features; ++ + return 0; + } + +@@ -3089,6 +3096,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code) + if (!ghcb_sw_scratch_is_valid(ghcb)) + goto vmgexit_err; + break; ++ case SVM_VMGEXIT_AP_CREATION: ++ if (!ghcb_rax_is_valid(ghcb)) ++ goto vmgexit_err; ++ break; + case SVM_VMGEXIT_NMI_COMPLETE: + case SVM_VMGEXIT_AP_HLT_LOOP: + case SVM_VMGEXIT_AP_JUMP_TABLE: +@@ -3671,6 +3682,178 @@ static void snp_handle_ext_guest_request(struct vcpu_svm *svm, gpa_t req_gpa, gp + svm_set_ghcb_sw_exit_info_2(vcpu, rc); + } + ++static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_svm *svm = to_svm(vcpu); ++ kvm_pfn_t pfn; ++ hpa_t cur_pa; ++ ++ WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); ++ ++ /* Save off the current VMSA PA for later checks */ ++ cur_pa = svm->sev_es.vmsa_pa; ++ ++ /* Mark the vCPU as offline and not runnable */ ++ vcpu->arch.pv.pv_unhalted = false; ++ vcpu->arch.mp_state = KVM_MP_STATE_STOPPED; ++ ++ /* Clear use of the VMSA */ ++ svm->sev_es.vmsa_pa = INVALID_PAGE; ++ svm->vmcb->control.vmsa_pa = INVALID_PAGE; ++ ++ if (cur_pa != __pa(svm->sev_es.vmsa) && VALID_PAGE(cur_pa)) { ++ /* ++ * The svm->sev_es.vmsa_pa field holds the hypervisor physical ++ * address of the about to be replaced VMSA which will no longer ++ * be used or referenced, so un-pin it. ++ */ ++ kvm_release_pfn_dirty(__phys_to_pfn(cur_pa)); ++ } ++ ++ if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { ++ /* ++ * The VMSA is referenced by the hypervisor physical address, ++ * so retrieve the PFN and pin it. ++ */ ++ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(svm->sev_es.snp_vmsa_gpa)); ++ if (is_error_pfn(pfn)) ++ return -EINVAL; ++ ++ /* Use the new VMSA */ ++ svm->sev_es.vmsa_pa = pfn_to_hpa(pfn); ++ svm->vmcb->control.vmsa_pa = svm->sev_es.vmsa_pa; ++ ++ /* Mark the vCPU as runnable */ ++ vcpu->arch.pv.pv_unhalted = false; ++ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; ++ ++ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; ++ } ++ ++ /* ++ * When replacing the VMSA during SEV-SNP AP creation, ++ * mark the VMCB dirty so that full state is always reloaded. ++ */ ++ vmcb_mark_all_dirty(svm->vmcb); ++ ++ return 0; ++} ++ ++/* ++ * Invoked as part of svm_vcpu_reset() processing of an init event. ++ */ ++void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_svm *svm = to_svm(vcpu); ++ int ret; ++ ++ if (!sev_snp_guest(vcpu->kvm)) ++ return; ++ ++ mutex_lock(&svm->sev_es.snp_vmsa_mutex); ++ ++ if (!svm->sev_es.snp_ap_create) ++ goto unlock; ++ ++ svm->sev_es.snp_ap_create = false; ++ ++ ret = __sev_snp_update_protected_guest_state(vcpu); ++ if (ret) ++ vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); ++ ++unlock: ++ mutex_unlock(&svm->sev_es.snp_vmsa_mutex); ++} ++ ++static int sev_snp_ap_creation(struct vcpu_svm *svm) ++{ ++ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; ++ struct kvm_vcpu *vcpu = &svm->vcpu; ++ struct kvm_vcpu *target_vcpu; ++ struct vcpu_svm *target_svm; ++ unsigned int request; ++ unsigned int apic_id; ++ bool kick; ++ int ret; ++ ++ request = lower_32_bits(svm->vmcb->control.exit_info_1); ++ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); ++ ++ /* Validate the APIC ID */ ++ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); ++ if (!target_vcpu) { ++ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", ++ apic_id); ++ return -EINVAL; ++ } ++ ++ ret = 0; ++ ++ target_svm = to_svm(target_vcpu); ++ ++ /* ++ * We have a valid target vCPU, so the vCPU will be kicked unless the ++ * request is for CREATE_ON_INIT. For any errors at this stage, the ++ * kick will place the vCPU in an non-runnable state. ++ */ ++ kick = true; ++ ++ mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); ++ ++ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; ++ target_svm->sev_es.snp_ap_create = true; ++ ++ /* Interrupt injection mode shouldn't change for AP creation */ ++ if (request < SVM_VMGEXIT_AP_DESTROY) { ++ u64 sev_features; ++ ++ sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; ++ sev_features ^= sev->sev_features; ++ if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { ++ vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", ++ vcpu->arch.regs[VCPU_REGS_RAX]); ++ ret = -EINVAL; ++ goto out; ++ } ++ } ++ ++ switch (request) { ++ case SVM_VMGEXIT_AP_CREATE_ON_INIT: ++ kick = false; ++ fallthrough; ++ case SVM_VMGEXIT_AP_CREATE: ++ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { ++ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", ++ svm->vmcb->control.exit_info_2); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; ++ break; ++ case SVM_VMGEXIT_AP_DESTROY: ++ break; ++ default: ++ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", ++ request); ++ ret = -EINVAL; ++ break; ++ } ++ ++out: ++ if (kick) { ++ if (target_vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) ++ target_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; ++ ++ kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); ++ kvm_vcpu_kick(target_vcpu); ++ } ++ ++ mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); ++ ++ return ret; ++} ++ + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) + { + struct vmcb_control_area *control = &svm->vmcb->control; +@@ -3936,6 +4119,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) + ret = 1; + break; + } ++ case SVM_VMGEXIT_AP_CREATION: ++ ret = sev_snp_ap_creation(svm); ++ if (ret) { ++ svm_set_ghcb_sw_exit_info_1(vcpu, 1); ++ svm_set_ghcb_sw_exit_info_2(vcpu, ++ X86_TRAP_GP | ++ SVM_EVTINJ_TYPE_EXEPT | ++ SVM_EVTINJ_VALID); ++ } ++ ++ ret = 1; ++ break; + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + vcpu_unimpl(vcpu, + "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", +@@ -4023,6 +4218,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + GHCB_VERSION_MIN, + sev_enc_bit)); ++ ++ mutex_init(&svm->sev_es.snp_vmsa_mutex); + } + + void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index f7155abe7567..fced6ea423ad 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1237,6 +1237,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + svm->spec_ctrl = 0; + svm->virt_spec_ctrl = 0; + ++ if (init_event) ++ sev_snp_init_protected_guest_state(vcpu); ++ + init_vmcb(vcpu); + + if (!init_event) +@@ -4749,8 +4752,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { + .alloc_apic_backing_page = svm_alloc_apic_backing_page, + + .rmp_page_level_adjust = sev_rmp_page_level_adjust, +- +- .update_protected_guest_state = sev_snp_update_protected_guest_state, + }; + + /* +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 46790bab07a8..971ff4e949fd 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -100,6 +100,8 @@ struct kvm_sev_info { + spinlock_t psc_lock; + void *snp_certs_data; + struct mutex guest_req_lock; ++ ++ u64 sev_features; /* Features set at VMSA creation */ + }; + + struct kvm_svm { +@@ -217,6 +219,10 @@ struct vcpu_sev_es_state { + u64 ghcb_sw_exit_info_2; + + u64 ghcb_registered_gpa; ++ ++ struct mutex snp_vmsa_mutex; ++ gpa_t snp_vmsa_gpa; ++ bool snp_ap_create; + }; + + struct vcpu_svm { +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 4a1d16231e30..c649d15efae3 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10095,6 +10095,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + + if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) + static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); ++ ++ if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) { ++ kvm_vcpu_reset(vcpu, true); ++ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) ++ goto out; ++ } + } + + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || +@@ -12219,7 +12225,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + +- if (kvm_apic_has_events(vcpu)) ++ if (kvm_apic_has_events(vcpu) || ++ kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + + if (vcpu->arch.pv.pv_unhalted) +-- +2.36.1 + + +From 83bc130bc45fedfbbf15f19b1ed33b9323113722 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:24:13 +0000 +Subject: [PATCH 45/90] KVM: SVM: Add module parameter to enable the SEV-SNP + +Add a module parameter than can be used to enable or disable the SEV-SNP +feature. Now that KVM contains the support for the SNP set the GHCB +hypervisor feature flag to indicate that SNP is supported. + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 7 ++++--- + arch/x86/kvm/svm/svm.h | 2 +- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 90de603e9132..fb4178f6e8dd 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -57,14 +57,15 @@ module_param_named(sev, sev_enabled, bool, 0444); + /* enable/disable SEV-ES support */ + static bool sev_es_enabled = true; + module_param_named(sev_es, sev_es_enabled, bool, 0444); ++ ++/* enable/disable SEV-SNP support */ ++static bool sev_snp_enabled = true; ++module_param_named(sev_snp, sev_snp_enabled, bool, 0444); + #else + #define sev_enabled false + #define sev_es_enabled false + #endif /* CONFIG_KVM_AMD_SEV */ + +-/* enable/disable SEV-SNP support */ +-static bool sev_snp_enabled; +- + #define AP_RESET_HOLD_NONE 0 + #define AP_RESET_HOLD_NAE_EVENT 1 + #define AP_RESET_HOLD_MSR_PROTO 2 +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 971ff4e949fd..7b14b5ef1f8c 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -688,7 +688,7 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); + #define GHCB_VERSION_MAX 2ULL + #define GHCB_VERSION_MIN 1ULL + +-#define GHCB_HV_FT_SUPPORTED 0 ++#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) + + extern unsigned int max_sev_asid; + +-- +2.36.1 + + +From c031ce322e78d8350776e8089f1c9d22b439c629 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:28:16 +0000 +Subject: [PATCH 46/90] ccp: add support to decrypt the page + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + drivers/crypto/ccp/sev-dev.c | 33 ++++++++++++++++++++++++++++++--- + include/linux/psp-sev.h | 6 +++--- + 2 files changed, 33 insertions(+), 6 deletions(-) + +diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c +index 740e610aeaf6..36277a146790 100644 +--- a/drivers/crypto/ccp/sev-dev.c ++++ b/drivers/crypto/ccp/sev-dev.c +@@ -1849,11 +1849,38 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error) + } + EXPORT_SYMBOL_GPL(snp_guest_page_reclaim); + +-int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error) ++int snp_guest_dbg_decrypt_page(u64 gctx_pfn, u64 src_pfn, u64 dst_pfn, int *error) + { +- return sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, data, error); ++ struct sev_data_snp_dbg data = {0}; ++ struct sev_device *sev; ++ int ret; ++ ++ if (!psp_master || !psp_master->sev_data) ++ return -ENODEV; ++ ++ sev = psp_master->sev_data; ++ ++ if (!sev->snp_inited) ++ return -EINVAL; ++ ++ data.gctx_paddr = sme_me_mask | (gctx_pfn << PAGE_SHIFT); ++ data.src_addr = sme_me_mask | (src_pfn << PAGE_SHIFT); ++ data.dst_addr = sme_me_mask | (dst_pfn << PAGE_SHIFT); ++ data.len = PAGE_SIZE; ++ ++ /* The destination page must be in the firmware state. */ ++ if (snp_set_rmp_state(data.dst_addr, 1, true, false, false)) ++ return -EIO; ++ ++ ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &data, error); ++ ++ /* Restore the page state */ ++ if (snp_set_rmp_state(data.dst_addr, 1, false, false, true)) ++ ret = -EIO; ++ ++ return ret; + } +-EXPORT_SYMBOL_GPL(snp_guest_dbg_decrypt); ++EXPORT_SYMBOL_GPL(snp_guest_dbg_decrypt_page); + + int snp_guest_ext_guest_request(struct sev_data_snp_guest_request *data, + unsigned long vaddr, unsigned long *npages, unsigned long *fw_err) +diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h +index cd37ccd1fa1f..8d2565c70c39 100644 +--- a/include/linux/psp-sev.h ++++ b/include/linux/psp-sev.h +@@ -928,7 +928,7 @@ int snp_guest_decommission(struct sev_data_snp_decommission *data, int *error); + int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); + + /** +- * snp_guest_dbg_decrypt - perform SEV SNP_DBG_DECRYPT command ++ * snp_guest_dbg_decrypt_page - perform SEV SNP_DBG_DECRYPT command + * + * @sev_ret: sev command return code + * +@@ -939,7 +939,7 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); + * -%ETIMEDOUT if the sev command timed out + * -%EIO if the sev returned a non-zero return code + */ +-int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error); ++int snp_guest_dbg_decrypt_page(u64 gctx_pfn, u64 src_pfn, u64 dst_pfn, int *error); + + void *psp_copy_user_blob(u64 uaddr, u32 len); + void *snp_alloc_firmware_page(gfp_t mask); +@@ -997,7 +997,7 @@ static inline int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, + return -ENODEV; + } + +-static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error) ++static inline int snp_guest_dbg_decrypt_page(u64 gctx_pfn, u64 src_pfn, u64 dst_pfn, int *error) + { + return -ENODEV; + } +-- +2.36.1 + + +From b7b303111ce74e975482c5d68fd5892a5aceac25 Mon Sep 17 00:00:00 2001 +From: Brijesh Singh <brijesh.singh@amd.com> +Date: Tue, 26 Apr 2022 19:30:49 +0000 +Subject: [PATCH 47/90] *debug: dump VMCBs on SNP guest destroy + +Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> +--- + arch/x86/kvm/svm/sev.c | 11 +++++++++ + arch/x86/kvm/svm/svm.c | 54 ++++++++++++++++++++++++++++++++++++++++-- + arch/x86/kvm/svm/svm.h | 2 ++ + 3 files changed, 65 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index fb4178f6e8dd..6c4c205091a2 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -1957,6 +1957,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; ++ start.policy |= (1ul << 19); // enable the debug + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) +@@ -2606,6 +2607,16 @@ void sev_vm_destroy(struct kvm *kvm) + + WARN_ON(!list_empty(&sev->mirror_vms)); + ++ if (sev_snp_guest(kvm)) { ++ unsigned int i; ++ ++ for (i = 0; i < kvm->created_vcpus; i++) { ++ struct kvm_vcpu *vcpu = xa_load(&kvm->vcpu_array, i); ++ ++ dump_vmcb(vcpu); ++ } ++ } ++ + /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ + if (is_mirroring_enc_context(kvm)) { + struct kvm *owner_kvm = sev->enc_context_owner; +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index fced6ea423ad..ce81c1950539 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -190,7 +190,7 @@ module_param(avic, bool, 0444); + static bool force_avic; + module_param_unsafe(force_avic, bool, 0444); + +-bool __read_mostly dump_invalid_vmcb; ++bool __read_mostly dump_invalid_vmcb = true; + module_param(dump_invalid_vmcb, bool, 0644); + + +@@ -3120,7 +3120,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { + [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, + }; + +-static void dump_vmcb(struct kvm_vcpu *vcpu) ++void dump_vmcb(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; +@@ -3171,6 +3171,31 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) + pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); + pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); + pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); ++ ++ if (vcpu->arch.guest_state_protected && sev_snp_guest(vcpu->kvm)) { ++ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; ++ struct page *save_page; ++ int ret, error; ++ ++ save_page = alloc_page(GFP_KERNEL); ++ if (!save_page) ++ return; ++ ++ save = page_address(save_page); ++ save01 = save; ++ ++ wbinvd_on_all_cpus(); ++ ++ ret = snp_guest_dbg_decrypt_page(__pa(sev->snp_context) >> PAGE_SHIFT, ++ svm->vmcb->control.vmsa_pa >> PAGE_SHIFT, ++ __pa(save) >> PAGE_SHIFT, ++ &error); ++ if (ret) { ++ pr_err("%s: failed to decrypt vmsa %d\n", __func__, error); ++ return; ++ } ++ } ++ + pr_err("VMCB State Save Area:\n"); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "es:", +@@ -3241,6 +3266,31 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) + pr_err("%-15s %016llx %-13s %016llx\n", + "excp_from:", save->last_excp_from, + "excp_to:", save->last_excp_to); ++ ++ if (sev_snp_guest(vcpu->kvm)) { ++ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; ++ ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "rax:", vmsa->rax, "rbx:", vmsa->rbx); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "r8:", vmsa->r8, "r9:", vmsa->r9); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "r10:", vmsa->r10, "r11:", vmsa->r11); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "r12:", vmsa->r12, "r13:", vmsa->r13); ++ pr_err("%-15s %016llx %-13s %016llx\n", ++ "r14:", vmsa->r14, "r15:", vmsa->r15); ++ ++ wbinvd_on_all_cpus(); ++ __free_page(virt_to_page(save)); ++ } ++ + } + + static bool svm_check_exit_valid(u64 exit_code) +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 7b14b5ef1f8c..060c1eabee0a 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -382,6 +382,8 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) + return container_of(vcpu, struct vcpu_svm, vcpu); + } + ++void dump_vmcb(struct kvm_vcpu *vcpu); ++ + /* + * Only the PDPTRs are loaded on demand into the shadow MMU. All other + * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. +-- +2.36.1 + + +From 840f8f7d3a94d4cfa8e12d72e7b0869009e92850 Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:33:58 +0000 +Subject: [PATCH 48/90] *debug: trace_kvm_mmu_spte_requested: add + fault.max_level to output + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kvm/mmu/mmutrace.h | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h +index 12247b96af01..4b60f755bc21 100644 +--- a/arch/x86/kvm/mmu/mmutrace.h ++++ b/arch/x86/kvm/mmu/mmutrace.h +@@ -374,16 +374,18 @@ TRACE_EVENT( + __field(u64, gfn) + __field(u64, pfn) + __field(u8, level) ++ __field(u8, max_level) + ), + + TP_fast_assign( + __entry->gfn = fault->gfn; + __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1)); + __entry->level = fault->goal_level; ++ __entry->max_level = fault->max_level; + ), + +- TP_printk("gfn %llx pfn %llx level %d", +- __entry->gfn, __entry->pfn, __entry->level ++ TP_printk("gfn %llx pfn %llx level %d max_level %d", ++ __entry->gfn, __entry->pfn, __entry->level, __entry->max_level + ) + ); + +-- +2.36.1 + + +From 120de6a393c2da31be3f648828aea6a54822605a Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:42:23 +0000 +Subject: [PATCH 49/90] *debug: define trace_kvm_sev_es_unmap_ghcb + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kvm/trace.h | 32 ++++++++++++++++++++++++++++++++ + arch/x86/kvm/x86.c | 1 + + 2 files changed, 33 insertions(+) + +diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h +index 79801e50344a..58c2a1ac0c6b 100644 +--- a/arch/x86/kvm/trace.h ++++ b/arch/x86/kvm/trace.h +@@ -1789,6 +1789,38 @@ TRACE_EVENT(kvm_snp_psc, + __entry->level) + ); + ++TRACE_EVENT(kvm_sev_es_unmap_ghcb, ++ TP_PROTO(void *ghcb_sa, u64 ghcb_sa_gpa, u32 ghcb_sa_len, u32 ghcb_sa_alloc_len, bool ghcb_sa_sync, bool ghcb_in_use, u8 ghcb_sa0, u8 ghcb_sa1), ++ TP_ARGS(ghcb_sa, ghcb_sa_gpa, ghcb_sa_len, ghcb_sa_alloc_len, ghcb_sa_sync, ghcb_in_use, ghcb_sa0, ghcb_sa1), ++ ++ TP_STRUCT__entry( ++ __field(u64, ghcb_sa_hva) ++ __field(u64, ghcb_sa_gpa) ++ __field(u32, ghcb_sa_len) ++ __field(u32, ghcb_sa_alloc_len) ++ __field(bool, ghcb_sa_sync) ++ __field(bool, ghcb_in_use) ++ __field(u8, ghcb_sa0) ++ __field(u8, ghcb_sa1) ++ ), ++ ++ TP_fast_assign( ++ __entry->ghcb_sa_hva = (u64)ghcb_sa; ++ __entry->ghcb_sa_gpa = ghcb_sa_gpa; ++ __entry->ghcb_sa_len = ghcb_sa_len; ++ __entry->ghcb_sa_alloc_len = ghcb_sa_alloc_len; ++ __entry->ghcb_sa_sync = ghcb_sa_sync; ++ __entry->ghcb_in_use = ghcb_in_use; ++ __entry->ghcb_sa0 = ghcb_sa0; ++ __entry->ghcb_sa1 = ghcb_sa1; ++ ), ++ ++ TP_printk("ghcb_sa_hva %016llx, ghcb_gpa %016llx, ghcb_sa_len 0x%x, ghcb_sa_alloc_len 0x%x, ghcb_sa_sync %d, ghcb_in_use %d, ghcb_sa0 0x%x, ghcb_sa1 0x%x", ++ __entry->ghcb_sa_hva, __entry->ghcb_sa_gpa, __entry->ghcb_sa_len, ++ __entry->ghcb_sa_alloc_len, __entry->ghcb_sa_sync, __entry->ghcb_in_use, ++ __entry->ghcb_sa0, __entry->ghcb_sa1) ++); ++ + #endif /* _TRACE_KVM_H */ + + #undef TRACE_INCLUDE_PATH +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index c649d15efae3..0cf968d0e182 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -13074,6 +13074,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_snp_psc); ++EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_sev_es_unmap_ghcb); + + static int __init kvm_x86_init(void) + { +-- +2.36.1 + + +From 5b6226c6fa5add40cb9af67076bd61f14fddfc62 Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:44:39 +0000 +Subject: [PATCH 50/90] *debug: use trace_kvm_sev_es_unmap_ghcb + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kvm/svm/sev.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 6c4c205091a2..38920c215fce 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3160,6 +3160,24 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code) + + void sev_es_unmap_ghcb(struct vcpu_svm *svm) + { ++ if (svm->sev_es.ghcb_sa_alloc_len >= 2) ++ trace_kvm_sev_es_unmap_ghcb(svm->sev_es.ghcb_sa, ++ svm->sev_es.ghcb_sa_gpa, ++ svm->sev_es.ghcb_sa_len, ++ svm->sev_es.ghcb_sa_alloc_len, ++ svm->sev_es.ghcb_sa_sync, ++ svm->sev_es.ghcb_in_use, ++ ((u8 *)svm->sev_es.ghcb_sa)[0], ++ ((u8 *)svm->sev_es.ghcb_sa)[1]); ++ else ++ trace_kvm_sev_es_unmap_ghcb(svm->sev_es.ghcb_sa, ++ svm->sev_es.ghcb_sa_gpa, ++ svm->sev_es.ghcb_sa_len, ++ svm->sev_es.ghcb_sa_alloc_len, ++ svm->sev_es.ghcb_sa_sync, ++ svm->sev_es.ghcb_in_use, ++ 0, 0); ++ + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + +-- +2.36.1 + + +From be537bcbc0f5f317834f287a7814d4a7f5fc1f2a Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:45:53 +0000 +Subject: [PATCH 51/90] *debug: warn when kvm_write_guest() fails in + sev_es_unmap_ghcb() + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kvm/svm/sev.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 38920c215fce..b3df120d5e20 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -3186,9 +3186,13 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) + + /* Sync the scratch buffer area. */ + if (svm->sev_es.ghcb_sa_sync) { +- kvm_write_guest(svm->vcpu.kvm, ++ int ret; ++ ++ ret = kvm_write_guest(svm->vcpu.kvm, + svm->sev_es.ghcb_sa_gpa, + svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); ++ if (ret) ++ pr_warn_ratelimited("unmap_ghcb: kvm_write_guest failed while syncing scratch area, gpa: %llx, ret: %d\n", svm->sev_es.ghcb_sa_gpa, ret); + svm->sev_es.ghcb_sa_sync = false; + } + +-- +2.36.1 + + +From ef8f1663c030db438879cbd8ae2efa17ed4efd63 Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:47:03 +0000 +Subject: [PATCH 52/90] *debug: warn if setup_vmgexit_scratch() fails in + sev_es_string_io() + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kvm/svm/sev.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index b3df120d5e20..6cadfbbfae89 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -4192,8 +4192,10 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) + return -EINVAL; + + r = setup_vmgexit_scratch(svm, in, bytes); +- if (r) ++ if (r) { ++ pr_err("failed to setup vmgexit scratch\n"); + return r; ++ } + + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, + count, in); +-- +2.36.1 + + +From 701aa0d6326460665afdd291b06fb3c2a0a1edeb Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:48:27 +0000 +Subject: [PATCH 53/90] *debug: warn on specific failures within + __kvm_write_guest_page() + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + virt/kvm/kvm_main.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index f30bb8c16f26..5cf3e5915510 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2974,11 +2974,15 @@ static int __kvm_write_guest_page(struct kvm *kvm, + unsigned long addr; + + addr = gfn_to_hva_memslot(memslot, gfn); +- if (kvm_is_error_hva(addr)) ++ if (kvm_is_error_hva(addr)) { ++ pr_warn_ratelimited("__kvm_write_guest_page addr is error hva: %ld\n", addr); + return -EFAULT; ++ } + r = __copy_to_user((void __user *)addr + offset, data, len); +- if (r) ++ if (r) { ++ pr_warn_ratelimited("__kvm_write_guest_page: __copy_to_user failure: %d", r); + return -EFAULT; ++ } + mark_page_dirty_in_slot(kvm, memslot, gfn); + return 0; + } +-- +2.36.1 + + +From 04039fdc97c7ed0b6d003ae4a9bde2ed8e332e11 Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:51:03 +0000 +Subject: [PATCH 54/90] *fix for stale per-cpu pointer due to cond_resched + during ghcb mapping + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kvm/svm/svm.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index ce81c1950539..83dcac191ef4 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1352,7 +1352,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); +- struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); ++ struct svm_cpu_data *sd; + + if (sev_es_guest(vcpu->kvm)) + sev_es_unmap_ghcb(svm); +@@ -1360,6 +1360,10 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) + if (svm->guest_state_loaded) + return; + ++ /* sev_es_unmap_ghcb() can resched, so grab per-cpu pointer afterward. */ ++ barrier(); ++ sd = per_cpu(svm_data, vcpu->cpu); ++ + /* + * Save additional host state that will be restored on VMEXIT (sev-es) + * or subsequent vmload of host save area. +-- +2.36.1 + + +From b85ffefe0fa8611df26b6a2f058b418cd0d04bd6 Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:53:23 +0000 +Subject: [PATCH 55/90] *debug: warn and retry failed rmpupdates + +In some cases on B0 hardware exhibits something like the following +behavior (where M < 512): + + Guest A | Guest B + |-------------------------------|----------------------------------| + | | rc = rmpupdate pfn=N*512,4K,priv + | rmpupdate pfn=N*512+M,4K,priv | + | rc = FAIL_OVERLAP | rc = SUCCESS + +The FAIL_OVERLAP might possible be the result of hardware temporarily +treating Guest B's rmpupdate for pfn=N*512 as a 2M update, causing the +subsequent update from Guest A for pfn=N*512+M to report FAIL_OVERLAP +at that particular instance. Retrying the update for N*512+M immediately +afterward seems to resolve the FAIL_OVERLAP issue reliably however. + +A similar failure has also been observed when transitioning pages back +to shared during VM destroy. In this case repeating the rmpupdate does +not always seem to resolve the failure immediately. + +Both situations are much more likely to occur if THP is disabled, or +if it is enabled/disabled while guests are actively being +started/stopped. + +Include some debug/error information to get a better idea of the +behavior on different hardware, and add the rmpupdate retry as a +workaround for Milan B0 testing. + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + arch/x86/kernel/sev.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index 6640a639fffc..5ae8c9f853c8 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -2530,6 +2530,7 @@ static int rmpupdate(u64 pfn, struct rmpupdate *val) + { + unsigned long paddr = pfn << PAGE_SHIFT; + int ret, level, npages; ++ int retries = 0; + + if (!pfn_valid(pfn)) + return -EINVAL; +@@ -2552,12 +2553,26 @@ static int rmpupdate(u64 pfn, struct rmpupdate *val) + } + } + ++retry: + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" + : "=a"(ret) + : "a"(paddr), "c"((unsigned long)val) + : "memory", "cc"); + ++ if (ret) { ++ if (!retries) { ++ pr_err("rmpupdate failed, ret: %d, pfn: %llx, npages: %d, level: %d, retrying (max: %d)...\n", ++ ret, pfn, npages, level, 2 * num_present_cpus()); ++ dump_stack(); ++ } ++ retries++; ++ if (retries < 2 * num_present_cpus()) ++ goto retry; ++ } else if (retries > 0) { ++ pr_err("rmpupdate for pfn %llx succeeded after %d retries\n", pfn, retries); ++ } ++ + /* + * Restore the direct map after the page is removed from the RMP table. + */ +-- +2.36.1 + + +From f69a7ab309a72df726c3d9550c10d498960b55dc Mon Sep 17 00:00:00 2001 +From: Michael Roth <michael.roth@amd.com> +Date: Tue, 26 Apr 2022 19:54:55 +0000 +Subject: [PATCH 56/90] *debug: print some debug output when handling userspace + RMP faults + +Signed-off-by: Michael Roth <michael.roth@amd.com> +--- + mm/memory.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/mm/memory.c b/mm/memory.c +index c2187ffcbb8e..60f41a7b17f0 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5034,8 +5034,14 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + return 0; + } + +- if (flags & FAULT_FLAG_PAGE_SPLIT) +- return handle_split_page_fault(&vmf); ++ if (flags & FAULT_FLAG_PAGE_SPLIT) { ++ int rcc; ++ ++ pr_warn("going to split page at addr 0x%lx, flags: 0x%x\n", address, flags); ++ rcc = handle_split_page_fault(&vmf); ++ pr_warn("done split page at addr 0x%lx, rcc: %d\n", address, rcc); ++ return rcc; ++ } + + if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) +-- +2.36.1 + + +From e4ae05c40a039cee9650f7d204e57873cf89bf22 Mon Sep 17 00:00:00 2001 +From: Ashish Kalra <ashish.kalra@amd.com> +Date: Mon, 6 Jun 2022 22:28:01 +0000 +Subject: [PATCH 57/90] KVM: SVM: Sync the GHCB scratch buffer using already + mapped ghcb + +Using kvm_write_guest() to sync the GHCB scratch buffer can fail +due to host mapping being 2M, but RMP being 4K. The page fault handling +in do_user_addr_fault() fails to split the 2M page to handle RMP fault due +to it being called here in a non-preemptible context. Instead use +the already kernel mapped ghcb to sync the scratch buffer when the +scratch buffer is contained within the GHCB. + +Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> +--- + arch/x86/kvm/svm/sev.c | 32 ++++++++++++++++++++------------ + arch/x86/kvm/svm/svm.h | 2 ++ + 2 files changed, 22 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c +index 6cadfbbfae89..2bd437b2c1ff 100644 +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -2954,6 +2954,23 @@ static bool sev_es_sync_to_ghcb(struct vcpu_svm *svm) + ghcb_set_sw_exit_info_1(ghcb, svm->sev_es.ghcb_sw_exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, svm->sev_es.ghcb_sw_exit_info_2); + ++ /* Sync the scratch buffer area. */ ++ if (svm->sev_es.ghcb_sa_sync) { ++ if (svm->sev_es.ghcb_sa_contained) { ++ memcpy(ghcb->shared_buffer + svm->sev_es.ghcb_sa_offset, ++ svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); ++ } else { ++ int ret; ++ ++ ret = kvm_write_guest(svm->vcpu.kvm, ++ svm->sev_es.ghcb_sa_gpa, ++ svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); ++ if (ret) ++ pr_warn_ratelimited("unmap_ghcb: kvm_write_guest failed while syncing scratch area, gpa: %llx, ret: %d\n", svm->sev_es.ghcb_sa_gpa, ret); ++ } ++ svm->sev_es.ghcb_sa_sync = false; ++ } ++ + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, ghcb); + + svm_unmap_ghcb(svm, &map); +@@ -3184,18 +3201,6 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) + if (!svm->sev_es.ghcb_in_use) + return; + +- /* Sync the scratch buffer area. */ +- if (svm->sev_es.ghcb_sa_sync) { +- int ret; +- +- ret = kvm_write_guest(svm->vcpu.kvm, +- svm->sev_es.ghcb_sa_gpa, +- svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); +- if (ret) +- pr_warn_ratelimited("unmap_ghcb: kvm_write_guest failed while syncing scratch area, gpa: %llx, ret: %d\n", svm->sev_es.ghcb_sa_gpa, ret); +- svm->sev_es.ghcb_sa_sync = false; +- } +- + sev_es_sync_to_ghcb(svm); + + svm->sev_es.ghcb_in_use = false; +@@ -3261,6 +3266,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + scratch_gpa_beg, scratch_gpa_end); + goto e_scratch; + } ++ svm->sev_es.ghcb_sa_contained = true; ++ svm->sev_es.ghcb_sa_offset = scratch_gpa_beg - ghcb_scratch_beg; + } else { + /* + * The guest memory must be read into a kernel buffer, so +@@ -3271,6 +3278,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) + len, GHCB_SCRATCH_AREA_LIMIT); + goto e_scratch; + } ++ svm->sev_es.ghcb_sa_contained = false; + } + + if (svm->sev_es.ghcb_sa_alloc_len < len) { +diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h +index 060c1eabee0a..7857f1800882 100644 +--- a/arch/x86/kvm/svm/svm.h ++++ b/arch/x86/kvm/svm/svm.h +@@ -210,6 +210,8 @@ struct vcpu_sev_es_state { + u64 ghcb_sa_gpa; + u32 ghcb_sa_alloc_len; + bool ghcb_sa_sync; ++ bool ghcb_sa_contained; ++ u32 ghcb_sa_offset; + + /* + * SEV-ES support to hold the sw_exit_info return values to be +-- +2.36.1 + + +From 4df99e018d9eee63765c4035ece62c5d9ae308ed Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:37 -0700 +Subject: [PATCH 58/90] x86/sgx: Add short descriptions to ENCLS wrappers + +The SGX ENCLS instruction uses EAX to specify an SGX function and +may require additional registers, depending on the SGX function. +ENCLS invokes the specified privileged SGX function for managing +and debugging enclaves. Macros are used to wrap the ENCLS +functionality and several wrappers are used to wrap the macros to +make the different SGX functions accessible in the code. + +The wrappers of the supported SGX functions are cryptic. Add short +descriptions of each as a comment. + +Suggested-by: Dave Hansen <dave.hansen@linux.intel.com> +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encls.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h +index fa04a73daf9c..0e22fa8f77c5 100644 +--- a/arch/x86/kernel/cpu/sgx/encls.h ++++ b/arch/x86/kernel/cpu/sgx/encls.h +@@ -136,57 +136,71 @@ static inline bool encls_failed(int ret) + ret; \ + }) + ++/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ + static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) + { + return __encls_2(ECREATE, pginfo, secs); + } + ++/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ + static inline int __eextend(void *secs, void *addr) + { + return __encls_2(EEXTEND, secs, addr); + } + ++/* ++ * Associate an EPC page to an enclave either as a REG or TCS page ++ * populated with the provided data. ++ */ + static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) + { + return __encls_2(EADD, pginfo, addr); + } + ++/* Finalize enclave build, initialize enclave for user code execution. */ + static inline int __einit(void *sigstruct, void *token, void *secs) + { + return __encls_ret_3(EINIT, sigstruct, secs, token); + } + ++/* Disassociate EPC page from its enclave and mark it as unused. */ + static inline int __eremove(void *addr) + { + return __encls_ret_1(EREMOVE, addr); + } + ++/* Copy data to an EPC page belonging to a debug enclave. */ + static inline int __edbgwr(void *addr, unsigned long *data) + { + return __encls_2(EDGBWR, *data, addr); + } + ++/* Copy data from an EPC page belonging to a debug enclave. */ + static inline int __edbgrd(void *addr, unsigned long *data) + { + return __encls_1_1(EDGBRD, *data, addr); + } + ++/* Track that software has completed the required TLB address clears. */ + static inline int __etrack(void *addr) + { + return __encls_ret_1(ETRACK, addr); + } + ++/* Load, verify, and unblock an EPC page. */ + static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, + void *va) + { + return __encls_ret_3(ELDU, pginfo, addr, va); + } + ++/* Make EPC page inaccessible to enclave, ready to be written to memory. */ + static inline int __eblock(void *addr) + { + return __encls_ret_1(EBLOCK, addr); + } + ++/* Initialize an EPC page into a Version Array (VA) page. */ + static inline int __epa(void *addr) + { + unsigned long rbx = SGX_PAGE_TYPE_VA; +@@ -194,6 +208,7 @@ static inline int __epa(void *addr) + return __encls_2(EPA, rbx, addr); + } + ++/* Invalidate an EPC page and write it out to main memory. */ + static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + void *va) + { +-- +2.36.1 + + +From 61faa9686e913ff895f22464689c6698e0194a58 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:38 -0700 +Subject: [PATCH 59/90] x86/sgx: Add wrapper for SGX2 EMODPR function + +Add a wrapper for the EMODPR ENCLS leaf function used to +restrict enclave page permissions as maintained in the +SGX hardware's Enclave Page Cache Map (EPCM). + +EMODPR: +1) Updates the EPCM permissions of an enclave page by treating + the new permissions as a mask. Supplying a value that attempts + to relax EPCM permissions has no effect on EPCM permissions + (PR bit, see below, is changed). +2) Sets the PR bit in the EPCM entry of the enclave page to + indicate that permission restriction is in progress. The bit + is reset by the enclave by invoking ENCLU leaf function + EACCEPT or EACCEPTCOPY. + +The enclave may access the page throughout the entire process +if conforming to the EPCM permissions for the enclave page. + +After performing the permission restriction by issuing EMODPR +the kernel needs to collaborate with the hardware to ensure that +all logical processors sees the new restricted permissions. This +is required for the enclave's EACCEPT/EACCEPTCOPY to succeed and +is accomplished with the ETRACK flow. + +Expand enum sgx_return_code with the possible EMODPR return +values. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/include/asm/sgx.h | 5 +++++ + arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ + 2 files changed, 11 insertions(+) + +diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h +index 3f9334ef67cd..d67810b50a81 100644 +--- a/arch/x86/include/asm/sgx.h ++++ b/arch/x86/include/asm/sgx.h +@@ -65,17 +65,22 @@ enum sgx_encls_function { + + /** + * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV ++ * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. + * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. + * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * public key does not match IA32_SGXLEPUBKEYHASH. ++ * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it ++ * is in the PENDING or MODIFIED state. + * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + */ + enum sgx_return_code { ++ SGX_EPC_PAGE_CONFLICT = 7, + SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, + SGX_INVALID_EINITTOKEN = 16, ++ SGX_PAGE_NOT_MODIFIABLE = 20, + SGX_UNMASKED_EVENT = 128, + }; + +diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h +index 0e22fa8f77c5..2b091912f038 100644 +--- a/arch/x86/kernel/cpu/sgx/encls.h ++++ b/arch/x86/kernel/cpu/sgx/encls.h +@@ -215,4 +215,10 @@ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + return __encls_ret_3(EWB, pginfo, addr, va); + } + ++/* Restrict the EPCM permissions of an EPC page. */ ++static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) ++{ ++ return __encls_ret_2(EMODPR, secinfo, addr); ++} ++ + #endif /* _X86_ENCLS_H */ +-- +2.36.1 + + +From bf9ceed67565ab51eaf36564f0b582a2e753d23e Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:39 -0700 +Subject: [PATCH 60/90] x86/sgx: Add wrapper for SGX2 EMODT function + +Add a wrapper for the EMODT ENCLS leaf function used to +change the type of an enclave page as maintained in the +SGX hardware's Enclave Page Cache Map (EPCM). + +EMODT: +1) Updates the EPCM page type of the enclave page. +2) Sets the MODIFIED bit in the EPCM entry of the enclave page. + This bit is reset by the enclave by invoking ENCLU leaf + function EACCEPT or EACCEPTCOPY. + +Access from within the enclave to the enclave page is not possible +while the MODIFIED bit is set. + +After changing the enclave page type by issuing EMODT the kernel +needs to collaborate with the hardware to ensure that no logical +processor continues to hold a reference to the changed page. This +is required to ensure no required security checks are circumvented +and is required for the enclave's EACCEPT/EACCEPTCOPY to succeed. +Ensuring that no references to the changed page remain is +accomplished with the ETRACK flow. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h +index 2b091912f038..7a1ecf704ec1 100644 +--- a/arch/x86/kernel/cpu/sgx/encls.h ++++ b/arch/x86/kernel/cpu/sgx/encls.h +@@ -221,4 +221,10 @@ static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) + return __encls_ret_2(EMODPR, secinfo, addr); + } + ++/* Change the type of an EPC page. */ ++static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) ++{ ++ return __encls_ret_2(EMODT, secinfo, addr); ++} ++ + #endif /* _X86_ENCLS_H */ +-- +2.36.1 + + +From a3ca126aabf73a4dca3414922cb656a12762a454 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:40 -0700 +Subject: [PATCH 61/90] x86/sgx: Add wrapper for SGX2 EAUG function + +Add a wrapper for the EAUG ENCLS leaf function used to +add a page to an initialized enclave. + +EAUG: +1) Stores all properties of the new enclave page in the SGX + hardware's Enclave Page Cache Map (EPCM). +2) Sets the PENDING bit in the EPCM entry of the enclave page. + This bit is cleared by the enclave by invoking ENCLU leaf + function EACCEPT or EACCEPTCOPY. + +Access from within the enclave to the new enclave page is not +possible until the PENDING bit is cleared. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encls.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h +index 7a1ecf704ec1..99004b02e2ed 100644 +--- a/arch/x86/kernel/cpu/sgx/encls.h ++++ b/arch/x86/kernel/cpu/sgx/encls.h +@@ -227,4 +227,10 @@ static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) + return __encls_ret_2(EMODT, secinfo, addr); + } + ++/* Zero a page of EPC memory and add it to an initialized enclave. */ ++static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) ++{ ++ return __encls_2(EAUG, pginfo, addr); ++} ++ + #endif /* _X86_ENCLS_H */ +-- +2.36.1 + + +From 31c2faea4872fb789f9d58f1bd8819cbba669a0e Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:41 -0700 +Subject: [PATCH 62/90] x86/sgx: Support loading enclave page without VMA + permissions check + +sgx_encl_load_page() is used to find and load an enclave page into +enclave (EPC) memory, potentially loading it from the backing storage. +Both usages of sgx_encl_load_page() are during an access to the +enclave page from a VMA and thus the permissions of the VMA are +considered before the enclave page is loaded. + +SGX2 functions operating on enclave pages belonging to an initialized +enclave requiring the page to be in EPC. It is thus required to +support loading enclave pages into the EPC independent from a VMA. + +Split the current sgx_encl_load_page() to support the two usages: +A new call, sgx_encl_load_page_in_vma(), behaves exactly like the +current sgx_encl_load_page() that takes VMA permissions into account, +while sgx_encl_load_page() just loads an enclave page into EPC. + +VMA, PTE, and EPCM permissions continue to dictate whether +the pages can be accessed from within an enclave. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++------------ + arch/x86/kernel/cpu/sgx/encl.h | 2 ++ + 2 files changed, 40 insertions(+), 19 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index 7c63a1911fae..05ae1168391c 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -131,25 +131,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, + return epc_page; + } + +-static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, +- unsigned long addr, +- unsigned long vm_flags) ++static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, ++ struct sgx_encl_page *entry) + { +- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_epc_page *epc_page; +- struct sgx_encl_page *entry; +- +- entry = xa_load(&encl->page_array, PFN_DOWN(addr)); +- if (!entry) +- return ERR_PTR(-EFAULT); +- +- /* +- * Verify that the faulted page has equal or higher build time +- * permissions than the VMA permissions (i.e. the subset of {VM_READ, +- * VM_WRITE, VM_EXECUTE} in vma->vm_flags). +- */ +- if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) +- return ERR_PTR(-EFAULT); + + /* Entry successfully located. */ + if (entry->epc_page) { +@@ -175,6 +160,40 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + return entry; + } + ++static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, ++ unsigned long addr, ++ unsigned long vm_flags) ++{ ++ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); ++ struct sgx_encl_page *entry; ++ ++ entry = xa_load(&encl->page_array, PFN_DOWN(addr)); ++ if (!entry) ++ return ERR_PTR(-EFAULT); ++ ++ /* ++ * Verify that the page has equal or higher build time ++ * permissions than the VMA permissions (i.e. the subset of {VM_READ, ++ * VM_WRITE, VM_EXECUTE} in vma->vm_flags). ++ */ ++ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) ++ return ERR_PTR(-EFAULT); ++ ++ return __sgx_encl_load_page(encl, entry); ++} ++ ++struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, ++ unsigned long addr) ++{ ++ struct sgx_encl_page *entry; ++ ++ entry = xa_load(&encl->page_array, PFN_DOWN(addr)); ++ if (!entry) ++ return ERR_PTR(-EFAULT); ++ ++ return __sgx_encl_load_page(encl, entry); ++} ++ + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) + { + unsigned long addr = (unsigned long)vmf->address; +@@ -196,7 +215,7 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) + + mutex_lock(&encl->lock); + +- entry = sgx_encl_load_page(encl, addr, vma->vm_flags); ++ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); + if (IS_ERR(entry)) { + mutex_unlock(&encl->lock); + +@@ -344,7 +363,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, + for ( ; ; ) { + mutex_lock(&encl->lock); + +- entry = sgx_encl_load_page(encl, addr, vm_flags); ++ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); + if (PTR_ERR(entry) != -EBUSY) + break; + +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index fec43ca65065..6b34efba1602 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -116,5 +116,7 @@ unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); + void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); + bool sgx_va_page_full(struct sgx_va_page *va_page); + void sgx_encl_free_epc_page(struct sgx_epc_page *page); ++struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, ++ unsigned long addr); + + #endif /* _X86_ENCL_H */ +-- +2.36.1 + + +From b63502021bdff22d51a1f9bf2515fb3cd6c36856 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:42 -0700 +Subject: [PATCH 63/90] x86/sgx: Export sgx_encl_ewb_cpumask() + +Using sgx_encl_ewb_cpumask() to learn which CPUs might have executed +an enclave is useful to ensure that TLBs are cleared when changes are +made to enclave pages. + +sgx_encl_ewb_cpumask() is used within the reclaimer when an enclave +page is evicted. The upcoming SGX2 support enables changes to be +made to enclave pages and will require TLBs to not refer to the +changed pages and thus will be needing sgx_encl_ewb_cpumask(). + +Relocate sgx_encl_ewb_cpumask() to be with the rest of the enclave +code in encl.c now that it is no longer unique to the reclaimer. + +Take care to ensure that any future usage maintains the +current context requirement that ETRACK has been called first. +Expand the existing comments to highlight this while moving them +to a more prominent location before the function. + +No functional change. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 67 ++++++++++++++++++++++++++++++++++ + arch/x86/kernel/cpu/sgx/encl.h | 1 + + arch/x86/kernel/cpu/sgx/main.c | 29 --------------- + 3 files changed, 68 insertions(+), 29 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index 05ae1168391c..c6525eba74e8 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -613,6 +613,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) + return 0; + } + ++/** ++ * sgx_encl_ewb_cpumask() - Query which CPUs might be accessing the enclave ++ * @encl: the enclave ++ * ++ * Some SGX functions require that no cached linear-to-physical address ++ * mappings are present before they can succeed. For example, ENCLS[EWB] ++ * copies a page from the enclave page cache to regular main memory but ++ * it fails if it cannot ensure that there are no cached ++ * linear-to-physical address mappings referring to the page. ++ * ++ * SGX hardware flushes all cached linear-to-physical mappings on a CPU ++ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave ++ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical ++ * address mappings are cleared but coordination with the tracking done within ++ * the SGX hardware is needed to support the SGX functions that depend on this ++ * cache clearing. ++ * ++ * When the ENCLS[ETRACK] function is issued on an enclave the hardware ++ * tracks threads operating inside the enclave at that time. The SGX ++ * hardware tracking require that all the identified threads must have ++ * exited the enclave in order to flush the mappings before a function such ++ * as ENCLS[EWB] will be permitted ++ * ++ * The following flow is used to support SGX functions that require that ++ * no cached linear-to-physical address mappings are present: ++ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. ++ * 2) Use this function (sgx_encl_ewb_cpumask()) to query which CPUs might be ++ * accessing the enclave. ++ * 3) Send IPI to identified CPUs, kicking them out of the enclave and ++ * thus flushing all locally cached linear-to-physical address mappings. ++ * 4) Execute SGX function. ++ * ++ * Context: It is required to call this function after ENCLS[ETRACK]. ++ * This will ensure that if any new mm appears (racing with ++ * sgx_encl_mm_add()) then the new mm will enter into the ++ * enclave with fresh linear-to-physical address mappings. ++ * ++ * It is required that all IPIs are completed before a new ++ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 ++ * of the above flow with the enclave's mutex. ++ * ++ * Return: cpumask of CPUs that might be accessing @encl ++ */ ++const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) ++{ ++ cpumask_t *cpumask = &encl->cpumask; ++ struct sgx_encl_mm *encl_mm; ++ int idx; ++ ++ cpumask_clear(cpumask); ++ ++ idx = srcu_read_lock(&encl->srcu); ++ ++ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { ++ if (!mmget_not_zero(encl_mm->mm)) ++ continue; ++ ++ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); ++ ++ mmput_async(encl_mm->mm); ++ } ++ ++ srcu_read_unlock(&encl->srcu, idx); ++ ++ return cpumask; ++} ++ + static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, + pgoff_t index) + { +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index 6b34efba1602..d2acb4debde5 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -105,6 +105,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + + void sgx_encl_release(struct kref *ref); + int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); ++const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl); + int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c +index 8e4bc6453d26..2de85f459492 100644 +--- a/arch/x86/kernel/cpu/sgx/main.c ++++ b/arch/x86/kernel/cpu/sgx/main.c +@@ -203,35 +203,6 @@ static void sgx_ipi_cb(void *info) + { + } + +-static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +-{ +- cpumask_t *cpumask = &encl->cpumask; +- struct sgx_encl_mm *encl_mm; +- int idx; +- +- /* +- * Can race with sgx_encl_mm_add(), but ETRACK has already been +- * executed, which means that the CPUs running in the new mm will enter +- * into the enclave with a fresh epoch. +- */ +- cpumask_clear(cpumask); +- +- idx = srcu_read_lock(&encl->srcu); +- +- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { +- if (!mmget_not_zero(encl_mm->mm)) +- continue; +- +- cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); +- +- mmput_async(encl_mm->mm); +- } +- +- srcu_read_unlock(&encl->srcu, idx); +- +- return cpumask; +-} +- + /* + * Swap page to the regular memory transformed to the blocked state by using + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). +-- +2.36.1 + + +From 096a500b9d6e01e33349eb3b919f86e3e3d5cc00 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:43 -0700 +Subject: [PATCH 64/90] x86/sgx: Rename sgx_encl_ewb_cpumask() as + sgx_encl_cpumask() + +sgx_encl_ewb_cpumask() is no longer unique to the reclaimer where it +is used during the EWB ENCLS leaf function when EPC pages are written +out to main memory and sgx_encl_ewb_cpumask() is used to learn which +CPUs might have executed the enclave to ensure that TLBs are cleared. + +Upcoming SGX2 enabling will use sgx_encl_ewb_cpumask() during the +EMODPR and EMODT ENCLS leaf functions that make changes to enclave +pages. The function is needed for the same reason it is used now: to +learn which CPUs might have executed the enclave to ensure that TLBs +no longer point to the changed pages. + +Rename sgx_encl_ewb_cpumask() to sgx_encl_cpumask() to reflect the +broader usage. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 6 +++--- + arch/x86/kernel/cpu/sgx/encl.h | 2 +- + arch/x86/kernel/cpu/sgx/main.c | 2 +- + 3 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index c6525eba74e8..8de9bebc4d81 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -614,7 +614,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) + } + + /** +- * sgx_encl_ewb_cpumask() - Query which CPUs might be accessing the enclave ++ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address +@@ -639,7 +639,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. +- * 2) Use this function (sgx_encl_ewb_cpumask()) to query which CPUs might be ++ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. +@@ -656,7 +656,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) + * + * Return: cpumask of CPUs that might be accessing @encl + */ +-const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) ++const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) + { + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index d2acb4debde5..e59c2cbf71e2 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -105,7 +105,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + + void sgx_encl_release(struct kref *ref); + int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +-const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl); ++const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); + int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c +index 2de85f459492..fa33922879bf 100644 +--- a/arch/x86/kernel/cpu/sgx/main.c ++++ b/arch/x86/kernel/cpu/sgx/main.c +@@ -249,7 +249,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, + * miss cpus that entered the enclave between + * generating the mask and incrementing epoch. + */ +- on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), ++ on_each_cpu_mask(sgx_encl_cpumask(encl), + sgx_ipi_cb, NULL, 1); + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + } +-- +2.36.1 + + +From a69bef1e179b035e7bafddbb486cec79fd5e501e Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:44 -0700 +Subject: [PATCH 65/90] x86/sgx: Move PTE zap code to new + sgx_zap_enclave_ptes() + +The SGX reclaimer removes page table entries pointing to pages that are +moved to swap. + +SGX2 enables changes to pages belonging to an initialized enclave, thus +enclave pages may have their permission or type changed while the page +is being accessed by an enclave. Supporting SGX2 requires page table +entries to be removed so that any cached mappings to changed pages +are removed. For example, with the ability to change enclave page types +a regular enclave page may be changed to a Thread Control Structure +(TCS) page that may not be accessed by an enclave. + +Factor out the code removing page table entries to a separate function +sgx_zap_enclave_ptes(), fixing accuracy of comments in the process, +and make it available to the upcoming SGX2 code. + +Place sgx_zap_enclave_ptes() with the rest of the enclave code in +encl.c interacting with the page table since this code is no longer +unique to the reclaimer. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 45 +++++++++++++++++++++++++++++++++- + arch/x86/kernel/cpu/sgx/encl.h | 2 +- + arch/x86/kernel/cpu/sgx/main.c | 31 ++--------------------- + 3 files changed, 47 insertions(+), 31 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index 8de9bebc4d81..c77a62432862 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -605,7 +605,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) + + spin_lock(&encl->mm_lock); + list_add_rcu(&encl_mm->list, &encl->mm_list); +- /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ ++ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ + smp_wmb(); + encl->mm_list_version++; + spin_unlock(&encl->mm_lock); +@@ -792,6 +792,49 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, + return ret; + } + ++/** ++ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave ++ * @encl: the enclave ++ * @addr: page aligned pointer to single page for which PTEs will be removed ++ * ++ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping ++ * @addr from each VMA. Ensure that page fault handler is ready to handle ++ * new mappings of @addr before calling this function. ++ */ ++void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) ++{ ++ unsigned long mm_list_version; ++ struct sgx_encl_mm *encl_mm; ++ struct vm_area_struct *vma; ++ int idx, ret; ++ ++ do { ++ mm_list_version = encl->mm_list_version; ++ ++ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ ++ smp_rmb(); ++ ++ idx = srcu_read_lock(&encl->srcu); ++ ++ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { ++ if (!mmget_not_zero(encl_mm->mm)) ++ continue; ++ ++ mmap_read_lock(encl_mm->mm); ++ ++ ret = sgx_encl_find(encl_mm->mm, addr, &vma); ++ if (!ret && encl == vma->vm_private_data) ++ zap_vma_ptes(vma, addr, PAGE_SIZE); ++ ++ mmap_read_unlock(encl_mm->mm); ++ ++ mmput_async(encl_mm->mm); ++ } ++ ++ srcu_read_unlock(&encl->srcu, idx); ++ } while (unlikely(encl->mm_list_version != mm_list_version)); ++} ++ + /** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index e59c2cbf71e2..1b15d22f6757 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -111,7 +111,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); + int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); +- ++void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); + struct sgx_epc_page *sgx_alloc_va_page(void); + unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); + void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); +diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c +index fa33922879bf..ce9e87d5f8ec 100644 +--- a/arch/x86/kernel/cpu/sgx/main.c ++++ b/arch/x86/kernel/cpu/sgx/main.c +@@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) + struct sgx_encl_page *page = epc_page->owner; + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; +- unsigned long mm_list_version; +- struct sgx_encl_mm *encl_mm; +- struct vm_area_struct *vma; +- int idx, ret; +- +- do { +- mm_list_version = encl->mm_list_version; +- +- /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ +- smp_rmb(); +- +- idx = srcu_read_lock(&encl->srcu); +- +- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { +- if (!mmget_not_zero(encl_mm->mm)) +- continue; +- +- mmap_read_lock(encl_mm->mm); +- +- ret = sgx_encl_find(encl_mm->mm, addr, &vma); +- if (!ret && encl == vma->vm_private_data) +- zap_vma_ptes(vma, addr, PAGE_SIZE); +- +- mmap_read_unlock(encl_mm->mm); +- +- mmput_async(encl_mm->mm); +- } ++ int ret; + +- srcu_read_unlock(&encl->srcu, idx); +- } while (unlikely(encl->mm_list_version != mm_list_version)); ++ sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + +-- +2.36.1 + + +From cb88e0696de61b98b6b150790fc0c0b750a4011c Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:45 -0700 +Subject: [PATCH 66/90] x86/sgx: Make sgx_ipi_cb() available internally + +The ETRACK function followed by an IPI to all CPUs within an enclave +is a common pattern with more frequent use in support of SGX2. + +Make the (empty) IPI callback function available internally in +preparation for usage by SGX2. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/main.c | 2 +- + arch/x86/kernel/cpu/sgx/sgx.h | 2 ++ + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c +index ce9e87d5f8ec..6e2cb7564080 100644 +--- a/arch/x86/kernel/cpu/sgx/main.c ++++ b/arch/x86/kernel/cpu/sgx/main.c +@@ -172,7 +172,7 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, + return ret; + } + +-static void sgx_ipi_cb(void *info) ++void sgx_ipi_cb(void *info) + { + } + +diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h +index 0f17def9fe6f..b30cee4de903 100644 +--- a/arch/x86/kernel/cpu/sgx/sgx.h ++++ b/arch/x86/kernel/cpu/sgx/sgx.h +@@ -90,6 +90,8 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); + int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); + struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); + ++void sgx_ipi_cb(void *info); ++ + #ifdef CONFIG_X86_SGX_KVM + int __init sgx_vepc_init(void); + #else +-- +2.36.1 + + +From 27df52d087dd9c7d9c9390070c42f968a7ce3c2b Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:46 -0700 +Subject: [PATCH 67/90] x86/sgx: Create utility to validate user provided + offset and length + +User provided offset and length is validated when parsing the parameters +of the SGX_IOC_ENCLAVE_ADD_PAGES ioctl(). Extract this validation +(with consistent use of IS_ALIGNED) into a utility that can be used +by the SGX2 ioctl()s that will also provide these values. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/ioctl.c | 28 ++++++++++++++++++++++------ + 1 file changed, 22 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 83df20e3e633..a66795e0b685 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -372,6 +372,26 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + return ret; + } + ++/* ++ * Ensure user provided offset and length values are valid for ++ * an enclave. ++ */ ++static int sgx_validate_offset_length(struct sgx_encl *encl, ++ unsigned long offset, ++ unsigned long length) ++{ ++ if (!IS_ALIGNED(offset, PAGE_SIZE)) ++ return -EINVAL; ++ ++ if (!length || !IS_ALIGNED(length, PAGE_SIZE)) ++ return -EINVAL; ++ ++ if (offset + length - PAGE_SIZE >= encl->size) ++ return -EINVAL; ++ ++ return 0; ++} ++ + /** + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES + * @encl: an enclave pointer +@@ -425,14 +445,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) + if (copy_from_user(&add_arg, arg, sizeof(add_arg))) + return -EFAULT; + +- if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || +- !IS_ALIGNED(add_arg.src, PAGE_SIZE)) +- return -EINVAL; +- +- if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) ++ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) + return -EINVAL; + +- if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) ++ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) + return -EINVAL; + + if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, +-- +2.36.1 + + +From b8a8af3dc3bfd63d6967e11163824ad35ca751e7 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:47 -0700 +Subject: [PATCH 68/90] x86/sgx: Keep record of SGX page type + +SGX2 functions are not allowed on all page types. For example, +ENCLS[EMODPR] is only allowed on regular SGX enclave pages and +ENCLS[EMODPT] is only allowed on TCS and regular pages. If these +functions are attempted on another type of page the hardware would +trigger a fault. + +Keep a record of the SGX page type so that there is more +certainty whether an SGX2 instruction can succeed and faults +can be treated as real failures. + +The page type is a property of struct sgx_encl_page +and thus does not cover the VA page type. VA pages are maintained +in separate structures and their type can be determined in +a different way. The SGX2 instructions needing the page type do not +operate on VA pages and this is thus not a scenario needing to +be covered at this time. + +struct sgx_encl_page hosting this information is maintained for each +enclave page so the space consumed by the struct is important. +The existing sgx_encl_page->vm_max_prot_bits is already unsigned long +while only using three bits. Transition to a bitfield for the two +members to support the additional information without increasing +the space consumed by the struct. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/include/asm/sgx.h | 3 +++ + arch/x86/kernel/cpu/sgx/encl.h | 3 ++- + arch/x86/kernel/cpu/sgx/ioctl.c | 2 ++ + 3 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h +index d67810b50a81..eae20fa52b93 100644 +--- a/arch/x86/include/asm/sgx.h ++++ b/arch/x86/include/asm/sgx.h +@@ -239,6 +239,9 @@ struct sgx_pageinfo { + * %SGX_PAGE_TYPE_REG: a regular page + * %SGX_PAGE_TYPE_VA: a VA page + * %SGX_PAGE_TYPE_TRIM: a page in trimmed state ++ * ++ * Make sure when making changes to this enum that its values can still fit ++ * in the bitfield within &struct sgx_encl_page + */ + enum sgx_page_type { + SGX_PAGE_TYPE_SECS, +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index 1b15d22f6757..07abfc70c8e3 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -27,7 +27,8 @@ + + struct sgx_encl_page { + unsigned long desc; +- unsigned long vm_max_prot_bits; ++ unsigned long vm_max_prot_bits:8; ++ enum sgx_page_type type:16; + struct sgx_epc_page *epc_page; + struct sgx_encl *encl; + struct sgx_va_page *va_page; +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index a66795e0b685..21078c6643f7 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) + set_bit(SGX_ENCL_DEBUG, &encl->flags); + + encl->secs.encl = encl; ++ encl->secs.type = SGX_PAGE_TYPE_SECS; + encl->base = secs->base; + encl->size = secs->size; + encl->attributes = secs->attributes; +@@ -344,6 +345,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + */ + encl_page->encl = encl; + encl_page->epc_page = epc_page; ++ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; + encl->secs_child_cnt++; + + if (flags & SGX_PAGE_MEASURE) { +-- +2.36.1 + + +From e5cd94eb947cb17b78633aceb483a9a1d0d3a862 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:48 -0700 +Subject: [PATCH 69/90] x86/sgx: Export sgx_encl_{grow,shrink}() + +In order to use sgx_encl_{grow,shrink}() in the page augmentation code +located in encl.c, export these functions. + +Suggested-by: Jarkko Sakkinen <jarkko@kernel.org> +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.h | 2 ++ + arch/x86/kernel/cpu/sgx/ioctl.c | 4 ++-- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index 07abfc70c8e3..9d673d9531f0 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -120,5 +120,7 @@ bool sgx_va_page_full(struct sgx_va_page *va_page); + void sgx_encl_free_epc_page(struct sgx_epc_page *page); + struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); ++struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl); ++void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); + + #endif /* _X86_ENCL_H */ +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 21078c6643f7..2df27dd8b30d 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -17,7 +17,7 @@ + #include "encl.h" + #include "encls.h" + +-static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) ++struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) + { + struct sgx_va_page *va_page = NULL; + void *err; +@@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) + return va_page; + } + +-static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) ++void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) + { + encl->page_cnt--; + +-- +2.36.1 + + +From 13bb2caefd398f2672185c83da186382f47f387a Mon Sep 17 00:00:00 2001 +From: Jarkko Sakkinen <jarkko@kernel.org> +Date: Tue, 10 May 2022 11:08:49 -0700 +Subject: [PATCH 70/90] x86/sgx: Export sgx_encl_page_alloc() + +Move sgx_encl_page_alloc() to encl.c and export it so that it can be +used in the implementation for support of adding pages to initialized +enclaves, which requires to allocate new enclave pages. + +Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 32 ++++++++++++++++++++++++++++++++ + arch/x86/kernel/cpu/sgx/encl.h | 3 +++ + arch/x86/kernel/cpu/sgx/ioctl.c | 32 -------------------------------- + 3 files changed, 35 insertions(+), 32 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index c77a62432862..546423753e4c 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -792,6 +792,38 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, + return ret; + } + ++struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, ++ unsigned long offset, ++ u64 secinfo_flags) ++{ ++ struct sgx_encl_page *encl_page; ++ unsigned long prot; ++ ++ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); ++ if (!encl_page) ++ return ERR_PTR(-ENOMEM); ++ ++ encl_page->desc = encl->base + offset; ++ encl_page->encl = encl; ++ ++ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | ++ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | ++ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); ++ ++ /* ++ * TCS pages must always RW set for CPU access while the SECINFO ++ * permissions are *always* zero - the CPU ignores the user provided ++ * values and silently overwrites them with zero permissions. ++ */ ++ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) ++ prot |= PROT_READ | PROT_WRITE; ++ ++ /* Calculate maximum of the VM flags for the page. */ ++ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); ++ ++ return encl_page; ++} ++ + /** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index 9d673d9531f0..253ebdd1c5be 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -112,6 +112,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); + int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); ++struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, ++ unsigned long offset, ++ u64 secinfo_flags); + void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); + struct sgx_epc_page *sgx_alloc_va_page(void); + unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 2df27dd8b30d..bb8cdb2ad0d1 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -169,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) + return ret; + } + +-static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, +- unsigned long offset, +- u64 secinfo_flags) +-{ +- struct sgx_encl_page *encl_page; +- unsigned long prot; +- +- encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); +- if (!encl_page) +- return ERR_PTR(-ENOMEM); +- +- encl_page->desc = encl->base + offset; +- encl_page->encl = encl; +- +- prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | +- _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | +- _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); +- +- /* +- * TCS pages must always RW set for CPU access while the SECINFO +- * permissions are *always* zero - the CPU ignores the user provided +- * values and silently overwrites them with zero permissions. +- */ +- if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) +- prot |= PROT_READ | PROT_WRITE; +- +- /* Calculate maximum of the VM flags for the page. */ +- encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); +- +- return encl_page; +-} +- + static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) + { + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; +-- +2.36.1 + + +From 5c1dabc893aa0c39417d8aa30956acbcd042ea79 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:50 -0700 +Subject: [PATCH 71/90] x86/sgx: Support VA page allocation without reclaiming + +struct sgx_encl should be protected with the mutex +sgx_encl->lock. One exception is sgx_encl->page_cnt that +is incremented (in sgx_encl_grow()) when an enclave page +is added to the enclave. The reason the mutex is not held +is to allow the reclaimer to be called directly if there are +no EPC pages (in support of a new VA page) available at the time. + +Incrementing sgx_encl->page_cnt without sgc_encl->lock held +is currently (before SGX2) safe from concurrent updates because +all paths in which sgx_encl_grow() is called occur before +enclave initialization and are protected with an atomic +operation on SGX_ENCL_IOCTL. + +SGX2 includes support for dynamically adding pages after +enclave initialization where the protection of SGX_ENCL_IOCTL +is not available. + +Make direct reclaim of EPC pages optional when new VA pages +are added to the enclave. Essentially the existing "reclaim" +flag used when regular EPC pages are added to an enclave +becomes available to the caller when used to allocate VA pages +instead of always being "true". + +When adding pages without invoking the reclaimer it is possible +to do so with sgx_encl->lock held, gaining its protection against +concurrent updates to sgx_encl->page_cnt after enclave +initialization. + +No functional change. + +Reported-by: Haitao Huang <haitao.huang@intel.com> +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 6 ++++-- + arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- + arch/x86/kernel/cpu/sgx/ioctl.c | 8 ++++---- + 3 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index 546423753e4c..92516aeca405 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -869,6 +869,8 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) + + /** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page ++ * @reclaim: Reclaim EPC pages directly if none available. Enclave ++ * mutex should not be held if this is set. + * + * Allocate a free EPC page and convert it to a Version Array (VA) page. + * +@@ -876,12 +878,12 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) + * a VA page, + * -errno otherwise + */ +-struct sgx_epc_page *sgx_alloc_va_page(void) ++struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) + { + struct sgx_epc_page *epc_page; + int ret; + +- epc_page = sgx_alloc_epc_page(NULL, true); ++ epc_page = sgx_alloc_epc_page(NULL, reclaim); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + +diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h +index 253ebdd1c5be..66adb8faec45 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.h ++++ b/arch/x86/kernel/cpu/sgx/encl.h +@@ -116,14 +116,14 @@ struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); + void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +-struct sgx_epc_page *sgx_alloc_va_page(void); ++struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); + unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); + void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); + bool sgx_va_page_full(struct sgx_va_page *va_page); + void sgx_encl_free_epc_page(struct sgx_epc_page *page); + struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +-struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl); ++struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); + void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); + + #endif /* _X86_ENCL_H */ +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index bb8cdb2ad0d1..5d41aa204761 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -17,7 +17,7 @@ + #include "encl.h" + #include "encls.h" + +-struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) ++struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) + { + struct sgx_va_page *va_page = NULL; + void *err; +@@ -30,7 +30,7 @@ struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) + if (!va_page) + return ERR_PTR(-ENOMEM); + +- va_page->epc_page = sgx_alloc_va_page(); ++ va_page->epc_page = sgx_alloc_va_page(reclaim); + if (IS_ERR(va_page->epc_page)) { + err = ERR_CAST(va_page->epc_page); + kfree(va_page); +@@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) + struct file *backing; + long ret; + +- va_page = sgx_encl_grow(encl); ++ va_page = sgx_encl_grow(encl, true); + if (IS_ERR(va_page)) + return PTR_ERR(va_page); + else if (va_page) +@@ -275,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + return PTR_ERR(epc_page); + } + +- va_page = sgx_encl_grow(encl); ++ va_page = sgx_encl_grow(encl, true); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_out_free; +-- +2.36.1 + + +From d526d6e277816c4c2ef55bd932cd644341fac866 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:51 -0700 +Subject: [PATCH 72/90] x86/sgx: Support restricting of enclave page + permissions + +In the initial (SGX1) version of SGX, pages in an enclave need to be +created with permissions that support all usages of the pages, from the +time the enclave is initialized until it is unloaded. For example, +pages used by a JIT compiler or when code needs to otherwise be +relocated need to always have RWX permissions. + +SGX2 includes a new function ENCLS[EMODPR] that is run from the kernel +and can be used to restrict the EPCM permissions of regular enclave +pages within an initialized enclave. + +Introduce ioctl() SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS to support +restricting EPCM permissions. With this ioctl() the user specifies +a page range and the EPCM permissions to be applied to all pages in +the provided range. ENCLS[EMODPR] is run to restrict the EPCM +permissions followed by the ENCLS[ETRACK] flow that will ensure +no cached linear-to-physical address mappings to the changed +pages remain. + +It is possible for the permission change request to fail on any +page within the provided range, either with an error encountered +by the kernel or by the SGX hardware while running +ENCLS[EMODPR]. To support partial success the ioctl() returns an +error code based on failures encountered by the kernel as well +as two result output parameters: one for the number of pages +that were successfully changed and one for the SGX return code. + +The page table entry permissions are not impacted by the EPCM +permission changes. VMAs and PTEs will continue to allow the +maximum vetted permissions determined at the time the pages +are added to the enclave. The SGX error code in a page fault +will indicate if it was an EPCM permission check that prevented +an access attempt. + +No checking is done to ensure that the permissions are actually +being restricted. This is because the enclave may have relaxed +the EPCM permissions from within the enclave without the kernel +knowing. An attempt to relax permissions using this call will +be ignored by the hardware. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Haitao Huang <haitao.huang@intel.com> +Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/include/uapi/asm/sgx.h | 21 ++++ + arch/x86/kernel/cpu/sgx/ioctl.c | 216 ++++++++++++++++++++++++++++++++ + 2 files changed, 237 insertions(+) + +diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h +index f4b81587e90b..82648c006470 100644 +--- a/arch/x86/include/uapi/asm/sgx.h ++++ b/arch/x86/include/uapi/asm/sgx.h +@@ -29,6 +29,8 @@ enum sgx_page_flags { + _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) + #define SGX_IOC_VEPC_REMOVE_ALL \ + _IO(SGX_MAGIC, 0x04) ++#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ ++ _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) + + /** + * struct sgx_enclave_create - parameter structure for the +@@ -76,6 +78,25 @@ struct sgx_enclave_provision { + __u64 fd; + }; + ++/** ++ * struct sgx_enclave_restrict_permissions - parameters for ioctl ++ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ++ * @offset: starting page offset (page aligned relative to enclave base ++ * address defined in SECS) ++ * @length: length of memory (multiple of the page size) ++ * @permissions:new permission bits for pages in range described by @offset ++ * and @length ++ * @result: (output) SGX result code of ENCLS[EMODPR] function ++ * @count: (output) bytes successfully changed (multiple of page size) ++ */ ++struct sgx_enclave_restrict_permissions { ++ __u64 offset; ++ __u64 length; ++ __u64 permissions; ++ __u64 result; ++ __u64 count; ++}; ++ + struct sgx_enclave_run; + + /** +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 5d41aa204761..720188d86ed4 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -660,6 +660,218 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) + return sgx_set_attribute(&encl->attributes_mask, params.fd); + } + ++/* ++ * Ensure enclave is ready for SGX2 functions. Readiness is checked ++ * by ensuring the hardware supports SGX2 and the enclave is initialized ++ * and thus able to handle requests to modify pages within it. ++ */ ++static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) ++{ ++ if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) ++ return -ENODEV; ++ ++ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* ++ * Some SGX functions require that no cached linear-to-physical address ++ * mappings are present before they can succeed. Collaborate with ++ * hardware via ENCLS[ETRACK] to ensure that all cached ++ * linear-to-physical address mappings belonging to all threads of ++ * the enclave are cleared. See sgx_encl_cpumask() for details. ++ * ++ * Must be called with enclave's mutex held from the time the ++ * SGX function requiring that no cached linear-to-physical mappings ++ * are present is executed until this ETRACK flow is complete. ++ */ ++static int sgx_enclave_etrack(struct sgx_encl *encl) ++{ ++ void *epc_virt; ++ int ret; ++ ++ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); ++ ret = __etrack(epc_virt); ++ if (ret) { ++ /* ++ * ETRACK only fails when there is an OS issue. For ++ * example, two consecutive ETRACK was sent without ++ * completed IPI between. ++ */ ++ pr_err_once("ETRACK returned %d (0x%x)", ret, ret); ++ /* ++ * Send IPIs to kick CPUs out of the enclave and ++ * try ETRACK again. ++ */ ++ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ++ ret = __etrack(epc_virt); ++ if (ret) { ++ pr_err_once("ETRACK repeat returned %d (0x%x)", ++ ret, ret); ++ return -EFAULT; ++ } ++ } ++ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ++ ++ return 0; ++} ++ ++/** ++ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions ++ * @encl: Enclave to which the pages belong. ++ * @modp: Checked parameters from user on which pages need modifying and ++ * their new permissions. ++ * ++ * Return: ++ * - 0: Success. ++ * - -errno: Otherwise. ++ */ ++static long ++sgx_enclave_restrict_permissions(struct sgx_encl *encl, ++ struct sgx_enclave_restrict_permissions *modp) ++{ ++ struct sgx_encl_page *entry; ++ struct sgx_secinfo secinfo; ++ unsigned long addr; ++ unsigned long c; ++ void *epc_virt; ++ int ret; ++ ++ memset(&secinfo, 0, sizeof(secinfo)); ++ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; ++ ++ for (c = 0 ; c < modp->length; c += PAGE_SIZE) { ++ addr = encl->base + modp->offset + c; ++ ++ mutex_lock(&encl->lock); ++ ++ entry = sgx_encl_load_page(encl, addr); ++ if (IS_ERR(entry)) { ++ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; ++ goto out_unlock; ++ } ++ ++ /* ++ * Changing EPCM permissions is only supported on regular ++ * SGX pages. Attempting this change on other pages will ++ * result in #PF. ++ */ ++ if (entry->type != SGX_PAGE_TYPE_REG) { ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ /* ++ * Apart from ensuring that read-access remains, do not verify ++ * the permission bits requested. Kernel has no control over ++ * how EPCM permissions can be relaxed from within the enclave. ++ * ENCLS[EMODPR] can only remove existing EPCM permissions, ++ * attempting to set new permissions will be ignored by the ++ * hardware. ++ */ ++ ++ /* Change EPCM permissions. */ ++ epc_virt = sgx_get_epc_virt_addr(entry->epc_page); ++ ret = __emodpr(&secinfo, epc_virt); ++ if (encls_faulted(ret)) { ++ /* ++ * All possible faults should be avoidable: ++ * parameters have been checked, will only change ++ * permissions of a regular page, and no concurrent ++ * SGX1/SGX2 ENCLS instructions since these ++ * are protected with mutex. ++ */ ++ pr_err_once("EMODPR encountered exception %d\n", ++ ENCLS_TRAPNR(ret)); ++ ret = -EFAULT; ++ goto out_unlock; ++ } ++ if (encls_failed(ret)) { ++ modp->result = ret; ++ ret = -EFAULT; ++ goto out_unlock; ++ } ++ ++ ret = sgx_enclave_etrack(encl); ++ if (ret) { ++ ret = -EFAULT; ++ goto out_unlock; ++ } ++ ++ mutex_unlock(&encl->lock); ++ } ++ ++ ret = 0; ++ goto out; ++ ++out_unlock: ++ mutex_unlock(&encl->lock); ++out: ++ modp->count = c; ++ ++ return ret; ++} ++ ++/** ++ * sgx_ioc_enclave_restrict_permissions() - handler for ++ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ++ * @encl: an enclave pointer ++ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions ++ * instance ++ * ++ * SGX2 distinguishes between relaxing and restricting the enclave page ++ * permissions maintained by the hardware (EPCM permissions) of pages ++ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). ++ * ++ * EPCM permissions cannot be restricted from within the enclave, the enclave ++ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] ++ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call ++ * will be ignored by the hardware. ++ * ++ * Return: ++ * - 0: Success ++ * - -errno: Otherwise ++ */ ++static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, ++ void __user *arg) ++{ ++ struct sgx_enclave_restrict_permissions params; ++ long ret; ++ ++ ret = sgx_ioc_sgx2_ready(encl); ++ if (ret) ++ return ret; ++ ++ if (copy_from_user(¶ms, arg, sizeof(params))) ++ return -EFAULT; ++ ++ if (sgx_validate_offset_length(encl, params.offset, params.length)) ++ return -EINVAL; ++ ++ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) ++ return -EINVAL; ++ ++ /* ++ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] ++ * from faulting later when the CPU does the same check. ++ */ ++ if ((params.permissions & SGX_SECINFO_W) && ++ !(params.permissions & SGX_SECINFO_R)) ++ return -EINVAL; ++ ++ if (params.result || params.count) ++ return -EINVAL; ++ ++ ret = sgx_enclave_restrict_permissions(encl, ¶ms); ++ ++ if (copy_to_user(arg, ¶ms, sizeof(params))) ++ return -EFAULT; ++ ++ return ret; ++} ++ + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + { + struct sgx_encl *encl = filep->private_data; +@@ -681,6 +893,10 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + case SGX_IOC_ENCLAVE_PROVISION: + ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); + break; ++ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: ++ ret = sgx_ioc_enclave_restrict_permissions(encl, ++ (void __user *)arg); ++ break; + default: + ret = -ENOIOCTLCMD; + break; +-- +2.36.1 + + +From 43602c796c42d5b0891e509e2a959f0ce5b1d9f2 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:52 -0700 +Subject: [PATCH 73/90] x86/sgx: Support adding of pages to an initialized + enclave + +With SGX1 an enclave needs to be created with its maximum memory demands +allocated. Pages cannot be added to an enclave after it is initialized. +SGX2 introduces a new function, ENCLS[EAUG], that can be used to add +pages to an initialized enclave. With SGX2 the enclave still needs to +set aside address space for its maximum memory demands during enclave +creation, but all pages need not be added before enclave initialization. +Pages can be added during enclave runtime. + +Add support for dynamically adding pages to an initialized enclave, +architecturally limited to RW permission at creation but allowed to +obtain RWX permissions after trusted enclave runs EMODPE. Add pages +via the page fault handler at the time an enclave address without a +backing enclave page is accessed, potentially directly reclaiming +pages if no free pages are available. + +The enclave is still required to run ENCLU[EACCEPT] on the page before +it can be used. A useful flow is for the enclave to run ENCLU[EACCEPT] +on an uninitialized address. This will trigger the page fault handler +that will add the enclave page and return execution to the enclave to +repeat the ENCLU[EACCEPT] instruction, this time successful. + +If the enclave accesses an uninitialized address in another way, for +example by expanding the enclave stack to a page that has not yet been +added, then the page fault handler would add the page on the first +write but upon returning to the enclave the instruction that triggered +the page fault would be repeated and since ENCLU[EACCEPT] was not run +yet it would trigger a second page fault, this time with the SGX flag +set in the page fault error code. This can only be recovered by entering +the enclave again and directly running the ENCLU[EACCEPT] instruction on +the now initialized address. + +Accessing an uninitialized address from outside the enclave also +triggers this flow but the page will remain inaccessible (access will +result in #PF) until accepted from within the enclave via +ENCLU[EACCEPT]. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Haitao Huang <haitao.huang@intel.com> +Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 117 +++++++++++++++++++++++++++++++++ + 1 file changed, 117 insertions(+) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index 92516aeca405..7f2b83df99e1 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -194,6 +194,112 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + return __sgx_encl_load_page(encl, entry); + } + ++/** ++ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave ++ * @vma: VMA obtained from fault info from where page is accessed ++ * @encl: enclave accessing the page ++ * @addr: address that triggered the page fault ++ * ++ * When an initialized enclave accesses a page with no backing EPC page ++ * on a SGX2 system then the EPC can be added dynamically via the SGX2 ++ * ENCLS[EAUG] instruction. ++ * ++ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed ++ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. ++ */ ++static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, ++ struct sgx_encl *encl, unsigned long addr) ++{ ++ vm_fault_t vmret = VM_FAULT_SIGBUS; ++ struct sgx_pageinfo pginfo = {0}; ++ struct sgx_encl_page *encl_page; ++ struct sgx_epc_page *epc_page; ++ struct sgx_va_page *va_page; ++ unsigned long phys_addr; ++ u64 secinfo_flags; ++ int ret; ++ ++ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) ++ return VM_FAULT_SIGBUS; ++ ++ /* ++ * Ignore internal permission checking for dynamically added pages. ++ * They matter only for data added during the pre-initialization ++ * phase. The enclave decides the permissions by the means of ++ * EACCEPT, EACCEPTCOPY and EMODPE. ++ */ ++ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; ++ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); ++ if (IS_ERR(encl_page)) ++ return VM_FAULT_OOM; ++ ++ mutex_lock(&encl->lock); ++ ++ epc_page = sgx_alloc_epc_page(encl_page, false); ++ if (IS_ERR(epc_page)) { ++ if (PTR_ERR(epc_page) == -EBUSY) ++ vmret = VM_FAULT_NOPAGE; ++ goto err_out_unlock; ++ } ++ ++ va_page = sgx_encl_grow(encl, false); ++ if (IS_ERR(va_page)) ++ goto err_out_epc; ++ ++ if (va_page) ++ list_add(&va_page->list, &encl->va_pages); ++ ++ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), ++ encl_page, GFP_KERNEL); ++ /* ++ * If ret == -EBUSY then page was created in another flow while ++ * running without encl->lock ++ */ ++ if (ret) ++ goto err_out_shrink; ++ ++ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); ++ pginfo.addr = encl_page->desc & PAGE_MASK; ++ pginfo.metadata = 0; ++ ++ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); ++ if (ret) ++ goto err_out; ++ ++ encl_page->encl = encl; ++ encl_page->epc_page = epc_page; ++ encl_page->type = SGX_PAGE_TYPE_REG; ++ encl->secs_child_cnt++; ++ ++ sgx_mark_page_reclaimable(encl_page->epc_page); ++ ++ phys_addr = sgx_get_epc_phys_addr(epc_page); ++ /* ++ * Do not undo everything when creating PTE entry fails - next #PF ++ * would find page ready for a PTE. ++ */ ++ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); ++ if (vmret != VM_FAULT_NOPAGE) { ++ mutex_unlock(&encl->lock); ++ return VM_FAULT_SIGBUS; ++ } ++ mutex_unlock(&encl->lock); ++ return VM_FAULT_NOPAGE; ++ ++err_out: ++ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); ++ ++err_out_shrink: ++ sgx_encl_shrink(encl, va_page); ++err_out_epc: ++ sgx_encl_free_epc_page(epc_page); ++err_out_unlock: ++ mutex_unlock(&encl->lock); ++ kfree(encl_page); ++ ++ return vmret; ++} ++ + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) + { + unsigned long addr = (unsigned long)vmf->address; +@@ -213,6 +319,17 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) + if (unlikely(!encl)) + return VM_FAULT_SIGBUS; + ++ /* ++ * The page_array keeps track of all enclave pages, whether they ++ * are swapped out or not. If there is no entry for this page and ++ * the system supports SGX2 then it is possible to dynamically add ++ * a new enclave page. This is only possible for an initialized ++ * enclave that will be checked for right away. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_SGX2) && ++ (!xa_load(&encl->page_array, PFN_DOWN(addr)))) ++ return sgx_encl_eaug_page(vma, encl, addr); ++ + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); +-- +2.36.1 + + +From 546d1d85368fa8400e3348c4a1bf7e842004c545 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:53 -0700 +Subject: [PATCH 74/90] x86/sgx: Tighten accessible memory range after enclave + initialization + +Before an enclave is initialized the enclave's memory range is unknown. +The enclave's memory range is learned at the time it is created via the +SGX_IOC_ENCLAVE_CREATE ioctl() where the provided memory range is +obtained from an earlier mmap() of /dev/sgx_enclave. After an enclave +is initialized its memory can be mapped into user space (mmap()) from +where it can be entered at its defined entry points. + +With the enclave's memory range known after it is initialized there is +no reason why it should be possible to map memory outside this range. + +Lock down access to the initialized enclave's memory range by denying +any attempt to map memory outside its memory range. + +Locking down the memory range also makes adding pages to an initialized +enclave more efficient. Pages are added to an initialized enclave by +accessing memory that belongs to the enclave's memory range but not yet +backed by an enclave page. If it is possible for user space to map +memory that does not form part of the enclave then an access to this +memory would eventually fail. Failures range from a prompt general +protection fault if the access was an ENCLU[EACCEPT] from within the +enclave, or a page fault via the vDSO if it was another access from +within the enclave, or a SIGBUS (also resulting from a page fault) if +the access was from outside the enclave. + +Disallowing invalid memory to be mapped in the first place avoids +preventable failures. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/encl.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c +index 7f2b83df99e1..1a2cbe44b8d9 100644 +--- a/arch/x86/kernel/cpu/sgx/encl.c ++++ b/arch/x86/kernel/cpu/sgx/encl.c +@@ -402,6 +402,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + + XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + ++ /* Disallow mapping outside enclave's address range. */ ++ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && ++ (start < encl->base || end > encl->base + encl->size)) ++ return -EACCES; ++ + /* + * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might + * conflict with the enclave page permissions. +-- +2.36.1 + + +From fdd9787b3dc392abe5e030810cb115821a9cf134 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:54 -0700 +Subject: [PATCH 75/90] x86/sgx: Support modifying SGX page type + +Every enclave contains one or more Thread Control Structures (TCS). The +TCS contains meta-data used by the hardware to save and restore thread +specific information when entering/exiting the enclave. With SGX1 an +enclave needs to be created with enough TCSs to support the largest +number of threads expecting to use the enclave and enough enclave pages +to meet all its anticipated memory demands. In SGX1 all pages remain in +the enclave until the enclave is unloaded. + +SGX2 introduces a new function, ENCLS[EMODT], that is used to change +the type of an enclave page from a regular (SGX_PAGE_TYPE_REG) enclave +page to a TCS (SGX_PAGE_TYPE_TCS) page or change the type from a +regular (SGX_PAGE_TYPE_REG) or TCS (SGX_PAGE_TYPE_TCS) +page to a trimmed (SGX_PAGE_TYPE_TRIM) page (setting it up for later +removal). + +With the existing support of dynamically adding regular enclave pages +to an initialized enclave and changing the page type to TCS it is +possible to dynamically increase the number of threads supported by an +enclave. + +Changing the enclave page type to SGX_PAGE_TYPE_TRIM is the first step +of dynamically removing pages from an initialized enclave. The complete +page removal flow is: +1) Change the type of the pages to be removed to SGX_PAGE_TYPE_TRIM + using the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() introduced here. +2) Approve the page removal by running ENCLU[EACCEPT] from within + the enclave. +3) Initiate actual page removal using the ioctl() introduced in the + following patch. + +Add ioctl() SGX_IOC_ENCLAVE_MODIFY_TYPES to support changing SGX +enclave page types within an initialized enclave. With +SGX_IOC_ENCLAVE_MODIFY_TYPES the user specifies a page range and the +enclave page type to be applied to all pages in the provided range. +The ioctl() itself can return an error code based on failures +encountered by the kernel. It is also possible for SGX specific +failures to be encountered. Add a result output parameter to +communicate the SGX return code. It is possible for the enclave page +type change request to fail on any page within the provided range. +Support partial success by returning the number of pages that were +successfully changed. + +After the page type is changed the page continues to be accessible +from the kernel perspective with page table entries and internal +state. The page may be moved to swap. Any access until ENCLU[EACCEPT] +will encounter a page fault with SGX flag set in error code. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Haitao Huang <haitao.huang@intel.com> +Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/include/uapi/asm/sgx.h | 20 ++++ + arch/x86/kernel/cpu/sgx/ioctl.c | 202 ++++++++++++++++++++++++++++++++ + 2 files changed, 222 insertions(+) + +diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h +index 82648c006470..567f6166c24a 100644 +--- a/arch/x86/include/uapi/asm/sgx.h ++++ b/arch/x86/include/uapi/asm/sgx.h +@@ -31,6 +31,8 @@ enum sgx_page_flags { + _IO(SGX_MAGIC, 0x04) + #define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ + _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) ++#define SGX_IOC_ENCLAVE_MODIFY_TYPES \ ++ _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) + + /** + * struct sgx_enclave_create - parameter structure for the +@@ -97,6 +99,24 @@ struct sgx_enclave_restrict_permissions { + __u64 count; + }; + ++/** ++ * struct sgx_enclave_modify_types - parameters for ioctl ++ * %SGX_IOC_ENCLAVE_MODIFY_TYPES ++ * @offset: starting page offset (page aligned relative to enclave base ++ * address defined in SECS) ++ * @length: length of memory (multiple of the page size) ++ * @page_type: new type for pages in range described by @offset and @length ++ * @result: (output) SGX result code of ENCLS[EMODT] function ++ * @count: (output) bytes successfully changed (multiple of page size) ++ */ ++struct sgx_enclave_modify_types { ++ __u64 offset; ++ __u64 length; ++ __u64 page_type; ++ __u64 result; ++ __u64 count; ++}; ++ + struct sgx_enclave_run; + + /** +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 720188d86ed4..9ccafbfc4811 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -872,6 +872,205 @@ static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + return ret; + } + ++/** ++ * sgx_enclave_modify_types() - Modify type of SGX enclave pages ++ * @encl: Enclave to which the pages belong. ++ * @modt: Checked parameters from user about which pages need modifying ++ * and their new page type. ++ * ++ * Return: ++ * - 0: Success ++ * - -errno: Otherwise ++ */ ++static long sgx_enclave_modify_types(struct sgx_encl *encl, ++ struct sgx_enclave_modify_types *modt) ++{ ++ unsigned long max_prot_restore; ++ enum sgx_page_type page_type; ++ struct sgx_encl_page *entry; ++ struct sgx_secinfo secinfo; ++ unsigned long prot; ++ unsigned long addr; ++ unsigned long c; ++ void *epc_virt; ++ int ret; ++ ++ page_type = modt->page_type & SGX_PAGE_TYPE_MASK; ++ ++ /* ++ * The only new page types allowed by hardware are PT_TCS and PT_TRIM. ++ */ ++ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) ++ return -EINVAL; ++ ++ memset(&secinfo, 0, sizeof(secinfo)); ++ ++ secinfo.flags = page_type << 8; ++ ++ for (c = 0 ; c < modt->length; c += PAGE_SIZE) { ++ addr = encl->base + modt->offset + c; ++ ++ mutex_lock(&encl->lock); ++ ++ entry = sgx_encl_load_page(encl, addr); ++ if (IS_ERR(entry)) { ++ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; ++ goto out_unlock; ++ } ++ ++ /* ++ * Borrow the logic from the Intel SDM. Regular pages ++ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS ++ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. ++ * CET pages not supported yet. ++ */ ++ if (!(entry->type == SGX_PAGE_TYPE_REG || ++ (entry->type == SGX_PAGE_TYPE_TCS && ++ page_type == SGX_PAGE_TYPE_TRIM))) { ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ max_prot_restore = entry->vm_max_prot_bits; ++ ++ /* ++ * Once a regular page becomes a TCS page it cannot be ++ * changed back. So the maximum allowed protection reflects ++ * the TCS page that is always RW from kernel perspective but ++ * will be inaccessible from within enclave. Before doing ++ * so, do make sure that the new page type continues to ++ * respect the originally vetted page permissions. ++ */ ++ if (entry->type == SGX_PAGE_TYPE_REG && ++ page_type == SGX_PAGE_TYPE_TCS) { ++ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { ++ ret = -EPERM; ++ goto out_unlock; ++ } ++ prot = PROT_READ | PROT_WRITE; ++ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); ++ ++ /* ++ * Prevent page from being reclaimed while mutex ++ * is released. ++ */ ++ if (sgx_unmark_page_reclaimable(entry->epc_page)) { ++ ret = -EAGAIN; ++ goto out_entry_changed; ++ } ++ ++ /* ++ * Do not keep encl->lock because of dependency on ++ * mmap_lock acquired in sgx_zap_enclave_ptes(). ++ */ ++ mutex_unlock(&encl->lock); ++ ++ sgx_zap_enclave_ptes(encl, addr); ++ ++ mutex_lock(&encl->lock); ++ ++ sgx_mark_page_reclaimable(entry->epc_page); ++ } ++ ++ /* Change EPC type */ ++ epc_virt = sgx_get_epc_virt_addr(entry->epc_page); ++ ret = __emodt(&secinfo, epc_virt); ++ if (encls_faulted(ret)) { ++ /* ++ * All possible faults should be avoidable: ++ * parameters have been checked, will only change ++ * valid page types, and no concurrent ++ * SGX1/SGX2 ENCLS instructions since these are ++ * protected with mutex. ++ */ ++ pr_err_once("EMODT encountered exception %d\n", ++ ENCLS_TRAPNR(ret)); ++ ret = -EFAULT; ++ goto out_entry_changed; ++ } ++ if (encls_failed(ret)) { ++ modt->result = ret; ++ ret = -EFAULT; ++ goto out_entry_changed; ++ } ++ ++ ret = sgx_enclave_etrack(encl); ++ if (ret) { ++ ret = -EFAULT; ++ goto out_unlock; ++ } ++ ++ entry->type = page_type; ++ ++ mutex_unlock(&encl->lock); ++ } ++ ++ ret = 0; ++ goto out; ++ ++out_entry_changed: ++ entry->vm_max_prot_bits = max_prot_restore; ++out_unlock: ++ mutex_unlock(&encl->lock); ++out: ++ modt->count = c; ++ ++ return ret; ++} ++ ++/** ++ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES ++ * @encl: an enclave pointer ++ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance ++ * ++ * Ability to change the enclave page type supports the following use cases: ++ * ++ * * It is possible to add TCS pages to an enclave by changing the type of ++ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. ++ * With this support the number of threads supported by an initialized ++ * enclave can be increased dynamically. ++ * ++ * * Regular or TCS pages can dynamically be removed from an initialized ++ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the ++ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual ++ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called ++ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the ++ * enclave. ++ * ++ * Return: ++ * - 0: Success ++ * - -errno: Otherwise ++ */ ++static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, ++ void __user *arg) ++{ ++ struct sgx_enclave_modify_types params; ++ long ret; ++ ++ ret = sgx_ioc_sgx2_ready(encl); ++ if (ret) ++ return ret; ++ ++ if (copy_from_user(¶ms, arg, sizeof(params))) ++ return -EFAULT; ++ ++ if (sgx_validate_offset_length(encl, params.offset, params.length)) ++ return -EINVAL; ++ ++ if (params.page_type & ~SGX_PAGE_TYPE_MASK) ++ return -EINVAL; ++ ++ if (params.result || params.count) ++ return -EINVAL; ++ ++ ret = sgx_enclave_modify_types(encl, ¶ms); ++ ++ if (copy_to_user(arg, ¶ms, sizeof(params))) ++ return -EFAULT; ++ ++ return ret; ++} ++ + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + { + struct sgx_encl *encl = filep->private_data; +@@ -897,6 +1096,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; ++ case SGX_IOC_ENCLAVE_MODIFY_TYPES: ++ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); ++ break; + default: + ret = -ENOIOCTLCMD; + break; +-- +2.36.1 + + +From 8547db7c5848f96142d5e9c63081f8d4d69308cc Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:55 -0700 +Subject: [PATCH 76/90] x86/sgx: Support complete page removal + +The SGX2 page removal flow was introduced in previous patch and is +as follows: +1) Change the type of the pages to be removed to SGX_PAGE_TYPE_TRIM + using the ioctl() SGX_IOC_ENCLAVE_MODIFY_TYPES introduced in + previous patch. +2) Approve the page removal by running ENCLU[EACCEPT] from within + the enclave. +3) Initiate actual page removal using the ioctl() + SGX_IOC_ENCLAVE_REMOVE_PAGES introduced here. + +Support the final step of the SGX2 page removal flow with ioctl() +SGX_IOC_ENCLAVE_REMOVE_PAGES. With this ioctl() the user specifies +a page range that should be removed. All pages in the provided +range should have the SGX_PAGE_TYPE_TRIM page type and the request +will fail with EPERM (Operation not permitted) if a page that does +not have the correct type is encountered. Page removal can fail +on any page within the provided range. Support partial success by +returning the number of pages that were successfully removed. + +Since actual page removal will succeed even if ENCLU[EACCEPT] was not +run from within the enclave the ENCLU[EMODPR] instruction with RWX +permissions is used as a no-op mechanism to ensure ENCLU[EACCEPT] was +successfully run from within the enclave before the enclave page is +removed. + +If the user omits running SGX_IOC_ENCLAVE_REMOVE_PAGES the pages will +still be removed when the enclave is unloaded. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Tested-by: Haitao Huang <haitao.huang@intel.com> +Tested-by: Vijay Dhanraj <vijay.dhanraj@intel.com> +Tested-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/include/uapi/asm/sgx.h | 21 +++++ + arch/x86/kernel/cpu/sgx/ioctl.c | 145 ++++++++++++++++++++++++++++++++ + 2 files changed, 166 insertions(+) + +diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h +index 567f6166c24a..2dd35bbdc822 100644 +--- a/arch/x86/include/uapi/asm/sgx.h ++++ b/arch/x86/include/uapi/asm/sgx.h +@@ -33,6 +33,8 @@ enum sgx_page_flags { + _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) + #define SGX_IOC_ENCLAVE_MODIFY_TYPES \ + _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) ++#define SGX_IOC_ENCLAVE_REMOVE_PAGES \ ++ _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages) + + /** + * struct sgx_enclave_create - parameter structure for the +@@ -117,6 +119,25 @@ struct sgx_enclave_modify_types { + __u64 count; + }; + ++/** ++ * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters ++ * @offset: starting page offset (page aligned relative to enclave base ++ * address defined in SECS) ++ * @length: length of memory (multiple of the page size) ++ * @count: (output) bytes successfully changed (multiple of page size) ++ * ++ * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized ++ * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES ++ * ioctl() should be used to change the page type to PT_TRIM. After that ++ * succeeds ENCLU[EACCEPT] should be run from within the enclave and then ++ * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal. ++ */ ++struct sgx_enclave_remove_pages { ++ __u64 offset; ++ __u64 length; ++ __u64 count; ++}; ++ + struct sgx_enclave_run; + + /** +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 9ccafbfc4811..1a2595f261d3 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -1071,6 +1071,148 @@ static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + return ret; + } + ++/** ++ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave ++ * @encl: Enclave to which the pages belong ++ * @params: Checked parameters from user on which pages need to be removed ++ * ++ * Return: ++ * - 0: Success. ++ * - -errno: Otherwise. ++ */ ++static long sgx_encl_remove_pages(struct sgx_encl *encl, ++ struct sgx_enclave_remove_pages *params) ++{ ++ struct sgx_encl_page *entry; ++ struct sgx_secinfo secinfo; ++ unsigned long addr; ++ unsigned long c; ++ void *epc_virt; ++ int ret; ++ ++ memset(&secinfo, 0, sizeof(secinfo)); ++ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; ++ ++ for (c = 0 ; c < params->length; c += PAGE_SIZE) { ++ addr = encl->base + params->offset + c; ++ ++ mutex_lock(&encl->lock); ++ ++ entry = sgx_encl_load_page(encl, addr); ++ if (IS_ERR(entry)) { ++ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; ++ goto out_unlock; ++ } ++ ++ if (entry->type != SGX_PAGE_TYPE_TRIM) { ++ ret = -EPERM; ++ goto out_unlock; ++ } ++ ++ /* ++ * ENCLS[EMODPR] is a no-op instruction used to inform if ++ * ENCLU[EACCEPT] was run from within the enclave. If ++ * ENCLS[EMODPR] is run with RWX on a trimmed page that is ++ * not yet accepted then it will return ++ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is ++ * accepted the instruction will encounter a page fault. ++ */ ++ epc_virt = sgx_get_epc_virt_addr(entry->epc_page); ++ ret = __emodpr(&secinfo, epc_virt); ++ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { ++ ret = -EPERM; ++ goto out_unlock; ++ } ++ ++ if (sgx_unmark_page_reclaimable(entry->epc_page)) { ++ ret = -EBUSY; ++ goto out_unlock; ++ } ++ ++ /* ++ * Do not keep encl->lock because of dependency on ++ * mmap_lock acquired in sgx_zap_enclave_ptes(). ++ */ ++ mutex_unlock(&encl->lock); ++ ++ sgx_zap_enclave_ptes(encl, addr); ++ ++ mutex_lock(&encl->lock); ++ ++ sgx_encl_free_epc_page(entry->epc_page); ++ encl->secs_child_cnt--; ++ entry->epc_page = NULL; ++ xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); ++ sgx_encl_shrink(encl, NULL); ++ kfree(entry); ++ ++ mutex_unlock(&encl->lock); ++ } ++ ++ ret = 0; ++ goto out; ++ ++out_unlock: ++ mutex_unlock(&encl->lock); ++out: ++ params->count = c; ++ ++ return ret; ++} ++ ++/** ++ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES ++ * @encl: an enclave pointer ++ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance ++ * ++ * Final step of the flow removing pages from an initialized enclave. The ++ * complete flow is: ++ * ++ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM ++ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). ++ * 2) User approves the page removal by running ENCLU[EACCEPT] from within ++ * the enclave. ++ * 3) User initiates actual page removal using the ++ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. ++ * ++ * First remove any page table entries pointing to the page and then proceed ++ * with the actual removal of the enclave page and data in support of it. ++ * ++ * VA pages are not affected by this removal. It is thus possible that the ++ * enclave may end up with more VA pages than needed to support all its ++ * pages. ++ * ++ * Return: ++ * - 0: Success ++ * - -errno: Otherwise ++ */ ++static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, ++ void __user *arg) ++{ ++ struct sgx_enclave_remove_pages params; ++ long ret; ++ ++ ret = sgx_ioc_sgx2_ready(encl); ++ if (ret) ++ return ret; ++ ++ if (copy_from_user(¶ms, arg, sizeof(params))) ++ return -EFAULT; ++ ++ if (sgx_validate_offset_length(encl, params.offset, params.length)) ++ return -EINVAL; ++ ++ if (params.count) ++ return -EINVAL; ++ ++ ret = sgx_encl_remove_pages(encl, ¶ms); ++ ++ if (copy_to_user(arg, ¶ms, sizeof(params))) ++ return -EFAULT; ++ ++ return ret; ++} ++ + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + { + struct sgx_encl *encl = filep->private_data; +@@ -1099,6 +1241,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; ++ case SGX_IOC_ENCLAVE_REMOVE_PAGES: ++ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); ++ break; + default: + ret = -ENOIOCTLCMD; + break; +-- +2.36.1 + + +From acf00160f02092862d70b5b7a0c996e456c93341 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:56 -0700 +Subject: [PATCH 77/90] x86/sgx: Free up EPC pages directly to support large + page ranges + +The page reclaimer ensures availability of EPC pages across all +enclaves. In support of this it runs independently from the +individual enclaves in order to take locks from the different +enclaves as it writes pages to swap. + +When needing to load a page from swap an EPC page needs to be +available for its contents to be loaded into. Loading an existing +enclave page from swap does not reclaim EPC pages directly if +none are available, instead the reclaimer is woken when the +available EPC pages are found to be below a watermark. + +When iterating over a large number of pages in an oversubscribed +environment there is a race between the reclaimer woken up and +EPC pages reclaimed fast enough for the page operations to proceed. + +Ensure there are EPC pages available before attempting to load +a page that may potentially be pulled from swap into an available +EPC page. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + arch/x86/kernel/cpu/sgx/ioctl.c | 6 ++++++ + arch/x86/kernel/cpu/sgx/main.c | 11 +++++++++++ + arch/x86/kernel/cpu/sgx/sgx.h | 1 + + 3 files changed, 18 insertions(+) + +diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c +index 1a2595f261d3..ebe79d60619f 100644 +--- a/arch/x86/kernel/cpu/sgx/ioctl.c ++++ b/arch/x86/kernel/cpu/sgx/ioctl.c +@@ -745,6 +745,8 @@ sgx_enclave_restrict_permissions(struct sgx_encl *encl, + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + ++ sgx_reclaim_direct(); ++ + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); +@@ -910,6 +912,8 @@ static long sgx_enclave_modify_types(struct sgx_encl *encl, + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + ++ sgx_reclaim_direct(); ++ + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); +@@ -1096,6 +1100,8 @@ static long sgx_encl_remove_pages(struct sgx_encl *encl, + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + ++ sgx_reclaim_direct(); ++ + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); +diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c +index 6e2cb7564080..0e8741a80cf3 100644 +--- a/arch/x86/kernel/cpu/sgx/main.c ++++ b/arch/x86/kernel/cpu/sgx/main.c +@@ -370,6 +370,17 @@ static bool sgx_should_reclaim(unsigned long watermark) + !list_empty(&sgx_active_page_list); + } + ++/* ++ * sgx_reclaim_direct() should be called (without enclave's mutex held) ++ * in locations where SGX memory resources might be low and might be ++ * needed in order to make forward progress. ++ */ ++void sgx_reclaim_direct(void) ++{ ++ if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) ++ sgx_reclaim_pages(); ++} ++ + static int ksgxd(void *p) + { + set_freezable(); +diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h +index b30cee4de903..0f2020653fba 100644 +--- a/arch/x86/kernel/cpu/sgx/sgx.h ++++ b/arch/x86/kernel/cpu/sgx/sgx.h +@@ -86,6 +86,7 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) + struct sgx_epc_page *__sgx_alloc_epc_page(void); + void sgx_free_epc_page(struct sgx_epc_page *page); + ++void sgx_reclaim_direct(void); + void sgx_mark_page_reclaimable(struct sgx_epc_page *page); + int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); + struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +-- +2.36.1 + + +From 80f9ee71e24f681e3c7b7d30fc05c16ebb3b16f8 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:57 -0700 +Subject: [PATCH 78/90] Documentation/x86: Introduce enclave runtime management + section + +Enclave runtime management is introduced following the pattern +of the section describing enclave building. Provide a brief +summary of enclave runtime management, pointing to the functions +implementing the ioctl()s that will contain details within their +kernel-doc. + +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + Documentation/x86/sgx.rst | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst +index 265568a9292c..2bcbffacbed5 100644 +--- a/Documentation/x86/sgx.rst ++++ b/Documentation/x86/sgx.rst +@@ -100,6 +100,21 @@ pages and establish enclave page permissions. + sgx_ioc_enclave_init + sgx_ioc_enclave_provision + ++Enclave runtime management ++-------------------------- ++ ++Systems supporting SGX2 additionally support changes to initialized ++enclaves: modifying enclave page permissions and type, and dynamically ++adding and removing of enclave pages. When an enclave accesses an address ++within its address range that does not have a backing page then a new ++regular page will be dynamically added to the enclave. The enclave is ++still required to run EACCEPT on the new page before it can be used. ++ ++.. kernel-doc:: arch/x86/kernel/cpu/sgx/ioctl.c ++ :functions: sgx_ioc_enclave_restrict_permissions ++ sgx_ioc_enclave_modify_types ++ sgx_ioc_enclave_remove_pages ++ + Enclave vDSO + ------------ + +-- +2.36.1 + + +From 19fbd1d2facae80e246deb6f46c896fb222fb3c2 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:58 -0700 +Subject: [PATCH 79/90] selftests/sgx: Add test for EPCM permission changes + +EPCM permission changes could be made from within (to relax +permissions) or out (to restrict permissions) the enclave. Kernel +support is needed when permissions are restricted to be able to +call the privileged ENCLS[EMODPR] instruction. EPCM permissions +can be relaxed via ENCLU[EMODPE] from within the enclave but the +enclave still depends on the kernel to install PTEs with the needed +permissions. + +Add a test that exercises a few of the enclave page permission flows: +1) Test starts with a RW (from enclave and kernel perspective) + enclave page that is mapped via a RW VMA. +2) Use the SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl() to restrict + the enclave (EPCM) page permissions to read-only. +3) Run ENCLU[EACCEPT] from within the enclave to accept the new page + permissions. +4) Attempt to write to the enclave page from within the enclave - this + should fail with a page fault on the EPCM permissions since the page + table entry continues to allow RW access. +5) Restore EPCM permissions to RW by running ENCLU[EMODPE] from within + the enclave. +6) Attempt to write to the enclave page from within the enclave - this + should succeed since both EPCM and PTE permissions allow this access. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/defines.h | 15 ++ + tools/testing/selftests/sgx/main.c | 214 ++++++++++++++++++++++++ + tools/testing/selftests/sgx/test_encl.c | 38 +++++ + 3 files changed, 267 insertions(+) + +diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h +index 02d775789ea7..b638eb98c80c 100644 +--- a/tools/testing/selftests/sgx/defines.h ++++ b/tools/testing/selftests/sgx/defines.h +@@ -24,6 +24,8 @@ enum encl_op_type { + ENCL_OP_PUT_TO_ADDRESS, + ENCL_OP_GET_FROM_ADDRESS, + ENCL_OP_NOP, ++ ENCL_OP_EACCEPT, ++ ENCL_OP_EMODPE, + ENCL_OP_MAX, + }; + +@@ -53,4 +55,17 @@ struct encl_op_get_from_addr { + uint64_t addr; + }; + ++struct encl_op_eaccept { ++ struct encl_op_header header; ++ uint64_t epc_addr; ++ uint64_t flags; ++ uint64_t ret; ++}; ++ ++struct encl_op_emodpe { ++ struct encl_op_header header; ++ uint64_t epc_addr; ++ uint64_t flags; ++}; ++ + #endif /* DEFINES_H */ +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index dd74fa42302e..46eac09cd955 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -25,6 +25,18 @@ static const uint64_t MAGIC = 0x1122334455667788ULL; + static const uint64_t MAGIC2 = 0x8877665544332211ULL; + vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; + ++/* ++ * Security Information (SECINFO) data structure needed by a few SGX ++ * instructions (eg. ENCLU[EACCEPT] and ENCLU[EMODPE]) holds meta-data ++ * about an enclave page. &enum sgx_secinfo_page_state specifies the ++ * secinfo flags used for page state. ++ */ ++enum sgx_secinfo_page_state { ++ SGX_SECINFO_PENDING = (1 << 3), ++ SGX_SECINFO_MODIFIED = (1 << 4), ++ SGX_SECINFO_PR = (1 << 5), ++}; ++ + struct vdso_symtab { + Elf64_Sym *elf_symtab; + const char *elf_symstrtab; +@@ -555,4 +567,206 @@ TEST_F(enclave, pte_permissions) + EXPECT_EQ(self->run.exception_addr, 0); + } + ++/* ++ * Enclave page permission test. ++ * ++ * Modify and restore enclave page's EPCM (enclave) permissions from ++ * outside enclave (ENCLS[EMODPR] via kernel) as well as from within ++ * enclave (via ENCLU[EMODPE]). Check for page fault if ++ * VMA allows access but EPCM permissions do not. ++ */ ++TEST_F(enclave, epcm_permissions) ++{ ++ struct sgx_enclave_restrict_permissions restrict_ioc; ++ struct encl_op_get_from_addr get_addr_op; ++ struct encl_op_put_to_addr put_addr_op; ++ struct encl_op_eaccept eaccept_op; ++ struct encl_op_emodpe emodpe_op; ++ unsigned long data_start; ++ int ret, errno_save; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ /* ++ * Ensure kernel supports needed ioctl() and system supports needed ++ * commands. ++ */ ++ memset(&restrict_ioc, 0, sizeof(restrict_ioc)); ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, ++ &restrict_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ ASSERT_EQ(ret, -1); ++ ++ /* ret == -1 */ ++ if (errno_save == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); ++ else if (errno_save == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ ++ /* ++ * Page that will have its permissions changed is the second data ++ * page in the .data segment. This forms part of the local encl_buffer ++ * within the enclave. ++ * ++ * At start of test @data_start should have EPCM as well as PTE and ++ * VMA permissions of RW. ++ */ ++ ++ data_start = self->encl.encl_base + ++ encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ++ /* ++ * Sanity check that page at @data_start is writable before making ++ * any changes to page permissions. ++ * ++ * Start by writing MAGIC to test page. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = data_start; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory that was just written to, confirming that ++ * page is writable. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = data_start; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Change EPCM permissions to read-only. Kernel still considers ++ * the page writable. ++ */ ++ memset(&restrict_ioc, 0, sizeof(restrict_ioc)); ++ ++ restrict_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ restrict_ioc.length = PAGE_SIZE; ++ restrict_ioc.permissions = SGX_SECINFO_R; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, ++ &restrict_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(restrict_ioc.result, 0); ++ EXPECT_EQ(restrict_ioc.count, 4096); ++ ++ /* ++ * EPCM permissions changed from kernel, need to EACCEPT from enclave. ++ */ ++ eaccept_op.epc_addr = data_start; ++ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_REG | SGX_SECINFO_PR; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* ++ * EPCM permissions of page is now read-only, expect #PF ++ * on EPCM when attempting to write to page from within enclave. ++ */ ++ put_addr_op.value = MAGIC2; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(self->run.function, ERESUME); ++ EXPECT_EQ(self->run.exception_vector, 14); ++ EXPECT_EQ(self->run.exception_error_code, 0x8007); ++ EXPECT_EQ(self->run.exception_addr, data_start); ++ ++ self->run.exception_vector = 0; ++ self->run.exception_error_code = 0; ++ self->run.exception_addr = 0; ++ ++ /* ++ * Received AEX but cannot return to enclave at same entrypoint, ++ * need different TCS from where EPCM permission can be made writable ++ * again. ++ */ ++ self->run.tcs = self->encl.encl_base + PAGE_SIZE; ++ ++ /* ++ * Enter enclave at new TCS to change EPCM permissions to be ++ * writable again and thus fix the page fault that triggered the ++ * AEX. ++ */ ++ ++ emodpe_op.epc_addr = data_start; ++ emodpe_op.flags = SGX_SECINFO_R | SGX_SECINFO_W; ++ emodpe_op.header.type = ENCL_OP_EMODPE; ++ ++ EXPECT_EQ(ENCL_CALL(&emodpe_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Attempt to return to main TCS to resume execution at faulting ++ * instruction, PTE should continue to allow writing to the page. ++ */ ++ self->run.tcs = self->encl.encl_base; ++ ++ /* ++ * Wrong page permissions that caused original fault has ++ * now been fixed via EPCM permissions. ++ * Resume execution in main TCS to re-attempt the memory access. ++ */ ++ self->run.tcs = self->encl.encl_base; ++ ++ EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, ++ ERESUME, 0, 0, ++ &self->run), ++ 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ get_addr_op.value = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC2); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.user_data, 0); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++} ++ + TEST_HARNESS_MAIN +diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c +index 4fca01cfd898..5b6c65331527 100644 +--- a/tools/testing/selftests/sgx/test_encl.c ++++ b/tools/testing/selftests/sgx/test_encl.c +@@ -11,6 +11,42 @@ + */ + static uint8_t encl_buffer[8192] = { 1 }; + ++enum sgx_enclu_function { ++ EACCEPT = 0x5, ++ EMODPE = 0x6, ++}; ++ ++static void do_encl_emodpe(void *_op) ++{ ++ struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; ++ struct encl_op_emodpe *op = _op; ++ ++ secinfo.flags = op->flags; ++ ++ asm volatile(".byte 0x0f, 0x01, 0xd7" ++ : ++ : "a" (EMODPE), ++ "b" (&secinfo), ++ "c" (op->epc_addr)); ++} ++ ++static void do_encl_eaccept(void *_op) ++{ ++ struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; ++ struct encl_op_eaccept *op = _op; ++ int rax; ++ ++ secinfo.flags = op->flags; ++ ++ asm volatile(".byte 0x0f, 0x01, 0xd7" ++ : "=a" (rax) ++ : "a" (EACCEPT), ++ "b" (&secinfo), ++ "c" (op->epc_addr)); ++ ++ op->ret = rax; ++} ++ + static void *memcpy(void *dest, const void *src, size_t n) + { + size_t i; +@@ -62,6 +98,8 @@ void encl_body(void *rdi, void *rsi) + do_encl_op_put_to_addr, + do_encl_op_get_from_addr, + do_encl_op_nop, ++ do_encl_eaccept, ++ do_encl_emodpe, + }; + + struct encl_op_header *op = (struct encl_op_header *)rdi; +-- +2.36.1 + + +From 0284bd50e496e8b76ad485dbfd675769ef8a39d4 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:08:59 -0700 +Subject: [PATCH 80/90] selftests/sgx: Add test for TCS page permission changes + +Kernel should not allow permission changes on TCS pages. Add test to +confirm this behavior. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/main.c | 71 ++++++++++++++++++++++++++++++ + 1 file changed, 71 insertions(+) + +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index 46eac09cd955..016ae3e5f398 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -121,6 +121,24 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) + return NULL; + } + ++/* ++ * Return the offset in the enclave where the TCS segment can be found. ++ * The first RW segment loaded is the TCS. ++ */ ++static off_t encl_get_tcs_offset(struct encl *encl) ++{ ++ int i; ++ ++ for (i = 0; i < encl->nr_segments; i++) { ++ struct encl_segment *seg = &encl->segment_tbl[i]; ++ ++ if (i == 0 && seg->prot == (PROT_READ | PROT_WRITE)) ++ return seg->offset; ++ } ++ ++ return -1; ++} ++ + /* + * Return the offset in the enclave where the data segment can be found. + * The first RW segment loaded is the TCS, skip that to get info on the +@@ -567,6 +585,59 @@ TEST_F(enclave, pte_permissions) + EXPECT_EQ(self->run.exception_addr, 0); + } + ++/* ++ * Modifying permissions of TCS page should not be possible. ++ */ ++TEST_F(enclave, tcs_permissions) ++{ ++ struct sgx_enclave_restrict_permissions ioc; ++ int ret, errno_save; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ memset(&ioc, 0, sizeof(ioc)); ++ ++ /* ++ * Ensure kernel supports needed ioctl() and system supports needed ++ * commands. ++ */ ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ ASSERT_EQ(ret, -1); ++ ++ /* ret == -1 */ ++ if (errno_save == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); ++ else if (errno_save == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ ++ /* ++ * Attempt to make TCS page read-only. This is not allowed and ++ * should be prevented by the kernel. ++ */ ++ ioc.offset = encl_get_tcs_offset(&self->encl); ++ ioc.length = PAGE_SIZE; ++ ioc.permissions = SGX_SECINFO_R; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, -1); ++ EXPECT_EQ(errno_save, EINVAL); ++ EXPECT_EQ(ioc.result, 0); ++ EXPECT_EQ(ioc.count, 0); ++} ++ + /* + * Enclave page permission test. + * +-- +2.36.1 + + +From 8a1c597bd4a809100be72883e16abf2e4820ecc7 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:00 -0700 +Subject: [PATCH 81/90] selftests/sgx: Test two different SGX2 EAUG flows + +Enclave pages can be added to an initialized enclave when an address +belonging to the enclave but without a backing page is accessed from +within the enclave. + +Accessing memory without a backing enclave page from within an enclave +can be in different ways: +1) Pre-emptively run ENCLU[EACCEPT]. Since the addition of a page + always needs to be accepted by the enclave via ENCLU[EACCEPT] this + flow is efficient since the first execution of ENCLU[EACCEPT] + triggers the addition of the page and when execution returns to the + same instruction the second execution would be successful as an + acceptance of the page. + +2) A direct read or write. The flow where a direct read or write + triggers the page addition execution cannot resume from the + instruction (read/write) that triggered the fault but instead + the enclave needs to be entered at a different entry point to + run needed ENCLU[EACCEPT] before execution can return to the + original entry point and the read/write instruction that faulted. + +Add tests for both flows. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/main.c | 250 +++++++++++++++++++++++++++++ + 1 file changed, 250 insertions(+) + +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index 016ae3e5f398..79c08e347112 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -86,6 +86,15 @@ static bool vdso_get_symtab(void *addr, struct vdso_symtab *symtab) + return true; + } + ++static inline int sgx2_supported(void) ++{ ++ unsigned int eax, ebx, ecx, edx; ++ ++ __cpuid_count(SGX_CPUID, 0x0, eax, ebx, ecx, edx); ++ ++ return eax & 0x2; ++} ++ + static unsigned long elf_sym_hash(const char *name) + { + unsigned long h = 0, high; +@@ -840,4 +849,245 @@ TEST_F(enclave, epcm_permissions) + EXPECT_EQ(self->run.exception_addr, 0); + } + ++/* ++ * Test the addition of pages to an initialized enclave via writing to ++ * a page belonging to the enclave's address space but was not added ++ * during enclave creation. ++ */ ++TEST_F(enclave, augment) ++{ ++ struct encl_op_get_from_addr get_addr_op; ++ struct encl_op_put_to_addr put_addr_op; ++ struct encl_op_eaccept eaccept_op; ++ size_t total_size = 0; ++ void *addr; ++ int i; ++ ++ if (!sgx2_supported()) ++ SKIP(return, "SGX2 not supported"); ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ for (i = 0; i < self->encl.nr_segments; i++) { ++ struct encl_segment *seg = &self->encl.segment_tbl[i]; ++ ++ total_size += seg->size; ++ } ++ ++ /* ++ * Actual enclave size is expected to be larger than the loaded ++ * test enclave since enclave size must be a power of 2 in bytes ++ * and test_encl does not consume it all. ++ */ ++ EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); ++ ++ /* ++ * Create memory mapping for the page that will be added. New ++ * memory mapping is for one page right after all existing ++ * mappings. ++ * Kernel will allow new mapping using any permissions if it ++ * falls into the enclave's address range but not backed ++ * by existing enclave pages. ++ */ ++ addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, ++ PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_SHARED | MAP_FIXED, self->encl.fd, 0); ++ EXPECT_NE(addr, MAP_FAILED); ++ ++ self->run.exception_vector = 0; ++ self->run.exception_error_code = 0; ++ self->run.exception_addr = 0; ++ ++ /* ++ * Attempt to write to the new page from within enclave. ++ * Expected to fail since page is not (yet) part of the enclave. ++ * The first #PF will trigger the addition of the page to the ++ * enclave, but since the new page needs an EACCEPT from within the ++ * enclave before it can be used it would not be possible ++ * to successfully return to the failing instruction. This is the ++ * cause of the second #PF captured here having the SGX bit set, ++ * it is from hardware preventing the page from being used. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = (unsigned long)addr; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(self->run.function, ERESUME); ++ EXPECT_EQ(self->run.exception_vector, 14); ++ EXPECT_EQ(self->run.exception_addr, (unsigned long)addr); ++ ++ if (self->run.exception_error_code == 0x6) { ++ munmap(addr, PAGE_SIZE); ++ SKIP(return, "Kernel does not support adding pages to initialized enclave"); ++ } ++ ++ EXPECT_EQ(self->run.exception_error_code, 0x8007); ++ ++ self->run.exception_vector = 0; ++ self->run.exception_error_code = 0; ++ self->run.exception_addr = 0; ++ ++ /* Handle AEX by running EACCEPT from new entry point. */ ++ self->run.tcs = self->encl.encl_base + PAGE_SIZE; ++ ++ eaccept_op.epc_addr = self->encl.encl_base + total_size; ++ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* Can now return to main TCS to resume execution. */ ++ self->run.tcs = self->encl.encl_base; ++ ++ EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, ++ ERESUME, 0, 0, ++ &self->run), ++ 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory from newly added page that was just written to, ++ * confirming that data previously written (MAGIC) is present. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = (unsigned long)addr; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ munmap(addr, PAGE_SIZE); ++} ++ ++/* ++ * Test for the addition of pages to an initialized enclave via a ++ * pre-emptive run of EACCEPT on page to be added. ++ */ ++TEST_F(enclave, augment_via_eaccept) ++{ ++ struct encl_op_get_from_addr get_addr_op; ++ struct encl_op_put_to_addr put_addr_op; ++ struct encl_op_eaccept eaccept_op; ++ size_t total_size = 0; ++ void *addr; ++ int i; ++ ++ if (!sgx2_supported()) ++ SKIP(return, "SGX2 not supported"); ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ for (i = 0; i < self->encl.nr_segments; i++) { ++ struct encl_segment *seg = &self->encl.segment_tbl[i]; ++ ++ total_size += seg->size; ++ } ++ ++ /* ++ * Actual enclave size is expected to be larger than the loaded ++ * test enclave since enclave size must be a power of 2 in bytes while ++ * test_encl does not consume it all. ++ */ ++ EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); ++ ++ /* ++ * mmap() a page at end of existing enclave to be used for dynamic ++ * EPC page. ++ * ++ * Kernel will allow new mapping using any permissions if it ++ * falls into the enclave's address range but not backed ++ * by existing enclave pages. ++ */ ++ ++ addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, ++ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_FIXED, ++ self->encl.fd, 0); ++ EXPECT_NE(addr, MAP_FAILED); ++ ++ self->run.exception_vector = 0; ++ self->run.exception_error_code = 0; ++ self->run.exception_addr = 0; ++ ++ /* ++ * Run EACCEPT on new page to trigger the #PF->EAUG->EACCEPT(again ++ * without a #PF). All should be transparent to userspace. ++ */ ++ eaccept_op.epc_addr = self->encl.encl_base + total_size; ++ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ if (self->run.exception_vector == 14 && ++ self->run.exception_error_code == 4 && ++ self->run.exception_addr == self->encl.encl_base + total_size) { ++ munmap(addr, PAGE_SIZE); ++ SKIP(return, "Kernel does not support adding pages to initialized enclave"); ++ } ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* ++ * New page should be accessible from within enclave - attempt to ++ * write to it. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = (unsigned long)addr; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory from newly added page that was just written to, ++ * confirming that data previously written (MAGIC) is present. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = (unsigned long)addr; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ munmap(addr, PAGE_SIZE); ++} ++ + TEST_HARNESS_MAIN +-- +2.36.1 + + +From 1cf6f52eedd1fabdf05e45228b0d3e79e0159060 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:01 -0700 +Subject: [PATCH 82/90] selftests/sgx: Introduce dynamic entry point + +The test enclave (test_encl.elf) is built with two initialized +Thread Control Structures (TCS) included in the binary. Both TCS are +initialized with the same entry point, encl_entry, that correctly +computes the absolute address of the stack based on the stack of each +TCS that is also built into the binary. + +A new TCS can be added dynamically to the enclave and requires to be +initialized with an entry point used to enter the enclave. Since the +existing entry point, encl_entry, assumes that the TCS and its stack +exists at particular offsets within the binary it is not able to handle +a dynamically added TCS and its stack. + +Introduce a new entry point, encl_dyn_entry, that initializes the +absolute address of that thread's stack to the address immediately +preceding the TCS itself. It is now possible to dynamically add a +contiguous memory region to the enclave with the new stack preceding +the new TCS. With the new TCS initialized with encl_dyn_entry as entry +point the absolute address of the stack is computed correctly on entry. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/test_encl_bootstrap.S | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S +index 82fb0dfcbd23..03ae0f57e29d 100644 +--- a/tools/testing/selftests/sgx/test_encl_bootstrap.S ++++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S +@@ -45,6 +45,12 @@ encl_entry: + # TCS #2. By adding the value of encl_stack to it, we get + # the absolute address for the stack. + lea (encl_stack)(%rbx), %rax ++ jmp encl_entry_core ++encl_dyn_entry: ++ # Entry point for dynamically created TCS page expected to follow ++ # its stack directly. ++ lea -1(%rbx), %rax ++encl_entry_core: + xchg %rsp, %rax + push %rax + +-- +2.36.1 + + +From 93165003e27a941b08ae68e257478f499fd2794b Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:02 -0700 +Subject: [PATCH 83/90] selftests/sgx: Introduce TCS initialization enclave + operation + +The Thread Control Structure (TCS) contains meta-data used by the +hardware to save and restore thread specific information when +entering/exiting the enclave. A TCS can be added to an initialized +enclave by first adding a new regular enclave page, initializing the +content of the new page from within the enclave, and then changing that +page's type to a TCS. + +Support the initialization of a TCS from within the enclave. +The variable information needed that should be provided from outside +the enclave is the address of the TCS, address of the State Save Area +(SSA), and the entry point that the thread should use to enter the +enclave. With this information provided all needed fields of a TCS +can be initialized. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/defines.h | 8 +++++++ + tools/testing/selftests/sgx/test_encl.c | 30 +++++++++++++++++++++++++ + 2 files changed, 38 insertions(+) + +diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h +index b638eb98c80c..d8587c971941 100644 +--- a/tools/testing/selftests/sgx/defines.h ++++ b/tools/testing/selftests/sgx/defines.h +@@ -26,6 +26,7 @@ enum encl_op_type { + ENCL_OP_NOP, + ENCL_OP_EACCEPT, + ENCL_OP_EMODPE, ++ ENCL_OP_INIT_TCS_PAGE, + ENCL_OP_MAX, + }; + +@@ -68,4 +69,11 @@ struct encl_op_emodpe { + uint64_t flags; + }; + ++struct encl_op_init_tcs_page { ++ struct encl_op_header header; ++ uint64_t tcs_page; ++ uint64_t ssa; ++ uint64_t entry; ++}; ++ + #endif /* DEFINES_H */ +diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c +index 5b6c65331527..c0d6397295e3 100644 +--- a/tools/testing/selftests/sgx/test_encl.c ++++ b/tools/testing/selftests/sgx/test_encl.c +@@ -57,6 +57,35 @@ static void *memcpy(void *dest, const void *src, size_t n) + return dest; + } + ++static void *memset(void *dest, int c, size_t n) ++{ ++ size_t i; ++ ++ for (i = 0; i < n; i++) ++ ((char *)dest)[i] = c; ++ ++ return dest; ++} ++ ++static void do_encl_init_tcs_page(void *_op) ++{ ++ struct encl_op_init_tcs_page *op = _op; ++ void *tcs = (void *)op->tcs_page; ++ uint32_t val_32; ++ ++ memset(tcs, 0, 16); /* STATE and FLAGS */ ++ memcpy(tcs + 16, &op->ssa, 8); /* OSSA */ ++ memset(tcs + 24, 0, 4); /* CSSA */ ++ val_32 = 1; ++ memcpy(tcs + 28, &val_32, 4); /* NSSA */ ++ memcpy(tcs + 32, &op->entry, 8); /* OENTRY */ ++ memset(tcs + 40, 0, 24); /* AEP, OFSBASE, OGSBASE */ ++ val_32 = 0xFFFFFFFF; ++ memcpy(tcs + 64, &val_32, 4); /* FSLIMIT */ ++ memcpy(tcs + 68, &val_32, 4); /* GSLIMIT */ ++ memset(tcs + 72, 0, 4024); /* Reserved */ ++} ++ + static void do_encl_op_put_to_buf(void *op) + { + struct encl_op_put_to_buf *op2 = op; +@@ -100,6 +129,7 @@ void encl_body(void *rdi, void *rsi) + do_encl_op_nop, + do_encl_eaccept, + do_encl_emodpe, ++ do_encl_init_tcs_page, + }; + + struct encl_op_header *op = (struct encl_op_header *)rdi; +-- +2.36.1 + + +From b707c425ee3b8ededc9aaf8be0343c4387eacafc Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:03 -0700 +Subject: [PATCH 84/90] selftests/sgx: Test complete changing of page type flow + +Support for changing an enclave page's type enables an initialized +enclave to be expanded with support for more threads by changing the +type of a regular enclave page to that of a Thread Control Structure +(TCS). Additionally, being able to change a TCS or regular enclave +page's type to be trimmed (SGX_PAGE_TYPE_TRIM) initiates the removal +of the page from the enclave. + +Test changing page type to TCS as well as page removal flows +in two phases: In the first phase support for a new thread is +dynamically added to an initialized enclave and in the second phase +the pages associated with the new thread are removed from the enclave. +As an additional sanity check after the second phase the page used as +a TCS page during the first phase is added back as a regular page and +ensured that it can be written to (which is not possible if it was a +TCS page). + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/load.c | 41 ++++ + tools/testing/selftests/sgx/main.c | 343 +++++++++++++++++++++++++++++ + tools/testing/selftests/sgx/main.h | 1 + + 3 files changed, 385 insertions(+) + +diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c +index 006b464c8fc9..94bdeac1cf04 100644 +--- a/tools/testing/selftests/sgx/load.c ++++ b/tools/testing/selftests/sgx/load.c +@@ -130,6 +130,47 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) + return true; + } + ++/* ++ * Parse the enclave code's symbol table to locate and return address of ++ * the provided symbol ++ */ ++uint64_t encl_get_entry(struct encl *encl, const char *symbol) ++{ ++ Elf64_Shdr *sections; ++ Elf64_Sym *symtab; ++ Elf64_Ehdr *ehdr; ++ char *sym_names; ++ int num_sym; ++ int i; ++ ++ ehdr = encl->bin; ++ sections = encl->bin + ehdr->e_shoff; ++ ++ for (i = 0; i < ehdr->e_shnum; i++) { ++ if (sections[i].sh_type == SHT_SYMTAB) { ++ symtab = (Elf64_Sym *)((char *)encl->bin + sections[i].sh_offset); ++ num_sym = sections[i].sh_size / sections[i].sh_entsize; ++ break; ++ } ++ } ++ ++ for (i = 0; i < ehdr->e_shnum; i++) { ++ if (sections[i].sh_type == SHT_STRTAB) { ++ sym_names = (char *)encl->bin + sections[i].sh_offset; ++ break; ++ } ++ } ++ ++ for (i = 0; i < num_sym; i++) { ++ Elf64_Sym *sym = &symtab[i]; ++ ++ if (!strcmp(symbol, sym_names + sym->st_name)) ++ return (uint64_t)sym->st_value; ++ } ++ ++ return 0; ++} ++ + bool encl_load(const char *path, struct encl *encl, unsigned long heap_size) + { + const char device_path[] = "/dev/sgx_enclave"; +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index 79c08e347112..8bf43646e0bb 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -1090,4 +1090,347 @@ TEST_F(enclave, augment_via_eaccept) + munmap(addr, PAGE_SIZE); + } + ++/* ++ * SGX2 page type modification test in two phases: ++ * Phase 1: ++ * Create a new TCS, consisting out of three new pages (stack page with regular ++ * page type, SSA page with regular page type, and TCS page with TCS page ++ * type) in an initialized enclave and run a simple workload within it. ++ * Phase 2: ++ * Remove the three pages added in phase 1, add a new regular page at the ++ * same address that previously hosted the TCS page and verify that it can ++ * be modified. ++ */ ++TEST_F(enclave, tcs_create) ++{ ++ struct encl_op_init_tcs_page init_tcs_page_op; ++ struct sgx_enclave_remove_pages remove_ioc; ++ struct encl_op_get_from_addr get_addr_op; ++ struct sgx_enclave_modify_types modt_ioc; ++ struct encl_op_put_to_addr put_addr_op; ++ struct encl_op_get_from_buf get_buf_op; ++ struct encl_op_put_to_buf put_buf_op; ++ void *addr, *tcs, *stack_end, *ssa; ++ struct encl_op_eaccept eaccept_op; ++ size_t total_size = 0; ++ uint64_t val_64; ++ int errno_save; ++ int ret, i; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, ++ _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ /* ++ * Hardware (SGX2) and kernel support is needed for this test. Start ++ * with check that test has a chance of succeeding. ++ */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ ++ if (ret == -1) { ++ if (errno == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); ++ else if (errno == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ } ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ EXPECT_EQ(ret, -1); ++ ++ /* ++ * Add three regular pages via EAUG: one will be the TCS stack, one ++ * will be the TCS SSA, and one will be the new TCS. The stack and ++ * SSA will remain as regular pages, the TCS page will need its ++ * type changed after populated with needed data. ++ */ ++ for (i = 0; i < self->encl.nr_segments; i++) { ++ struct encl_segment *seg = &self->encl.segment_tbl[i]; ++ ++ total_size += seg->size; ++ } ++ ++ /* ++ * Actual enclave size is expected to be larger than the loaded ++ * test enclave since enclave size must be a power of 2 in bytes while ++ * test_encl does not consume it all. ++ */ ++ EXPECT_LT(total_size + 3 * PAGE_SIZE, self->encl.encl_size); ++ ++ /* ++ * mmap() three pages at end of existing enclave to be used for the ++ * three new pages. ++ */ ++ addr = mmap((void *)self->encl.encl_base + total_size, 3 * PAGE_SIZE, ++ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, ++ self->encl.fd, 0); ++ EXPECT_NE(addr, MAP_FAILED); ++ ++ self->run.exception_vector = 0; ++ self->run.exception_error_code = 0; ++ self->run.exception_addr = 0; ++ ++ stack_end = (void *)self->encl.encl_base + total_size; ++ tcs = (void *)self->encl.encl_base + total_size + PAGE_SIZE; ++ ssa = (void *)self->encl.encl_base + total_size + 2 * PAGE_SIZE; ++ ++ /* ++ * Run EACCEPT on each new page to trigger the ++ * EACCEPT->(#PF)->EAUG->EACCEPT(again without a #PF) flow. ++ */ ++ ++ eaccept_op.epc_addr = (unsigned long)stack_end; ++ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ if (self->run.exception_vector == 14 && ++ self->run.exception_error_code == 4 && ++ self->run.exception_addr == (unsigned long)stack_end) { ++ munmap(addr, 3 * PAGE_SIZE); ++ SKIP(return, "Kernel does not support adding pages to initialized enclave"); ++ } ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ eaccept_op.epc_addr = (unsigned long)ssa; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ eaccept_op.epc_addr = (unsigned long)tcs; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* ++ * Three new pages added to enclave. Now populate the TCS page with ++ * needed data. This should be done from within enclave. Provide ++ * the function that will do the actual data population with needed ++ * data. ++ */ ++ ++ /* ++ * New TCS will use the "encl_dyn_entry" entrypoint that expects ++ * stack to begin in page before TCS page. ++ */ ++ val_64 = encl_get_entry(&self->encl, "encl_dyn_entry"); ++ EXPECT_NE(val_64, 0); ++ ++ init_tcs_page_op.tcs_page = (unsigned long)tcs; ++ init_tcs_page_op.ssa = (unsigned long)total_size + 2 * PAGE_SIZE; ++ init_tcs_page_op.entry = val_64; ++ init_tcs_page_op.header.type = ENCL_OP_INIT_TCS_PAGE; ++ ++ EXPECT_EQ(ENCL_CALL(&init_tcs_page_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* Change TCS page type to TCS. */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ++ modt_ioc.offset = total_size + PAGE_SIZE; ++ modt_ioc.length = PAGE_SIZE; ++ modt_ioc.page_type = SGX_PAGE_TYPE_TCS; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(modt_ioc.result, 0); ++ EXPECT_EQ(modt_ioc.count, 4096); ++ ++ /* EACCEPT new TCS page from enclave. */ ++ eaccept_op.epc_addr = (unsigned long)tcs; ++ eaccept_op.flags = SGX_SECINFO_TCS | SGX_SECINFO_MODIFIED; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* Run workload from new TCS. */ ++ self->run.tcs = (unsigned long)tcs; ++ ++ /* ++ * Simple workload to write to data buffer and read value back. ++ */ ++ put_buf_op.header.type = ENCL_OP_PUT_TO_BUFFER; ++ put_buf_op.value = MAGIC; ++ ++ EXPECT_EQ(ENCL_CALL(&put_buf_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ get_buf_op.header.type = ENCL_OP_GET_FROM_BUFFER; ++ get_buf_op.value = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&get_buf_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_buf_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Phase 2 of test: ++ * Remove pages associated with new TCS, create a regular page ++ * where TCS page used to be and verify it can be used as a regular ++ * page. ++ */ ++ ++ /* Start page removal by requesting change of page type to PT_TRIM. */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ++ modt_ioc.offset = total_size; ++ modt_ioc.length = 3 * PAGE_SIZE; ++ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(modt_ioc.result, 0); ++ EXPECT_EQ(modt_ioc.count, 3 * PAGE_SIZE); ++ ++ /* ++ * Enter enclave via TCS #1 and approve page removal by sending ++ * EACCEPT for each of three removed pages. ++ */ ++ self->run.tcs = self->encl.encl_base; ++ ++ eaccept_op.epc_addr = (unsigned long)stack_end; ++ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ eaccept_op.epc_addr = (unsigned long)tcs; ++ eaccept_op.ret = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ eaccept_op.epc_addr = (unsigned long)ssa; ++ eaccept_op.ret = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* Send final ioctl() to complete page removal. */ ++ memset(&remove_ioc, 0, sizeof(remove_ioc)); ++ ++ remove_ioc.offset = total_size; ++ remove_ioc.length = 3 * PAGE_SIZE; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(remove_ioc.count, 3 * PAGE_SIZE); ++ ++ /* ++ * Enter enclave via TCS #1 and access location where TCS #3 was to ++ * trigger dynamic add of regular page at that location. ++ */ ++ eaccept_op.epc_addr = (unsigned long)tcs; ++ eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* ++ * New page should be accessible from within enclave - write to it. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = (unsigned long)tcs; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory from newly added page that was just written to, ++ * confirming that data previously written (MAGIC) is present. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = (unsigned long)tcs; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ munmap(addr, 3 * PAGE_SIZE); ++} ++ + TEST_HARNESS_MAIN +diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h +index b45c52ec7ab3..fc585be97e2f 100644 +--- a/tools/testing/selftests/sgx/main.h ++++ b/tools/testing/selftests/sgx/main.h +@@ -38,6 +38,7 @@ void encl_delete(struct encl *ctx); + bool encl_load(const char *path, struct encl *encl, unsigned long heap_size); + bool encl_measure(struct encl *encl); + bool encl_build(struct encl *encl); ++uint64_t encl_get_entry(struct encl *encl, const char *symbol); + + int sgx_enter_enclave(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, + struct sgx_enclave_run *run); +-- +2.36.1 + + +From 4fb1714c57b4d6c02cbd9e444b2edc156565c705 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:04 -0700 +Subject: [PATCH 85/90] selftests/sgx: Test faulty enclave behavior + +Removing a page from an initialized enclave involves three steps: +first the user requests changing the page type to SGX_PAGE_TYPE_TRIM +via an ioctl(), on success the ENCLU[EACCEPT] instruction needs to be +run from within the enclave to accept the page removal, finally the +user requests page removal to be completed via an ioctl(). Only after +acceptance (ENCLU[EACCEPT]) from within the enclave can the kernel +remove the page from a running enclave. + +Test the behavior when the user's request to change the page type +succeeds, but the ENCLU[EACCEPT] instruction is not run before the +ioctl() requesting page removal is run. This should not be permitted. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/main.c | 114 +++++++++++++++++++++++++++++ + 1 file changed, 114 insertions(+) + +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index 8bf43646e0bb..3a82bae915d1 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -1433,4 +1433,118 @@ TEST_F(enclave, tcs_create) + munmap(addr, 3 * PAGE_SIZE); + } + ++/* ++ * Ensure sane behavior if user requests page removal, does not run ++ * EACCEPT from within enclave but still attempts to finalize page removal ++ * with the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). The latter should fail ++ * because the removal was not EACCEPTed from within the enclave. ++ */ ++TEST_F(enclave, remove_added_page_no_eaccept) ++{ ++ struct sgx_enclave_remove_pages remove_ioc; ++ struct encl_op_get_from_addr get_addr_op; ++ struct sgx_enclave_modify_types modt_ioc; ++ struct encl_op_put_to_addr put_addr_op; ++ unsigned long data_start; ++ int ret, errno_save; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ /* ++ * Hardware (SGX2) and kernel support is needed for this test. Start ++ * with check that test has a chance of succeeding. ++ */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ ++ if (ret == -1) { ++ if (errno == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); ++ else if (errno == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ } ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ EXPECT_EQ(ret, -1); ++ ++ /* ++ * Page that will be removed is the second data page in the .data ++ * segment. This forms part of the local encl_buffer within the ++ * enclave. ++ */ ++ data_start = self->encl.encl_base + ++ encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ++ /* ++ * Sanity check that page at @data_start is writable before ++ * removing it. ++ * ++ * Start by writing MAGIC to test page. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = data_start; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory that was just written to, confirming that data ++ * previously written (MAGIC) is present. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = data_start; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* Start page removal by requesting change of page type to PT_TRIM */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ++ modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ modt_ioc.length = PAGE_SIZE; ++ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(modt_ioc.result, 0); ++ EXPECT_EQ(modt_ioc.count, 4096); ++ ++ /* Skip EACCEPT */ ++ ++ /* Send final ioctl() to complete page removal */ ++ memset(&remove_ioc, 0, sizeof(remove_ioc)); ++ ++ remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ remove_ioc.length = PAGE_SIZE; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ /* Operation not permitted since EACCEPT was omitted. */ ++ EXPECT_EQ(ret, -1); ++ EXPECT_EQ(errno_save, EPERM); ++ EXPECT_EQ(remove_ioc.count, 0); ++} ++ + TEST_HARNESS_MAIN +-- +2.36.1 + + +From da7c21307303bb250954b52fe84b4fd6a00c7cea Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:05 -0700 +Subject: [PATCH 86/90] selftests/sgx: Test invalid access to removed enclave + page + +Removing a page from an initialized enclave involves three steps: +(1) the user requests changing the page type to SGX_PAGE_TYPE_TRIM +via the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(), (2) on success the +ENCLU[EACCEPT] instruction is run from within the enclave to accept +the page removal, (3) the user initiates the actual removal of the +page via the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). + +Test two possible invalid accesses during the page removal flow: +* Test the behavior when a request to remove the page by changing its + type to SGX_PAGE_TYPE_TRIM completes successfully but instead of + executing ENCLU[EACCEPT] from within the enclave the enclave attempts + to read from the page. Even though the page is accessible from the + page table entries its type is SGX_PAGE_TYPE_TRIM and thus not + accessible according to SGX. The expected behavior is a page fault + with the SGX flag set in the error code. +* Test the behavior when the page type is changed successfully and + ENCLU[EACCEPT] was run from within the enclave. The final ioctl(), + SGX_IOC_ENCLAVE_REMOVE_PAGES, is omitted and replaced with an + attempt to access the page. Even though the page is accessible + from the page table entries its type is SGX_PAGE_TYPE_TRIM and + thus not accessible according to SGX. The expected behavior is + a page fault with the SGX flag set in the error code. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/main.c | 243 +++++++++++++++++++++++++++++ + 1 file changed, 243 insertions(+) + +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index 3a82bae915d1..2c69045253b2 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -1547,4 +1547,247 @@ TEST_F(enclave, remove_added_page_no_eaccept) + EXPECT_EQ(remove_ioc.count, 0); + } + ++/* ++ * Request enclave page removal but instead of correctly following with ++ * EACCEPT a read attempt to page is made from within the enclave. ++ */ ++TEST_F(enclave, remove_added_page_invalid_access) ++{ ++ struct encl_op_get_from_addr get_addr_op; ++ struct encl_op_put_to_addr put_addr_op; ++ struct sgx_enclave_modify_types ioc; ++ unsigned long data_start; ++ int ret, errno_save; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ /* ++ * Hardware (SGX2) and kernel support is needed for this test. Start ++ * with check that test has a chance of succeeding. ++ */ ++ memset(&ioc, 0, sizeof(ioc)); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); ++ ++ if (ret == -1) { ++ if (errno == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); ++ else if (errno == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ } ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ EXPECT_EQ(ret, -1); ++ ++ /* ++ * Page that will be removed is the second data page in the .data ++ * segment. This forms part of the local encl_buffer within the ++ * enclave. ++ */ ++ data_start = self->encl.encl_base + ++ encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ++ /* ++ * Sanity check that page at @data_start is writable before ++ * removing it. ++ * ++ * Start by writing MAGIC to test page. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = data_start; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory that was just written to, confirming that data ++ * previously written (MAGIC) is present. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = data_start; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* Start page removal by requesting change of page type to PT_TRIM. */ ++ memset(&ioc, 0, sizeof(ioc)); ++ ++ ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ioc.length = PAGE_SIZE; ++ ioc.page_type = SGX_PAGE_TYPE_TRIM; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(ioc.result, 0); ++ EXPECT_EQ(ioc.count, 4096); ++ ++ /* ++ * Read from page that was just removed. ++ */ ++ get_addr_op.value = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ /* ++ * From kernel perspective the page is present but according to SGX the ++ * page should not be accessible so a #PF with SGX bit set is ++ * expected. ++ */ ++ ++ EXPECT_EQ(self->run.function, ERESUME); ++ EXPECT_EQ(self->run.exception_vector, 14); ++ EXPECT_EQ(self->run.exception_error_code, 0x8005); ++ EXPECT_EQ(self->run.exception_addr, data_start); ++} ++ ++/* ++ * Request enclave page removal and correctly follow with ++ * EACCEPT but do not follow with removal ioctl() but instead a read attempt ++ * to removed page is made from within the enclave. ++ */ ++TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) ++{ ++ struct encl_op_get_from_addr get_addr_op; ++ struct encl_op_put_to_addr put_addr_op; ++ struct sgx_enclave_modify_types ioc; ++ struct encl_op_eaccept eaccept_op; ++ unsigned long data_start; ++ int ret, errno_save; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ /* ++ * Hardware (SGX2) and kernel support is needed for this test. Start ++ * with check that test has a chance of succeeding. ++ */ ++ memset(&ioc, 0, sizeof(ioc)); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); ++ ++ if (ret == -1) { ++ if (errno == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); ++ else if (errno == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ } ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ EXPECT_EQ(ret, -1); ++ ++ /* ++ * Page that will be removed is the second data page in the .data ++ * segment. This forms part of the local encl_buffer within the ++ * enclave. ++ */ ++ data_start = self->encl.encl_base + ++ encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ++ /* ++ * Sanity check that page at @data_start is writable before ++ * removing it. ++ * ++ * Start by writing MAGIC to test page. ++ */ ++ put_addr_op.value = MAGIC; ++ put_addr_op.addr = data_start; ++ put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* ++ * Read memory that was just written to, confirming that data ++ * previously written (MAGIC) is present. ++ */ ++ get_addr_op.value = 0; ++ get_addr_op.addr = data_start; ++ get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ EXPECT_EQ(get_addr_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ++ /* Start page removal by requesting change of page type to PT_TRIM. */ ++ memset(&ioc, 0, sizeof(ioc)); ++ ++ ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ioc.length = PAGE_SIZE; ++ ioc.page_type = SGX_PAGE_TYPE_TRIM; ++ ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(ioc.result, 0); ++ EXPECT_EQ(ioc.count, 4096); ++ ++ eaccept_op.epc_addr = (unsigned long)data_start; ++ eaccept_op.ret = 0; ++ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ /* Skip ioctl() to remove page. */ ++ ++ /* ++ * Read from page that was just removed. ++ */ ++ get_addr_op.value = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); ++ ++ /* ++ * From kernel perspective the page is present but according to SGX the ++ * page should not be accessible so a #PF with SGX bit set is ++ * expected. ++ */ ++ ++ EXPECT_EQ(self->run.function, ERESUME); ++ EXPECT_EQ(self->run.exception_vector, 14); ++ EXPECT_EQ(self->run.exception_error_code, 0x8005); ++ EXPECT_EQ(self->run.exception_addr, data_start); ++} ++ + TEST_HARNESS_MAIN +-- +2.36.1 + + +From ff50849ecb369de6c13ab2c8d26e23f34e961aab Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:06 -0700 +Subject: [PATCH 87/90] selftests/sgx: Test reclaiming of untouched page + +Removing a page from an initialized enclave involves three steps: +(1) the user requests changing the page type to PT_TRIM via the + SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() +(2) on success the ENCLU[EACCEPT] instruction is run from within + the enclave to accept the page removal +(3) the user initiates the actual removal of the page via the + SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). + +Remove a page that has never been accessed. This means that when the +first ioctl() requesting page removal arrives, there will be no page +table entry, yet a valid page table entry needs to exist for the +ENCLU[EACCEPT] function to succeed. In this test it is verified that +a page table entry can still be installed for a page that is in the +process of being removed. + +Suggested-by: Haitao Huang <haitao.huang@intel.com> +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/main.c | 80 ++++++++++++++++++++++++++++++ + 1 file changed, 80 insertions(+) + +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index 2c69045253b2..ba16671aef79 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -1790,4 +1790,84 @@ TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) + EXPECT_EQ(self->run.exception_addr, data_start); + } + ++TEST_F(enclave, remove_untouched_page) ++{ ++ struct sgx_enclave_remove_pages remove_ioc; ++ struct sgx_enclave_modify_types modt_ioc; ++ struct encl_op_eaccept eaccept_op; ++ unsigned long data_start; ++ int ret, errno_save; ++ ++ ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); ++ ++ /* ++ * Hardware (SGX2) and kernel support is needed for this test. Start ++ * with check that test has a chance of succeeding. ++ */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ ++ if (ret == -1) { ++ if (errno == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); ++ else if (errno == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ } ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ EXPECT_EQ(ret, -1); ++ ++ /* SGX2 is supported by kernel and hardware, test can proceed. */ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ data_start = self->encl.encl_base + ++ encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ++ modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ modt_ioc.length = PAGE_SIZE; ++ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(modt_ioc.result, 0); ++ EXPECT_EQ(modt_ioc.count, 4096); ++ ++ /* ++ * Enter enclave via TCS #1 and approve page removal by sending ++ * EACCEPT for removed page. ++ */ ++ ++ eaccept_op.epc_addr = data_start; ++ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; ++ eaccept_op.ret = 0; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ EXPECT_EQ(eaccept_op.ret, 0); ++ ++ memset(&remove_ioc, 0, sizeof(remove_ioc)); ++ ++ remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; ++ remove_ioc.length = PAGE_SIZE; ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(remove_ioc.count, 4096); ++} ++ + TEST_HARNESS_MAIN +-- +2.36.1 + + +From cd481f7f388bd1c0e46d6616340983c91aaa6c35 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre <reinette.chatre@intel.com> +Date: Tue, 10 May 2022 11:09:07 -0700 +Subject: [PATCH 88/90] selftests/sgx: Page removal stress test + +Create enclave with additional heap that consumes all physical SGX +memory and then remove it. + +Depending on the available SGX memory this test could take a +significant time to run (several minutes) as it (1) creates the +enclave, (2) changes the type of every page to be trimmed, +(3) enters the enclave once per page to run EACCEPT, before +(4) the pages are finally removed. + +Acked-by: Jarkko Sakkinen <jarkko@kernel.org> +Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> +--- + tools/testing/selftests/sgx/main.c | 120 +++++++++++++++++++++++++++++ + 1 file changed, 120 insertions(+) + +diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c +index ba16671aef79..9820b3809c69 100644 +--- a/tools/testing/selftests/sgx/main.c ++++ b/tools/testing/selftests/sgx/main.c +@@ -378,7 +378,127 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) + EXPECT_EQ(get_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); ++} ++ ++TEST_F_TIMEOUT(enclave, unclobbered_vdso_oversubscribed_remove, 900) ++{ ++ struct sgx_enclave_remove_pages remove_ioc; ++ struct sgx_enclave_modify_types modt_ioc; ++ struct encl_op_get_from_buf get_op; ++ struct encl_op_eaccept eaccept_op; ++ struct encl_op_put_to_buf put_op; ++ struct encl_segment *heap; ++ unsigned long total_mem; ++ int ret, errno_save; ++ unsigned long addr; ++ unsigned long i; ++ ++ /* ++ * Create enclave with additional heap that is as big as all ++ * available physical SGX memory. ++ */ ++ total_mem = get_total_epc_mem(); ++ ASSERT_NE(total_mem, 0); ++ TH_LOG("Creating an enclave with %lu bytes heap may take a while ...", ++ total_mem); ++ ASSERT_TRUE(setup_test_encl(total_mem, &self->encl, _metadata)); ++ ++ /* ++ * Hardware (SGX2) and kernel support is needed for this test. Start ++ * with check that test has a chance of succeeding. ++ */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ ++ if (ret == -1) { ++ if (errno == ENOTTY) ++ SKIP(return, ++ "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); ++ else if (errno == ENODEV) ++ SKIP(return, "System does not support SGX2"); ++ } ++ ++ /* ++ * Invalid parameters were provided during sanity check, ++ * expect command to fail. ++ */ ++ EXPECT_EQ(ret, -1); ++ ++ /* SGX2 is supported by kernel and hardware, test can proceed. */ ++ memset(&self->run, 0, sizeof(self->run)); ++ self->run.tcs = self->encl.encl_base; ++ ++ heap = &self->encl.segment_tbl[self->encl.nr_segments - 1]; ++ ++ put_op.header.type = ENCL_OP_PUT_TO_BUFFER; ++ put_op.value = MAGIC; ++ ++ EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); ++ ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.user_data, 0); ++ ++ get_op.header.type = ENCL_OP_GET_FROM_BUFFER; ++ get_op.value = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); ++ ++ EXPECT_EQ(get_op.value, MAGIC); ++ EXPECT_EEXIT(&self->run); ++ EXPECT_EQ(self->run.user_data, 0); + ++ /* Trim entire heap. */ ++ memset(&modt_ioc, 0, sizeof(modt_ioc)); ++ ++ modt_ioc.offset = heap->offset; ++ modt_ioc.length = heap->size; ++ modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; ++ ++ TH_LOG("Changing type of %zd bytes to trimmed may take a while ...", ++ heap->size); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(modt_ioc.result, 0); ++ EXPECT_EQ(modt_ioc.count, heap->size); ++ ++ /* EACCEPT all removed pages. */ ++ addr = self->encl.encl_base + heap->offset; ++ ++ eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; ++ eaccept_op.header.type = ENCL_OP_EACCEPT; ++ ++ TH_LOG("Entering enclave to run EACCEPT for each page of %zd bytes may take a while ...", ++ heap->size); ++ for (i = 0; i < heap->size; i += 4096) { ++ eaccept_op.epc_addr = addr + i; ++ eaccept_op.ret = 0; ++ ++ EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); ++ ++ EXPECT_EQ(self->run.exception_vector, 0); ++ EXPECT_EQ(self->run.exception_error_code, 0); ++ EXPECT_EQ(self->run.exception_addr, 0); ++ ASSERT_EQ(eaccept_op.ret, 0); ++ ASSERT_EQ(self->run.function, EEXIT); ++ } ++ ++ /* Complete page removal. */ ++ memset(&remove_ioc, 0, sizeof(remove_ioc)); ++ ++ remove_ioc.offset = heap->offset; ++ remove_ioc.length = heap->size; ++ ++ TH_LOG("Removing %zd bytes from enclave may take a while ...", ++ heap->size); ++ ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); ++ errno_save = ret == -1 ? errno : 0; ++ ++ EXPECT_EQ(ret, 0); ++ EXPECT_EQ(errno_save, 0); ++ EXPECT_EQ(remove_ioc.count, heap->size); + } + + TEST_F(enclave, clobbered_vdso) +-- +2.36.1 + + +From 561d60ba75691dee84008682bf422b51cf9f41ae Mon Sep 17 00:00:00 2001 +From: Jarkko Sakkinen <jarkko@kernel.org> +Date: Tue, 28 Jun 2022 03:56:12 +0300 +Subject: [PATCH 89/90] KVM: SVM: Make alloc_apic_backing_page() optional + +Signed-off-by: Ashish Kalra <Ashish.Kalra@amd.com> +Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org> +--- + arch/x86/include/asm/kvm-x86-ops.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h +index e0068e702692..1ca41da8ba13 100644 +--- a/arch/x86/include/asm/kvm-x86-ops.h ++++ b/arch/x86/include/asm/kvm-x86-ops.h +@@ -128,7 +128,7 @@ KVM_X86_OP(msr_filter_changed) + KVM_X86_OP(complete_emulated_msr) + KVM_X86_OP(vcpu_deliver_sipi_vector) + KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); +-KVM_X86_OP(alloc_apic_backing_page) ++KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) + KVM_X86_OP_OPTIONAL(rmp_page_level_adjust) + + #undef KVM_X86_OP +-- +2.36.1 + + +From a505adda8dd720d2379019ba335ca00d32755a72 Mon Sep 17 00:00:00 2001 +From: Nathaniel McCallum <nathaniel@profian.com> +Date: Tue, 9 Nov 2021 22:25:33 -0500 +Subject: [PATCH 90/90] Add GitHub Action + +Signed-off-by: Nathaniel McCallum <nathaniel@profian.com> +--- + .github/workflows/build.yml | 47 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 47 insertions(+) + create mode 100644 .github/workflows/build.yml + +diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml +new file mode 100644 +index 000000000000..b732e72923ce +--- /dev/null ++++ b/.github/workflows/build.yml +@@ -0,0 +1,47 @@ ++on: [push] ++name: build ++jobs: ++ main: ++ name: build ++ runs-on: ubuntu-latest ++ container: debian:latest ++ steps: ++ - run: apt update ++ - run: apt install -y build-essential linux-image-generic flex bison libelf-dev libssl-dev bc python3 dwarves ++ ++ # Check out the code ++ - uses: actions/checkout@v2 ++ ++ # Base this kernel config off of the OS's config ++ - run: cp /boot/config-* .config ++ ++ # Don't use the OS's trusted keys ++ - run: sed -i 's|^CONFIG_SYSTEM_TRUSTED_KEYS|#CONFIG_SYSTEM_TRUSTED_KEYS|' .config ++ ++ # Use the defaults for all new values ++ - run: bash -c 'yes "" | make oldconfig' # Work around -o pipefail ++ ++ # However, unconditionally enable SGX and SEV ++ - run: sed -i 's|^.*AMD_MEM_ENCRYPT.*$||' .config ++ - run: sed -i 's|^.*AMD_SEV.*$||' .config ++ - run: sed -i 's|^.*SGX.*$||' .config ++ - run: bash -c 'yes | make oldconfig' # Work around -o pipefail ++ ++ # Build the kernel ++ - run: make EXTRAVERSION=-$GITHUB_REF_NAME -j 5 ++ ++ # Install the kernel ++ - run: mkdir -p foo/boot ++ - run: make EXTRAVERSION=-$GITHUB_REF_NAME INSTALL_PATH=`pwd`/foo/boot install ++ - run: make EXTRAVERSION=-$GITHUB_REF_NAME INSTALL_MOD_PATH=`pwd`/foo INSTALL_MOD_STRIP=1 modules_install ++ ++ # Package the kernel ++ - run: rm -f foo/lib/modules/*/source ++ - run: rm -f foo/lib/modules/*/build ++ - run: tar -C foo -cvjf linux.tar.bz2 . ++ ++ # Upload the results ++ - uses: actions/upload-artifact@v2 ++ with: ++ name: linux.tar.bz2 ++ path: linux.tar.bz2 +-- +2.36.1 + |