2025cf9e193de (Thomas Gleixner 2019-05-29 07:18:02 -0700 1) // SPDX-License-Identifier: GPL-2.0-only
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 2) /*
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 3) * fs/dax.c - Direct Access filesystem code
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 4) * Copyright (c) 2013-2014 Intel Corporation
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 5) * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 6) * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 7) */
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 8)
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 9) #include <linux/atomic.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 10) #include <linux/blkdev.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 11) #include <linux/buffer_head.h>
d77e92e270edd (Ross Zwisler 2015-09-09 10:29:40 -0600 12) #include <linux/dax.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 13) #include <linux/fs.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 14) #include <linux/genhd.h>
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 15) #include <linux/highmem.h>
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 16) #include <linux/memcontrol.h>
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 17) #include <linux/mm.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 18) #include <linux/mutex.h>
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 19) #include <linux/pagevec.h>
289c6aedac981 (Matthew Wilcox 2015-02-16 15:58:59 -0800 20) #include <linux/sched.h>
f361bf4a66c9b (Ingo Molnar 2017-02-03 23:47:37 +0100 21) #include <linux/sched/signal.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 22) #include <linux/uio.h>
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 23) #include <linux/vmstat.h>
34c0fd540e79f (Dan Williams 2016-01-15 16:56:14 -0800 24) #include <linux/pfn_t.h>
0e749e54244ee (Dan Williams 2016-01-15 16:55:53 -0800 25) #include <linux/sizes.h>
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 26) #include <linux/mmu_notifier.h>
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 27) #include <linux/iomap.h>
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 28) #include <asm/pgalloc.h>
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 29)
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 30) #define CREATE_TRACE_POINTS
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 31) #include <trace/events/fs_dax.h>
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 32)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 33) static inline unsigned int pe_order(enum page_entry_size pe_size)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 34) {
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 35) if (pe_size == PE_SIZE_PTE)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 36) return PAGE_SHIFT - PAGE_SHIFT;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 37) if (pe_size == PE_SIZE_PMD)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 38) return PMD_SHIFT - PAGE_SHIFT;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 39) if (pe_size == PE_SIZE_PUD)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 40) return PUD_SHIFT - PAGE_SHIFT;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 41) return ~0;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 42) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 43)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 44) /* We choose 4096 entries - same as per-zone page wait tables */
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 45) #define DAX_WAIT_TABLE_BITS 12
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 46) #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 47)
917f34526c412 (Ross Zwisler 2017-09-06 16:18:58 -0700 48) /* The 'colour' (ie low bits) within a PMD of a page offset. */
917f34526c412 (Ross Zwisler 2017-09-06 16:18:58 -0700 49) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
977fbdcd5986c (Matthew Wilcox 2018-01-31 16:17:36 -0800 50) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
917f34526c412 (Ross Zwisler 2017-09-06 16:18:58 -0700 51)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 52) /* The order of a PMD entry */
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 53) #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 54)
ce95ab0fa6696 (Ross Zwisler 2016-11-08 11:31:44 +1100 55) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 56)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 57) static int __init init_dax_wait_table(void)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 58) {
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 59) int i;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 60)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 61) for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 62) init_waitqueue_head(wait_table + i);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 63) return 0;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 64) }
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 65) fs_initcall(init_dax_wait_table);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 66)
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 67) /*
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 68) * DAX pagecache entries use XArray value entries so they can't be mistaken
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 69) * for pages. We use one bit for locking, one bit for the entry size (PMD)
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 70) * and two more to tell us if the entry is a zero page or an empty entry that
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 71) * is just used for locking. In total four special bits.
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 72) *
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 73) * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 74) * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 75) * block allocation.
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 76) */
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 77) #define DAX_SHIFT (4)
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 78) #define DAX_LOCKED (1UL << 0)
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 79) #define DAX_PMD (1UL << 1)
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 80) #define DAX_ZERO_PAGE (1UL << 2)
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 81) #define DAX_EMPTY (1UL << 3)
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 82)
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 83) static unsigned long dax_to_pfn(void *entry)
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 84) {
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 85) return xa_to_value(entry) >> DAX_SHIFT;
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 86) }
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 87)
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 88) static void *dax_make_entry(pfn_t pfn, unsigned long flags)
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 89) {
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 90) return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 91) }
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 92)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 93) static bool dax_is_locked(void *entry)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 94) {
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 95) return xa_to_value(entry) & DAX_LOCKED;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 96) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 97)
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 98) static unsigned int dax_entry_order(void *entry)
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 99) {
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 100) if (xa_to_value(entry) & DAX_PMD)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 101) return PMD_ORDER;
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 102) return 0;
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 103) }
527b19d0808e7 (Ross Zwisler 2017-09-06 16:18:51 -0700 104)
fda490d39fc06 (Matthew Wilcox 2018-11-16 15:07:31 -0500 105) static unsigned long dax_is_pmd_entry(void *entry)
d1a5f2b4d8a12 (Dan Williams 2016-01-28 20:25:31 -0800 106) {
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 107) return xa_to_value(entry) & DAX_PMD;
d1a5f2b4d8a12 (Dan Williams 2016-01-28 20:25:31 -0800 108) }
d1a5f2b4d8a12 (Dan Williams 2016-01-28 20:25:31 -0800 109)
fda490d39fc06 (Matthew Wilcox 2018-11-16 15:07:31 -0500 110) static bool dax_is_pte_entry(void *entry)
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 111) {
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 112) return !(xa_to_value(entry) & DAX_PMD);
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 113) }
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 114)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 115) static int dax_is_zero_entry(void *entry)
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 116) {
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 117) return xa_to_value(entry) & DAX_ZERO_PAGE;
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 118) }
d475c6346a38a (Matthew Wilcox 2015-02-16 15:58:56 -0800 119)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 120) static int dax_is_empty_entry(void *entry)
b2e0d1625e193 (Dan Williams 2016-01-15 16:55:59 -0800 121) {
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 122) return xa_to_value(entry) & DAX_EMPTY;
b2e0d1625e193 (Dan Williams 2016-01-15 16:55:59 -0800 123) }
b2e0d1625e193 (Dan Williams 2016-01-15 16:55:59 -0800 124)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 125) /*
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 126) * true if the entry that was found is of a smaller order than the entry
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 127) * we were looking for
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 128) */
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 129) static bool dax_is_conflict(void *entry)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 130) {
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 131) return entry == XA_RETRY_ENTRY;
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 132) }
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 133)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 134) /*
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 135) * DAX page cache entry locking
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 136) */
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 137) struct exceptional_entry_key {
ec4907ff69fb1 (Matthew Wilcox 2018-03-28 11:01:43 -0400 138) struct xarray *xa;
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 139) pgoff_t entry_start;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 140) };
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 141)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 142) struct wait_exceptional_entry_queue {
ac6424b981bce (Ingo Molnar 2017-06-20 12:06:13 +0200 143) wait_queue_entry_t wait;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 144) struct exceptional_entry_key key;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 145) };
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 146)
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 147) /**
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 148) * enum dax_wake_mode: waitqueue wakeup behaviour
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 149) * @WAKE_ALL: wake all waiters in the waitqueue
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 150) * @WAKE_NEXT: wake only the first waiter in the waitqueue
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 151) */
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 152) enum dax_wake_mode {
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 153) WAKE_ALL,
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 154) WAKE_NEXT,
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 155) };
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 156)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 157) static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 158) void *entry, struct exceptional_entry_key *key)
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 159) {
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 160) unsigned long hash;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 161) unsigned long index = xas->xa_index;
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 162)
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 163) /*
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 164) * If 'entry' is a PMD, align the 'index' that we use for the wait
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 165) * queue to the start of that PMD. This ensures that all offsets in
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 166) * the range covered by the PMD map to the same bit lock.
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 167) */
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 168) if (dax_is_pmd_entry(entry))
917f34526c412 (Ross Zwisler 2017-09-06 16:18:58 -0700 169) index &= ~PG_PMD_COLOUR;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 170) key->xa = xas->xa;
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 171) key->entry_start = index;
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 172)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 173) hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 174) return wait_table + hash;
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 175) }
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 176)
ec4907ff69fb1 (Matthew Wilcox 2018-03-28 11:01:43 -0400 177) static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
ec4907ff69fb1 (Matthew Wilcox 2018-03-28 11:01:43 -0400 178) unsigned int mode, int sync, void *keyp)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 179) {
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 180) struct exceptional_entry_key *key = keyp;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 181) struct wait_exceptional_entry_queue *ewait =
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 182) container_of(wait, struct wait_exceptional_entry_queue, wait);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 183)
ec4907ff69fb1 (Matthew Wilcox 2018-03-28 11:01:43 -0400 184) if (key->xa != ewait->key.xa ||
63e95b5c4f16e (Ross Zwisler 2016-11-08 11:32:20 +1100 185) key->entry_start != ewait->key.entry_start)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 186) return 0;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 187) return autoremove_wake_function(wait, mode, sync, NULL);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 188) }
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 189)
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 190) /*
b93b016313b3b (Matthew Wilcox 2018-04-10 16:36:56 -0700 191) * @entry may no longer be the entry at the index in the mapping.
b93b016313b3b (Matthew Wilcox 2018-04-10 16:36:56 -0700 192) * The important information it's conveying is whether the entry at
b93b016313b3b (Matthew Wilcox 2018-04-10 16:36:56 -0700 193) * this index used to be a PMD entry.
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 194) */
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 195) static void dax_wake_entry(struct xa_state *xas, void *entry,
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 196) enum dax_wake_mode mode)
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 197) {
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 198) struct exceptional_entry_key key;
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 199) wait_queue_head_t *wq;
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 200)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 201) wq = dax_entry_waitqueue(xas, entry, &key);
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 202)
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 203) /*
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 204) * Checking for locked entry and prepare_to_wait_exclusive() happens
b93b016313b3b (Matthew Wilcox 2018-04-10 16:36:56 -0700 205) * under the i_pages lock, ditto for entry handling in our callers.
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 206) * So at this point all tasks that could have seen our entry locked
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 207) * must be in the waitqueue and the following check will see them.
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 208) */
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 209) if (waitqueue_active(wq))
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 210) __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 211) }
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 212)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 213) /*
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 214) * Look up entry in page cache, wait for it to become unlocked if it
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 215) * is a DAX entry and return it. The caller must subsequently call
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 216) * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 217) * if it did. The entry returned may have a larger order than @order.
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 218) * If @order is larger than the order of the entry found in i_pages, this
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 219) * function returns a dax_is_conflict entry.
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 220) *
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 221) * Must be called with the i_pages lock held.
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 222) */
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 223) static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 224) {
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 225) void *entry;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 226) struct wait_exceptional_entry_queue ewait;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 227) wait_queue_head_t *wq;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 228)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 229) init_wait(&ewait.wait);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 230) ewait.wait.func = wake_exceptional_entry_func;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 231)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 232) for (;;) {
0e40de0338d00 (Matthew Wilcox 2018-11-16 15:19:13 -0500 233) entry = xas_find_conflict(xas);
6370740e5f8ef (Dan Williams 2019-10-21 09:29:20 -0700 234) if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
6370740e5f8ef (Dan Williams 2019-10-21 09:29:20 -0700 235) return entry;
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 236) if (dax_entry_order(entry) < order)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 237) return XA_RETRY_ENTRY;
6370740e5f8ef (Dan Williams 2019-10-21 09:29:20 -0700 238) if (!dax_is_locked(entry))
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 239) return entry;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 240)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 241) wq = dax_entry_waitqueue(xas, entry, &ewait.key);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 242) prepare_to_wait_exclusive(wq, &ewait.wait,
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 243) TASK_UNINTERRUPTIBLE);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 244) xas_unlock_irq(xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 245) xas_reset(xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 246) schedule();
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 247) finish_wait(wq, &ewait.wait);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 248) xas_lock_irq(xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 249) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 250) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 251)
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 252) /*
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 253) * The only thing keeping the address space around is the i_pages lock
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 254) * (it's cycled in clear_inode() after removing the entries from i_pages)
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 255) * After we call xas_unlock_irq(), we cannot touch xas->xa.
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 256) */
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 257) static void wait_entry_unlocked(struct xa_state *xas, void *entry)
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 258) {
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 259) struct wait_exceptional_entry_queue ewait;
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 260) wait_queue_head_t *wq;
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 261)
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 262) init_wait(&ewait.wait);
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 263) ewait.wait.func = wake_exceptional_entry_func;
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 264)
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 265) wq = dax_entry_waitqueue(xas, entry, &ewait.key);
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 266) /*
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 267) * Unlike get_unlocked_entry() there is no guarantee that this
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 268) * path ever successfully retrieves an unlocked entry before an
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 269) * inode dies. Perform a non-exclusive wait in case this path
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 270) * never successfully performs its own wake up.
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 271) */
d8a706414af48 (Dan Williams 2018-12-21 11:35:53 -0800 272) prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 273) xas_unlock_irq(xas);
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 274) schedule();
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 275) finish_wait(wq, &ewait.wait);
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 276) }
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 277)
4c3d043d271d4 (Vivek Goyal 2021-04-28 15:03:13 -0400 278) static void put_unlocked_entry(struct xa_state *xas, void *entry,
4c3d043d271d4 (Vivek Goyal 2021-04-28 15:03:13 -0400 279) enum dax_wake_mode mode)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 280) {
61c30c98ef17e (Jan Kara 2019-07-29 13:57:49 +0200 281) if (entry && !dax_is_conflict(entry))
4c3d043d271d4 (Vivek Goyal 2021-04-28 15:03:13 -0400 282) dax_wake_entry(xas, entry, mode);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 283) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 284)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 285) /*
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 286) * We used the xa_state to get the entry, but then we locked the entry and
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 287) * dropped the xa_lock, so we know the xa_state is stale and must be reset
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 288) * before use.
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 289) */
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 290) static void dax_unlock_entry(struct xa_state *xas, void *entry)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 291) {
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 292) void *old;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 293)
7ae2ea7dc45e8 (Matthew Wilcox 2018-11-09 20:09:37 -0500 294) BUG_ON(dax_is_locked(entry));
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 295) xas_reset(xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 296) xas_lock_irq(xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 297) old = xas_store(xas, entry);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 298) xas_unlock_irq(xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 299) BUG_ON(!dax_is_locked(old));
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 300) dax_wake_entry(xas, entry, WAKE_NEXT);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 301) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 302)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 303) /*
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 304) * Return: The entry stored at this location before it was locked.
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 305) */
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 306) static void *dax_lock_entry(struct xa_state *xas, void *entry)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 307) {
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 308) unsigned long v = xa_to_value(entry);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 309) return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 310) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 311)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 312) static unsigned long dax_entry_size(void *entry)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 313) {
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 314) if (dax_is_zero_entry(entry))
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 315) return 0;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 316) else if (dax_is_empty_entry(entry))
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 317) return 0;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 318) else if (dax_is_pmd_entry(entry))
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 319) return PMD_SIZE;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 320) else
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 321) return PAGE_SIZE;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 322) }
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 323)
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 324) static unsigned long dax_end_pfn(void *entry)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 325) {
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 326) return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 327) }
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 328)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 329) /*
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 330) * Iterate through all mapped pfns represented by an entry, i.e. skip
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 331) * 'empty' and 'zero' entries.
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 332) */
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 333) #define for_each_mapped_pfn(entry, pfn) \
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 334) for (pfn = dax_to_pfn(entry); \
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 335) pfn < dax_end_pfn(entry); pfn++)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 336)
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 337) /*
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 338) * TODO: for reflink+dax we need a way to associate a single page with
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 339) * multiple address_space instances at different linear_page_index()
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 340) * offsets.
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 341) */
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 342) static void dax_associate_entry(void *entry, struct address_space *mapping,
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 343) struct vm_area_struct *vma, unsigned long address)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 344) {
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 345) unsigned long size = dax_entry_size(entry), pfn, index;
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 346) int i = 0;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 347)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 348) if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 349) return;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 350)
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 351) index = linear_page_index(vma, address & ~(size - 1));
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 352) for_each_mapped_pfn(entry, pfn) {
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 353) struct page *page = pfn_to_page(pfn);
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 354)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 355) WARN_ON_ONCE(page->mapping);
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 356) page->mapping = mapping;
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 357) page->index = index + i++;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 358) }
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 359) }
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 360)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 361) static void dax_disassociate_entry(void *entry, struct address_space *mapping,
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 362) bool trunc)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 363) {
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 364) unsigned long pfn;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 365)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 366) if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 367) return;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 368)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 369) for_each_mapped_pfn(entry, pfn) {
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 370) struct page *page = pfn_to_page(pfn);
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 371)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 372) WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 373) WARN_ON_ONCE(page->mapping && page->mapping != mapping);
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 374) page->mapping = NULL;
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 375) page->index = 0;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 376) }
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 377) }
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 378)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 379) static struct page *dax_busy_page(void *entry)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 380) {
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 381) unsigned long pfn;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 382)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 383) for_each_mapped_pfn(entry, pfn) {
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 384) struct page *page = pfn_to_page(pfn);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 385)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 386) if (page_ref_count(page) > 1)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 387) return page;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 388) }
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 389) return NULL;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 390) }
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 391)
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 392) /*
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 393) * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 394) * @page: The page whose entry we want to lock
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 395) *
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 396) * Context: Process context.
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 397) * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 398) * not be locked.
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 399) */
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 400) dax_entry_t dax_lock_page(struct page *page)
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 401) {
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 402) XA_STATE(xas, NULL, 0);
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 403) void *entry;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 404)
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 405) /* Ensure page->mapping isn't freed while we look at it */
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 406) rcu_read_lock();
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 407) for (;;) {
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 408) struct address_space *mapping = READ_ONCE(page->mapping);
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 409)
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 410) entry = NULL;
c93db7bb6ef32 (Matthew Wilcox 2018-11-27 13:16:33 -0800 411) if (!mapping || !dax_mapping(mapping))
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 412) break;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 413)
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 414) /*
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 415) * In the device-dax case there's no need to lock, a
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 416) * struct dev_pagemap pin is sufficient to keep the
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 417) * inode alive, and we assume we have dev_pagemap pin
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 418) * otherwise we would not have a valid pfn_to_page()
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 419) * translation.
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 420) */
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 421) entry = (void *)~0UL;
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 422) if (S_ISCHR(mapping->host->i_mode))
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 423) break;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 424)
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 425) xas.xa = &mapping->i_pages;
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 426) xas_lock_irq(&xas);
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 427) if (mapping != page->mapping) {
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 428) xas_unlock_irq(&xas);
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 429) continue;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 430) }
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 431) xas_set(&xas, page->index);
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 432) entry = xas_load(&xas);
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 433) if (dax_is_locked(entry)) {
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 434) rcu_read_unlock();
55e56f06ed71d (Matthew Wilcox 2018-11-27 13:16:34 -0800 435) wait_entry_unlocked(&xas, entry);
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 436) rcu_read_lock();
6d7cd8c137374 (Matthew Wilcox 2018-11-06 13:11:57 -0500 437) continue;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 438) }
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 439) dax_lock_entry(&xas, entry);
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 440) xas_unlock_irq(&xas);
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 441) break;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 442) }
c5bbd4515a05f (Matthew Wilcox 2018-11-16 14:37:06 -0500 443) rcu_read_unlock();
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 444) return (dax_entry_t)entry;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 445) }
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 446)
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 447) void dax_unlock_page(struct page *page, dax_entry_t cookie)
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 448) {
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 449) struct address_space *mapping = page->mapping;
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 450) XA_STATE(xas, &mapping->i_pages, page->index);
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 451)
9f32d221301c3 (Matthew Wilcox 2018-06-12 09:46:30 -0400 452) if (S_ISCHR(mapping->host->i_mode))
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 453) return;
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 454)
27359fd6e5f3c (Matthew Wilcox 2018-11-30 11:05:06 -0500 455) dax_unlock_entry(&xas, (void *)cookie);
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 456) }
c2a7d2a115525 (Dan Williams 2018-07-13 21:50:16 -0700 457)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 458) /*
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 459) * Find page cache entry at given index. If it is a DAX entry, return it
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 460) * with the entry locked. If the page cache doesn't contain an entry at
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 461) * that index, add a locked empty entry.
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 462) *
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 463) * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 464) * either return that locked entry or will return VM_FAULT_FALLBACK.
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 465) * This will happen if there are any PTE entries within the PMD range
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 466) * that we are requesting.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 467) *
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 468) * We always favor PTE entries over PMD entries. There isn't a flow where we
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 469) * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 470) * insertion will fail if it finds any PTE entries already in the tree, and a
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 471) * PTE insertion will cause an existing PMD entry to be unmapped and
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 472) * downgraded to PTE entries. This happens for both PMD zero pages as
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 473) * well as PMD empty entries.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 474) *
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 475) * The exception to this downgrade path is for PMD entries that have
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 476) * real storage backing them. We will leave these real PMD entries in
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 477) * the tree, and PTE writes will simply dirty the entire PMD entry.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 478) *
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 479) * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 480) * persistent memory the benefit is doubtful. We can add that later if we can
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 481) * show it helps.
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 482) *
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 483) * On error, this function does not return an ERR_PTR. Instead it returns
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 484) * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 485) * overlap with xarray value entries.
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 486) */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 487) static void *grab_mapping_entry(struct xa_state *xas,
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 488) struct address_space *mapping, unsigned int order)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 489) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 490) unsigned long index = xas->xa_index;
a800caba38542 (Jan Kara 2021-06-28 19:35:04 -0700 491) bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 492) void *entry;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 493)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 494) retry:
a800caba38542 (Jan Kara 2021-06-28 19:35:04 -0700 495) pmd_downgrade = false;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 496) xas_lock_irq(xas);
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 497) entry = get_unlocked_entry(xas, order);
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 498)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 499) if (entry) {
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 500) if (dax_is_conflict(entry))
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 501) goto fallback;
0e40de0338d00 (Matthew Wilcox 2018-11-16 15:19:13 -0500 502) if (!xa_is_value(entry)) {
49688e654e48a (Hao Li 2020-07-29 11:44:36 +0800 503) xas_set_err(xas, -EIO);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 504) goto out_unlock;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 505) }
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 506)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 507) if (order == 0) {
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 508) if (dax_is_pmd_entry(entry) &&
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 509) (dax_is_zero_entry(entry) ||
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 510) dax_is_empty_entry(entry))) {
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 511) pmd_downgrade = true;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 512) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 513) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 514) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 515)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 516) if (pmd_downgrade) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 517) /*
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 518) * Make sure 'entry' remains valid while we drop
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 519) * the i_pages lock.
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 520) */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 521) dax_lock_entry(xas, entry);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 522)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 523) /*
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 524) * Besides huge zero pages the only other thing that gets
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 525) * downgraded are empty entries which don't need to be
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 526) * unmapped.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 527) */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 528) if (dax_is_zero_entry(entry)) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 529) xas_unlock_irq(xas);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 530) unmap_mapping_pages(mapping,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 531) xas->xa_index & ~PG_PMD_COLOUR,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 532) PG_PMD_NR, false);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 533) xas_reset(xas);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 534) xas_lock_irq(xas);
e11f8b7b6c4ea (Ross Zwisler 2017-04-07 16:04:57 -0700 535) }
e11f8b7b6c4ea (Ross Zwisler 2017-04-07 16:04:57 -0700 536)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 537) dax_disassociate_entry(entry, mapping, false);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 538) xas_store(xas, NULL); /* undo the PMD join */
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 539) dax_wake_entry(xas, entry, WAKE_ALL);
7f0e07fb02895 (Matthew Wilcox (Oracle) 2021-05-04 18:32:51 -0700 540) mapping->nrpages -= PG_PMD_NR;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 541) entry = NULL;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 542) xas_set(xas, index);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 543) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 544)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 545) if (entry) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 546) dax_lock_entry(xas, entry);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 547) } else {
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 548) unsigned long flags = DAX_EMPTY;
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 549)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 550) if (order > 0)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 551) flags |= DAX_PMD;
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 552) entry = dax_make_entry(pfn_to_pfn_t(0), flags);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 553) dax_lock_entry(xas, entry);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 554) if (xas_error(xas))
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 555) goto out_unlock;
7f0e07fb02895 (Matthew Wilcox (Oracle) 2021-05-04 18:32:51 -0700 556) mapping->nrpages += 1UL << order;
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 557) }
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 558)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 559) out_unlock:
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 560) xas_unlock_irq(xas);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 561) if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 562) goto retry;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 563) if (xas->xa_node == XA_ERROR(-ENOMEM))
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 564) return xa_mk_internal(VM_FAULT_OOM);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 565) if (xas_error(xas))
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 566) return xa_mk_internal(VM_FAULT_SIGBUS);
e3ad61c64abce (Ross Zwisler 2016-11-08 11:32:12 +1100 567) return entry;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 568) fallback:
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 569) xas_unlock_irq(xas);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 570) return xa_mk_internal(VM_FAULT_FALLBACK);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 571) }
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 572)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 573) /**
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 574) * dax_layout_busy_page_range - find first pinned page in @mapping
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 575) * @mapping: address space to scan for a page with ref count > 1
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 576) * @start: Starting offset. Page containing 'start' is included.
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 577) * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 578) * pages from 'start' till the end of file are included.
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 579) *
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 580) * DAX requires ZONE_DEVICE mapped pages. These pages are never
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 581) * 'onlined' to the page allocator so they are considered idle when
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 582) * page->count == 1. A filesystem uses this interface to determine if
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 583) * any page in the mapping is busy, i.e. for DMA, or other
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 584) * get_user_pages() usages.
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 585) *
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 586) * It is expected that the filesystem is holding locks to block the
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 587) * establishment of new mappings in this address_space. I.e. it expects
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 588) * to be able to run unmap_mapping_range() and subsequently not race
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 589) * mapping_mapped() becoming true.
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 590) */
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 591) struct page *dax_layout_busy_page_range(struct address_space *mapping,
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 592) loff_t start, loff_t end)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 593) {
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 594) void *entry;
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 595) unsigned int scanned = 0;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 596) struct page *page = NULL;
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 597) pgoff_t start_idx = start >> PAGE_SHIFT;
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 598) pgoff_t end_idx;
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 599) XA_STATE(xas, &mapping->i_pages, start_idx);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 600)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 601) /*
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 602) * In the 'limited' case get_user_pages() for dax is disabled.
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 603) */
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 604) if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 605) return NULL;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 606)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 607) if (!dax_mapping(mapping) || !mapping_mapped(mapping))
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 608) return NULL;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 609)
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 610) /* If end == LLONG_MAX, all pages from start to till end of file */
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 611) if (end == LLONG_MAX)
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 612) end_idx = ULONG_MAX;
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 613) else
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 614) end_idx = end >> PAGE_SHIFT;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 615) /*
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 616) * If we race get_user_pages_fast() here either we'll see the
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 617) * elevated page count in the iteration and wait, or
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 618) * get_user_pages_fast() will see that the page it took a reference
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 619) * against is no longer mapped in the page tables and bail to the
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 620) * get_user_pages() slow path. The slow path is protected by
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 621) * pte_lock() and pmd_lock(). New references are not taken without
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 622) * holding those locks, and unmap_mapping_pages() will not zero the
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 623) * pte or pmd without holding the respective lock, so we are
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 624) * guaranteed to either see new references or prevent new
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 625) * references from being established.
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 626) */
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 627) unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 628)
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 629) xas_lock_irq(&xas);
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 630) xas_for_each(&xas, entry, end_idx) {
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 631) if (WARN_ON_ONCE(!xa_is_value(entry)))
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 632) continue;
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 633) if (unlikely(dax_is_locked(entry)))
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 634) entry = get_unlocked_entry(&xas, 0);
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 635) if (entry)
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 636) page = dax_busy_page(entry);
4c3d043d271d4 (Vivek Goyal 2021-04-28 15:03:13 -0400 637) put_unlocked_entry(&xas, entry, WAKE_NEXT);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 638) if (page)
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 639) break;
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 640) if (++scanned % XA_CHECK_SCHED)
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 641) continue;
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 642)
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 643) xas_pause(&xas);
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 644) xas_unlock_irq(&xas);
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 645) cond_resched();
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 646) xas_lock_irq(&xas);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 647) }
084a899008cec (Matthew Wilcox 2018-05-17 13:03:48 -0400 648) xas_unlock_irq(&xas);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 649) return page;
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 650) }
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 651) EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 652)
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 653) struct page *dax_layout_busy_page(struct address_space *mapping)
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 654) {
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 655) return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
6bbdd563ee9a6 (Vivek Goyal 2020-03-03 14:58:21 -0500 656) }
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 657) EXPORT_SYMBOL_GPL(dax_layout_busy_page);
5fac7408d8287 (Dan Williams 2018-03-09 17:44:31 -0800 658)
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 659) static int __dax_invalidate_entry(struct address_space *mapping,
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 660) pgoff_t index, bool trunc)
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 661) {
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 662) XA_STATE(xas, &mapping->i_pages, index);
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 663) int ret = 0;
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 664) void *entry;
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 665)
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 666) xas_lock_irq(&xas);
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 667) entry = get_unlocked_entry(&xas, 0);
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 668) if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 669) goto out;
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 670) if (!trunc &&
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 671) (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 672) xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 673) goto out;
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 674) dax_disassociate_entry(entry, mapping, trunc);
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 675) xas_store(&xas, NULL);
7f0e07fb02895 (Matthew Wilcox (Oracle) 2021-05-04 18:32:51 -0700 676) mapping->nrpages -= 1UL << dax_entry_order(entry);
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 677) ret = 1;
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 678) out:
237388320deff (Vivek Goyal 2021-04-28 15:03:14 -0400 679) put_unlocked_entry(&xas, entry, WAKE_ALL);
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 680) xas_unlock_irq(&xas);
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 681) return ret;
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 682) }
07f2d89cc2709 (Matthew Wilcox 2018-03-28 15:40:41 -0400 683)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 684) /*
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 685) * Delete DAX entry at @index from @mapping. Wait for it
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 686) * to be unlocked before deleting it.
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 687) */
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 688) int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 689) {
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 690) int ret = __dax_invalidate_entry(mapping, index, true);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 691)
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 692) /*
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 693) * This gets called from truncate / punch_hole path. As such, the caller
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 694) * must hold locks protecting against concurrent modifications of the
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 695) * page cache (usually fs-private i_mmap_sem for writing). Since the
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 696) * caller has seen a DAX entry for this index, we better find it
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 697) * at that index as well...
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 698) */
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 699) WARN_ON_ONCE(!ret);
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 700) return ret;
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 701) }
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 702)
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 703) /*
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 704) * Invalidate DAX entry if it is clean.
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 705) */
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 706) int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 707) pgoff_t index)
c6dcf52c23d2d (Jan Kara 2016-08-10 17:22:44 +0200 708) {
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 709) return __dax_invalidate_entry(mapping, index, false);
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 710) }
ac401cc782429 (Jan Kara 2016-05-12 18:29:18 +0200 711)
c7fe193f1877e (Ira Weiny 2020-07-17 00:20:49 -0700 712) static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
c7fe193f1877e (Ira Weiny 2020-07-17 00:20:49 -0700 713) sector_t sector, struct page *to, unsigned long vaddr)
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 714) {
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 715) void *vto, *kaddr;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 716) pgoff_t pgoff;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 717) long rc;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 718) int id;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 719)
c7fe193f1877e (Ira Weiny 2020-07-17 00:20:49 -0700 720) rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 721) if (rc)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 722) return rc;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 723)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 724) id = dax_read_lock();
c7fe193f1877e (Ira Weiny 2020-07-17 00:20:49 -0700 725) rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 726) if (rc < 0) {
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 727) dax_read_unlock(id);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 728) return rc;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 729) }
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 730) vto = kmap_atomic(to);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 731) copy_user_page(vto, (void __force *)kaddr, vaddr, to);
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 732) kunmap_atomic(vto);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 733) dax_read_unlock(id);
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 734) return 0;
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 735) }
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 736)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 737) /*
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 738) * By this point grab_mapping_entry() has ensured that we have a locked entry
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 739) * of the appropriate size so we don't have to worry about downgrading PMDs to
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 740) * PTEs. If we happen to be trying to insert a PTE and there is a PMD
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 741) * already in the tree, we will skip the insertion and just dirty the PMD as
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 742) * appropriate.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 743) */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 744) static void *dax_insert_entry(struct xa_state *xas,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 745) struct address_space *mapping, struct vm_fault *vmf,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 746) void *entry, pfn_t pfn, unsigned long flags, bool dirty)
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 747) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 748) void *new_entry = dax_make_entry(pfn, flags);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 749)
f5b7b74876cff (Jan Kara 2017-11-01 16:36:40 +0100 750) if (dirty)
d2b2a28e64048 (Dmitry Monakhov 2016-02-05 15:36:55 -0800 751) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 752)
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 753) if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 754) unsigned long index = xas->xa_index;
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 755) /* we are replacing a zero page with block mapping */
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 756) if (dax_is_pmd_entry(entry))
977fbdcd5986c (Matthew Wilcox 2018-01-31 16:17:36 -0800 757) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 758) PG_PMD_NR, false);
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 759) else /* pte entry */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 760) unmap_mapping_pages(mapping, index, 1, false);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 761) }
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 762)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 763) xas_reset(xas);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 764) xas_lock_irq(xas);
1571c029a2ff2 (Jan Kara 2019-06-06 11:10:28 +0200 765) if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
1571c029a2ff2 (Jan Kara 2019-06-06 11:10:28 +0200 766) void *old;
1571c029a2ff2 (Jan Kara 2019-06-06 11:10:28 +0200 767)
d2c997c0f1453 (Dan Williams 2017-12-22 22:02:48 -0800 768) dax_disassociate_entry(entry, mapping, false);
73449daf8f0db (Dan Williams 2018-07-13 21:49:50 -0700 769) dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 770) /*
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 771) * Only swap our new entry into the page cache if the current
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 772) * entry is a zero page or an empty entry. If a normal PTE or
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 773) * PMD entry is already in the cache, we leave it alone. This
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 774) * means that if we are trying to insert a PTE and the
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 775) * existing entry is a PMD, we will just leave the PMD in the
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 776) * tree and dirty it if necessary.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 777) */
1571c029a2ff2 (Jan Kara 2019-06-06 11:10:28 +0200 778) old = dax_lock_entry(xas, new_entry);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 779) WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 780) DAX_LOCKED));
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 781) entry = new_entry;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 782) } else {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 783) xas_load(xas); /* Walk the xa_state */
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 784) }
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 785)
f5b7b74876cff (Jan Kara 2017-11-01 16:36:40 +0100 786) if (dirty)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 787) xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 788)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 789) xas_unlock_irq(xas);
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 790) return entry;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 791) }
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 792)
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 793) static inline
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 794) unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 795) {
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 796) unsigned long address;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 797)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 798) address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 799) VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 800) return address;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 801) }
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 802)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 803) /* Walk all mappings of a given index of a file and writeprotect them */
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 804) static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 805) unsigned long pfn)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 806) {
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 807) struct vm_area_struct *vma;
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 808) pte_t pte, *ptep = NULL;
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 809) pmd_t *pmdp = NULL;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 810) spinlock_t *ptl;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 811)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 812) i_mmap_lock_read(mapping);
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 813) vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
ac46d4f3c4324 (Jérôme Glisse 2018-12-28 00:38:09 -0800 814) struct mmu_notifier_range range;
ac46d4f3c4324 (Jérôme Glisse 2018-12-28 00:38:09 -0800 815) unsigned long address;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 816)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 817) cond_resched();
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 818)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 819) if (!(vma->vm_flags & VM_SHARED))
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 820) continue;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 821)
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 822) address = pgoff_address(index, vma);
a4d1a88525138 (Jérôme Glisse 2017-08-31 17:17:26 -0400 823)
a4d1a88525138 (Jérôme Glisse 2017-08-31 17:17:26 -0400 824) /*
9fd6dad1261a5 (Paolo Bonzini 2021-02-05 05:07:11 -0500 825) * follow_invalidate_pte() will use the range to call
ff5c19ed4b087 (Christoph Hellwig 2020-12-15 20:47:23 -0800 826) * mmu_notifier_invalidate_range_start() on our behalf before
ff5c19ed4b087 (Christoph Hellwig 2020-12-15 20:47:23 -0800 827) * taking any lock.
a4d1a88525138 (Jérôme Glisse 2017-08-31 17:17:26 -0400 828) */
9fd6dad1261a5 (Paolo Bonzini 2021-02-05 05:07:11 -0500 829) if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
9fd6dad1261a5 (Paolo Bonzini 2021-02-05 05:07:11 -0500 830) &pmdp, &ptl))
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 831) continue;
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 832)
0f10851ea475e (Jérôme Glisse 2017-11-15 17:34:07 -0800 833) /*
0f10851ea475e (Jérôme Glisse 2017-11-15 17:34:07 -0800 834) * No need to call mmu_notifier_invalidate_range() as we are
0f10851ea475e (Jérôme Glisse 2017-11-15 17:34:07 -0800 835) * downgrading page table protection not changing it to point
0f10851ea475e (Jérôme Glisse 2017-11-15 17:34:07 -0800 836) * to a new page.
0f10851ea475e (Jérôme Glisse 2017-11-15 17:34:07 -0800 837) *
ad56b738c5dd2 (Mike Rapoport 2018-03-21 21:22:47 +0200 838) * See Documentation/vm/mmu_notifier.rst
0f10851ea475e (Jérôme Glisse 2017-11-15 17:34:07 -0800 839) */
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 840) if (pmdp) {
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 841) #ifdef CONFIG_FS_DAX_PMD
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 842) pmd_t pmd;
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 843)
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 844) if (pfn != pmd_pfn(*pmdp))
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 845) goto unlock_pmd;
f6f3732162b5a (Linus Torvalds 2017-12-15 18:53:22 -0800 846) if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 847) goto unlock_pmd;
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 848)
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 849) flush_cache_page(vma, address, pfn);
024eee0e83f0d (Aneesh Kumar K.V 2019-05-13 17:19:11 -0700 850) pmd = pmdp_invalidate(vma, address, pmdp);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 851) pmd = pmd_wrprotect(pmd);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 852) pmd = pmd_mkclean(pmd);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 853) set_pmd_at(vma->vm_mm, address, pmdp, pmd);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 854) unlock_pmd:
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 855) #endif
ee190ca6516bc (Jan H. Schönherr 2018-01-31 16:14:04 -0800 856) spin_unlock(ptl);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 857) } else {
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 858) if (pfn != pte_pfn(*ptep))
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 859) goto unlock_pte;
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 860) if (!pte_dirty(*ptep) && !pte_write(*ptep))
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 861) goto unlock_pte;
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 862)
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 863) flush_cache_page(vma, address, pfn);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 864) pte = ptep_clear_flush(vma, address, ptep);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 865) pte = pte_wrprotect(pte);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 866) pte = pte_mkclean(pte);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 867) set_pte_at(vma->vm_mm, address, ptep, pte);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 868) unlock_pte:
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 869) pte_unmap_unlock(ptep, ptl);
f729c8c9b24f0 (Ross Zwisler 2017-01-10 16:57:24 -0800 870) }
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 871)
ac46d4f3c4324 (Jérôme Glisse 2018-12-28 00:38:09 -0800 872) mmu_notifier_invalidate_range_end(&range);
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 873) }
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 874) i_mmap_unlock_read(mapping);
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 875) }
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 876)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 877) static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 878) struct address_space *mapping, void *entry)
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 879) {
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 880) unsigned long pfn, index, count;
3fe0791c295cf (Dan Williams 2017-10-14 17:13:45 -0700 881) long ret = 0;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 882)
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 883) /*
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 884) * A page got tagged dirty in DAX mapping? Something is seriously
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 885) * wrong.
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 886) */
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 887) if (WARN_ON(!xa_is_value(entry)))
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 888) return -EIO;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 889)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 890) if (unlikely(dax_is_locked(entry))) {
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 891) void *old_entry = entry;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 892)
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 893) entry = get_unlocked_entry(xas, 0);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 894)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 895) /* Entry got punched out / reallocated? */
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 896) if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 897) goto put_unlocked;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 898) /*
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 899) * Entry got reallocated elsewhere? No need to writeback.
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 900) * We have to compare pfns as we must not bail out due to
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 901) * difference in lockbit or entry type.
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 902) */
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 903) if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 904) goto put_unlocked;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 905) if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 906) dax_is_zero_entry(entry))) {
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 907) ret = -EIO;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 908) goto put_unlocked;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 909) }
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 910)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 911) /* Another fsync thread may have already done this entry */
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 912) if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 913) goto put_unlocked;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 914) }
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 915)
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 916) /* Lock the entry to serialize with page faults */
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 917) dax_lock_entry(xas, entry);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 918)
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 919) /*
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 920) * We can clear the tag now but we have to be careful so that concurrent
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 921) * dax_writeback_one() calls for the same index cannot finish before we
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 922) * actually flush the caches. This is achieved as the calls will look
b93b016313b3b (Matthew Wilcox 2018-04-10 16:36:56 -0700 923) * at the entry only under the i_pages lock and once they do that
b93b016313b3b (Matthew Wilcox 2018-04-10 16:36:56 -0700 924) * they will see the entry locked and wait for it to unlock.
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 925) */
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 926) xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 927) xas_unlock_irq(xas);
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 928)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 929) /*
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 930) * If dax_writeback_mapping_range() was given a wbc->range_start
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 931) * in the middle of a PMD, the 'index' we use needs to be
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 932) * aligned to the start of the PMD.
3fe0791c295cf (Dan Williams 2017-10-14 17:13:45 -0700 933) * This allows us to flush for PMD_SIZE and not have to worry about
3fe0791c295cf (Dan Williams 2017-10-14 17:13:45 -0700 934) * partial PMD writebacks.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 935) */
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 936) pfn = dax_to_pfn(entry);
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 937) count = 1UL << dax_entry_order(entry);
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 938) index = xas->xa_index & ~(count - 1);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 939)
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 940) dax_entry_mkclean(mapping, index, pfn);
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 941) dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 942) /*
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 943) * After we have flushed the cache, we can clear the dirty tag. There
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 944) * cannot be new dirty data in the pfn after the flush has completed as
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 945) * the pfn mappings are writeprotected and fault waits for mapping
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 946) * entry lock.
4b4bb46d00b38 (Jan Kara 2016-12-14 15:07:53 -0800 947) */
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 948) xas_reset(xas);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 949) xas_lock_irq(xas);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 950) xas_store(xas, entry);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 951) xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
698ab77aebffe (Vivek Goyal 2021-04-28 15:03:12 -0400 952) dax_wake_entry(xas, entry, WAKE_NEXT);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 953)
e4b3448bc346f (Matthew Wilcox 2019-03-01 11:12:41 -0800 954) trace_dax_writeback_one(mapping->host, index, count);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 955) return ret;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 956)
a6abc2c0e77b1 (Jan Kara 2016-12-14 15:07:47 -0800 957) put_unlocked:
4c3d043d271d4 (Vivek Goyal 2021-04-28 15:03:13 -0400 958) put_unlocked_entry(xas, entry, WAKE_NEXT);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 959) return ret;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 960) }
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 961)
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 962) /*
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 963) * Flush the mapping to the persistent domain within the byte range of [start,
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 964) * end]. This is required by data integrity operations to ensure file data is
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 965) * on persistent storage prior to completion of the operation.
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 966) */
7f6d5b529b7df (Ross Zwisler 2016-02-26 15:19:55 -0800 967) int dax_writeback_mapping_range(struct address_space *mapping,
3f666c56c6b8c (Vivek Goyal 2020-01-03 13:33:07 -0500 968) struct dax_device *dax_dev, struct writeback_control *wbc)
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 969) {
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 970) XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 971) struct inode *inode = mapping->host;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 972) pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 973) void *entry;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 974) int ret = 0;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 975) unsigned int scanned = 0;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 976)
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 977) if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 978) return -EIO;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 979)
7716506adac46 (Matthew Wilcox (Oracle) 2021-05-04 18:32:45 -0700 980) if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
7f6d5b529b7df (Ross Zwisler 2016-02-26 15:19:55 -0800 981) return 0;
7f6d5b529b7df (Ross Zwisler 2016-02-26 15:19:55 -0800 982)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 983) trace_dax_writeback_range(inode, xas.xa_index, end_index);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 984)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 985) tag_pages_for_writeback(mapping, xas.xa_index, end_index);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 986)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 987) xas_lock_irq(&xas);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 988) xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 989) ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 990) if (ret < 0) {
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 991) mapping_set_error(mapping, ret);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 992) break;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 993) }
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 994) if (++scanned % XA_CHECK_SCHED)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 995) continue;
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 996)
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 997) xas_pause(&xas);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 998) xas_unlock_irq(&xas);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 999) cond_resched();
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 1000) xas_lock_irq(&xas);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 1001) }
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 1002) xas_unlock_irq(&xas);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 1003) trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
9fc747f68d49f (Matthew Wilcox 2018-03-28 16:03:45 -0400 1004) return ret;
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 1005) }
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 1006) EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
9973c98ecfda3 (Ross Zwisler 2016-01-22 15:10:47 -0800 1007)
31a6f1a6e5a4a (Jan Kara 2017-11-01 16:36:32 +0100 1008) static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 1009) {
a3841f94c7ecb (Linus Torvalds 2017-11-17 09:51:57 -0800 1010) return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
31a6f1a6e5a4a (Jan Kara 2017-11-01 16:36:32 +0100 1011) }
31a6f1a6e5a4a (Jan Kara 2017-11-01 16:36:32 +0100 1012)
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1013) static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1014) pfn_t *pfnp)
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 1015) {
31a6f1a6e5a4a (Jan Kara 2017-11-01 16:36:32 +0100 1016) const sector_t sector = dax_iomap_sector(iomap, pos);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1017) pgoff_t pgoff;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1018) int id, rc;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1019) long length;
f7ca90b160307 (Matthew Wilcox 2015-02-16 15:59:02 -0800 1020)
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1021) rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1022) if (rc)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1023) return rc;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1024) id = dax_read_lock();
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1025) length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
86ed913b0e826 (Huaisheng Ye 2018-07-30 15:15:48 +0800 1026) NULL, pfnp);
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1027) if (length < 0) {
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1028) rc = length;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1029) goto out;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1030) }
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1031) rc = -EINVAL;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1032) if (PFN_PHYS(length) < size)
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1033) goto out;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1034) if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1035) goto out;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1036) /* For larger pages we need devmap */
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1037) if (length > 1 && !pfn_t_devmap(*pfnp))
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1038) goto out;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1039) rc = 0;
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1040) out:
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1041) dax_read_unlock(id);
5e161e4066d3e (Jan Kara 2017-11-01 16:36:33 +0100 1042) return rc;
0e3b210ce1722 (Boaz Harrosh 2015-04-15 16:15:14 -0700 1043) }
0e3b210ce1722 (Boaz Harrosh 2015-04-15 16:15:14 -0700 1044)
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1045) /*
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 1046) * The user has performed a load from a hole in the file. Allocating a new
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 1047) * page in the file would cause excessive storage usage for workloads with
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 1048) * sparse files. Instead we insert a read-only mapping of the 4k zero page.
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 1049) * If this page is ever written to we will re-fault and change the mapping to
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 1050) * point to real DAX storage instead.
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1051) */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1052) static vm_fault_t dax_load_hole(struct xa_state *xas,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1053) struct address_space *mapping, void **entry,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1054) struct vm_fault *vmf)
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1055) {
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1056) struct inode *inode = mapping->host;
91d25ba8a6b0d (Ross Zwisler 2017-09-06 16:18:43 -0700 1057) unsigned long vaddr = vmf->address;
b90ca5cc32f59 (Matthew Wilcox 2018-09-11 21:27:44 -0700 1058) pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
b90ca5cc32f59 (Matthew Wilcox 2018-09-11 21:27:44 -0700 1059) vm_fault_t ret;
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1060)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1061) *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 1062) DAX_ZERO_PAGE, false);
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 1063)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1064) ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1065) trace_dax_load_hole(inode, vmf, ret);
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1066) return ret;
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1067) }
e30331ff05f68 (Ross Zwisler 2017-09-06 16:18:39 -0700 1068)
81ee8e52a71c7 (Matthew Wilcox (Oracle) 2020-09-21 08:58:42 -0700 1069) s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
679c8bd3b2942 (Christoph Hellwig 2016-05-09 10:47:04 +0200 1070) {
4f3b4f161d7a0 (Vivek Goyal 2020-02-28 11:34:56 -0500 1071) sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1072) pgoff_t pgoff;
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1073) long rc, id;
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1074) void *kaddr;
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1075) bool page_aligned = false;
81ee8e52a71c7 (Matthew Wilcox (Oracle) 2020-09-21 08:58:42 -0700 1076) unsigned offset = offset_in_page(pos);
81ee8e52a71c7 (Matthew Wilcox (Oracle) 2020-09-21 08:58:42 -0700 1077) unsigned size = min_t(u64, PAGE_SIZE - offset, length);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1078)
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1079) if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
81ee8e52a71c7 (Matthew Wilcox (Oracle) 2020-09-21 08:58:42 -0700 1080) (size == PAGE_SIZE))
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1081) page_aligned = true;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1082)
4f3b4f161d7a0 (Vivek Goyal 2020-02-28 11:34:56 -0500 1083) rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1084) if (rc)
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1085) return rc;
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1086)
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1087) id = dax_read_lock();
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1088)
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1089) if (page_aligned)
81ee8e52a71c7 (Matthew Wilcox (Oracle) 2020-09-21 08:58:42 -0700 1090) rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1091) else
4f3b4f161d7a0 (Vivek Goyal 2020-02-28 11:34:56 -0500 1092) rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1093) if (rc < 0) {
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1094) dax_read_unlock(id);
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1095) return rc;
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1096) }
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1097)
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1098) if (!page_aligned) {
81f558701ae8d (Dan Williams 2017-05-29 13:12:20 -0700 1099) memset(kaddr + offset, 0, size);
4f3b4f161d7a0 (Vivek Goyal 2020-02-28 11:34:56 -0500 1100) dax_flush(iomap->dax_dev, kaddr + offset, size);
4b0228fa1d753 (Vishal Verma 2016-04-21 15:13:46 -0400 1101) }
0a23f9ffa5ac2 (Vivek Goyal 2020-02-28 11:34:55 -0500 1102) dax_read_unlock(id);
81ee8e52a71c7 (Matthew Wilcox (Oracle) 2020-09-21 08:58:42 -0700 1103) return size;
679c8bd3b2942 (Christoph Hellwig 2016-05-09 10:47:04 +0200 1104) }
679c8bd3b2942 (Christoph Hellwig 2016-05-09 10:47:04 +0200 1105)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1106) static loff_t
11c59c92f44d9 (Ross Zwisler 2016-11-08 11:32:46 +1100 1107) dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1108) struct iomap *iomap, struct iomap *srcmap)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1109) {
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1110) struct block_device *bdev = iomap->bdev;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1111) struct dax_device *dax_dev = iomap->dax_dev;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1112) struct iov_iter *iter = data;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1113) loff_t end = pos + length, done = 0;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1114) ssize_t ret = 0;
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1115) size_t xfer;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1116) int id;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1117)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1118) if (iov_iter_rw(iter) == READ) {
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1119) end = min(end, i_size_read(inode));
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1120) if (pos >= end)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1121) return 0;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1122)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1123) if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1124) return iov_iter_zero(min(length, end - pos), iter);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1125) }
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1126)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1127) if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1128) return -EIO;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1129)
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1130) /*
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1131) * Write can allocate block for an area which has a hole page mapped
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1132) * into page tables. We have to tear down these mappings so that data
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1133) * written by write(2) is visible in mmap.
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1134) */
cd656375f9463 (Jan Kara 2017-05-12 15:46:50 -0700 1135) if (iomap->flags & IOMAP_F_NEW) {
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1136) invalidate_inode_pages2_range(inode->i_mapping,
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1137) pos >> PAGE_SHIFT,
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1138) (end - 1) >> PAGE_SHIFT);
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1139) }
e3fce68cdbed2 (Jan Kara 2016-08-10 17:10:28 +0200 1140)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1141) id = dax_read_lock();
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1142) while (pos < end) {
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1143) unsigned offset = pos & (PAGE_SIZE - 1);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1144) const size_t size = ALIGN(length + offset, PAGE_SIZE);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1145) const sector_t sector = dax_iomap_sector(iomap, pos);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1146) ssize_t map_len;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1147) pgoff_t pgoff;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1148) void *kaddr;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1149)
d1908f52557b3 (Michal Hocko 2017-02-03 13:13:26 -0800 1150) if (fatal_signal_pending(current)) {
d1908f52557b3 (Michal Hocko 2017-02-03 13:13:26 -0800 1151) ret = -EINTR;
d1908f52557b3 (Michal Hocko 2017-02-03 13:13:26 -0800 1152) break;
d1908f52557b3 (Michal Hocko 2017-02-03 13:13:26 -0800 1153) }
d1908f52557b3 (Michal Hocko 2017-02-03 13:13:26 -0800 1154)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1155) ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1156) if (ret)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1157) break;
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1158)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1159) map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
86ed913b0e826 (Huaisheng Ye 2018-07-30 15:15:48 +0800 1160) &kaddr, NULL);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1161) if (map_len < 0) {
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1162) ret = map_len;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1163) break;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1164) }
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1165)
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1166) map_len = PFN_PHYS(map_len);
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1167) kaddr += offset;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1168) map_len -= offset;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1169) if (map_len > end - pos)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1170) map_len = end - pos;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1171)
a2e050f5a9a9b (Ross Zwisler 2017-09-06 16:18:54 -0700 1172) /*
a2e050f5a9a9b (Ross Zwisler 2017-09-06 16:18:54 -0700 1173) * The userspace address for the memory copy has already been
a2e050f5a9a9b (Ross Zwisler 2017-09-06 16:18:54 -0700 1174) * validated via access_ok() in either vfs_read() or
a2e050f5a9a9b (Ross Zwisler 2017-09-06 16:18:54 -0700 1175) * vfs_write(), depending on which operation we are doing.
a2e050f5a9a9b (Ross Zwisler 2017-09-06 16:18:54 -0700 1176) */
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1177) if (iov_iter_rw(iter) == WRITE)
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1178) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
fec53774fd043 (Dan Williams 2017-05-29 21:56:49 -0700 1179) map_len, iter);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1180) else
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1181) xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
b3a9a0c36e1f7 (Dan Williams 2018-05-02 06:46:33 -0700 1182) map_len, iter);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1183)
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1184) pos += xfer;
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1185) length -= xfer;
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1186) done += xfer;
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1187)
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1188) if (xfer == 0)
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1189) ret = -EFAULT;
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1190) if (xfer < map_len)
a77d478642f12 (Dan Williams 2018-03-16 17:36:44 -0700 1191) break;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1192) }
cccbce6715829 (Dan Williams 2017-01-27 13:31:42 -0800 1193) dax_read_unlock(id);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1194)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1195) return done ? done : ret;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1196) }
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1197)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1198) /**
11c59c92f44d9 (Ross Zwisler 2016-11-08 11:32:46 +1100 1199) * dax_iomap_rw - Perform I/O to a DAX file
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1200) * @iocb: The control block for this I/O
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1201) * @iter: The addresses to do I/O from or to
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1202) * @ops: iomap ops passed from the file system
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1203) *
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1204) * This function performs read and write operations to directly mapped
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1205) * persistent memory. The callers needs to take care of read/write exclusion
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1206) * and evicting any page cache pages in the region under I/O.
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1207) */
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1208) ssize_t
11c59c92f44d9 (Ross Zwisler 2016-11-08 11:32:46 +1100 1209) dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa17b6a6 (Christoph Hellwig 2017-01-27 23:20:26 -0800 1210) const struct iomap_ops *ops)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1211) {
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1212) struct address_space *mapping = iocb->ki_filp->f_mapping;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1213) struct inode *inode = mapping->host;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1214) loff_t pos = iocb->ki_pos, ret = 0, done = 0;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1215) unsigned flags = 0;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1216)
168316db35832 (Christoph Hellwig 2017-02-08 14:43:13 -0500 1217) if (iov_iter_rw(iter) == WRITE) {
9ffbe8ac05dbb (Nikolay Borisov 2019-05-31 13:06:51 +0300 1218) lockdep_assert_held_write(&inode->i_rwsem);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1219) flags |= IOMAP_WRITE;
168316db35832 (Christoph Hellwig 2017-02-08 14:43:13 -0500 1220) } else {
168316db35832 (Christoph Hellwig 2017-02-08 14:43:13 -0500 1221) lockdep_assert_held(&inode->i_rwsem);
168316db35832 (Christoph Hellwig 2017-02-08 14:43:13 -0500 1222) }
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1223)
96222d53842df (Jeff Moyer 2020-02-05 14:15:58 -0500 1224) if (iocb->ki_flags & IOCB_NOWAIT)
96222d53842df (Jeff Moyer 2020-02-05 14:15:58 -0500 1225) flags |= IOMAP_NOWAIT;
96222d53842df (Jeff Moyer 2020-02-05 14:15:58 -0500 1226)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1227) while (iov_iter_count(iter)) {
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1228) ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
11c59c92f44d9 (Ross Zwisler 2016-11-08 11:32:46 +1100 1229) iter, dax_iomap_actor);
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1230) if (ret <= 0)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1231) break;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1232) pos += ret;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1233) done += ret;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1234) }
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1235)
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1236) iocb->ki_pos += done;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1237) return done ? done : ret;
a254e56812880 (Christoph Hellwig 2016-09-19 11:24:49 +1000 1238) }
11c59c92f44d9 (Ross Zwisler 2016-11-08 11:32:46 +1100 1239) EXPORT_SYMBOL_GPL(dax_iomap_rw);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1240)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1241) static vm_fault_t dax_fault_return(int error)
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1242) {
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1243) if (error == 0)
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1244) return VM_FAULT_NOPAGE;
c9aed74e6a276 (Souptick Joarder 2019-01-05 00:54:11 +0530 1245) return vmf_error(error);
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1246) }
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1247)
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1248) /*
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1249) * MAP_SYNC on a dax mapping guarantees dirty metadata is
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1250) * flushed on write-faults (non-cow), but not read-faults.
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1251) */
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1252) static bool dax_fault_is_synchronous(unsigned long flags,
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1253) struct vm_area_struct *vma, struct iomap *iomap)
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1254) {
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1255) return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1256) && (iomap->flags & IOMAP_F_DIRTY);
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1257) }
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1258)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1259) static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
c0b2462597928 (Jan Kara 2018-01-07 16:38:43 -0500 1260) int *iomap_errp, const struct iomap_ops *ops)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1261) {
a0987ad5c5766 (Jan Kara 2017-11-01 16:36:34 +0100 1262) struct vm_area_struct *vma = vmf->vma;
a0987ad5c5766 (Jan Kara 2017-11-01 16:36:34 +0100 1263) struct address_space *mapping = vma->vm_file->f_mapping;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1264) XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1265) struct inode *inode = mapping->host;
1a29d85eb0f19 (Jan Kara 2016-12-14 15:07:01 -0800 1266) unsigned long vaddr = vmf->address;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1267) loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1268) struct iomap iomap = { .type = IOMAP_HOLE };
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1269) struct iomap srcmap = { .type = IOMAP_HOLE };
9484ab1bf4464 (Jan Kara 2016-11-10 10:26:50 +1100 1270) unsigned flags = IOMAP_FAULT;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1271) int error, major = 0;
d2c43ef133274 (Jan Kara 2017-11-01 16:36:35 +0100 1272) bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1273) bool sync;
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1274) vm_fault_t ret = 0;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1275) void *entry;
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1276) pfn_t pfn;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1277)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1278) trace_dax_pte_fault(inode, vmf, ret);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1279) /*
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1280) * Check whether offset isn't beyond end of file now. Caller is supposed
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1281) * to hold locks serializing us with truncate / punch hole so this is
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1282) * a reliable test.
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1283) */
a9c42b33ed809 (Ross Zwisler 2017-05-08 16:00:00 -0700 1284) if (pos >= i_size_read(inode)) {
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1285) ret = VM_FAULT_SIGBUS;
a9c42b33ed809 (Ross Zwisler 2017-05-08 16:00:00 -0700 1286) goto out;
a9c42b33ed809 (Ross Zwisler 2017-05-08 16:00:00 -0700 1287) }
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1288)
d2c43ef133274 (Jan Kara 2017-11-01 16:36:35 +0100 1289) if (write && !vmf->cow_page)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1290) flags |= IOMAP_WRITE;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1291)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1292) entry = grab_mapping_entry(&xas, mapping, 0);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1293) if (xa_is_internal(entry)) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1294) ret = xa_to_internal(entry);
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1295) goto out;
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1296) }
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1297)
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1298) /*
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1299) * It is possible, particularly with mixed reads & writes to private
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1300) * mappings, that we have raced with a PMD fault that overlaps with
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1301) * the PTE we need to set up. If so just return and the fault will be
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1302) * retried.
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1303) */
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1304) if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1305) ret = VM_FAULT_NOPAGE;
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1306) goto unlock_entry;
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1307) }
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1308)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1309) /*
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1310) * Note that we don't bother to use iomap_apply here: DAX required
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1311) * the file system block size to be equal the page size, which means
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1312) * that we never have to deal with more than a single extent here.
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1313) */
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1314) error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
c0b2462597928 (Jan Kara 2018-01-07 16:38:43 -0500 1315) if (iomap_errp)
c0b2462597928 (Jan Kara 2018-01-07 16:38:43 -0500 1316) *iomap_errp = error;
a9c42b33ed809 (Ross Zwisler 2017-05-08 16:00:00 -0700 1317) if (error) {
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1318) ret = dax_fault_return(error);
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1319) goto unlock_entry;
a9c42b33ed809 (Ross Zwisler 2017-05-08 16:00:00 -0700 1320) }
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1321) if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1322) error = -EIO; /* fs corruption? */
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1323) goto error_finish_iomap;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1324) }
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1325)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1326) if (vmf->cow_page) {
31a6f1a6e5a4a (Jan Kara 2017-11-01 16:36:32 +0100 1327) sector_t sector = dax_iomap_sector(&iomap, pos);
31a6f1a6e5a4a (Jan Kara 2017-11-01 16:36:32 +0100 1328)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1329) switch (iomap.type) {
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1330) case IOMAP_HOLE:
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1331) case IOMAP_UNWRITTEN:
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1332) clear_user_highpage(vmf->cow_page, vaddr);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1333) break;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1334) case IOMAP_MAPPED:
c7fe193f1877e (Ira Weiny 2020-07-17 00:20:49 -0700 1335) error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
c7fe193f1877e (Ira Weiny 2020-07-17 00:20:49 -0700 1336) sector, vmf->cow_page, vaddr);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1337) break;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1338) default:
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1339) WARN_ON_ONCE(1);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1340) error = -EIO;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1341) break;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1342) }
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1343)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1344) if (error)
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1345) goto error_finish_iomap;
b1aa812b21084 (Jan Kara 2016-12-14 15:07:24 -0800 1346)
b1aa812b21084 (Jan Kara 2016-12-14 15:07:24 -0800 1347) __SetPageUptodate(vmf->cow_page);
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1348) ret = finish_fault(vmf);
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1349) if (!ret)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1350) ret = VM_FAULT_DONE_COW;
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1351) goto finish_iomap;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1352) }
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1353)
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1354) sync = dax_fault_is_synchronous(flags, vma, &iomap);
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1355)
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1356) switch (iomap.type) {
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1357) case IOMAP_MAPPED:
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1358) if (iomap.flags & IOMAP_F_NEW) {
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1359) count_vm_event(PGMAJFAULT);
a0987ad5c5766 (Jan Kara 2017-11-01 16:36:34 +0100 1360) count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1361) major = VM_FAULT_MAJOR;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1362) }
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1363) error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1364) if (error < 0)
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1365) goto error_finish_iomap;
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1366)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1367) entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1368) 0, write && !sync);
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1369)
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1370) /*
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1371) * If we are doing synchronous page fault and inode needs fsync,
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1372) * we can insert PTE into page tables only after that happens.
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1373) * Skip insertion for now and return the pfn so that caller can
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1374) * insert it after fsync is done.
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1375) */
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1376) if (sync) {
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1377) if (WARN_ON_ONCE(!pfnp)) {
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1378) error = -EIO;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1379) goto error_finish_iomap;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1380) }
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1381) *pfnp = pfn;
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1382) ret = VM_FAULT_NEEDDSYNC | major;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1383) goto finish_iomap;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1384) }
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1385) trace_dax_insert_mapping(inode, vmf, entry);
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1386) if (write)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1387) ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1388) else
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1389) ret = vmf_insert_mixed(vma, vaddr, pfn);
1b5a1cb21e0cd (Jan Kara 2017-11-01 16:36:36 +0100 1390)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1391) goto finish_iomap;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1392) case IOMAP_UNWRITTEN:
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1393) case IOMAP_HOLE:
d2c43ef133274 (Jan Kara 2017-11-01 16:36:35 +0100 1394) if (!write) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1395) ret = dax_load_hole(&xas, mapping, &entry, vmf);
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1396) goto finish_iomap;
1550290b08012 (Ross Zwisler 2016-11-08 11:33:26 +1100 1397) }
df561f6688fef (Gustavo A. R. Silva 2020-08-23 17:36:59 -0500 1398) fallthrough;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1399) default:
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1400) WARN_ON_ONCE(1);
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1401) error = -EIO;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1402) break;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1403) }
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1404)
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1405) error_finish_iomap:
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1406) ret = dax_fault_return(error);
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1407) finish_iomap:
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1408) if (ops->iomap_end) {
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1409) int copied = PAGE_SIZE;
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1410)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1411) if (ret & VM_FAULT_ERROR)
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1412) copied = 0;
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1413) /*
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1414) * The fault is done by now and there's no way back (other
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1415) * thread may be already happily using PTE we have installed).
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1416) * Just ignore error from ->iomap_end since we cannot do much
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1417) * with it.
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1418) */
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1419) ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1550290b08012 (Ross Zwisler 2016-11-08 11:33:26 +1100 1420) }
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1421) unlock_entry:
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1422) dax_unlock_entry(&xas, entry);
13e451fdc1af0 (Jan Kara 2017-05-12 15:46:57 -0700 1423) out:
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1424) trace_dax_pte_fault_done(inode, vmf, ret);
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1425) return ret | major;
a7d73fe6c538f (Christoph Hellwig 2016-09-19 11:24:50 +1000 1426) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1427)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1428) #ifdef CONFIG_FS_DAX_PMD
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1429) static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1430) struct iomap *iomap, void **entry)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1431) {
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1432) struct address_space *mapping = vmf->vma->vm_file->f_mapping;
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1433) unsigned long pmd_addr = vmf->address & PMD_MASK;
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1434) struct vm_area_struct *vma = vmf->vma;
653b2ea3396fd (Ross Zwisler 2017-02-22 15:39:57 -0800 1435) struct inode *inode = mapping->host;
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1436) pgtable_t pgtable = NULL;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1437) struct page *zero_page;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1438) spinlock_t *ptl;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1439) pmd_t pmd_entry;
3fe0791c295cf (Dan Williams 2017-10-14 17:13:45 -0700 1440) pfn_t pfn;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1441)
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1442) zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1443)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1444) if (unlikely(!zero_page))
653b2ea3396fd (Ross Zwisler 2017-02-22 15:39:57 -0800 1445) goto fallback;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1446)
3fe0791c295cf (Dan Williams 2017-10-14 17:13:45 -0700 1447) pfn = page_to_pfn_t(zero_page);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1448) *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 1449) DAX_PMD | DAX_ZERO_PAGE, false);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1450)
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1451) if (arch_needs_pgtable_deposit()) {
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1452) pgtable = pte_alloc_one(vma->vm_mm);
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1453) if (!pgtable)
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1454) return VM_FAULT_OOM;
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1455) }
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1456)
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1457) ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1458) if (!pmd_none(*(vmf->pmd))) {
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1459) spin_unlock(ptl);
653b2ea3396fd (Ross Zwisler 2017-02-22 15:39:57 -0800 1460) goto fallback;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1461) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1462)
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1463) if (pgtable) {
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1464) pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1465) mm_inc_nr_ptes(vma->vm_mm);
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1466) }
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1467) pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1468) pmd_entry = pmd_mkhuge(pmd_entry);
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1469) set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1470) spin_unlock(ptl);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1471) trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1472) return VM_FAULT_NOPAGE;
653b2ea3396fd (Ross Zwisler 2017-02-22 15:39:57 -0800 1473)
653b2ea3396fd (Ross Zwisler 2017-02-22 15:39:57 -0800 1474) fallback:
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1475) if (pgtable)
11cf9d863dcb5 (Aneesh Kumar K.V 2019-03-09 17:37:21 +0530 1476) pte_free(vma->vm_mm, pgtable);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1477) trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
653b2ea3396fd (Ross Zwisler 2017-02-22 15:39:57 -0800 1478) return VM_FAULT_FALLBACK;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1479) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1480)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1481) static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1482) const struct iomap_ops *ops)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1483) {
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1484) struct vm_area_struct *vma = vmf->vma;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1485) struct address_space *mapping = vma->vm_file->f_mapping;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1486) XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
d8a849e1bc123 (Dave Jiang 2017-02-22 15:40:03 -0800 1487) unsigned long pmd_addr = vmf->address & PMD_MASK;
d8a849e1bc123 (Dave Jiang 2017-02-22 15:40:03 -0800 1488) bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1489) bool sync;
9484ab1bf4464 (Jan Kara 2016-11-10 10:26:50 +1100 1490) unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1491) struct inode *inode = mapping->host;
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1492) vm_fault_t result = VM_FAULT_FALLBACK;
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1493) struct iomap iomap = { .type = IOMAP_HOLE };
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1494) struct iomap srcmap = { .type = IOMAP_HOLE };
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1495) pgoff_t max_pgoff;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1496) void *entry;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1497) loff_t pos;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1498) int error;
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1499) pfn_t pfn;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1500)
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1501) /*
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1502) * Check whether offset isn't beyond end of file now. Caller is
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1503) * supposed to hold locks serializing us with truncate / punch hole so
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1504) * this is a reliable test.
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1505) */
957ac8c421ad8 (Jeff Moyer 2017-11-14 20:37:27 -0500 1506) max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1507)
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1508) trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1509)
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1510) /*
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1511) * Make sure that the faulting address's PMD offset (color) matches
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1512) * the PMD offset from the start of the file. This is necessary so
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1513) * that a PMD range in the page table overlaps exactly with a PMD
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 1514) * range in the page cache.
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1515) */
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1516) if ((vmf->pgoff & PG_PMD_COLOUR) !=
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1517) ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1518) goto fallback;
fffa281b48a91 (Ross Zwisler 2017-08-25 15:55:36 -0700 1519)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1520) /* Fall back to PTEs if we're going to COW */
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1521) if (write && !(vma->vm_flags & VM_SHARED))
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1522) goto fallback;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1523)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1524) /* If the PMD would extend outside the VMA */
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1525) if (pmd_addr < vma->vm_start)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1526) goto fallback;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1527) if ((pmd_addr + PMD_SIZE) > vma->vm_end)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1528) goto fallback;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1529)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1530) if (xas.xa_index >= max_pgoff) {
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1531) result = VM_FAULT_SIGBUS;
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1532) goto out;
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1533) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1534)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1535) /* If the PMD would extend beyond the file size */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1536) if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1537) goto fallback;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1538)
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1539) /*
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1540) * grab_mapping_entry() will make sure we get an empty PMD entry,
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1541) * a zero PMD entry or a DAX PMD. If it can't (because a PTE
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1542) * entry is already in the array, for instance), it will return
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1543) * VM_FAULT_FALLBACK.
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1544) */
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 1545) entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1546) if (xa_is_internal(entry)) {
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1547) result = xa_to_internal(entry);
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1548) goto fallback;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1549) }
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1550)
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1551) /*
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1552) * It is possible, particularly with mixed reads & writes to private
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1553) * mappings, that we have raced with a PTE fault that overlaps with
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1554) * the PMD we need to set up. If so just return and the fault will be
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1555) * retried.
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1556) */
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1557) if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1558) !pmd_devmap(*vmf->pmd)) {
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1559) result = 0;
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1560) goto unlock_entry;
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1561) }
e2093926a098a (Ross Zwisler 2017-06-02 14:46:37 -0700 1562)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1563) /*
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1564) * Note that we don't use iomap_apply here. We aren't doing I/O, only
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1565) * setting up a mapping, so really we're using iomap_begin() as a way
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1566) * to look up our filesystem block.
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1567) */
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1568) pos = (loff_t)xas.xa_index << PAGE_SHIFT;
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1569) error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
c039b99792726 (Goldwyn Rodrigues 2019-10-18 16:44:10 -0700 1570) &srcmap);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1571) if (error)
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1572) goto unlock_entry;
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1573)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1574) if (iomap.offset + iomap.length < pos + PMD_SIZE)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1575) goto finish_iomap;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1576)
aaa422c4c3f6e (Dan Williams 2017-11-13 16:38:44 -0800 1577) sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1578)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1579) switch (iomap.type) {
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1580) case IOMAP_MAPPED:
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1581) error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1582) if (error < 0)
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1583) goto finish_iomap;
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1584)
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1585) entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
3159f943aafdb (Matthew Wilcox 2017-11-03 13:30:42 -0400 1586) DAX_PMD, write && !sync);
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1587)
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1588) /*
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1589) * If we are doing synchronous page fault and inode needs fsync,
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1590) * we can insert PMD into page tables only after that happens.
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1591) * Skip insertion for now and return the pfn so that caller can
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1592) * insert it after fsync is done.
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1593) */
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1594) if (sync) {
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1595) if (WARN_ON_ONCE(!pfnp))
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1596) goto finish_iomap;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1597) *pfnp = pfn;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1598) result = VM_FAULT_NEEDDSYNC;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1599) goto finish_iomap;
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1600) }
caa51d26f85c2 (Jan Kara 2017-11-01 16:36:42 +0100 1601)
302a5e312b3a1 (Jan Kara 2017-11-01 16:36:37 +0100 1602) trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
fce86ff5802ba (Dan Williams 2019-05-13 17:15:33 -0700 1603) result = vmf_insert_pfn_pmd(vmf, pfn, write);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1604) break;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1605) case IOMAP_UNWRITTEN:
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1606) case IOMAP_HOLE:
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1607) if (WARN_ON_ONCE(write))
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1608) break;
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1609) result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1610) break;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1611) default:
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1612) WARN_ON_ONCE(1);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1613) break;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1614) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1615)
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1616) finish_iomap:
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1617) if (ops->iomap_end) {
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1618) int copied = PMD_SIZE;
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1619)
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1620) if (result == VM_FAULT_FALLBACK)
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1621) copied = 0;
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1622) /*
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1623) * The fault is done by now and there's no way back (other
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1624) * thread may be already happily using PMD we have installed).
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1625) * Just ignore error from ->iomap_end since we cannot do much
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1626) * with it.
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1627) */
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1628) ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
9f141d6ef6258 (Jan Kara 2016-10-19 14:34:31 +0200 1629) &iomap);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1630) }
876f29460cbd4 (Ross Zwisler 2017-05-12 15:47:00 -0700 1631) unlock_entry:
b15cd800682fc (Matthew Wilcox 2018-03-29 22:58:27 -0400 1632) dax_unlock_entry(&xas, entry);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1633) fallback:
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1634) if (result == VM_FAULT_FALLBACK) {
d8a849e1bc123 (Dave Jiang 2017-02-22 15:40:03 -0800 1635) split_huge_pmd(vma, vmf->pmd, vmf->address);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1636) count_vm_event(THP_FAULT_FALLBACK);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1637) }
282a8e0391c37 (Ross Zwisler 2017-02-22 15:39:50 -0800 1638) out:
f42003917b456 (Dave Jiang 2017-02-22 15:40:06 -0800 1639) trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1640) return result;
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1641) }
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1642) #else
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1643) static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
01cddfe99008d (Arnd Bergmann 2017-02-27 14:26:44 -0800 1644) const struct iomap_ops *ops)
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1645) {
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1646) return VM_FAULT_FALLBACK;
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1647) }
642261ac995e0 (Ross Zwisler 2016-11-08 11:34:45 +1100 1648) #endif /* CONFIG_FS_DAX_PMD */
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1649)
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1650) /**
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1651) * dax_iomap_fault - handle a page fault on a DAX file
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1652) * @vmf: The description of the fault
cec04e8c825ea (Jan Kara 2017-11-01 16:36:38 +0100 1653) * @pe_size: Size of the page to fault in
9a0dd42251439 (Jan Kara 2017-11-01 16:36:39 +0100 1654) * @pfnp: PFN to insert for synchronous faults if fsync is required
c0b2462597928 (Jan Kara 2018-01-07 16:38:43 -0500 1655) * @iomap_errp: Storage for detailed error code in case of error
cec04e8c825ea (Jan Kara 2017-11-01 16:36:38 +0100 1656) * @ops: Iomap ops passed from the file system
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1657) *
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1658) * When a page fault occurs, filesystems may call this helper in
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1659) * their fault handler for DAX files. dax_iomap_fault() assumes the caller
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1660) * has done all the necessary locking for page fault to proceed
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1661) * successfully.
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1662) */
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1663) vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
c0b2462597928 (Jan Kara 2018-01-07 16:38:43 -0500 1664) pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1665) {
c791ace1e7473 (Dave Jiang 2017-02-24 14:57:08 -0800 1666) switch (pe_size) {
c791ace1e7473 (Dave Jiang 2017-02-24 14:57:08 -0800 1667) case PE_SIZE_PTE:
c0b2462597928 (Jan Kara 2018-01-07 16:38:43 -0500 1668) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
c791ace1e7473 (Dave Jiang 2017-02-24 14:57:08 -0800 1669) case PE_SIZE_PMD:
9a0dd42251439 (Jan Kara 2017-11-01 16:36:39 +0100 1670) return dax_iomap_pmd_fault(vmf, pfnp, ops);
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1671) default:
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1672) return VM_FAULT_FALLBACK;
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1673) }
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1674) }
a2d581675d485 (Dave Jiang 2017-02-24 14:56:59 -0800 1675) EXPORT_SYMBOL_GPL(dax_iomap_fault);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1676)
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 1677) /*
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1678) * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1679) * @vmf: The description of the fault
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1680) * @pfn: PFN to insert
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1681) * @order: Order of entry to insert.
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1682) *
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 1683) * This function inserts a writeable PTE or PMD entry into the page tables
a77d19f46a37c (Matthew Wilcox 2018-03-27 13:39:38 -0400 1684) * for an mmaped DAX file. It also marks the page cache entry as dirty.
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1685) */
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1686) static vm_fault_t
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1687) dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1688) {
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1689) struct address_space *mapping = vmf->vma->vm_file->f_mapping;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1690) XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1691) void *entry;
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1692) vm_fault_t ret;
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1693)
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1694) xas_lock_irq(&xas);
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 1695) entry = get_unlocked_entry(&xas, order);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1696) /* Did we race with someone splitting entry or so? */
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 1697) if (!entry || dax_is_conflict(entry) ||
23c84eb783751 (Matthew Wilcox (Oracle) 2019-07-03 23:21:25 -0400 1698) (order == 0 && !dax_is_pte_entry(entry))) {
4c3d043d271d4 (Vivek Goyal 2021-04-28 15:03:13 -0400 1699) put_unlocked_entry(&xas, entry, WAKE_NEXT);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1700) xas_unlock_irq(&xas);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1701) trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1702) VM_FAULT_NOPAGE);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1703) return VM_FAULT_NOPAGE;
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1704) }
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1705) xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1706) dax_lock_entry(&xas, entry);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1707) xas_unlock_irq(&xas);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1708) if (order == 0)
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1709) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1710) #ifdef CONFIG_FS_DAX_PMD
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1711) else if (order == PMD_ORDER)
fce86ff5802ba (Dan Williams 2019-05-13 17:15:33 -0700 1712) ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1713) #endif
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1714) else
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1715) ret = VM_FAULT_FALLBACK;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1716) dax_unlock_entry(&xas, entry);
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1717) trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1718) return ret;
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1719) }
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1720)
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1721) /**
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1722) * dax_finish_sync_fault - finish synchronous page fault
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1723) * @vmf: The description of the fault
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1724) * @pe_size: Size of entry to be inserted
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1725) * @pfn: PFN to insert
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1726) *
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1727) * This function ensures that the file range touched by the page fault is
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1728) * stored persistently on the media and handles inserting of appropriate page
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1729) * table entry.
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1730) */
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1731) vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
ab77dab46210b (Souptick Joarder 2018-06-07 17:04:29 -0700 1732) enum page_entry_size pe_size, pfn_t pfn)
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1733) {
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1734) int err;
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1735) loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1736) unsigned int order = pe_order(pe_size);
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1737) size_t len = PAGE_SIZE << order;
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1738)
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1739) err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1740) if (err)
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1741) return VM_FAULT_SIGBUS;
cfc93c6c6c963 (Matthew Wilcox 2018-03-28 11:48:03 -0400 1742) return dax_insert_pfn_mkwrite(vmf, pfn, order);
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1743) }
71eab6dfd91ea (Jan Kara 2017-11-01 16:36:43 +0100 1744) EXPORT_SYMBOL_GPL(dax_finish_sync_fault);