^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * The Kyber I/O scheduler. Controls latency by throttling queue depths using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * scalable techniques.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2017 Facebook
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/blk-mq.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/elevator.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/sbitmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include "blk.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include "blk-mq.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include "blk-mq-debugfs.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include "blk-mq-sched.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include "blk-mq-tag.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <trace/events/kyber.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * Scheduling domains: the device is divided into multiple domains based on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * request type.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) KYBER_READ,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) KYBER_WRITE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) KYBER_DISCARD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) KYBER_OTHER,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) KYBER_NUM_DOMAINS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) static const char *kyber_domain_names[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) [KYBER_READ] = "READ",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) [KYBER_WRITE] = "WRITE",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) [KYBER_DISCARD] = "DISCARD",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) [KYBER_OTHER] = "OTHER",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * In order to prevent starvation of synchronous requests by a flood of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * asynchronous requests, we reserve 25% of requests for synchronous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) KYBER_ASYNC_PERCENT = 75,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * Maximum device-wide depth for each scheduling domain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * Even for fast devices with lots of tags like NVMe, you can saturate the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * device with only a fraction of the maximum possible queue depth. So, we cap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * these to a reasonable value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) static const unsigned int kyber_depth[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) [KYBER_READ] = 256,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) [KYBER_WRITE] = 128,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) [KYBER_DISCARD] = 64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) [KYBER_OTHER] = 16,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * Default latency targets for each scheduling domain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) static const u64 kyber_latency_targets[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * Batch size (number of requests we'll dispatch in a row) for each scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * domain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) static const unsigned int kyber_batch_size[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) [KYBER_READ] = 16,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) [KYBER_WRITE] = 8,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) [KYBER_DISCARD] = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) [KYBER_OTHER] = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * Requests latencies are recorded in a histogram with buckets defined relative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * to the target latency:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * <= 1/4 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * <= 1/2 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) * <= 3/4 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) * <= target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * <= 1 1/4 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * <= 1 1/2 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) * <= 1 3/4 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * > 1 3/4 * target latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * The width of the latency histogram buckets is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) KYBER_LATENCY_SHIFT = 2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * thus, "good".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * We measure both the total latency and the I/O latency (i.e., latency after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * submitting to the device).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) KYBER_TOTAL_LATENCY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) KYBER_IO_LATENCY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) static const char *kyber_latency_type_names[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) [KYBER_TOTAL_LATENCY] = "total",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) [KYBER_IO_LATENCY] = "I/O",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * Per-cpu latency histograms: total latency and I/O latency for each scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) * domain except for KYBER_OTHER.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) struct kyber_cpu_latency {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * There is a same mapping between ctx & hctx and kcq & khd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * we use request->mq_ctx->index_hw to index the kcq in khd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) struct kyber_ctx_queue {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) * Also protect the rqs on rq_list when merge.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) spinlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) struct list_head rq_list[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) struct kyber_queue_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) struct request_queue *q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * Each scheduling domain has a limited number of in-flight requests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * device-wide, limited by these tokens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * Async request percentage, converted to per-word depth for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * sbitmap_get_shallow().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) unsigned int async_depth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) struct kyber_cpu_latency __percpu *cpu_latency;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) /* Timer for stats aggregation and adjusting domain tokens. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) struct timer_list timer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) unsigned long latency_timeout[KYBER_OTHER];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) int domain_p99[KYBER_OTHER];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) /* Target latencies in nanoseconds. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) u64 latency_targets[KYBER_OTHER];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) struct kyber_hctx_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) spinlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) struct list_head rqs[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) unsigned int cur_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) unsigned int batching;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) struct kyber_ctx_queue *kcqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) atomic_t wait_index[KYBER_NUM_DOMAINS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) void *key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) static unsigned int kyber_sched_domain(unsigned int op)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) switch (op & REQ_OP_MASK) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) case REQ_OP_READ:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) return KYBER_READ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) case REQ_OP_WRITE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) return KYBER_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) case REQ_OP_DISCARD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) return KYBER_DISCARD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) return KYBER_OTHER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) static void flush_latency_buckets(struct kyber_queue_data *kqd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) struct kyber_cpu_latency *cpu_latency,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) unsigned int sched_domain, unsigned int type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) unsigned int bucket;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * Calculate the histogram bucket with the given percentile rank, or -1 if there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * aren't enough samples yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) static int calculate_percentile(struct kyber_queue_data *kqd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) unsigned int sched_domain, unsigned int type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) unsigned int percentile)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) unsigned int bucket, samples = 0, percentile_samples;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) samples += buckets[bucket];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) if (!samples)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) * We do the calculation once we have 500 samples or one second passes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) * since the first sample was recorded, whichever comes first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) if (!kqd->latency_timeout[sched_domain])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) if (samples < 500 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) kqd->latency_timeout[sched_domain] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) if (buckets[bucket] >= percentile_samples)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) percentile_samples -= buckets[bucket];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) kyber_latency_type_names[type], percentile,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) return bucket;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) static void kyber_resize_domain(struct kyber_queue_data *kqd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) unsigned int sched_domain, unsigned int depth)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) depth = clamp(depth, 1U, kyber_depth[sched_domain]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) static void kyber_timer_fn(struct timer_list *t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) unsigned int sched_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) bool bad = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) /* Sum all of the per-cpu latency histograms. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) for_each_online_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) struct kyber_cpu_latency *cpu_latency;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) flush_latency_buckets(kqd, cpu_latency, sched_domain,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) KYBER_TOTAL_LATENCY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) flush_latency_buckets(kqd, cpu_latency, sched_domain,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) KYBER_IO_LATENCY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * Check if any domains have a high I/O latency, which might indicate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * congestion in the device. Note that we use the p90; we don't want to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) * be too sensitive to outliers here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) int p90;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 90);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) if (p90 >= KYBER_GOOD_BUCKETS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) bad = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) * Adjust the scheduling domain depths. If we determined that there was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) * congestion, we throttle all domains with good latencies. Either way,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) * we ease up on throttling domains with bad latencies.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) unsigned int orig_depth, depth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) int p99;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) p99 = calculate_percentile(kqd, sched_domain,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) KYBER_TOTAL_LATENCY, 99);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * This is kind of subtle: different domains will not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) * necessarily have enough samples to calculate the latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) * percentiles during the same window, so we have to remember
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) * the p99 for the next time we observe congestion; once we do,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * we don't want to throttle again until we get more data, so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * reset it to -1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) if (bad) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) if (p99 < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) p99 = kqd->domain_p99[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) kqd->domain_p99[sched_domain] = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) } else if (p99 >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) kqd->domain_p99[sched_domain] = p99;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) if (p99 < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) * If this domain has bad latency, throttle less. Otherwise,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) * throttle more iff we determined that there is congestion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) * The new depth is scaled linearly with the p99 latency vs the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) * latency target. E.g., if the p99 is 3/4 of the target, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) * we throttle down to 3/4 of the current depth, and if the p99
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * is 2x the target, then we double the depth.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (bad || p99 >= KYBER_GOOD_BUCKETS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) kyber_resize_domain(kqd, sched_domain, depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) static unsigned int kyber_sched_tags_shift(struct request_queue *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) * All of the hardware queues have the same depth, so we can just grab
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * the shift of the first one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) struct kyber_queue_data *kqd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) unsigned int shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) int ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) if (!kqd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) kqd->q = q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) GFP_KERNEL | __GFP_ZERO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) if (!kqd->cpu_latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) goto err_kqd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) timer_setup(&kqd->timer, kyber_timer_fn, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) WARN_ON(!kyber_depth[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) WARN_ON(!kyber_batch_size[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) kyber_depth[i], -1, false,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) GFP_KERNEL, q->node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) while (--i >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) sbitmap_queue_free(&kqd->domain_tokens[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) goto err_buckets;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) for (i = 0; i < KYBER_OTHER; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) kqd->domain_p99[i] = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) kqd->latency_targets[i] = kyber_latency_targets[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) shift = kyber_sched_tags_shift(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) return kqd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) err_buckets:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) free_percpu(kqd->cpu_latency);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) err_kqd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) kfree(kqd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) return ERR_PTR(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) struct kyber_queue_data *kqd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) struct elevator_queue *eq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) eq = elevator_alloc(q, e);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) if (!eq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) kqd = kyber_queue_data_alloc(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) if (IS_ERR(kqd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) kobject_put(&eq->kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) return PTR_ERR(kqd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) blk_stat_enable_accounting(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) eq->elevator_data = kqd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) q->elevator = eq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) static void kyber_exit_sched(struct elevator_queue *e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) struct kyber_queue_data *kqd = e->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) del_timer_sync(&kqd->timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) for (i = 0; i < KYBER_NUM_DOMAINS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) sbitmap_queue_free(&kqd->domain_tokens[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) free_percpu(kqd->cpu_latency);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) kfree(kqd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) unsigned int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) spin_lock_init(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) for (i = 0; i < KYBER_NUM_DOMAINS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) INIT_LIST_HEAD(&kcq->rq_list[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) struct kyber_hctx_data *khd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) if (!khd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) sizeof(struct kyber_ctx_queue),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) GFP_KERNEL, hctx->numa_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) if (!khd->kcqs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) goto err_khd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) for (i = 0; i < hctx->nr_ctx; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) kyber_ctx_queue_init(&khd->kcqs[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) ilog2(8), GFP_KERNEL, hctx->numa_node)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) while (--i >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) sbitmap_free(&khd->kcq_map[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) goto err_kcqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) spin_lock_init(&khd->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) INIT_LIST_HEAD(&khd->rqs[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) khd->domain_wait[i].sbq = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) init_waitqueue_func_entry(&khd->domain_wait[i].wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) kyber_domain_wake);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) khd->domain_wait[i].wait.private = hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) atomic_set(&khd->wait_index[i], 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) khd->cur_domain = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) khd->batching = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) hctx->sched_data = khd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) kqd->async_depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) err_kcqs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) kfree(khd->kcqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) err_khd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) kfree(khd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) for (i = 0; i < KYBER_NUM_DOMAINS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) sbitmap_free(&khd->kcq_map[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) kfree(khd->kcqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) kfree(hctx->sched_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) static int rq_get_domain_token(struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) return (long)rq->elv.priv[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) static void rq_set_domain_token(struct request *rq, int token)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) rq->elv.priv[0] = (void *)(long)token;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) static void rq_clear_domain_token(struct kyber_queue_data *kqd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) unsigned int sched_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) int nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) nr = rq_get_domain_token(rq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) if (nr != -1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) sched_domain = kyber_sched_domain(rq->cmd_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) rq->mq_ctx->cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) * We use the scheduler tags as per-hardware queue queueing tokens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) * Async requests can be limited at this stage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) if (!op_is_sync(op)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) data->shallow_depth = kqd->async_depth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) unsigned int nr_segs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) struct list_head *rq_list = &kcq->rq_list[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) bool merged;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) spin_lock(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) spin_unlock(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) return merged;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) static void kyber_prepare_request(struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) rq_set_domain_token(rq, -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) struct list_head *rq_list, bool at_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) struct request *rq, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) list_for_each_entry_safe(rq, next, rq_list, queuelist) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) struct list_head *head = &kcq->rq_list[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) spin_lock(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) if (at_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) list_move(&rq->queuelist, head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) list_move_tail(&rq->queuelist, head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) sbitmap_set_bit(&khd->kcq_map[sched_domain],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) rq->mq_ctx->index_hw[hctx->type]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) blk_mq_sched_request_inserted(rq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) spin_unlock(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) static void kyber_finish_request(struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) rq_clear_domain_token(kqd, rq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) unsigned int sched_domain, unsigned int type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) u64 target, u64 latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) unsigned int bucket;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) u64 divisor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) if (latency > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) KYBER_LATENCY_BUCKETS - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) bucket = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) static void kyber_completed_request(struct request *rq, u64 now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) struct kyber_cpu_latency *cpu_latency;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) unsigned int sched_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) u64 target;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) sched_domain = kyber_sched_domain(rq->cmd_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) if (sched_domain == KYBER_OTHER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) cpu_latency = get_cpu_ptr(kqd->cpu_latency);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) target = kqd->latency_targets[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) target, now - rq->start_time_ns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) now - rq->io_start_time_ns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) put_cpu_ptr(kqd->cpu_latency);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) timer_reduce(&kqd->timer, jiffies + HZ / 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) struct flush_kcq_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) struct kyber_hctx_data *khd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) unsigned int sched_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) struct list_head *list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) struct flush_kcq_data *flush_data = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) spin_lock(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) flush_data->list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) sbitmap_clear_bit(sb, bitnr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) spin_unlock(&kcq->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) unsigned int sched_domain,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) struct flush_kcq_data data = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) .khd = khd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) .sched_domain = sched_domain,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) .list = list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) sbitmap_for_each_set(&khd->kcq_map[sched_domain],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) flush_busy_kcq, &data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) void *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) sbitmap_del_wait_queue(wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) blk_mq_run_hw_queue(hctx, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) static int kyber_get_domain_token(struct kyber_queue_data *kqd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) struct kyber_hctx_data *khd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) unsigned int sched_domain = khd->cur_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) struct sbq_wait *wait = &khd->domain_wait[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) struct sbq_wait_state *ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) int nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) nr = __sbitmap_queue_get(domain_tokens);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) * If we failed to get a domain token, make sure the hardware queue is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) * run when one becomes available. Note that this is serialized on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) * khd->lock, but we still need to be careful about the waker.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) ws = sbq_wait_ptr(domain_tokens,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) &khd->wait_index[sched_domain]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) khd->domain_ws[sched_domain] = ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) sbitmap_add_wait_queue(domain_tokens, ws, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * Try again in case a token was freed before we got on the wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) * queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) nr = __sbitmap_queue_get(domain_tokens);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * If we got a token while we were on the wait queue, remove ourselves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) * from the wait queue to ensure that all wake ups make forward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * progress. It's possible that the waker already deleted the entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) * between the !list_empty_careful() check and us grabbing the lock, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) * list_del_init() is okay with that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) ws = khd->domain_ws[sched_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) spin_lock_irq(&ws->wait.lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) sbitmap_del_wait_queue(wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) spin_unlock_irq(&ws->wait.lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) return nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) static struct request *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) struct kyber_hctx_data *khd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) struct list_head *rqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) struct request *rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) int nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) rqs = &khd->rqs[khd->cur_domain];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) * If we already have a flushed request, then we just need to get a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) * token for it. Otherwise, if there are pending requests in the kcqs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * flush the kcqs, but only if we can get a token. If not, we should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * leave the requests in the kcqs so that they can be merged. Note that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) * khd->lock serializes the flushes, so if we observed any bit set in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) * the kcq_map, we will always get a request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) rq = list_first_entry_or_null(rqs, struct request, queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) if (rq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) nr = kyber_get_domain_token(kqd, khd, hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) if (nr >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) khd->batching++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) rq_set_domain_token(rq, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) list_del_init(&rq->queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) return rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) trace_kyber_throttled(kqd->q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) kyber_domain_names[khd->cur_domain]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) nr = kyber_get_domain_token(kqd, khd, hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) if (nr >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) rq = list_first_entry(rqs, struct request, queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) khd->batching++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) rq_set_domain_token(rq, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) list_del_init(&rq->queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) return rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) trace_kyber_throttled(kqd->q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) kyber_domain_names[khd->cur_domain]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) /* There were either no pending requests or no tokens. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) struct request *rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) spin_lock(&khd->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * First, if we are still entitled to batch, try to dispatch a request
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) * from the batch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) if (khd->batching < kyber_batch_size[khd->cur_domain]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) if (rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) * Either,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) * 1. We were no longer entitled to a batch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) * 2. The domain we were batching didn't have any requests.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * 3. The domain we were batching was out of tokens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * Start another batch. Note that this wraps back around to the original
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * domain if no other domains have requests or tokens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) khd->batching = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) khd->cur_domain = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) khd->cur_domain++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) if (rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) rq = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) spin_unlock(&khd->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) return rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) if (!list_empty_careful(&khd->rqs[i]) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) sbitmap_any_bit_set(&khd->kcq_map[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) #define KYBER_LAT_SHOW_STORE(domain, name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) char *page) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) struct kyber_queue_data *kqd = e->elevator_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) } \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) const char *page, size_t count) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) struct kyber_queue_data *kqd = e->elevator_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) unsigned long long nsec; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) int ret; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) ret = kstrtoull(page, 10, &nsec); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) if (ret) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) return ret; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) kqd->latency_targets[domain] = nsec; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) return count; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) KYBER_LAT_SHOW_STORE(KYBER_READ, read);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) #undef KYBER_LAT_SHOW_STORE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) static struct elv_fs_entry kyber_sched_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) KYBER_LAT_ATTR(read),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) KYBER_LAT_ATTR(write),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) __ATTR_NULL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) #undef KYBER_LAT_ATTR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) #ifdef CONFIG_BLK_DEBUG_FS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) struct request_queue *q = data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) struct kyber_queue_data *kqd = q->elevator->elevator_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) return 0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) } \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) __acquires(&khd->lock) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) struct blk_mq_hw_ctx *hctx = m->private; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) struct kyber_hctx_data *khd = hctx->sched_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) spin_lock(&khd->lock); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) return seq_list_start(&khd->rqs[domain], *pos); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) } \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) loff_t *pos) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) struct blk_mq_hw_ctx *hctx = m->private; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) struct kyber_hctx_data *khd = hctx->sched_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) return seq_list_next(v, &khd->rqs[domain], pos); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) } \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) __releases(&khd->lock) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) struct blk_mq_hw_ctx *hctx = m->private; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) struct kyber_hctx_data *khd = hctx->sched_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) spin_unlock(&khd->lock); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) } \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) .start = kyber_##name##_rqs_start, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) .next = kyber_##name##_rqs_next, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) .stop = kyber_##name##_rqs_stop, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) .show = blk_mq_debugfs_rq_show, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) }; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) struct blk_mq_hw_ctx *hctx = data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) struct kyber_hctx_data *khd = hctx->sched_data; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) return 0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) #undef KYBER_DEBUGFS_DOMAIN_ATTRS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) static int kyber_async_depth_show(void *data, struct seq_file *m)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) struct request_queue *q = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) struct kyber_queue_data *kqd = q->elevator->elevator_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) seq_printf(m, "%u\n", kqd->async_depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) static int kyber_cur_domain_show(void *data, struct seq_file *m)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) struct blk_mq_hw_ctx *hctx = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) static int kyber_batching_show(void *data, struct seq_file *m)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) struct blk_mq_hw_ctx *hctx = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) struct kyber_hctx_data *khd = hctx->sched_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) seq_printf(m, "%u\n", khd->batching);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) {#name "_tokens", 0400, kyber_##name##_tokens_show}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) KYBER_QUEUE_DOMAIN_ATTRS(read),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) KYBER_QUEUE_DOMAIN_ATTRS(write),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) KYBER_QUEUE_DOMAIN_ATTRS(discard),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) KYBER_QUEUE_DOMAIN_ATTRS(other),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) {"async_depth", 0400, kyber_async_depth_show},
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) {},
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) #undef KYBER_QUEUE_DOMAIN_ATTRS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) #define KYBER_HCTX_DOMAIN_ATTRS(name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) {#name "_waiting", 0400, kyber_##name##_waiting_show}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) KYBER_HCTX_DOMAIN_ATTRS(read),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) KYBER_HCTX_DOMAIN_ATTRS(write),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) KYBER_HCTX_DOMAIN_ATTRS(discard),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) KYBER_HCTX_DOMAIN_ATTRS(other),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) {"cur_domain", 0400, kyber_cur_domain_show},
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) {"batching", 0400, kyber_batching_show},
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) {},
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) #undef KYBER_HCTX_DOMAIN_ATTRS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) static struct elevator_type kyber_sched = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) .ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) .init_sched = kyber_init_sched,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) .exit_sched = kyber_exit_sched,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) .init_hctx = kyber_init_hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) .exit_hctx = kyber_exit_hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) .limit_depth = kyber_limit_depth,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) .bio_merge = kyber_bio_merge,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) .prepare_request = kyber_prepare_request,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) .insert_requests = kyber_insert_requests,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) .finish_request = kyber_finish_request,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) .requeue_request = kyber_finish_request,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) .completed_request = kyber_completed_request,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) .dispatch_request = kyber_dispatch_request,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) .has_work = kyber_has_work,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) #ifdef CONFIG_BLK_DEBUG_FS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) .elevator_attrs = kyber_sched_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) .elevator_name = "kyber",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) .elevator_features = ELEVATOR_F_MQ_AWARE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) .elevator_owner = THIS_MODULE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) static int __init kyber_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) return elv_register(&kyber_sched);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) static void __exit kyber_exit(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) elv_unregister(&kyber_sched);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) module_init(kyber_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) module_exit(kyber_exit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) MODULE_AUTHOR("Omar Sandoval");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) MODULE_LICENSE("GPL");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) MODULE_DESCRIPTION("Kyber I/O scheduler");