/*
 * livepatch_bsc1231943
 *
 * Fix for CVE-2024-47706, bsc#1231943
 *
 *  Copyright (c) 2025 SUSE
 *  Author: Marcos Paulo de Souza <mpdesouza@suse.com>
 *
 *  Based on the original Linux kernel code. Other copyrights apply.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

/* klp-ccp: from block/bfq-iosched.c */
#include <linux/module.h>

/* klp-ccp: from block/bfq-iosched.c */
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>

#include <linux/backing-dev.h>

#include <trace/events/block.h>

#include "../klp_trace.h"

/* klp-ccp: from include/trace/events/block.h */
KLPR_TRACE_EVENT(block_rq_insert,
                 TP_PROTO(struct request *rq),
                 TP_ARGS(rq)
);

/* klp-ccp: from block/blk.h */
#include <linux/idr.h>

/* klp-ccp: from include/linux/compiler_types.h */
#define inline inline __gnu_inline __inline_maybe_unused notrace

/* klp-ccp: from block/blk.h */
#include <linux/blk-mq.h>

/* klp-ccp: from block/blk-crypto-internal.h */
#include <linux/bio.h>
#include <linux/blkdev.h>

/* klp-ccp: from block/blk-stat.h */
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/ktime.h>
#include <linux/rcupdate.h>
#include <linux/timer.h>

/* klp-ccp: from block/blk-mq.h */
static inline void blk_mq_free_requests(struct list_head *list)
{
	while (!list_empty(list)) {
		struct request *rq = list_entry_rq(list->next);

		list_del_init(&rq->queuelist);
		blk_mq_free_request(rq);
	}
}

/* klp-ccp: from block/blk-mq-sched.h */
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
				   struct list_head *free);

/* klp-ccp: from block/bfq-iosched.h */
#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
#include <linux/blk-cgroup.h>

/* klp-ccp: from block/blk-cgroup-rwstat.h */
#include <linux/blk-cgroup.h>

enum blkg_rwstat_type {
	BLKG_RWSTAT_READ,
	BLKG_RWSTAT_WRITE,
	BLKG_RWSTAT_SYNC,
	BLKG_RWSTAT_ASYNC,
	BLKG_RWSTAT_DISCARD,

	BLKG_RWSTAT_NR,
	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
};

struct blkg_rwstat {
	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
};

/* klp-ccp: from block/bfq-iosched.h */
#define BFQ_IOPRIO_CLASSES	3

#define BFQ_SOFTRT_WEIGHT_FACTOR	100

struct bfq_service_tree {
	/* tree for active entities (i.e., those backlogged) */
	struct rb_root active;
	/* tree for idle entities (i.e., not backlogged, with V < F_i)*/
	struct rb_root idle;

	/* idle entity with minimum F_i */
	struct bfq_entity *first_idle;
	/* idle entity with maximum F_i */
	struct bfq_entity *last_idle;

	/* scheduler virtual time */
	u64 vtime;
	/* scheduler weight sum; active and idle entities contribute to it */
	unsigned long wsum;
};

struct bfq_sched_data {
	/* entity in service */
	struct bfq_entity *in_service_entity;
	/* head-of-line entity (see comments above) */
	struct bfq_entity *next_in_service;
	/* array of service trees, one per ioprio_class */
	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
	/* last time CLASS_IDLE was served */
	unsigned long bfq_class_idle_last_service;

};

struct bfq_entity {
	/* service_tree member */
	struct rb_node rb_node;

	/*
	 * Flag, true if the entity is on a tree (either the active or
	 * the idle one of its service_tree) or is in service.
	 */
	bool on_st_or_in_serv;

	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
	u64 start, finish;

	/* tree the entity is enqueued into; %NULL if not on a tree */
	struct rb_root *tree;

	/*
	 * minimum start time of the (active) subtree rooted at this
	 * entity; used for O(log N) lookups into active trees
	 */
	u64 min_start;

	/* amount of service received during the last service slot */
	int service;

	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
	int budget;

	/* Number of requests allocated in the subtree of this entity */
	int allocated;

	/* device weight, if non-zero, it overrides the default weight of
	 * bfq_group_data */
	int dev_weight;
	/* weight of the queue */
	int weight;
	/* next weight if a change is in progress */
	int new_weight;

	/* original weight, used to implement weight boosting */
	int orig_weight;

	/* parent entity, for hierarchical scheduling */
	struct bfq_entity *parent;

	/*
	 * For non-leaf nodes in the hierarchy, the associated
	 * scheduler queue, %NULL on leaf nodes.
	 */
	struct bfq_sched_data *my_sched_data;
	/* the scheduler queue this entity belongs to */
	struct bfq_sched_data *sched_data;

	/* flag, set to request a weight, ioprio or ioprio_class change  */
	int prio_changed;

	/* flag, set if the entity is counted in groups_with_pending_reqs */
	bool in_groups_with_pending_reqs;

	/* last child queue of entity created (for non-leaf entities) */
	struct bfq_queue *last_bfqq_created;
};

struct bfq_ttime {
	/* completion time of the last request */
	u64 last_end_request;

	/* total process thinktime */
	u64 ttime_total;
	/* number of thinktime samples */
	unsigned long ttime_samples;
	/* average process thinktime */
	u64 ttime_mean;
};

struct bfq_queue {
	/* reference counter */
	int ref;
	/* counter of references from other queues for delayed stable merge */
	int stable_ref;
	/* parent bfq_data */
	struct bfq_data *bfqd;

	/* current ioprio and ioprio class */
	unsigned short ioprio, ioprio_class;
	/* next ioprio and ioprio class if a change is in progress */
	unsigned short new_ioprio, new_ioprio_class;

	/* last total-service-time sample, see bfq_update_inject_limit() */
	u64 last_serv_time_ns;
	/* limit for request injection */
	unsigned int inject_limit;
	/* last time the inject limit has been decreased, in jiffies */
	unsigned long decrease_time_jif;

	/*
	 * Shared bfq_queue if queue is cooperating with one or more
	 * other queues.
	 */
	struct bfq_queue *new_bfqq;
	/* request-position tree member (see bfq_group's @rq_pos_tree) */
	struct rb_node pos_node;
	/* request-position tree root (see bfq_group's @rq_pos_tree) */
	struct rb_root *pos_root;

	/* sorted list of pending requests */
	struct rb_root sort_list;
	/* if fifo isn't expired, next request to serve */
	struct request *next_rq;
	/* number of sync and async requests queued */
	int queued[2];
	/* number of pending metadata requests */
	int meta_pending;
	/* fifo list of requests in sort_list */
	struct list_head fifo;

	/* entity representing this queue in the scheduler */
	struct bfq_entity entity;

	/* pointer to the weight counter associated with this entity */
	struct bfq_weight_counter *weight_counter;

	/* maximum budget allowed from the feedback mechanism */
	int max_budget;
	/* budget expiration (in jiffies) */
	unsigned long budget_timeout;

	/* number of requests on the dispatch list or inside driver */
	int dispatched;

	/* status flags */
	unsigned long flags;

	/* node for active/idle bfqq list inside parent bfqd */
	struct list_head bfqq_list;

	/* associated @bfq_ttime struct */
	struct bfq_ttime ttime;

	/* when bfqq started to do I/O within the last observation window */
	u64 io_start_time;
	/* how long bfqq has remained empty during the last observ. window */
	u64 tot_idle_time;

	/* bit vector: a 1 for each seeky requests in history */
	u32 seek_history;

	/* node for the device's burst list */
	struct hlist_node burst_list_node;

	/* position of the last request enqueued */
	sector_t last_request_pos;

	/* Number of consecutive pairs of request completion and
	 * arrival, such that the queue becomes idle after the
	 * completion, but the next request arrives within an idle
	 * time slice; used only if the queue's IO_bound flag has been
	 * cleared.
	 */
	unsigned int requests_within_timer;

	/* pid of the process owning the queue, used for logging purposes */
	pid_t pid;

	/*
	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
	 * if the queue is shared.
	 */
	struct bfq_io_cq *bic;

	/* current maximum weight-raising time for this queue */
	unsigned long wr_cur_max_time;
	/*
	 * Minimum time instant such that, only if a new request is
	 * enqueued after this time instant in an idle @bfq_queue with
	 * no outstanding requests, then the task associated with the
	 * queue it is deemed as soft real-time (see the comments on
	 * the function bfq_bfqq_softrt_next_start())
	 */
	unsigned long soft_rt_next_start;
	/*
	 * Start time of the current weight-raising period if
	 * the @bfq-queue is being weight-raised, otherwise
	 * finish time of the last weight-raising period.
	 */
	unsigned long last_wr_start_finish;
	/* factor by which the weight of this queue is multiplied */
	unsigned int wr_coeff;
	/*
	 * Time of the last transition of the @bfq_queue from idle to
	 * backlogged.
	 */
	unsigned long last_idle_bklogged;
	/*
	 * Cumulative service received from the @bfq_queue since the
	 * last transition from idle to backlogged.
	 */
	unsigned long service_from_backlogged;
	/*
	 * Cumulative service received from the @bfq_queue since its
	 * last transition to weight-raised state.
	 */
	unsigned long service_from_wr;

	/*
	 * Value of wr start time when switching to soft rt
	 */
	unsigned long wr_start_at_switch_to_srt;

	unsigned long split_time; /* time of last split */

	unsigned long first_IO_time; /* time of first I/O for this queue */

	unsigned long creation_time; /* when this queue is created */

	/* max service rate measured so far */
	u32 max_service_rate;

	/*
	 * Pointer to the waker queue for this queue, i.e., to the
	 * queue Q such that this queue happens to get new I/O right
	 * after some I/O request of Q is completed. For details, see
	 * the comments on the choice of the queue for injection in
	 * bfq_select_queue().
	 */
	struct bfq_queue *waker_bfqq;
	/* pointer to the curr. tentative waker queue, see bfq_check_waker() */
	struct bfq_queue *tentative_waker_bfqq;
	/* number of times the same tentative waker has been detected */
	unsigned int num_waker_detections;
	/* time when we started considering this waker */
	u64 waker_detection_started;

	/* node for woken_list, see below */
	struct hlist_node woken_list_node;
	/*
	 * Head of the list of the woken queues for this queue, i.e.,
	 * of the list of the queues for which this queue is a waker
	 * queue. This list is used to reset the waker_bfqq pointer in
	 * the woken queues when this queue exits.
	 */
	struct hlist_head woken_list;
};

struct bfq_io_cq {
	/* associated io_cq structure */
	struct io_cq icq; /* must be the first member */
	/* array of two process queues, the sync and the async */
	struct bfq_queue *bfqq[2];
	/* per (request_queue, blkcg) ioprio */
	int ioprio;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
	uint64_t blkcg_serial_nr; /* the current blkcg serial */
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif
	bool saved_has_short_ttime;
	/*
	 * Same purpose as the previous two fields for the I/O bound
	 * classification of a queue.
	 */
	bool saved_IO_bound;

	u64 saved_io_start_time;
	u64 saved_tot_idle_time;

	/*
	 * Same purpose as the previous fields for the value of the
	 * field keeping the queue's belonging to a large burst
	 */
	bool saved_in_large_burst;
	/*
	 * True if the queue belonged to a burst list before its merge
	 * with another cooperating queue.
	 */
	bool was_in_burst_list;

	/*
	 * Save the weight when a merge occurs, to be able
	 * to restore it in case of split. If the weight is not
	 * correctly resumed when the queue is recycled,
	 * then the weight of the recycled queue could differ
	 * from the weight of the original queue.
	 */
	unsigned int saved_weight;

	/*
	 * Similar to previous fields: save wr information.
	 */
	unsigned long saved_wr_coeff;
	unsigned long saved_last_wr_start_finish;
	unsigned long saved_service_from_wr;
	unsigned long saved_wr_start_at_switch_to_srt;
	unsigned int saved_wr_cur_max_time;
	struct bfq_ttime saved_ttime;

	/* Save also injection state */
	u64 saved_last_serv_time_ns;
	unsigned int saved_inject_limit;
	unsigned long saved_decrease_time_jif;

	/* candidate queue for a stable merge (due to close creation time) */
	struct bfq_queue *stable_merge_bfqq;

	bool stably_merged;	/* non splittable if true */
	unsigned int requests;	/* Number of requests this process has in flight */
};

struct bfq_data {
	/* device request queue */
	struct request_queue *queue;
	/* dispatch queue */
	struct list_head dispatch;

	/* root bfq_group for the device */
	struct bfq_group *root_group;

	/*
	 * rbtree of weight counters of @bfq_queues, sorted by
	 * weight. Used to keep track of whether all @bfq_queues have
	 * the same weight. The tree contains one counter for each
	 * distinct weight associated to some active and not
	 * weight-raised @bfq_queue (see the comments to the functions
	 * bfq_weights_tree_[add|remove] for further details).
	 */
	struct rb_root_cached queue_weights_tree;

	/*
	 * Number of groups with at least one descendant process that
	 * has at least one request waiting for completion. Note that
	 * this accounts for also requests already dispatched, but not
	 * yet completed. Therefore this number of groups may differ
	 * (be larger) than the number of active groups, as a group is
	 * considered active only if its corresponding entity has
	 * descendant queues with at least one request queued. This
	 * number is used to decide whether a scenario is symmetric.
	 * For a detailed explanation see comments on the computation
	 * of the variable asymmetric_scenario in the function
	 * bfq_better_to_idle().
	 *
	 * However, it is hard to compute this number exactly, for
	 * groups with multiple descendant processes. Consider a group
	 * that is inactive, i.e., that has no descendant process with
	 * pending I/O inside BFQ queues. Then suppose that
	 * num_groups_with_pending_reqs is still accounting for this
	 * group, because the group has descendant processes with some
	 * I/O request still in flight. num_groups_with_pending_reqs
	 * should be decremented when the in-flight request of the
	 * last descendant process is finally completed (assuming that
	 * nothing else has changed for the group in the meantime, in
	 * terms of composition of the group and active/inactive state of child
	 * groups and processes). To accomplish this, an additional
	 * pending-request counter must be added to entities, and must
	 * be updated correctly. To avoid this additional field and operations,
	 * we resort to the following tradeoff between simplicity and
	 * accuracy: for an inactive group that is still counted in
	 * num_groups_with_pending_reqs, we decrement
	 * num_groups_with_pending_reqs when the first descendant
	 * process of the group remains with no request waiting for
	 * completion.
	 *
	 * Even this simpler decrement strategy requires a little
	 * carefulness: to avoid multiple decrements, we flag a group,
	 * more precisely an entity representing a group, as still
	 * counted in num_groups_with_pending_reqs when it becomes
	 * inactive. Then, when the first descendant queue of the
	 * entity remains with no request waiting for completion,
	 * num_groups_with_pending_reqs is decremented, and this flag
	 * is reset. After this flag is reset for the entity,
	 * num_groups_with_pending_reqs won't be decremented any
	 * longer in case a new descendant queue of the entity remains
	 * with no request waiting for completion.
	 */
	unsigned int num_groups_with_pending_reqs;

	/*
	 * Per-class (RT, BE, IDLE) number of bfq_queues containing
	 * requests (including the queue in service, even if it is
	 * idling).
	 */
	unsigned int busy_queues[3];
	/* number of weight-raised busy @bfq_queues */
	int wr_busy_queues;
	/* number of queued requests */
	int queued;
	/* number of requests dispatched and waiting for completion */
	int rq_in_driver;

	/* true if the device is non rotational and performs queueing */
	bool nonrot_with_queueing;

	/*
	 * Maximum number of requests in driver in the last
	 * @hw_tag_samples completed requests.
	 */
	int max_rq_in_driver;
	/* number of samples used to calculate hw_tag */
	int hw_tag_samples;
	/* flag set to one if the driver is showing a queueing behavior */
	int hw_tag;

	/* number of budgets assigned */
	int budgets_assigned;

	/*
	 * Timer set when idling (waiting) for the next request from
	 * the queue in service.
	 */
	struct hrtimer idle_slice_timer;

	/* bfq_queue in service */
	struct bfq_queue *in_service_queue;

	/* on-disk position of the last served request */
	sector_t last_position;

	/* position of the last served request for the in-service queue */
	sector_t in_serv_last_pos;

	/* time of last request completion (ns) */
	u64 last_completion;

	/* bfqq owning the last completed rq */
	struct bfq_queue *last_completed_rq_bfqq;

	/* last bfqq created, among those in the root group */
	struct bfq_queue *last_bfqq_created;

	/* time of last transition from empty to non-empty (ns) */
	u64 last_empty_occupied_ns;

	/*
	 * Flag set to activate the sampling of the total service time
	 * of a just-arrived first I/O request (see
	 * bfq_update_inject_limit()). This will cause the setting of
	 * waited_rq when the request is finally dispatched.
	 */
	bool wait_dispatch;
	/*
	 *  If set, then bfq_update_inject_limit() is invoked when
	 *  waited_rq is eventually completed.
	 */
	struct request *waited_rq;
	/*
	 * True if some request has been injected during the last service hole.
	 */
	bool rqs_injected;

	/* time of first rq dispatch in current observation interval (ns) */
	u64 first_dispatch;
	/* time of last rq dispatch in current observation interval (ns) */
	u64 last_dispatch;

	/* beginning of the last budget */
	ktime_t last_budget_start;
	/* beginning of the last idle slice */
	ktime_t last_idling_start;
	unsigned long last_idling_start_jiffies;

	/* number of samples in current observation interval */
	int peak_rate_samples;
	/* num of samples of seq dispatches in current observation interval */
	u32 sequential_samples;
	/* total num of sectors transferred in current observation interval */
	u64 tot_sectors_dispatched;
	/* max rq size seen during current observation interval (sectors) */
	u32 last_rq_max_size;
	/* time elapsed from first dispatch in current observ. interval (us) */
	u64 delta_from_first;
	/*
	 * Current estimate of the device peak rate, measured in
	 * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by
	 * BFQ_RATE_SHIFT is performed to increase precision in
	 * fixed-point calculations.
	 */
	u32 peak_rate;

	/* maximum budget allotted to a bfq_queue before rescheduling */
	int bfq_max_budget;

	/* list of all the bfq_queues active on the device */
	struct list_head active_list;
	/* list of all the bfq_queues idle on the device */
	struct list_head idle_list;

	/*
	 * Timeout for async/sync requests; when it fires, requests
	 * are served in fifo order.
	 */
	u64 bfq_fifo_expire[2];
	/* weight of backward seeks wrt forward ones */
	unsigned int bfq_back_penalty;
	/* maximum allowed backward seek */
	unsigned int bfq_back_max;
	/* maximum idling time */
	u32 bfq_slice_idle;

	/* user-configured max budget value (0 for auto-tuning) */
	int bfq_user_max_budget;
	/*
	 * Timeout for bfq_queues to consume their budget; used to
	 * prevent seeky queues from imposing long latencies to
	 * sequential or quasi-sequential ones (this also implies that
	 * seeky queues cannot receive guarantees in the service
	 * domain; after a timeout they are charged for the time they
	 * have been in service, to preserve fairness among them, but
	 * without service-domain guarantees).
	 */
	unsigned int bfq_timeout;

	/*
	 * Force device idling whenever needed to provide accurate
	 * service guarantees, without caring about throughput
	 * issues. CAVEAT: this may even increase latencies, in case
	 * of useless idling for processes that did stop doing I/O.
	 */
	bool strict_guarantees;

	/*
	 * Last time at which a queue entered the current burst of
	 * queues being activated shortly after each other; for more
	 * details about this and the following parameters related to
	 * a burst of activations, see the comments on the function
	 * bfq_handle_burst.
	 */
	unsigned long last_ins_in_burst;
	/*
	 * Reference time interval used to decide whether a queue has
	 * been activated shortly after @last_ins_in_burst.
	 */
	unsigned long bfq_burst_interval;
	/* number of queues in the current burst of queue activations */
	int burst_size;

	/* common parent entity for the queues in the burst */
	struct bfq_entity *burst_parent_entity;
	/* Maximum burst size above which the current queue-activation
	 * burst is deemed as 'large'.
	 */
	unsigned long bfq_large_burst_thresh;
	/* true if a large queue-activation burst is in progress */
	bool large_burst;
	/*
	 * Head of the burst list (as for the above fields, more
	 * details in the comments on the function bfq_handle_burst).
	 */
	struct hlist_head burst_list;

	/* if set to true, low-latency heuristics are enabled */
	bool low_latency;
	/*
	 * Maximum factor by which the weight of a weight-raised queue
	 * is multiplied.
	 */
	unsigned int bfq_wr_coeff;
	/* maximum duration of a weight-raising period (jiffies) */
	unsigned int bfq_wr_max_time;

	/* Maximum weight-raising duration for soft real-time processes */
	unsigned int bfq_wr_rt_max_time;
	/*
	 * Minimum idle period after which weight-raising may be
	 * reactivated for a queue (in jiffies).
	 */
	unsigned int bfq_wr_min_idle_time;
	/*
	 * Minimum period between request arrivals after which
	 * weight-raising may be reactivated for an already busy async
	 * queue (in jiffies).
	 */
	unsigned long bfq_wr_min_inter_arr_async;

	/* Max service-rate for a soft real-time queue, in sectors/sec */
	unsigned int bfq_wr_max_softrt_rate;
	/*
	 * Cached value of the product ref_rate*ref_wr_duration, used
	 * for computing the maximum duration of weight raising
	 * automatically.
	 */
	u64 rate_dur_prod;

	/* fallback dummy bfqq for extreme OOM conditions */
	struct bfq_queue oom_bfqq;

	spinlock_t lock;

	/*
	 * bic associated with the task issuing current bio for
	 * merging. This and the next field are used as a support to
	 * be able to perform the bic lookup, needed by bio-merge
	 * functions, before the scheduler lock is taken, and thus
	 * avoid taking the request-queue lock while the scheduler
	 * lock is being held.
	 */
	struct bfq_io_cq *bio_bic;
	/* bfqq associated with the task issuing current bio for merging */
	struct bfq_queue *bio_bfqq;

	/*
	 * Depth limits used in bfq_limit_depth (see comments on the
	 * function)
	 */
	unsigned int word_depths[2][2];
	unsigned int full_depth_shift;
};

static void (*klpe_bfq_clear_bfqq_just_created)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_just_created)(const struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_busy)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_wait_request)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_wait_request)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_non_blocking_wait_rq)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_non_blocking_wait_rq)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_mark_bfqq_has_short_ttime)(struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_has_short_ttime)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_has_short_ttime)(const struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_sync)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_mark_bfqq_IO_bound)(struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_IO_bound)(struct bfq_queue *bfqq);

static void (*klpe_bfq_mark_bfqq_in_large_burst)(struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_in_large_burst)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_in_large_burst)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_coop)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_coop)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_split_coop)(struct bfq_queue *bfqq);

static int (*klpe_bfq_bfqq_split_coop)(const struct bfq_queue *bfqq);

static void (*klpe_bfq_clear_bfqq_softrt_update)(struct bfq_queue *bfqq);

enum bfqq_expiration {
	BFQQE_TOO_IDLE = 0,		/*
					 * queue has been idling for
					 * too long
					 */
	BFQQE_BUDGET_TIMEOUT,	/* budget took too long to be used */
	BFQQE_BUDGET_EXHAUSTED,	/* budget consumed */
	BFQQE_NO_MORE_REQUESTS,	/* the queue has no more requests */
	BFQQE_PREEMPTED		/* preemption in progress */
};

struct bfqg_stats {
	/* basic stats */
	struct blkg_rwstat		bytes;
	struct blkg_rwstat		ios;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
#error "klp-ccp: non-taken branch"
#endif /* CONFIG_BFQ_CGROUP_DEBUG */
};

#ifdef CONFIG_BFQ_GROUP_IOSCHED

struct bfq_group {
	/* must be the first member */
	struct blkg_policy_data pd;

	/* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
	char blkg_path[128];

	/* reference counter (see comments in bfq_bic_update_cgroup) */
	int ref;
	/* Is bfq_group still online? */
	bool online;

	struct bfq_entity entity;
	struct bfq_sched_data sched_data;

	void *bfqd;

	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
	struct bfq_queue *async_idle_bfqq;

	struct bfq_entity *my_entity;

	int active_entities;

	struct rb_root rq_pos_tree;

	struct bfqg_stats stats;
};

#else
#error "klp-ccp: non-taken branch"
#endif

static struct bfq_queue *(*klpe_bic_to_bfqq)(struct bfq_io_cq *bic, bool is_sync);
static void (*klpe_bic_set_bfqq)(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
static struct bfq_data *(*klpe_bic_to_bfqd)(struct bfq_io_cq *bic);
static void (*klpe_bfq_pos_tree_add_move)(struct bfq_data *bfqd, struct bfq_queue *bfqq);

static void (*klpe_bfq_bfqq_expire)(struct bfq_data *bfqd, struct bfq_queue *bfqq,
		     bool compensate, enum bfqq_expiration reason);
static void (*klpe_bfq_put_queue)(struct bfq_queue *bfqq);
static void (*klpe_bfq_put_cooperator)(struct bfq_queue *bfqq);

static void (*klpe_bfq_release_process_ref)(struct bfq_data *bfqd, struct bfq_queue *bfqq);

static void (*klpe_bfqg_stats_update_legacy_io)(struct request_queue *q, struct request *rq);

static void (*klpe_bfq_bic_update_cgroup)(struct bfq_io_cq *bic, struct bio *bio);

static struct blkcg_gq *(*klpe_bfqg_to_blkg)(struct bfq_group *bfqg);
static struct bfq_group *(*klpe_bfqq_group)(struct bfq_queue *bfqq);

#define for_each_entity(entity)	\
	for (; entity ; entity = entity->parent)

static struct bfq_group *(*klpe_bfq_bfqq_to_bfqg)(struct bfq_queue *bfqq);

static unsigned int (*klpe_bfq_tot_busy_queues)(struct bfq_data *bfqd);

static unsigned short (*klpe_bfq_ioprio_to_weight)(int ioprio);

static bool (*klpe_next_queue_may_preempt)(struct bfq_data *bfqd);

static void (*klpe_bfq_add_bfqq_busy)(struct bfq_data *bfqd, struct bfq_queue *bfqq);

static inline void bfq_pid_to_str(int pid, char *str, int len)
{
	if (pid != -1)
		snprintf(str, len, "%d", pid);
	else
		snprintf(str, len, "SHARED-");
}

#define bfq_log(bfqd, fmt, args...) \
	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

#define MAX_PID_STR_LENGTH 12

#define klpr_bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
	char pid_str[MAX_PID_STR_LENGTH];	\
	if (likely(!blk_trace_note_message_enabled((bfqd)->queue)))	\
		break;							\
	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
	blk_add_cgroup_trace_msg((bfqd)->queue,				\
			(*klpe_bfqg_to_blkg)((*klpe_bfqq_group)(bfqq))->blkcg,		\
			"bfq%s%c " fmt, pid_str,			\
			(*klpe_bfq_bfqq_sync)((bfqq)) ? 'S' : 'A', ##args);	\
} while (0)

/* klp-ccp: from block/blk-wbt.h */
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/timer.h>
#include <linux/ktime.h>

/* klp-ccp: from block/blk-rq-qos.h */
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/blk-mq.h>

/* klp-ccp: from block/blk-mq-debugfs.h */
#ifdef CONFIG_BLK_DEBUG_FS

#include <linux/seq_file.h>

#else
#error "klp-ccp: non-taken branch"
#endif

static const int bfq_stats_min_budgets = 194;

static const int bfq_default_max_budget = 16 * 1024;

static const int bfq_async_charge_factor = 3;

static const unsigned long bfq_merge_time_limit = HZ/10;

#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
	(get_sdist(last_pos, rq) >			\
	 BFQQ_SEEK_THR &&				\
	 (!blk_queue_nonrot(bfqd->queue) ||		\
	  blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)

#define BFQQ_TOTALLY_SEEKY(bfqq)	(bfqq->seek_history == -1)

static const unsigned long bfq_late_stable_merging = 600;

#define RQ_BIC(rq)		icq_to_bic((rq)->elv.priv[0])
#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

static void klpr_bfq_put_stable_ref(struct bfq_queue *bfqq);

static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
{
	/* bic->icq is the first member, %NULL will convert to %NULL */
	return container_of(icq, struct bfq_io_cq, icq);
}

#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

#define bfq_sample_valid(samples)	((samples) > 80)

static struct request *bfq_choose_req(struct bfq_data *bfqd,
				      struct request *rq1,
				      struct request *rq2,
				      sector_t last)
{
	sector_t s1, s2, d1 = 0, d2 = 0;
	unsigned long back_max;
#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */

	if (!rq1 || rq1 == rq2)
		return rq2;
	if (!rq2)
		return rq1;

	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
		return rq1;
	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
		return rq2;
	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
		return rq1;
	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
		return rq2;

	s1 = blk_rq_pos(rq1);
	s2 = blk_rq_pos(rq2);

	/*
	 * By definition, 1KiB is 2 sectors.
	 */
	back_max = bfqd->bfq_back_max * 2;

	/*
	 * Strict one way elevator _except_ in the case where we allow
	 * short backward seeks which are biased as twice the cost of a
	 * similar forward seek.
	 */
	if (s1 >= last)
		d1 = s1 - last;
	else if (s1 + back_max >= last)
		d1 = (last - s1) * bfqd->bfq_back_penalty;
	else
		wrap |= BFQ_RQ1_WRAP;

	if (s2 >= last)
		d2 = s2 - last;
	else if (s2 + back_max >= last)
		d2 = (last - s2) * bfqd->bfq_back_penalty;
	else
		wrap |= BFQ_RQ2_WRAP;

	/* Found required data */

	/*
	 * By doing switch() on the bit mask "wrap" we avoid having to
	 * check two variables for all permutations: --> faster!
	 */
	switch (wrap) {
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
		if (d1 < d2)
			return rq1;
		else if (d2 < d1)
			return rq2;

		if (s1 >= s2)
			return rq1;
		else
			return rq2;

	case BFQ_RQ2_WRAP:
		return rq1;
	case BFQ_RQ1_WRAP:
		return rq2;
	case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
	default:
		/*
		 * Since both rqs are wrapped,
		 * start with the one that's further behind head
		 * (--> only *one* back seek required),
		 * since back seek takes more time than forward.
		 */
		if (s1 <= s2)
			return rq1;
		else
			return rq2;
	}
}

static struct bfq_queue *
bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
		     sector_t sector, struct rb_node **ret_parent,
		     struct rb_node ***rb_link)
{
	struct rb_node **p, *parent;
	struct bfq_queue *bfqq = NULL;

	parent = NULL;
	p = &root->rb_node;
	while (*p) {
		struct rb_node **n;

		parent = *p;
		bfqq = rb_entry(parent, struct bfq_queue, pos_node);

		/*
		 * Sort strictly based on sector. Smallest to the left,
		 * largest to the right.
		 */
		if (sector > blk_rq_pos(bfqq->next_rq))
			n = &(*p)->rb_right;
		else if (sector < blk_rq_pos(bfqq->next_rq))
			n = &(*p)->rb_left;
		else
			break;
		p = n;
		bfqq = NULL;
	}

	*ret_parent = parent;
	if (rb_link)
		*rb_link = p;

	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
		(unsigned long long)sector,
		bfqq ? bfqq->pid : 0);

	return bfqq;
}

static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
{
	return bfqq->service_from_backlogged > 0 &&
		time_is_before_jiffies(bfqq->first_IO_time +
				       bfq_merge_time_limit);
}

static bool (*klpe_bfq_asymmetric_scenario)(struct bfq_data *bfqd,
				   struct bfq_queue *bfqq);

static unsigned long klpr_bfq_serv_to_charge(struct request *rq,
					struct bfq_queue *bfqq)
{
	if ((*klpe_bfq_bfqq_sync)(bfqq) || bfqq->wr_coeff > 1 ||
	    (*klpe_bfq_asymmetric_scenario)(bfqq->bfqd, bfqq))
		return blk_rq_sectors(rq);

	return blk_rq_sectors(rq) * bfq_async_charge_factor;
}

static void (*klpe_bfq_updated_next_req)(struct bfq_data *bfqd,
				 struct bfq_queue *bfqq);

static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
{
	u64 dur;

	if (bfqd->bfq_wr_max_time > 0)
		return bfqd->bfq_wr_max_time;

	dur = bfqd->rate_dur_prod;
	do_div(dur, bfqd->peak_rate);

	/*
	 * Limit duration between 3 and 25 seconds. The upper limit
	 * has been conservatively set after the following worst case:
	 * on a QEMU/KVM virtual machine
	 * - running in a slow PC
	 * - with a virtual disk stacked on a slow low-end 5400rpm HDD
	 * - serving a heavy I/O workload, such as the sequential reading
	 *   of several files
	 * mplayer took 23 seconds to start, if constantly weight-raised.
	 *
	 * As for higher values than that accommodating the above bad
	 * scenario, tests show that higher values would often yield
	 * the opposite of the desired result, i.e., would worsen
	 * responsiveness by allowing non-interactive applications to
	 * preserve weight raising for too long.
	 *
	 * On the other end, lower values than 3 seconds make it
	 * difficult for most interactive tasks to complete their jobs
	 * before weight-raising finishes.
	 */
	return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
}

static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
					  struct bfq_data *bfqd)
{
	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
	bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
}

static void
klpr_bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
		      struct bfq_io_cq *bic, bool bfq_already_existing)
{
	unsigned int old_wr_coeff = 1;
	bool busy = bfq_already_existing && (*klpe_bfq_bfqq_busy)(bfqq);

	if (bic->saved_has_short_ttime)
		(*klpe_bfq_mark_bfqq_has_short_ttime)(bfqq);
	else
		(*klpe_bfq_clear_bfqq_has_short_ttime)(bfqq);

	if (bic->saved_IO_bound)
		(*klpe_bfq_mark_bfqq_IO_bound)(bfqq);
	else
		(*klpe_bfq_clear_bfqq_IO_bound)(bfqq);

	bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
	bfqq->inject_limit = bic->saved_inject_limit;
	bfqq->decrease_time_jif = bic->saved_decrease_time_jif;

	bfqq->entity.new_weight = bic->saved_weight;
	bfqq->ttime = bic->saved_ttime;
	bfqq->io_start_time = bic->saved_io_start_time;
	bfqq->tot_idle_time = bic->saved_tot_idle_time;
	/*
	 * Restore weight coefficient only if low_latency is on
	 */
	if (bfqd->low_latency) {
		old_wr_coeff = bfqq->wr_coeff;
		bfqq->wr_coeff = bic->saved_wr_coeff;
	}
	bfqq->service_from_wr = bic->saved_service_from_wr;
	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;

	if (bfqq->wr_coeff > 1 && ((*klpe_bfq_bfqq_in_large_burst)(bfqq) ||
	    time_is_before_jiffies(bfqq->last_wr_start_finish +
				   bfqq->wr_cur_max_time))) {
		if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
		    !(*klpe_bfq_bfqq_in_large_burst)(bfqq) &&
		    time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
					     bfq_wr_duration(bfqd))) {
			switch_back_to_interactive_wr(bfqq, bfqd);
		} else {
			bfqq->wr_coeff = 1;
			klpr_bfq_log_bfqq(bfqq->bfqd, bfqq,
				     "resume state: switching off wr");
		}
	}

	/* make sure weight will be updated, however we got here */
	bfqq->entity.prio_changed = 1;

	if (likely(!busy))
		return;

	if (old_wr_coeff == 1 && bfqq->wr_coeff > 1)
		bfqd->wr_busy_queues++;
	else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1)
		bfqd->wr_busy_queues--;
}

static int bfqq_process_refs(struct bfq_queue *bfqq)
{
	return bfqq->ref - bfqq->entity.allocated -
		bfqq->entity.on_st_or_in_serv -
		(bfqq->weight_counter != NULL) - bfqq->stable_ref;
}

static void klpr_bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
	struct bfq_queue *item;
	struct hlist_node *n;

	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
		hlist_del_init(&item->burst_list_node);

	/*
	 * Start the creation of a new burst list only if there is no
	 * active queue. See comments on the conditional invocation of
	 * bfq_handle_burst().
	 */
	if ((*klpe_bfq_tot_busy_queues)(bfqd) == 0) {
		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
		bfqd->burst_size = 1;
	} else
		bfqd->burst_size = 0;

	bfqd->burst_parent_entity = bfqq->entity.parent;
}

static void klpr_bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
	/* Increment burst size to take into account also bfqq */
	bfqd->burst_size++;

	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
		struct bfq_queue *pos, *bfqq_item;
		struct hlist_node *n;

		/*
		 * Enough queues have been activated shortly after each
		 * other to consider this burst as large.
		 */
		bfqd->large_burst = true;

		/*
		 * We can now mark all queues in the burst list as
		 * belonging to a large burst.
		 */
		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
				     burst_list_node)
			(*klpe_bfq_mark_bfqq_in_large_burst)(bfqq_item);
		(*klpe_bfq_mark_bfqq_in_large_burst)(bfqq);

		/*
		 * From now on, and until the current burst finishes, any
		 * new queue being activated shortly after the last queue
		 * was inserted in the burst can be immediately marked as
		 * belonging to a large burst. So the burst list is not
		 * needed any more. Remove it.
		 */
		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
					  burst_list_node)
			hlist_del_init(&pos->burst_list_node);
	} else /*
		* Burst not yet large: add bfqq to the burst list. Do
		* not increment the ref counter for bfqq, because bfqq
		* is removed from the burst list before freeing bfqq
		* in put_queue.
		*/
		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
}

static void klpr_bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
	/*
	 * If bfqq is already in the burst list or is part of a large
	 * burst, or finally has just been split, then there is
	 * nothing else to do.
	 */
	if (!hlist_unhashed(&bfqq->burst_list_node) ||
	    (*klpe_bfq_bfqq_in_large_burst)(bfqq) ||
	    time_is_after_eq_jiffies(bfqq->split_time +
				     msecs_to_jiffies(10)))
		return;

	/*
	 * If bfqq's creation happens late enough, or bfqq belongs to
	 * a different group than the burst group, then the current
	 * burst is finished, and related data structures must be
	 * reset.
	 *
	 * In this respect, consider the special case where bfqq is
	 * the very first queue created after BFQ is selected for this
	 * device. In this case, last_ins_in_burst and
	 * burst_parent_entity are not yet significant when we get
	 * here. But it is easy to verify that, whether or not the
	 * following condition is true, bfqq will end up being
	 * inserted into the burst list. In particular the list will
	 * happen to contain only bfqq. And this is exactly what has
	 * to happen, as bfqq may be the first queue of the first
	 * burst.
	 */
	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
	    bfqd->bfq_burst_interval) ||
	    bfqq->entity.parent != bfqd->burst_parent_entity) {
		bfqd->large_burst = false;
		klpr_bfq_reset_burst_list(bfqd, bfqq);
		goto end;
	}

	/*
	 * If we get here, then bfqq is being activated shortly after the
	 * last queue. So, if the current burst is also large, we can mark
	 * bfqq as belonging to this large burst immediately.
	 */
	if (bfqd->large_burst) {
		(*klpe_bfq_mark_bfqq_in_large_burst)(bfqq);
		goto end;
	}

	/*
	 * If we get here, then a large-burst state has not yet been
	 * reached, but bfqq is being activated shortly after the last
	 * queue. Then we add bfqq to the burst.
	 */
	klpr_bfq_add_to_burst(bfqd, bfqq);
end:
	/*
	 * At this point, bfqq either has been added to the current
	 * burst or has caused the current burst to terminate and a
	 * possible new burst to start. In particular, in the second
	 * case, bfqq has become the first queue in the possible new
	 * burst.  In both cases last_ins_in_burst needs to be moved
	 * forward.
	 */
	bfqd->last_ins_in_burst = jiffies;
}

static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
{
	struct bfq_entity *entity = &bfqq->entity;

	return entity->budget - entity->service;
}

static int bfq_min_budget(struct bfq_data *bfqd)
{
	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
		return bfq_default_max_budget / 32;
	else
		return bfqd->bfq_max_budget / 32;
}

static bool klpr_bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
						struct bfq_queue *bfqq,
						bool arrived_in_time)
{
	struct bfq_entity *entity = &bfqq->entity;

	/*
	 * In the next compound condition, we check also whether there
	 * is some budget left, because otherwise there is no point in
	 * trying to go on serving bfqq with this same budget: bfqq
	 * would be expired immediately after being selected for
	 * service. This would only cause useless overhead.
	 */
	if ((*klpe_bfq_bfqq_non_blocking_wait_rq)(bfqq) && arrived_in_time &&
	    bfq_bfqq_budget_left(bfqq) > 0) {
		/*
		 * We do not clear the flag non_blocking_wait_rq here, as
		 * the latter is used in bfq_activate_bfqq to signal
		 * that timestamps need to be back-shifted (and is
		 * cleared right after).
		 */

		/*
		 * In next assignment we rely on that either
		 * entity->service or entity->budget are not updated
		 * on expiration if bfqq is empty (see
		 * __bfq_bfqq_recalc_budget). Thus both quantities
		 * remain unchanged after such an expiration, and the
		 * following statement therefore assigns to
		 * entity->budget the remaining budget on such an
		 * expiration.
		 */
		entity->budget = min_t(unsigned long,
				       bfq_bfqq_budget_left(bfqq),
				       bfqq->max_budget);

		/*
		 * At this point, we have used entity->service to get
		 * the budget left (needed for updating
		 * entity->budget). Thus we finally can, and have to,
		 * reset entity->service. The latter must be reset
		 * because bfqq would otherwise be charged again for
		 * the service it has received during its previous
		 * service slot(s).
		 */
		entity->service = 0;

		return true;
	}

	/*
	 * We can finally complete expiration, by setting service to 0.
	 */
	entity->service = 0;
	entity->budget = max_t(unsigned long, bfqq->max_budget,
			       klpr_bfq_serv_to_charge(bfqq->next_rq, bfqq));
	(*klpe_bfq_clear_bfqq_non_blocking_wait_rq)(bfqq);
	return false;
}

static unsigned long bfq_smallest_from_now(void)
{
	return jiffies - MAX_JIFFY_OFFSET;
}

static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
					     struct bfq_queue *bfqq,
					     unsigned int old_wr_coeff,
					     bool wr_or_deserves_wr,
					     bool interactive,
					     bool in_burst,
					     bool soft_rt)
{
	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
		/* start a weight-raising period */
		if (interactive) {
			bfqq->service_from_wr = 0;
			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
		} else {
			/*
			 * No interactive weight raising in progress
			 * here: assign minus infinity to
			 * wr_start_at_switch_to_srt, to make sure
			 * that, at the end of the soft-real-time
			 * weight raising periods that is starting
			 * now, no interactive weight-raising period
			 * may be wrongly considered as still in
			 * progress (and thus actually started by
			 * mistake).
			 */
			bfqq->wr_start_at_switch_to_srt =
				bfq_smallest_from_now();
			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
				BFQ_SOFTRT_WEIGHT_FACTOR;
			bfqq->wr_cur_max_time =
				bfqd->bfq_wr_rt_max_time;
		}

		/*
		 * If needed, further reduce budget to make sure it is
		 * close to bfqq's backlog, so as to reduce the
		 * scheduling-error component due to a too large
		 * budget. Do not care about throughput consequences,
		 * but only about latency. Finally, do not assign a
		 * too small budget either, to avoid increasing
		 * latency by causing too frequent expirations.
		 */
		bfqq->entity.budget = min_t(unsigned long,
					    bfqq->entity.budget,
					    2 * bfq_min_budget(bfqd));
	} else if (old_wr_coeff > 1) {
		if (interactive) { /* update wr coeff and duration */
			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
		} else if (in_burst)
			bfqq->wr_coeff = 1;
		else if (soft_rt) {
			/*
			 * The application is now or still meeting the
			 * requirements for being deemed soft rt.  We
			 * can then correctly and safely (re)charge
			 * the weight-raising duration for the
			 * application with the weight-raising
			 * duration for soft rt applications.
			 *
			 * In particular, doing this recharge now, i.e.,
			 * before the weight-raising period for the
			 * application finishes, reduces the probability
			 * of the following negative scenario:
			 * 1) the weight of a soft rt application is
			 *    raised at startup (as for any newly
			 *    created application),
			 * 2) since the application is not interactive,
			 *    at a certain time weight-raising is
			 *    stopped for the application,
			 * 3) at that time the application happens to
			 *    still have pending requests, and hence
			 *    is destined to not have a chance to be
			 *    deemed soft rt before these requests are
			 *    completed (see the comments to the
			 *    function bfq_bfqq_softrt_next_start()
			 *    for details on soft rt detection),
			 * 4) these pending requests experience a high
			 *    latency because the application is not
			 *    weight-raised while they are pending.
			 */
			if (bfqq->wr_cur_max_time !=
				bfqd->bfq_wr_rt_max_time) {
				bfqq->wr_start_at_switch_to_srt =
					bfqq->last_wr_start_finish;

				bfqq->wr_cur_max_time =
					bfqd->bfq_wr_rt_max_time;
				bfqq->wr_coeff = bfqd->bfq_wr_coeff *
					BFQ_SOFTRT_WEIGHT_FACTOR;
			}
			bfqq->last_wr_start_finish = jiffies;
		}
	}
}

static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
					struct bfq_queue *bfqq)
{
	return bfqq->dispatched == 0 &&
		time_is_before_jiffies(
			bfqq->budget_timeout +
			bfqd->bfq_wr_min_idle_time);
}

static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
					    struct bfq_queue *in_serv_bfqq)
{
	int bfqq_weight, in_serv_weight;

	if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
		return true;

	if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
		bfqq_weight = bfqq->entity.weight;
		in_serv_weight = in_serv_bfqq->entity.weight;
	} else {
		if (bfqq->entity.parent)
			bfqq_weight = bfqq->entity.parent->weight;
		else
			bfqq_weight = bfqq->entity.weight;
		if (in_serv_bfqq->entity.parent)
			in_serv_weight = in_serv_bfqq->entity.parent->weight;
		else
			in_serv_weight = in_serv_bfqq->entity.weight;
	}

	return bfqq_weight > in_serv_weight;
}

static bool (*klpe_bfq_better_to_idle)(struct bfq_queue *bfqq);

static void klpr_bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
					     struct bfq_queue *bfqq,
					     int old_wr_coeff,
					     struct request *rq,
					     bool *interactive)
{
	bool soft_rt, in_burst,	wr_or_deserves_wr,
		bfqq_wants_to_preempt,
		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
		/*
		 * See the comments on
		 * bfq_bfqq_update_budg_for_activation for
		 * details on the usage of the next variable.
		 */
		arrived_in_time =  ktime_get_ns() <=
			bfqq->ttime.last_end_request +
			bfqd->bfq_slice_idle * 3;


	/*
	 * bfqq deserves to be weight-raised if:
	 * - it is sync,
	 * - it does not belong to a large burst,
	 * - it has been idle for enough time or is soft real-time,
	 * - is linked to a bfq_io_cq (it is not shared in any sense),
	 * - has a default weight (otherwise we assume the user wanted
	 *   to control its weight explicitly)
	 */
	in_burst = (*klpe_bfq_bfqq_in_large_burst)(bfqq);
	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
		!BFQQ_TOTALLY_SEEKY(bfqq) &&
		!in_burst &&
		time_is_before_jiffies(bfqq->soft_rt_next_start) &&
		bfqq->dispatched == 0 &&
		bfqq->entity.new_weight == 40;
	*interactive = !in_burst && idle_for_long_time &&
		bfqq->entity.new_weight == 40;
	/*
	 * Merged bfq_queues are kept out of weight-raising
	 * (low-latency) mechanisms. The reason is that these queues
	 * are usually created for non-interactive and
	 * non-soft-real-time tasks. Yet this is not the case for
	 * stably-merged queues. These queues are merged just because
	 * they are created shortly after each other. So they may
	 * easily serve the I/O of an interactive or soft-real time
	 * application, if the application happens to spawn multiple
	 * processes. So let also stably-merged queued enjoy weight
	 * raising.
	 */
	wr_or_deserves_wr = bfqd->low_latency &&
		(bfqq->wr_coeff > 1 ||
		 ((*klpe_bfq_bfqq_sync)(bfqq) &&
		  (bfqq->bic || RQ_BIC(rq)->stably_merged) &&
		   (*interactive || soft_rt)));

	/*
	 * Using the last flag, update budget and check whether bfqq
	 * may want to preempt the in-service queue.
	 */
	bfqq_wants_to_preempt =
		klpr_bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
						    arrived_in_time);

	/*
	 * If bfqq happened to be activated in a burst, but has been
	 * idle for much more than an interactive queue, then we
	 * assume that, in the overall I/O initiated in the burst, the
	 * I/O associated with bfqq is finished. So bfqq does not need
	 * to be treated as a queue belonging to a burst
	 * anymore. Accordingly, we reset bfqq's in_large_burst flag
	 * if set, and remove bfqq from the burst list if it's
	 * there. We do not decrement burst_size, because the fact
	 * that bfqq does not need to belong to the burst list any
	 * more does not invalidate the fact that bfqq was created in
	 * a burst.
	 */
	if (likely(!(*klpe_bfq_bfqq_just_created)(bfqq)) &&
	    idle_for_long_time &&
	    time_is_before_jiffies(
		    bfqq->budget_timeout +
		    msecs_to_jiffies(10000))) {
		hlist_del_init(&bfqq->burst_list_node);
		(*klpe_bfq_clear_bfqq_in_large_burst)(bfqq);
	}

	(*klpe_bfq_clear_bfqq_just_created)(bfqq);

	if (bfqd->low_latency) {
		if (unlikely(time_is_after_jiffies(bfqq->split_time)))
			/* wraparound */
			bfqq->split_time =
				jiffies - bfqd->bfq_wr_min_idle_time - 1;

		if (time_is_before_jiffies(bfqq->split_time +
					   bfqd->bfq_wr_min_idle_time)) {
			bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
							 old_wr_coeff,
							 wr_or_deserves_wr,
							 *interactive,
							 in_burst,
							 soft_rt);

			if (old_wr_coeff != bfqq->wr_coeff)
				bfqq->entity.prio_changed = 1;
		}
	}

	bfqq->last_idle_bklogged = jiffies;
	bfqq->service_from_backlogged = 0;
	(*klpe_bfq_clear_bfqq_softrt_update)(bfqq);

	(*klpe_bfq_add_bfqq_busy)(bfqd, bfqq);

	/*
	 * Expire in-service queue if preemption may be needed for
	 * guarantees or throughput. As for guarantees, we care
	 * explicitly about two cases. The first is that bfqq has to
	 * recover a service hole, as explained in the comments on
	 * bfq_bfqq_update_budg_for_activation(), i.e., that
	 * bfqq_wants_to_preempt is true. However, if bfqq does not
	 * carry time-critical I/O, then bfqq's bandwidth is less
	 * important than that of queues that carry time-critical I/O.
	 * So, as a further constraint, we consider this case only if
	 * bfqq is at least as weight-raised, i.e., at least as time
	 * critical, as the in-service queue.
	 *
	 * The second case is that bfqq is in a higher priority class,
	 * or has a higher weight than the in-service queue. If this
	 * condition does not hold, we don't care because, even if
	 * bfqq does not start to be served immediately, the resulting
	 * delay for bfqq's I/O is however lower or much lower than
	 * the ideal completion time to be guaranteed to bfqq's I/O.
	 *
	 * In both cases, preemption is needed only if, according to
	 * the timestamps of both bfqq and of the in-service queue,
	 * bfqq actually is the next queue to serve. So, to reduce
	 * useless preemptions, the return value of
	 * next_queue_may_preempt() is considered in the next compound
	 * condition too. Yet next_queue_may_preempt() just checks a
	 * simple, necessary condition for bfqq to be the next queue
	 * to serve. In fact, to evaluate a sufficient condition, the
	 * timestamps of the in-service queue would need to be
	 * updated, and this operation is quite costly (see the
	 * comments on bfq_bfqq_update_budg_for_activation()).
	 *
	 * As for throughput, we ask bfq_better_to_idle() whether we
	 * still need to plug I/O dispatching. If bfq_better_to_idle()
	 * says no, then plugging is not needed any longer, either to
	 * boost throughput or to perserve service guarantees. Then
	 * the best option is to stop plugging I/O, as not doing so
	 * would certainly lower throughput. We may end up in this
	 * case if: (1) upon a dispatch attempt, we detected that it
	 * was better to plug I/O dispatch, and to wait for a new
	 * request to arrive for the currently in-service queue, but
	 * (2) this switch of bfqq to busy changes the scenario.
	 */
	if (bfqd->in_service_queue &&
	    ((bfqq_wants_to_preempt &&
	      bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
	     bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) ||
	     !(*klpe_bfq_better_to_idle)(bfqd->in_service_queue)) &&
	    (*klpe_next_queue_may_preempt)(bfqd))
		(*klpe_bfq_bfqq_expire)(bfqd, bfqd->in_service_queue,
				false, BFQQE_PREEMPTED);
}

static void klpr_bfq_reset_inject_limit(struct bfq_data *bfqd,
				   struct bfq_queue *bfqq)
{
	/* invalidate baseline total service time */
	bfqq->last_serv_time_ns = 0;

	/*
	 * Reset pointer in case we are waiting for
	 * some request completion.
	 */
	bfqd->waited_rq = NULL;

	/*
	 * If bfqq has a short think time, then start by setting the
	 * inject limit to 0 prudentially, because the service time of
	 * an injected I/O request may be higher than the think time
	 * of bfqq, and therefore, if one request was injected when
	 * bfqq remains empty, this injected request might delay the
	 * service of the next I/O request for bfqq significantly. In
	 * case bfqq can actually tolerate some injection, then the
	 * adaptive update will however raise the limit soon. This
	 * lucky circumstance holds exactly because bfqq has a short
	 * think time, and thus, after remaining empty, is likely to
	 * get new I/O enqueued---and then completed---before being
	 * expired. This is the very pattern that gives the
	 * limit-update algorithm the chance to measure the effect of
	 * injection on request service times, and then to update the
	 * limit accordingly.
	 *
	 * However, in the following special case, the inject limit is
	 * left to 1 even if the think time is short: bfqq's I/O is
	 * synchronized with that of some other queue, i.e., bfqq may
	 * receive new I/O only after the I/O of the other queue is
	 * completed. Keeping the inject limit to 1 allows the
	 * blocking I/O to be served while bfqq is in service. And
	 * this is very convenient both for bfqq and for overall
	 * throughput, as explained in detail in the comments in
	 * bfq_update_has_short_ttime().
	 *
	 * On the opposite end, if bfqq has a long think time, then
	 * start directly by 1, because:
	 * a) on the bright side, keeping at most one request in
	 * service in the drive is unlikely to cause any harm to the
	 * latency of bfqq's requests, as the service time of a single
	 * request is likely to be lower than the think time of bfqq;
	 * b) on the downside, after becoming empty, bfqq is likely to
	 * expire before getting its next request. With this request
	 * arrival pattern, it is very hard to sample total service
	 * times and update the inject limit accordingly (see comments
	 * on bfq_update_inject_limit()). So the limit is likely to be
	 * never, or at least seldom, updated.  As a consequence, by
	 * setting the limit to 1, we avoid that no injection ever
	 * occurs with bfqq. On the downside, this proactive step
	 * further reduces chances to actually compute the baseline
	 * total service time. Thus it reduces chances to execute the
	 * limit-update algorithm and possibly raise the limit to more
	 * than 1.
	 */
	if ((*klpe_bfq_bfqq_has_short_ttime)(bfqq))
		bfqq->inject_limit = 0;
	else
		bfqq->inject_limit = 1;

	bfqq->decrease_time_jif = jiffies;
}

static void klpr_bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
{
	u64 tot_io_time = now_ns - bfqq->io_start_time;

	if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0)
		bfqq->tot_idle_time +=
			now_ns - bfqq->ttime.last_end_request;

	if (unlikely((*klpe_bfq_bfqq_just_created)(bfqq)))
		return;

	/*
	 * Must be busy for at least about 80% of the time to be
	 * considered I/O bound.
	 */
	if (bfqq->tot_idle_time * 5 > tot_io_time)
		(*klpe_bfq_clear_bfqq_IO_bound)(bfqq);
	else
		(*klpe_bfq_mark_bfqq_IO_bound)(bfqq);

	/*
	 * Keep an observation window of at most 200 ms in the past
	 * from now.
	 */
	if (tot_io_time > 200 * NSEC_PER_MSEC) {
		bfqq->io_start_time = now_ns - (tot_io_time>>1);
		bfqq->tot_idle_time >>= 1;
	}
}

static void klpr_bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
			    u64 now_ns)
{
	if (!bfqd->last_completed_rq_bfqq ||
	    bfqd->last_completed_rq_bfqq == bfqq ||
	    (*klpe_bfq_bfqq_has_short_ttime)(bfqq) ||
	    now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC)
		return;

	/*
	 * We reset waker detection logic also if too much time has passed
 	 * since the first detection. If wakeups are rare, pointless idling
	 * doesn't hurt throughput that much. The condition below makes sure
	 * we do not uselessly idle blocking waker in more than 1/64 cases. 
	 */
	if (bfqd->last_completed_rq_bfqq !=
	    bfqq->tentative_waker_bfqq ||
	    now_ns > bfqq->waker_detection_started +
					128 * (u64)bfqd->bfq_slice_idle) {
		/*
		 * First synchronization detected with a
		 * candidate waker queue, or with a different
		 * candidate waker queue from the current one.
		 */
		bfqq->tentative_waker_bfqq =
			bfqd->last_completed_rq_bfqq;
		bfqq->num_waker_detections = 1;
		bfqq->waker_detection_started = now_ns;
	} else /* Same tentative waker queue detected again */
		bfqq->num_waker_detections++;

	if (bfqq->num_waker_detections == 3) {
		bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
		bfqq->tentative_waker_bfqq = NULL;

		/*
		 * If the waker queue disappears, then
		 * bfqq->waker_bfqq must be reset. To
		 * this goal, we maintain in each
		 * waker queue a list, woken_list, of
		 * all the queues that reference the
		 * waker queue through their
		 * waker_bfqq pointer. When the waker
		 * queue exits, the waker_bfqq pointer
		 * of all the queues in the woken_list
		 * is reset.
		 *
		 * In addition, if bfqq is already in
		 * the woken_list of a waker queue,
		 * then, before being inserted into
		 * the woken_list of a new waker
		 * queue, bfqq must be removed from
		 * the woken_list of the old waker
		 * queue.
		 */
		if (!hlist_unhashed(&bfqq->woken_list_node))
			hlist_del_init(&bfqq->woken_list_node);
		hlist_add_head(&bfqq->woken_list_node,
			       &bfqd->last_completed_rq_bfqq->woken_list);
	}
}

static void klpr_bfq_add_request(struct request *rq)
{
	struct bfq_queue *bfqq = RQ_BFQQ(rq);
	struct bfq_data *bfqd = bfqq->bfqd;
	struct request *next_rq, *prev;
	unsigned int old_wr_coeff = bfqq->wr_coeff;
	bool interactive = false;
	u64 now_ns = ktime_get_ns();

	klpr_bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));

	bfqq->queued[rq_is_sync(rq)]++;
	bfqd->queued++;

	if ((*klpe_bfq_bfqq_sync)(bfqq) && RQ_BIC(rq)->requests <= 1) {
		klpr_bfq_check_waker(bfqd, bfqq, now_ns);

		/*
		 * Periodically reset inject limit, to make sure that
		 * the latter eventually drops in case workload
		 * changes, see step (3) in the comments on
		 * bfq_update_inject_limit().
		 */
		if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
					     msecs_to_jiffies(1000)))
			klpr_bfq_reset_inject_limit(bfqd, bfqq);

		/*
		 * The following conditions must hold to setup a new
		 * sampling of total service time, and then a new
		 * update of the inject limit:
		 * - bfqq is in service, because the total service
		 *   time is evaluated only for the I/O requests of
		 *   the queues in service;
		 * - this is the right occasion to compute or to
		 *   lower the baseline total service time, because
		 *   there are actually no requests in the drive,
		 *   or
		 *   the baseline total service time is available, and
		 *   this is the right occasion to compute the other
		 *   quantity needed to update the inject limit, i.e.,
		 *   the total service time caused by the amount of
		 *   injection allowed by the current value of the
		 *   limit. It is the right occasion because injection
		 *   has actually been performed during the service
		 *   hole, and there are still in-flight requests,
		 *   which are very likely to be exactly the injected
		 *   requests, or part of them;
		 * - the minimum interval for sampling the total
		 *   service time and updating the inject limit has
		 *   elapsed.
		 */
		if (bfqq == bfqd->in_service_queue &&
		    (bfqd->rq_in_driver == 0 ||
		     (bfqq->last_serv_time_ns > 0 &&
		      bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
		    time_is_before_eq_jiffies(bfqq->decrease_time_jif +
					      msecs_to_jiffies(10))) {
			bfqd->last_empty_occupied_ns = ktime_get_ns();
			/*
			 * Start the state machine for measuring the
			 * total service time of rq: setting
			 * wait_dispatch will cause bfqd->waited_rq to
			 * be set when rq will be dispatched.
			 */
			bfqd->wait_dispatch = true;
			/*
			 * If there is no I/O in service in the drive,
			 * then possible injection occurred before the
			 * arrival of rq will not affect the total
			 * service time of rq. So the injection limit
			 * must not be updated as a function of such
			 * total service time, unless new injection
			 * occurs before rq is completed. To have the
			 * injection limit updated only in the latter
			 * case, reset rqs_injected here (rqs_injected
			 * will be set in case injection is performed
			 * on bfqq before rq is completed).
			 */
			if (bfqd->rq_in_driver == 0)
				bfqd->rqs_injected = false;
		}
	}

	if ((*klpe_bfq_bfqq_sync)(bfqq))
		klpr_bfq_update_io_intensity(bfqq, now_ns);

	elv_rb_add(&bfqq->sort_list, rq);

	/*
	 * Check if this request is a better next-serve candidate.
	 */
	prev = bfqq->next_rq;
	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
	bfqq->next_rq = next_rq;

	/*
	 * Adjust priority tree position, if next_rq changes.
	 * See comments on bfq_pos_tree_add_move() for the unlikely().
	 */
	if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
		(*klpe_bfq_pos_tree_add_move)(bfqd, bfqq);

	if (!(*klpe_bfq_bfqq_busy)(bfqq)) /* switching to busy ... */
		klpr_bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
						 rq, &interactive);
	else {
		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
		    time_is_before_jiffies(
				bfqq->last_wr_start_finish +
				bfqd->bfq_wr_min_inter_arr_async)) {
			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);

			bfqd->wr_busy_queues++;
			bfqq->entity.prio_changed = 1;
		}
		if (prev != bfqq->next_rq)
			(*klpe_bfq_updated_next_req)(bfqd, bfqq);
	}

	/*
	 * Assign jiffies to last_wr_start_finish in the following
	 * cases:
	 *
	 * . if bfqq is not going to be weight-raised, because, for
	 *   non weight-raised queues, last_wr_start_finish stores the
	 *   arrival time of the last request; as of now, this piece
	 *   of information is used only for deciding whether to
	 *   weight-raise async queues
	 *
	 * . if bfqq is not weight-raised, because, if bfqq is now
	 *   switching to weight-raised, then last_wr_start_finish
	 *   stores the time when weight-raising starts
	 *
	 * . if bfqq is interactive, because, regardless of whether
	 *   bfqq is currently weight-raised, the weight-raising
	 *   period must start or restart (this case is considered
	 *   separately because it is not detected by the above
	 *   conditions, if bfqq is already weight-raised)
	 *
	 * last_wr_start_finish has to be updated also if bfqq is soft
	 * real-time, because the weight-raising period is constantly
	 * restarted on idle-to-busy transitions for these queues, but
	 * this is already done in bfq_bfqq_handle_idle_busy_switch if
	 * needed.
	 */
	if (bfqd->low_latency &&
		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
		bfqq->last_wr_start_finish = jiffies;
}

static sector_t get_sdist(sector_t last_pos, struct request *rq)
{
	if (last_pos)
		return abs(blk_rq_pos(rq) - last_pos);

	return 0;
}

static void klpr_bfq_bfqq_end_wr(struct bfq_queue *bfqq)
{
	/*
	 * If bfqq has been enjoying interactive weight-raising, then
	 * reset soft_rt_next_start. We do it for the following
	 * reason. bfqq may have been conveying the I/O needed to load
	 * a soft real-time application. Such an application actually
	 * exhibits a soft real-time I/O pattern after it finishes
	 * loading, and finally starts doing its job. But, if bfqq has
	 * been receiving a lot of bandwidth so far (likely to happen
	 * on a fast device), then soft_rt_next_start now contains a
	 * high value that. So, without this reset, bfqq would be
	 * prevented from being possibly considered as soft_rt for a
	 * very long time.
	 */

	if (bfqq->wr_cur_max_time !=
	    bfqq->bfqd->bfq_wr_rt_max_time)
		bfqq->soft_rt_next_start = jiffies;

	if ((*klpe_bfq_bfqq_busy)(bfqq))
		bfqq->bfqd->wr_busy_queues--;
	bfqq->wr_coeff = 1;
	bfqq->wr_cur_max_time = 0;
	bfqq->last_wr_start_finish = jiffies;
	/*
	 * Trigger a weight change on the next invocation of
	 * __bfq_entity_update_weight_prio.
	 */
	bfqq->entity.prio_changed = 1;
}

static sector_t bfq_io_struct_pos(void *io_struct, bool request)
{
	if (request)
		return blk_rq_pos(io_struct);
	else
		return ((struct bio *)io_struct)->bi_iter.bi_sector;
}

static int bfq_rq_close_to_sector(void *io_struct, bool request,
				  sector_t sector)
{
	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
	       BFQQ_CLOSE_THR;
}

static struct bfq_queue *klpr_bfqq_find_close(struct bfq_data *bfqd,
					 struct bfq_queue *bfqq,
					 sector_t sector)
{
	struct rb_root *root = &(*klpe_bfq_bfqq_to_bfqg)(bfqq)->rq_pos_tree;
	struct rb_node *parent, *node;
	struct bfq_queue *__bfqq;

	if (RB_EMPTY_ROOT(root))
		return NULL;

	/*
	 * First, if we find a request starting at the end of the last
	 * request, choose it.
	 */
	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
	if (__bfqq)
		return __bfqq;

	/*
	 * If the exact sector wasn't found, the parent of the NULL leaf
	 * will contain the closest sector (rq_pos_tree sorted by
	 * next_request position).
	 */
	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
		return __bfqq;

	if (blk_rq_pos(__bfqq->next_rq) < sector)
		node = rb_next(&__bfqq->pos_node);
	else
		node = rb_prev(&__bfqq->pos_node);
	if (!node)
		return NULL;

	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
		return __bfqq;

	return NULL;
}

static struct bfq_queue *klpr_bfq_find_close_cooperator(struct bfq_data *bfqd,
						   struct bfq_queue *cur_bfqq,
						   sector_t sector)
{
	struct bfq_queue *bfqq;

	/*
	 * We shall notice if some of the queues are cooperating,
	 * e.g., working closely on the same area of the device. In
	 * that case, we can group them together and: 1) don't waste
	 * time idling, and 2) serve the union of their requests in
	 * the best possible order for throughput.
	 */
	bfqq = klpr_bfqq_find_close(bfqd, cur_bfqq, sector);
	if (!bfqq || bfqq == cur_bfqq)
		return NULL;

	return bfqq;
}

static struct bfq_queue *
(*klpe_bfq_setup_merge)(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq);

static bool (*klpe_bfq_may_be_close_cooperator)(struct bfq_queue *bfqq,
					struct bfq_queue *new_bfqq);

static bool (*klpe_idling_boosts_thr_without_issues)(struct bfq_data *bfqd,
					     struct bfq_queue *bfqq);

static struct bfq_queue *
klpr_bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
		     void *io_struct, bool request, struct bfq_io_cq *bic)
{
	struct bfq_queue *in_service_bfqq, *new_bfqq;

	/* if a merge has already been setup, then proceed with that first */
	if (bfqq->new_bfqq)
		return bfqq->new_bfqq;

	/*
	 * Check delayed stable merge for rotational or non-queueing
	 * devs. For this branch to be executed, bfqq must not be
	 * currently merged with some other queue (i.e., bfqq->bic
	 * must be non null). If we considered also merged queues,
	 * then we should also check whether bfqq has already been
	 * merged with bic->stable_merge_bfqq. But this would be
	 * costly and complicated.
	 */
	if (unlikely(!bfqd->nonrot_with_queueing)) {
		/*
		 * Make sure also that bfqq is sync, because
		 * bic->stable_merge_bfqq may point to some queue (for
		 * stable merging) also if bic is associated with a
		 * sync queue, but this bfqq is async
		 */
		if ((*klpe_bfq_bfqq_sync)(bfqq) && bic->stable_merge_bfqq &&
		    !(*klpe_bfq_bfqq_just_created)(bfqq) &&
		    time_is_before_jiffies(bfqq->split_time +
					  msecs_to_jiffies(bfq_late_stable_merging)) &&
		    time_is_before_jiffies(bfqq->creation_time +
					   msecs_to_jiffies(bfq_late_stable_merging))) {
			struct bfq_queue *stable_merge_bfqq =
				bic->stable_merge_bfqq;
			int proc_ref = min(bfqq_process_refs(bfqq),
					   bfqq_process_refs(stable_merge_bfqq));

			/* deschedule stable merge, because done or aborted here */
			klpr_bfq_put_stable_ref(stable_merge_bfqq);

			bic->stable_merge_bfqq = NULL;

			if (!(*klpe_idling_boosts_thr_without_issues)(bfqd, bfqq) &&
			    proc_ref > 0) {
				/* next function will take at least one ref */
				struct bfq_queue *new_bfqq =
					(*klpe_bfq_setup_merge)(bfqq, stable_merge_bfqq);

				if (new_bfqq) {
					bic->stably_merged = true;
					if (new_bfqq->bic)
						new_bfqq->bic->stably_merged =
									true;
				}
				return new_bfqq;
			} else
				return NULL;
		}
	}

	/*
	 * Do not perform queue merging if the device is non
	 * rotational and performs internal queueing. In fact, such a
	 * device reaches a high speed through internal parallelism
	 * and pipelining. This means that, to reach a high
	 * throughput, it must have many requests enqueued at the same
	 * time. But, in this configuration, the internal scheduling
	 * algorithm of the device does exactly the job of queue
	 * merging: it reorders requests so as to obtain as much as
	 * possible a sequential I/O pattern. As a consequence, with
	 * the workload generated by processes doing interleaved I/O,
	 * the throughput reached by the device is likely to be the
	 * same, with and without queue merging.
	 *
	 * Disabling merging also provides a remarkable benefit in
	 * terms of throughput. Merging tends to make many workloads
	 * artificially more uneven, because of shared queues
	 * remaining non empty for incomparably more time than
	 * non-merged queues. This may accentuate workload
	 * asymmetries. For example, if one of the queues in a set of
	 * merged queues has a higher weight than a normal queue, then
	 * the shared queue may inherit such a high weight and, by
	 * staying almost always active, may force BFQ to perform I/O
	 * plugging most of the time. This evidently makes it harder
	 * for BFQ to let the device reach a high throughput.
	 *
	 * Finally, the likely() macro below is not used because one
	 * of the two branches is more likely than the other, but to
	 * have the code path after the following if() executed as
	 * fast as possible for the case of a non rotational device
	 * with queueing. We want it because this is the fastest kind
	 * of device. On the opposite end, the likely() may lengthen
	 * the execution time of BFQ for the case of slower devices
	 * (rotational or at least without queueing). But in this case
	 * the execution time of BFQ matters very little, if not at
	 * all.
	 */
	if (likely(bfqd->nonrot_with_queueing))
		return NULL;

	/*
	 * Prevent bfqq from being merged if it has been created too
	 * long ago. The idea is that true cooperating processes, and
	 * thus their associated bfq_queues, are supposed to be
	 * created shortly after each other. This is the case, e.g.,
	 * for KVM/QEMU and dump I/O threads. Basing on this
	 * assumption, the following filtering greatly reduces the
	 * probability that two non-cooperating processes, which just
	 * happen to do close I/O for some short time interval, have
	 * their queues merged by mistake.
	 */
	if (bfq_too_late_for_merging(bfqq))
		return NULL;

	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
		return NULL;

	/* If there is only one backlogged queue, don't search. */
	if ((*klpe_bfq_tot_busy_queues)(bfqd) == 1)
		return NULL;

	in_service_bfqq = bfqd->in_service_queue;

	if (in_service_bfqq && in_service_bfqq != bfqq &&
	    likely(in_service_bfqq != &bfqd->oom_bfqq) &&
	    bfq_rq_close_to_sector(io_struct, request,
				   bfqd->in_serv_last_pos) &&
	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
	    (*klpe_bfq_may_be_close_cooperator)(bfqq, in_service_bfqq)) {
		new_bfqq = (*klpe_bfq_setup_merge)(bfqq, in_service_bfqq);
		if (new_bfqq)
			return new_bfqq;
	}
	/*
	 * Check whether there is a cooperator among currently scheduled
	 * queues. The only thing we need is that the bio/request is not
	 * NULL, as we need it to establish whether a cooperator exists.
	 */
	new_bfqq = klpr_bfq_find_close_cooperator(bfqd, bfqq,
			bfq_io_struct_pos(io_struct, request));

	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
	    (*klpe_bfq_may_be_close_cooperator)(bfqq, new_bfqq))
		return (*klpe_bfq_setup_merge)(bfqq, new_bfqq);

	return NULL;
}

static void
(*klpe_bfq_merge_bfqqs)(struct bfq_data *bfqd, struct bfq_io_cq *bic,
		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq);

static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
{
	return time_is_before_eq_jiffies(bfqq->budget_timeout);
}

static bool (*klpe_idling_boosts_thr_without_issues)(struct bfq_data *bfqd,
					     struct bfq_queue *bfqq);

static bool (*klpe_bfq_better_to_idle)(struct bfq_queue *bfqq);

static void klpr_bfq_put_stable_ref(struct bfq_queue *bfqq)
{
	bfqq->stable_ref--;
	(*klpe_bfq_put_queue)(bfqq);
}

static void
klpr_bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
{
	struct task_struct *tsk = current;
	int ioprio_class;
	struct bfq_data *bfqd = bfqq->bfqd;

	if (!bfqd)
		return;

	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
	switch (ioprio_class) {
	default:
		pr_err("bdi %s: bfq: bad prio class %d\n",
				bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
				ioprio_class);
		fallthrough;
	case IOPRIO_CLASS_NONE:
		/*
		 * No prio set, inherit CPU scheduling settings.
		 */
		bfqq->new_ioprio = task_nice_ioprio(tsk);
		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
		break;
	case IOPRIO_CLASS_RT:
		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
		break;
	case IOPRIO_CLASS_BE:
		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
		break;
	case IOPRIO_CLASS_IDLE:
		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
		bfqq->new_ioprio = 7;
		break;
	}

	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
		pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
			bfqq->new_ioprio);
		bfqq->new_ioprio = IOPRIO_BE_NR - 1;
	}

	bfqq->entity.new_weight = (*klpe_bfq_ioprio_to_weight)(bfqq->new_ioprio);
	klpr_bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d",
		     bfqq->new_ioprio, bfqq->entity.new_weight);
	bfqq->entity.prio_changed = 1;
}

static struct bfq_queue *(*klpe_bfq_get_queue)(struct bfq_data *bfqd,
				       struct bio *bio, bool is_sync,
				       struct bfq_io_cq *bic,
				       bool respawn);

static void klpr_bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
{
	struct bfq_data *bfqd = (*klpe_bic_to_bfqd)(bic);
	struct bfq_queue *bfqq;
	int ioprio = bic->icq.ioc->ioprio;

	/*
	 * This condition may trigger on a newly created bic, be sure to
	 * drop the lock before returning.
	 */
	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
		return;

	bic->ioprio = ioprio;

	bfqq = (*klpe_bic_to_bfqq)(bic, false);
	if (bfqq) {
		struct bfq_queue *old_bfqq = bfqq;

		bfqq = (*klpe_bfq_get_queue)(bfqd, bio, false, bic, true);
		(*klpe_bic_set_bfqq)(bic, bfqq, false);
		(*klpe_bfq_release_process_ref)(bfqd, old_bfqq);
	}

	bfqq = (*klpe_bic_to_bfqq)(bic, true);
	if (bfqq)
		klpr_bfq_set_next_ioprio_data(bfqq, bic);
}

static struct bfq_queue *(*klpe_bfq_get_queue)(struct bfq_data *bfqd,
				       struct bio *bio, bool is_sync,
				       struct bfq_io_cq *bic,
				       bool respawn);

static void klpr_bfq_update_io_thinktime(struct bfq_data *bfqd,
				    struct bfq_queue *bfqq)
{
	struct bfq_ttime *ttime = &bfqq->ttime;
	u64 elapsed;

	/*
	 * We are really interested in how long it takes for the queue to
	 * become busy when there is no outstanding IO for this queue. So
	 * ignore cases when the bfq queue has already IO queued.
	 */
	if (bfqq->dispatched || (*klpe_bfq_bfqq_busy)(bfqq))
		return;
	elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);

	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
				     ttime->ttime_samples);
}

static void
klpr_bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
		       struct request *rq)
{
	bfqq->seek_history <<= 1;
	bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);

	if (bfqq->wr_coeff > 1 &&
	    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
	    BFQQ_TOTALLY_SEEKY(bfqq)) {
		if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
					   bfq_wr_duration(bfqd))) {
			/*
			 * In soft_rt weight raising with the
			 * interactive-weight-raising period
			 * elapsed (so no switch back to
			 * interactive weight raising).
			 */
			klpr_bfq_bfqq_end_wr(bfqq);
		} else { /*
			  * stopping soft_rt weight raising
			  * while still in interactive period,
			  * switch back to interactive weight
			  * raising
			  */
			switch_back_to_interactive_wr(bfqq, bfqd);
			bfqq->entity.prio_changed = 1;
		}
	}
}

static void klpr_bfq_update_has_short_ttime(struct bfq_data *bfqd,
				       struct bfq_queue *bfqq,
				       struct bfq_io_cq *bic)
{
	bool has_short_ttime = true, state_changed;

	/*
	 * No need to update has_short_ttime if bfqq is async or in
	 * idle io prio class, or if bfq_slice_idle is zero, because
	 * no device idling is performed for bfqq in this case.
	 */
	if (!(*klpe_bfq_bfqq_sync)(bfqq) || bfq_class_idle(bfqq) ||
	    bfqd->bfq_slice_idle == 0)
		return;

	/* Idle window just restored, statistics are meaningless. */
	if (time_is_after_eq_jiffies(bfqq->split_time +
				     bfqd->bfq_wr_min_idle_time))
		return;

	/* Think time is infinite if no process is linked to
	 * bfqq. Otherwise check average think time to decide whether
	 * to mark as has_short_ttime. To this goal, compare average
	 * think time with half the I/O-plugging timeout.
	 */
	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
	    (bfq_sample_valid(bfqq->ttime.ttime_samples) &&
	     bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1))
		has_short_ttime = false;

	state_changed = has_short_ttime != (*klpe_bfq_bfqq_has_short_ttime)(bfqq);

	if (has_short_ttime)
		(*klpe_bfq_mark_bfqq_has_short_ttime)(bfqq);
	else
		(*klpe_bfq_clear_bfqq_has_short_ttime)(bfqq);

	/*
	 * Until the base value for the total service time gets
	 * finally computed for bfqq, the inject limit does depend on
	 * the think-time state (short|long). In particular, the limit
	 * is 0 or 1 if the think time is deemed, respectively, as
	 * short or long (details in the comments in
	 * bfq_update_inject_limit()). Accordingly, the next
	 * instructions reset the inject limit if the think-time state
	 * has changed and the above base value is still to be
	 * computed.
	 *
	 * However, the reset is performed only if more than 100 ms
	 * have elapsed since the last update of the inject limit, or
	 * (inclusive) if the change is from short to long think
	 * time. The reason for this waiting is as follows.
	 *
	 * bfqq may have a long think time because of a
	 * synchronization with some other queue, i.e., because the
	 * I/O of some other queue may need to be completed for bfqq
	 * to receive new I/O. Details in the comments on the choice
	 * of the queue for injection in bfq_select_queue().
	 *
	 * As stressed in those comments, if such a synchronization is
	 * actually in place, then, without injection on bfqq, the
	 * blocking I/O cannot happen to served while bfqq is in
	 * service. As a consequence, if bfqq is granted
	 * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
	 * is dispatched, until the idle timeout fires. This is likely
	 * to result in lower bandwidth and higher latencies for bfqq,
	 * and in a severe loss of total throughput.
	 *
	 * On the opposite end, a non-zero inject limit may allow the
	 * I/O that blocks bfqq to be executed soon, and therefore
	 * bfqq to receive new I/O soon.
	 *
	 * But, if the blocking gets actually eliminated, then the
	 * next think-time sample for bfqq may be very low. This in
	 * turn may cause bfqq's think time to be deemed
	 * short. Without the 100 ms barrier, this new state change
	 * would cause the body of the next if to be executed
	 * immediately. But this would set to 0 the inject
	 * limit. Without injection, the blocking I/O would cause the
	 * think time of bfqq to become long again, and therefore the
	 * inject limit to be raised again, and so on. The only effect
	 * of such a steady oscillation between the two think-time
	 * states would be to prevent effective injection on bfqq.
	 *
	 * In contrast, if the inject limit is not reset during such a
	 * long time interval as 100 ms, then the number of short
	 * think time samples can grow significantly before the reset
	 * is performed. As a consequence, the think time state can
	 * become stable before the reset. Therefore there will be no
	 * state change when the 100 ms elapse, and no reset of the
	 * inject limit. The inject limit remains steadily equal to 1
	 * both during and after the 100 ms. So injection can be
	 * performed at all times, and throughput gets boosted.
	 *
	 * An inject limit equal to 1 is however in conflict, in
	 * general, with the fact that the think time of bfqq is
	 * short, because injection may be likely to delay bfqq's I/O
	 * (as explained in the comments in
	 * bfq_update_inject_limit()). But this does not happen in
	 * this special case, because bfqq's low think time is due to
	 * an effective handling of a synchronization, through
	 * injection. In this special case, bfqq's I/O does not get
	 * delayed by injection; on the contrary, bfqq's I/O is
	 * brought forward, because it is not blocked for
	 * milliseconds.
	 *
	 * In addition, serving the blocking I/O much sooner, and much
	 * more frequently than once per I/O-plugging timeout, makes
	 * it much quicker to detect a waker queue (the concept of
	 * waker queue is defined in the comments in
	 * bfq_add_request()). This makes it possible to start sooner
	 * to boost throughput more effectively, by injecting the I/O
	 * of the waker queue unconditionally on every
	 * bfq_dispatch_request().
	 *
	 * One last, important benefit of not resetting the inject
	 * limit before 100 ms is that, during this time interval, the
	 * base value for the total service time is likely to get
	 * finally computed for bfqq, freeing the inject limit from
	 * its relation with the think time.
	 */
	if (state_changed && bfqq->last_serv_time_ns == 0 &&
	    (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
				      msecs_to_jiffies(100)) ||
	     !has_short_ttime))
		klpr_bfq_reset_inject_limit(bfqd, bfqq);
}

static void klpr_bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
			    struct request *rq)
{
	if (rq->cmd_flags & REQ_META)
		bfqq->meta_pending++;

	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

	if (bfqq == bfqd->in_service_queue && (*klpe_bfq_bfqq_wait_request)(bfqq)) {
		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
				 blk_rq_sectors(rq) < 32;
		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);

		/*
		 * There is just this request queued: if
		 * - the request is small, and
		 * - we are idling to boost throughput, and
		 * - the queue is not to be expired,
		 * then just exit.
		 *
		 * In this way, if the device is being idled to wait
		 * for a new request from the in-service queue, we
		 * avoid unplugging the device and committing the
		 * device to serve just a small request. In contrast
		 * we wait for the block layer to decide when to
		 * unplug the device: hopefully, new requests will be
		 * merged to this one quickly, then the device will be
		 * unplugged and larger requests will be dispatched.
		 */
		if (small_req && (*klpe_idling_boosts_thr_without_issues)(bfqd, bfqq) &&
		    !budget_timeout)
			return;

		/*
		 * A large enough request arrived, or idling is being
		 * performed to preserve service guarantees, or
		 * finally the queue is to be expired: in all these
		 * cases disk idling is to be stopped, so clear
		 * wait_request flag and reset timer.
		 */
		(*klpe_bfq_clear_bfqq_wait_request)(bfqq);
		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);

		/*
		 * The queue is not empty, because a new request just
		 * arrived. Hence we can safely expire the queue, in
		 * case of budget timeout, without risking that the
		 * timestamps of the queue are not updated correctly.
		 * See [1] for more details.
		 */
		if (budget_timeout)
			(*klpe_bfq_bfqq_expire)(bfqd, bfqq, false,
					BFQQE_BUDGET_TIMEOUT);
	}
}

static void bfqq_request_allocated(struct bfq_queue *bfqq)
{
	struct bfq_entity *entity = &bfqq->entity;

	for_each_entity(entity)
		entity->allocated++;
}

static void bfqq_request_freed(struct bfq_queue *bfqq)
{
	struct bfq_entity *entity = &bfqq->entity;

	for_each_entity(entity)
		entity->allocated--;
}

static bool klpr___bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
	struct bfq_queue *bfqq = RQ_BFQQ(rq),
		*new_bfqq = klpr_bfq_setup_cooperator(bfqd, bfqq, rq, true,
						 RQ_BIC(rq));
	bool waiting, idle_timer_disabled = false;

	if (new_bfqq) {
		/*
		 * Release the request's reference to the old bfqq
		 * and make sure one is taken to the shared queue.
		 */
		bfqq_request_allocated(new_bfqq);
		bfqq_request_freed(bfqq);
		new_bfqq->ref++;
		/*
		 * If the bic associated with the process
		 * issuing this request still points to bfqq
		 * (and thus has not been already redirected
		 * to new_bfqq or even some other bfq_queue),
		 * then complete the merge and redirect it to
		 * new_bfqq.
		 */
		if ((*klpe_bic_to_bfqq)(RQ_BIC(rq), 1) == bfqq)
			(*klpe_bfq_merge_bfqqs)(bfqd, RQ_BIC(rq),
					bfqq, new_bfqq);

		(*klpe_bfq_clear_bfqq_just_created)(bfqq);
		/*
		 * rq is about to be enqueued into new_bfqq,
		 * release rq reference on bfqq
		 */
		(*klpe_bfq_put_queue)(bfqq);
		rq->elv.priv[1] = new_bfqq;
		bfqq = new_bfqq;
	}

	klpr_bfq_update_io_thinktime(bfqd, bfqq);
	klpr_bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
	klpr_bfq_update_io_seektime(bfqd, bfqq, rq);

	waiting = bfqq && (*klpe_bfq_bfqq_wait_request)(bfqq);
	klpr_bfq_add_request(rq);
	idle_timer_disabled = waiting && !(*klpe_bfq_bfqq_wait_request)(bfqq);

	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
	list_add_tail(&rq->queuelist, &bfqq->fifo);

	klpr_bfq_rq_enqueued(bfqd, bfqq, rq);

	return idle_timer_disabled;
}

#ifdef CONFIG_BFQ_CGROUP_DEBUG
#error "klp-ccp: non-taken branch"
#else
static inline void bfq_update_insert_stats(struct request_queue *q,
					   struct bfq_queue *bfqq,
					   bool idle_timer_disabled,
					   unsigned int cmd_flags) {}
#endif /* CONFIG_BFQ_CGROUP_DEBUG */

static struct bfq_queue *klpp_bfq_init_rq(struct request *rq);

static void klpr_bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
			       bool at_head)
{
	struct request_queue *q = hctx->queue;
	struct bfq_data *bfqd = q->elevator->elevator_data;
	struct bfq_queue *bfqq;
	bool idle_timer_disabled = false;
	unsigned int cmd_flags;
	LIST_HEAD(free);

#ifdef CONFIG_BFQ_GROUP_IOSCHED
	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
		(*klpe_bfqg_stats_update_legacy_io)(q, rq);
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif
	spin_lock_irq(&bfqd->lock);
	bfqq = klpp_bfq_init_rq(rq);
	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
		spin_unlock_irq(&bfqd->lock);
		blk_mq_free_requests(&free);
		return;
	}

	klpr_trace_block_rq_insert(rq);

	if (!bfqq || at_head) {
		if (at_head)
			list_add(&rq->queuelist, &bfqd->dispatch);
		else
			list_add_tail(&rq->queuelist, &bfqd->dispatch);
	} else {
		idle_timer_disabled = klpr___bfq_insert_request(bfqd, rq);
		/*
		 * Update bfqq, because, if a queue merge has occurred
		 * in __bfq_insert_request, then rq has been
		 * redirected into a new queue.
		 */
		bfqq = RQ_BFQQ(rq);

		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}
	}

	/*
	 * Cache cmd_flags before releasing scheduler lock, because rq
	 * may disappear afterwards (for example, because of a request
	 * merge).
	 */
	cmd_flags = rq->cmd_flags;
	spin_unlock_irq(&bfqd->lock);

	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
				cmd_flags);
}

void klpp_bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
				struct list_head *list, bool at_head)
{
	while (!list_empty(list)) {
		struct request *rq;

		rq = list_first_entry(list, struct request, queuelist);
		list_del_init(&rq->queuelist);
		klpr_bfq_insert_request(hctx, rq, at_head);
	}
}

static struct bfq_queue *
klpr_bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
{
	klpr_bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");

	if (bfqq_process_refs(bfqq) == 1) {
		bfqq->pid = current->pid;
		(*klpe_bfq_clear_bfqq_coop)(bfqq);
		(*klpe_bfq_clear_bfqq_split_coop)(bfqq);
		return bfqq;
	}

	(*klpe_bic_set_bfqq)(bic, NULL, 1);

	(*klpe_bfq_put_cooperator)(bfqq);

	(*klpe_bfq_release_process_ref)(bfqq->bfqd, bfqq);
	return NULL;
}

static struct bfq_queue *klpr_bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
						   struct bfq_io_cq *bic,
						   struct bio *bio,
						   bool split, bool is_sync,
						   bool *new_queue)
{
	struct bfq_queue *bfqq = (*klpe_bic_to_bfqq)(bic, is_sync);

	if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
		return bfqq;

	if (new_queue)
		*new_queue = true;

	if (bfqq)
		(*klpe_bfq_put_queue)(bfqq);
	bfqq = (*klpe_bfq_get_queue)(bfqd, bio, is_sync, bic, split);

	(*klpe_bic_set_bfqq)(bic, bfqq, is_sync);
	if (split && is_sync) {
		if ((bic->was_in_burst_list && bfqd->large_burst) ||
		    bic->saved_in_large_burst)
			(*klpe_bfq_mark_bfqq_in_large_burst)(bfqq);
		else {
			(*klpe_bfq_clear_bfqq_in_large_burst)(bfqq);
			if (bic->was_in_burst_list)
				/*
				 * If bfqq was in the current
				 * burst list before being
				 * merged, then we have to add
				 * it back. And we do not need
				 * to increase burst_size, as
				 * we did not decrement
				 * burst_size when we removed
				 * bfqq from the burst list as
				 * a consequence of a merge
				 * (see comments in
				 * bfq_put_queue). In this
				 * respect, it would be rather
				 * costly to know whether the
				 * current burst list is still
				 * the same burst list from
				 * which bfqq was removed on
				 * the merge. To avoid this
				 * cost, if bfqq was in a
				 * burst list, then we add
				 * bfqq to the current burst
				 * list without any further
				 * check. This can cause
				 * inappropriate insertions,
				 * but rarely enough to not
				 * harm the detection of large
				 * bursts significantly.
				 */
				hlist_add_head(&bfqq->burst_list_node,
					       &bfqd->burst_list);
		}
		bfqq->split_time = jiffies;
	}

	return bfqq;
}

static struct bfq_queue *klpp_bfq_init_rq(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct bio *bio = rq->bio;
	struct bfq_data *bfqd = q->elevator->elevator_data;
	struct bfq_io_cq *bic;
	const int is_sync = rq_is_sync(rq);
	struct bfq_queue *bfqq;
	bool new_queue = false;
	bool bfqq_already_existing = false, split = false;

	if (unlikely(!rq->elv.icq))
		return NULL;

	/*
	 * Assuming that elv.priv[1] is set only if everything is set
	 * for this rq. This holds true, because this function is
	 * invoked only for insertion or merging, and, after such
	 * events, a request cannot be manipulated any longer before
	 * being removed from bfq.
	 */
	if (rq->elv.priv[1])
		return rq->elv.priv[1];

	bic = icq_to_bic(rq->elv.icq);

	klpr_bfq_check_ioprio_change(bic, bio);

	(*klpe_bfq_bic_update_cgroup)(bic, bio);

	bfqq = klpr_bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
					 &new_queue);

	if (likely(!new_queue)) {
		/* If the queue was seeky for too long, break it apart. */
		if ((*klpe_bfq_bfqq_coop)(bfqq) && (*klpe_bfq_bfqq_split_coop)(bfqq) &&
			!bic->stably_merged) {
			struct bfq_queue *old_bfqq = bfqq;

			/* Update bic before losing reference to bfqq */
			if ((*klpe_bfq_bfqq_in_large_burst)(bfqq))
				bic->saved_in_large_burst = true;

			bfqq = klpr_bfq_split_bfqq(bic, bfqq);
			split = true;

			if (!bfqq) {
				bfqq = klpr_bfq_get_bfqq_handle_split(bfqd, bic, bio,
								 true, is_sync,
								 NULL);
				if (unlikely(bfqq == &bfqd->oom_bfqq))
					bfqq_already_existing = true;
			} else
				bfqq_already_existing = true;

			if (!bfqq_already_existing) {
				bfqq->waker_bfqq = old_bfqq->waker_bfqq;
				bfqq->tentative_waker_bfqq = NULL;

				/*
				 * If the waker queue disappears, then
				 * new_bfqq->waker_bfqq must be
				 * reset. So insert new_bfqq into the
				 * woken_list of the waker. See
				 * bfq_check_waker for details.
				 */
				if (bfqq->waker_bfqq)
					hlist_add_head(&bfqq->woken_list_node,
						       &bfqq->waker_bfqq->woken_list);
			}
		}
	}

	bfqq_request_allocated(bfqq);
	bfqq->ref++;
	bic->requests++;
	klpr_bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
		     rq, bfqq, bfqq->ref);

	rq->elv.priv[0] = bic;
	rq->elv.priv[1] = bfqq;

	/*
	 * If a bfq_queue has only one process reference, it is owned
	 * by only this bic: we can then set bfqq->bic = bic. in
	 * addition, if the queue has also just been split, we have to
	 * resume its state.
	 */
	if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq &&
	    bfqq_process_refs(bfqq) == 1) {
		bfqq->bic = bic;
		if (split) {
			/*
			 * The queue has just been split from a shared
			 * queue: restore the idle window and the
			 * possible weight raising period.
			 */
			klpr_bfq_bfqq_resume_state(bfqq, bfqd, bic,
					      bfqq_already_existing);
		}
	}

	/*
	 * Consider bfqq as possibly belonging to a burst of newly
	 * created queues only if:
	 * 1) A burst is actually happening (bfqd->burst_size > 0)
	 * or
	 * 2) There is no other active queue. In fact, if, in
	 *    contrast, there are active queues not belonging to the
	 *    possible burst bfqq may belong to, then there is no gain
	 *    in considering bfqq as belonging to a burst, and
	 *    therefore in not weight-raising bfqq. See comments on
	 *    bfq_handle_burst().
	 *
	 * This filtering also helps eliminating false positives,
	 * occurring when bfqq does not belong to an actual large
	 * burst, but some background task (e.g., a service) happens
	 * to trigger the creation of new queues very close to when
	 * bfqq and its possible companion queues are created. See
	 * comments on bfq_handle_burst() for further details also on
	 * this issue.
	 */
	if (unlikely((*klpe_bfq_bfqq_just_created)(bfqq) &&
		     (bfqd->burst_size > 0 ||
		      (*klpe_bfq_tot_busy_queues)(bfqd) == 0)))
		klpr_bfq_handle_burst(bfqd, bfqq);

	return bfqq;
}


#include "livepatch_bsc1231943.h"

#include <linux/kernel.h>
#include "../kallsyms_relocs.h"

static struct klp_kallsyms_reloc klp_funcs[] = {
	{ "__traceiter_block_rq_insert",
	  (void *)&klpe___traceiter_block_rq_insert },
	{ "__tracepoint_block_rq_insert",
	  (void *)&klpe___tracepoint_block_rq_insert },
	{ "bfq_add_bfqq_busy", (void *)&klpe_bfq_add_bfqq_busy },
	{ "bfq_asymmetric_scenario", (void *)&klpe_bfq_asymmetric_scenario },
	{ "bfq_better_to_idle", (void *)&klpe_bfq_better_to_idle },
	{ "bfq_bfqq_busy", (void *)&klpe_bfq_bfqq_busy },
	{ "bfq_bfqq_coop", (void *)&klpe_bfq_bfqq_coop },
	{ "bfq_bfqq_expire", (void *)&klpe_bfq_bfqq_expire },
	{ "bfq_bfqq_has_short_ttime", (void *)&klpe_bfq_bfqq_has_short_ttime },
	{ "bfq_bfqq_in_large_burst", (void *)&klpe_bfq_bfqq_in_large_burst },
	{ "bfq_bfqq_just_created", (void *)&klpe_bfq_bfqq_just_created },
	{ "bfq_bfqq_non_blocking_wait_rq",
	  (void *)&klpe_bfq_bfqq_non_blocking_wait_rq },
	{ "bfq_bfqq_split_coop", (void *)&klpe_bfq_bfqq_split_coop },
	{ "bfq_bfqq_sync", (void *)&klpe_bfq_bfqq_sync },
	{ "bfq_bfqq_to_bfqg", (void *)&klpe_bfq_bfqq_to_bfqg },
	{ "bfq_bfqq_wait_request", (void *)&klpe_bfq_bfqq_wait_request },
	{ "bfq_bic_update_cgroup", (void *)&klpe_bfq_bic_update_cgroup },
	{ "bfq_clear_bfqq_IO_bound", (void *)&klpe_bfq_clear_bfqq_IO_bound },
	{ "bfq_clear_bfqq_coop", (void *)&klpe_bfq_clear_bfqq_coop },
	{ "bfq_clear_bfqq_has_short_ttime",
	  (void *)&klpe_bfq_clear_bfqq_has_short_ttime },
	{ "bfq_clear_bfqq_in_large_burst",
	  (void *)&klpe_bfq_clear_bfqq_in_large_burst },
	{ "bfq_clear_bfqq_just_created",
	  (void *)&klpe_bfq_clear_bfqq_just_created },
	{ "bfq_clear_bfqq_non_blocking_wait_rq",
	  (void *)&klpe_bfq_clear_bfqq_non_blocking_wait_rq },
	{ "bfq_clear_bfqq_softrt_update",
	  (void *)&klpe_bfq_clear_bfqq_softrt_update },
	{ "bfq_clear_bfqq_split_coop",
	  (void *)&klpe_bfq_clear_bfqq_split_coop },
	{ "bfq_clear_bfqq_wait_request",
	  (void *)&klpe_bfq_clear_bfqq_wait_request },
	{ "bfq_get_queue", (void *)&klpe_bfq_get_queue },
	{ "bfq_ioprio_to_weight", (void *)&klpe_bfq_ioprio_to_weight },
	{ "bfq_mark_bfqq_IO_bound", (void *)&klpe_bfq_mark_bfqq_IO_bound },
	{ "bfq_mark_bfqq_has_short_ttime",
	  (void *)&klpe_bfq_mark_bfqq_has_short_ttime },
	{ "bfq_mark_bfqq_in_large_burst",
	  (void *)&klpe_bfq_mark_bfqq_in_large_burst },
	{ "bfq_may_be_close_cooperator",
	  (void *)&klpe_bfq_may_be_close_cooperator },
	{ "bfq_merge_bfqqs", (void *)&klpe_bfq_merge_bfqqs },
	{ "bfq_pos_tree_add_move", (void *)&klpe_bfq_pos_tree_add_move },
	{ "bfq_put_cooperator", (void *)&klpe_bfq_put_cooperator },
	{ "bfq_put_queue", (void *)&klpe_bfq_put_queue },
	{ "bfq_release_process_ref", (void *)&klpe_bfq_release_process_ref },
	{ "bfq_setup_merge", (void *)&klpe_bfq_setup_merge },
	{ "bfq_tot_busy_queues", (void *)&klpe_bfq_tot_busy_queues },
	{ "bfq_updated_next_req", (void *)&klpe_bfq_updated_next_req },
	{ "bfqg_stats_update_legacy_io",
	  (void *)&klpe_bfqg_stats_update_legacy_io },
	{ "bfqg_to_blkg", (void *)&klpe_bfqg_to_blkg },
	{ "bfqq_group", (void *)&klpe_bfqq_group },
	{ "bic_set_bfqq", (void *)&klpe_bic_set_bfqq },
	{ "bic_to_bfqd", (void *)&klpe_bic_to_bfqd },
	{ "bic_to_bfqq", (void *)&klpe_bic_to_bfqq },
	{ "idling_boosts_thr_without_issues",
	  (void *)&klpe_idling_boosts_thr_without_issues },
	{ "next_queue_may_preempt", (void *)&klpe_next_queue_may_preempt },
};

int livepatch_bsc1231943_init(void)
{
	return klp_resolve_kallsyms_relocs(klp_funcs, ARRAY_SIZE(klp_funcs));
}
