VisionFive2 Linux kernel

StarFive Tech Linux Kernel for VisionFive (JH7110) boards (mirror)

More than 9999 Commits   33 Branches   55 Tags
author: Linus Torvalds <torvalds@linux-foundation.org> 2021-07-10 10:46:14 -0700 committer: Linus Torvalds <torvalds@linux-foundation.org> 2021-07-10 10:46:14 -0700 commit: e98e03d075537a14928661ebfbfcde34b0eced1a parent: 379cf80a9861e4356792185bc3fcdd7d4133f2f7
Commit Summary:
Merge tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Diffstat:
55 files changed, 1791 insertions, 1497 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 07b2328f29e0..a0e2130f0100 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -163,7 +163,6 @@ config S390
 	select HAVE_GCC_PLUGINS
 	select HAVE_GENERIC_VDSO
 	select HAVE_IOREMAP_PROT if PCI
-	select HAVE_IRQ_EXIT_ON_IRQ_STACK
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZ4
@@ -438,6 +437,7 @@ config COMPAT
 	select COMPAT_OLD_SIGACTION
 	select HAVE_UID16
 	depends on MULTIUSER
+	depends on !CC_IS_CLANG
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 098abe3a56f3..95c75e653e43 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -166,6 +166,19 @@ archheaders:
 archprepare:
 	$(Q)$(MAKE) $(build)=$(syscalls) kapi
 	$(Q)$(MAKE) $(build)=$(tools) kapi
+ifeq ($(KBUILD_EXTMOD),)
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+	$(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
+	$(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+		$(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
+endif
 
 # Don't use tabs in echo arguments
 define archhelp
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index bbe4df6c2f8b..d0cf21641e3a 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -23,6 +23,7 @@ unsigned long __bootdata_preserved(vmemmap_size);
 unsigned long __bootdata_preserved(MODULES_VADDR);
 unsigned long __bootdata_preserved(MODULES_END);
 unsigned long __bootdata(ident_map_size);
+int __bootdata(is_full_image) = 1;
 
 u64 __bootdata_preserved(stfle_fac_list[16]);
 u64 __bootdata_preserved(alt_stfle_fac_list[16]);
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index 82b99b916243..f6b0c4f43c99 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -36,6 +36,7 @@ void uv_query_info(void)
 		uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
 		uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
 		uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
+		uv_info.uv_feature_indications = uvcb.uv_feature_indications;
 	}
 
 #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index 837d1699b109..3afbee21dc1f 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -53,18 +53,20 @@ struct ap_queue_status {
  */
 static inline bool ap_instructions_available(void)
 {
-	register unsigned long reg0 asm ("0") = AP_MKQID(0, 0);
-	register unsigned long reg1 asm ("1") = 0;
-	register unsigned long reg2 asm ("2") = 0;
+	unsigned long reg0 = AP_MKQID(0, 0);
+	unsigned long reg1 = 0;
 
 	asm volatile(
-		"   .long 0xb2af0000\n"		/* PQAP(TAPQ) */
-		"0: la    %0,1\n"
+		"	lgr	0,%[reg0]\n"   /* qid into gr0 */
+		"	lghi	1,0\n"	       /* 0 into gr1 */
+		"	lghi	2,0\n"	       /* 0 into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(TAPQ) */
+		"0:	la	%[reg1],1\n"   /* 1 into reg1 */
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (reg1), "+d" (reg2)
-		: "d" (reg0)
-		: "cc");
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1", "2");
 	return reg1 != 0;
 }
 
@@ -77,14 +79,18 @@ static inline bool ap_instructions_available(void)
  */
 static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info)
 {
-	register unsigned long reg0 asm ("0") = qid;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm ("2");
-
-	asm volatile(".long 0xb2af0000"		/* PQAP(TAPQ) */
-		     : "=d" (reg1), "=d" (reg2)
-		     : "d" (reg0)
-		     : "cc");
+	struct ap_queue_status reg1;
+	unsigned long reg2;
+
+	asm volatile(
+		"	lgr	0,%[qid]\n"    /* qid into gr0 */
+		"	lghi	2,0\n"	       /* 0 into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(TAPQ) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* gr2 into reg2 */
+		: [reg1] "=&d" (reg1), [reg2] "=&d" (reg2)
+		: [qid] "d" (qid)
+		: "cc", "0", "1", "2");
 	if (info)
 		*info = reg2;
 	return reg1;
@@ -115,14 +121,16 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
  */
 static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
 {
-	register unsigned long reg0 asm ("0") = qid | (1UL << 24);
-	register struct ap_queue_status reg1 asm ("1");
+	unsigned long reg0 = qid | (1UL << 24);  /* fc 1UL is RAPQ */
+	struct ap_queue_status reg1;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(RAPQ) */
-		: "=d" (reg1)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"  /* qid arg into gr0 */
+		"	.long	0xb2af0000\n" /* PQAP(RAPQ) */
+		"	lgr	%[reg1],1\n"  /* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
 	return reg1;
 }
 
@@ -134,14 +142,16 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
  */
 static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
 {
-	register unsigned long reg0 asm ("0") = qid | (2UL << 24);
-	register struct ap_queue_status reg1 asm ("1");
+	unsigned long reg0 = qid | (2UL << 24);  /* fc 2UL is ZAPQ */
+	struct ap_queue_status reg1;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(ZAPQ) */
-		: "=d" (reg1)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"   /* qid arg into gr0 */
+		"	.long	0xb2af0000\n"  /* PQAP(ZAPQ) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
 	return reg1;
 }
 
@@ -172,18 +182,20 @@ struct ap_config_info {
  */
 static inline int ap_qci(struct ap_config_info *config)
 {
-	register unsigned long reg0 asm ("0") = 4UL << 24;
-	register unsigned long reg1 asm ("1") = -EOPNOTSUPP;
-	register struct ap_config_info *reg2 asm ("2") = config;
+	unsigned long reg0 = 4UL << 24;  /* fc 4UL is QCI */
+	unsigned long reg1 = -EOPNOTSUPP;
+	struct ap_config_info *reg2 = config;
 
 	asm volatile(
-		".long 0xb2af0000\n"		/* PQAP(QCI) */
-		"0: la    %0,0\n"
+		"	lgr	0,%[reg0]\n"   /* QCI fc into gr0 */
+		"	lgr	2,%[reg2]\n"   /* ptr to config into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(QCI) */
+		"0:	la	%[reg1],0\n"   /* good case, QCI fc available */
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (reg1)
-		: "d" (reg0), "d" (reg2)
-		: "cc", "memory");
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "memory", "0", "2");
 
 	return reg1;
 }
@@ -220,21 +232,25 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid,
 					     struct ap_qirq_ctrl qirqctrl,
 					     void *ind)
 {
-	register unsigned long reg0 asm ("0") = qid | (3UL << 24);
-	register union {
+	unsigned long reg0 = qid | (3UL << 24);  /* fc 3UL is AQIC */
+	union {
 		unsigned long value;
 		struct ap_qirq_ctrl qirqctrl;
 		struct ap_queue_status status;
-	} reg1 asm ("1");
-	register void *reg2 asm ("2") = ind;
+	} reg1;
+	void *reg2 = ind;
 
 	reg1.qirqctrl = qirqctrl;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(AQIC) */
-		: "+d" (reg1)
-		: "d" (reg0), "d" (reg2)
-		: "cc");
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lgr	1,%[reg1]\n"   /* irq ctrl into gr1 */
+		"	lgr	2,%[reg2]\n"   /* ni addr into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(AQIC) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "0", "1", "2");
 
 	return reg1.status;
 }
@@ -268,21 +284,24 @@ union ap_qact_ap_info {
 static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit,
 					     union ap_qact_ap_info *apinfo)
 {
-	register unsigned long reg0 asm ("0") = qid | (5UL << 24)
-		| ((ifbit & 0x01) << 22);
-	register union {
+	unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22);
+	union {
 		unsigned long value;
 		struct ap_queue_status status;
-	} reg1 asm ("1");
-	register unsigned long reg2 asm ("2");
+	} reg1;
+	unsigned long reg2;
 
 	reg1.value = apinfo->val;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(QACT) */
-		: "+d" (reg1), "=d" (reg2)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lgr	1,%[reg1]\n"   /* qact in info into gr1 */
+		"	.long	0xb2af0000\n"  /* PQAP(QACT) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* qact out info into reg2 */
+		: [reg1] "+&d" (reg1), [reg2] "=&d" (reg2)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1", "2");
 	apinfo->val = reg2;
 	return reg1.status;
 }
@@ -303,19 +322,24 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
 					     unsigned long long psmid,
 					     void *msg, size_t length)
 {
-	register unsigned long reg0 asm ("0") = qid | 0x40000000UL;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm ("2") = (unsigned long) msg;
-	register unsigned long reg3 asm ("3") = (unsigned long) length;
-	register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32);
-	register unsigned long reg5 asm ("5") = psmid & 0xffffffff;
+	unsigned long reg0 = qid | 0x40000000UL;  /* 0x4... is last msg part */
+	union register_pair nqap_r1, nqap_r2;
+	struct ap_queue_status reg1;
+
+	nqap_r1.even = (unsigned int)(psmid >> 32);
+	nqap_r1.odd  = psmid & 0xffffffff;
+	nqap_r2.even = (unsigned long)msg;
+	nqap_r2.odd  = (unsigned long)length;
 
 	asm volatile (
-		"0: .long 0xb2ad0042\n"		/* NQAP */
-		"   brc   2,0b"
-		: "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3)
-		: "d" (reg4), "d" (reg5)
-		: "cc", "memory");
+		"	lgr	0,%[reg0]\n"  /* qid param in gr0 */
+		"0:	.insn	rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n"
+		"	brc	2,0b\n"       /* handle partial completion */
+		"	lgr	%[reg1],1\n"  /* gr1 (status) into reg1 */
+		: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1),
+		  [nqap_r2] "+&d" (nqap_r2.pair)
+		: [nqap_r1] "d" (nqap_r1.pair)
+		: "cc", "memory", "0", "1");
 	return reg1;
 }
 
@@ -325,6 +349,8 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
  * @psmid: Pointer to program supplied message identifier
  * @msg: The message text
  * @length: The message length
+ * @reslength: Resitual length on return
+ * @resgr0: input: gr0 value (only used if != 0), output: resitual gr0 content
  *
  * Returns AP queue status structure.
  * Condition code 1 on DQAP means the receive has taken place
@@ -336,27 +362,65 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
  * Note that gpr2 is used by the DQAP instruction to keep track of
  * any 'residual' length, in case the instruction gets interrupted.
  * Hence it gets zeroed before the instruction.
+ * If the message does not fit into the buffer, this function will
+ * return with a truncated message and the reply in the firmware queue
+ * is not removed. This is indicated to the caller with an
+ * ap_queue_status response_code value of all bits on (0xFF) and (if
+ * the reslength ptr is given) the remaining length is stored in
+ * *reslength and (if the resgr0 ptr is given) the updated gr0 value
+ * for further processing of this msg entry is stored in *resgr0. The
+ * caller needs to detect this situation and should invoke ap_dqap
+ * with a valid resgr0 ptr and a value in there != 0 to indicate that
+ * *resgr0 is to be used instead of qid to further process this entry.
  */
 static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
 					     unsigned long long *psmid,
-					     void *msg, size_t length)
+					     void *msg, size_t length,
+					     size_t *reslength,
+					     unsigned long *resgr0)
 {
-	register unsigned long reg0 asm("0") = qid | 0x80000000UL;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm("2") = 0UL;
-	register unsigned long reg4 asm("4") = (unsigned long) msg;
-	register unsigned long reg5 asm("5") = (unsigned long) length;
-	register unsigned long reg6 asm("6") = 0UL;
-	register unsigned long reg7 asm("7") = 0UL;
+	unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL;
+	struct ap_queue_status reg1;
+	unsigned long reg2;
+	union register_pair rp1, rp2;
 
+	rp1.even = 0UL;
+	rp1.odd  = 0UL;
+	rp2.even = (unsigned long)msg;
+	rp2.odd  = (unsigned long)length;
 
 	asm volatile(
-		"0: .long 0xb2ae0064\n"		/* DQAP */
-		"   brc   6,0b\n"
-		: "+d" (reg0), "=d" (reg1), "+d" (reg2),
-		  "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7)
-		: : "cc", "memory");
-	*psmid = (((unsigned long long) reg6) << 32) + reg7;
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lghi	2,0\n"	       /* 0 into gr2 (res length) */
+		"0:	ltgr	%N[rp2],%N[rp2]\n" /* check buf len */
+		"	jz	2f\n"	       /* go out if buf len is 0 */
+		"1:	.insn	rre,0xb2ae0000,%[rp1],%[rp2]\n"
+		"	brc	6,0b\n"        /* handle partial complete */
+		"2:	lgr	%[reg0],0\n"   /* gr0 (qid + info) into reg0 */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* gr2 (res length) into reg2 */
+		: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), [reg2] "=&d" (reg2),
+		  [rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair)
+		:
+		: "cc", "memory", "0", "1", "2");
+
+	if (reslength)
+		*reslength = reg2;
+	if (reg2 != 0 && rp2.odd == 0) {
+		/*
+		 * Partially complete, status in gr1 is not set.
+		 * Signal the caller that this dqap is only partially received
+		 * with a special status response code 0xFF and *resgr0 updated
+		 */
+		reg1.response_code = 0xFF;
+		if (resgr0)
+			*resgr0 = reg0;
+	} else {
+		*psmid = (((unsigned long long)rp1.even) << 32) + rp1.odd;
+		if (resgr0)
+			*resgr0 = 0;
+	}
+
 	return reg1;
 }
 
diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
index 4dcefddb7751..ca0e0e5ddbc4 100644
--- a/arch/s390/include/asm/cpu_mcf.h
+++ b/arch/s390/include/asm/cpu_mcf.h
@@ -32,39 +32,22 @@ static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
 	[CPUMF_CTR_SET_MT_DIAG] = 0x20,
 };
 
-static inline void ctr_set_enable(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-static inline void ctr_set_disable(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-static inline void ctr_set_start(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-static inline void ctr_set_stop(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
 {
 	*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
 }
 
-static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
 {
 	*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
 }
 
-static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
 {
 	*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
 }
 
-static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
 {
 	*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
 }
@@ -92,8 +75,15 @@ struct cpu_cf_events {
 	struct cpumf_ctr_info	info;
 	atomic_t		ctr_set[CPUMF_CTR_SET_MAX];
 	atomic64_t		alert;
-	u64			state;
+	u64			state;		/* For perf_event_open SVC */
+	u64			dev_state;	/* For /dev/hwctr */
 	unsigned int		flags;
+	size_t used;			/* Bytes used in data */
+	size_t usedss;			/* Bytes used in start/stop */
+	unsigned char start[PAGE_SIZE];	/* Counter set at event add */
+	unsigned char stop[PAGE_SIZE];	/* Counter set at event delete */
+	unsigned char data[PAGE_SIZE];	/* Counter set at /dev/hwctr */
+	unsigned int sets;		/* # Counter set saved in memory */
 };
 DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
 
@@ -124,4 +114,6 @@ static inline int stccm_avail(void)
 
 size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
 			   struct cpumf_ctr_info *info);
+int cfset_online_cpu(unsigned int cpu);
+int cfset_offline_cpu(unsigned int cpu);
 #endif /* _ASM_S390_CPU_MCF_H */
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
index ed5efbb531c4..adc0179fa34e 100644
--- a/arch/s390/include/asm/ctl_reg.h
+++ b/arch/s390/include/asm/ctl_reg.h
@@ -21,8 +21,6 @@
 #define CR0_INTERRUPT_KEY_SUBMASK	BIT(63 - 57)
 #define CR0_MEASUREMENT_ALERT_SUBMASK	BIT(63 - 58)
 
-#define CR2_GUARDED_STORAGE		BIT(63 - 59)
-
 #define CR14_UNUSED_32			BIT(63 - 32)
 #define CR14_UNUSED_33			BIT(63 - 33)
 #define CR14_CHANNEL_REPORT_SUBMASK	BIT(63 - 35)
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 66d51ad090ab..bd00c94620d3 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -144,10 +144,6 @@ typedef s390_compat_regs compat_elf_gregset_t;
 #include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
-#include <asm/vdso.h>
-
-extern unsigned int vdso_enabled;
-
 /*
  * This is used to ensure we don't load something for the wrong architecture.
  */
@@ -176,7 +172,7 @@ struct arch_elf_state {
 	    !current->mm->context.alloc_pgste) {		\
 		set_thread_flag(TIF_PGSTE);			\
 		set_pt_regs_flag(task_pt_regs(current),		\
-				 PIF_SYSCALL_RESTART);		\
+				 PIF_EXECVE_PGSTE_RESTART);	\
 		_state->rc = -EAGAIN;				\
 	}							\
 	_state->rc;						\
@@ -268,11 +264,10 @@ do {								\
 #define STACK_RND_MASK	MMAP_RND_MASK
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO							    \
-do {									    \
-	if (vdso_enabled)						    \
-		NEW_AUX_ENT(AT_SYSINFO_EHDR,				    \
-			    (unsigned long)current->mm->context.vdso_base); \
+#define ARCH_DLINFO							\
+do {									\
+	NEW_AUX_ENT(AT_SYSINFO_EHDR,					\
+		    (unsigned long)current->mm->context.vdso_base);	\
 } while (0)
 
 struct linux_binprm;
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index baa8005090c3..17aead80aadb 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -14,7 +14,6 @@
 #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
 
 void do_per_trap(struct pt_regs *regs);
-void do_syscall(struct pt_regs *regs);
 
 #ifdef CONFIG_DEBUG_ENTRY
 static __always_inline void arch_check_user_regs(struct pt_regs *regs)
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index a0a7a2c72bd4..24e8fed150cf 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -5,7 +5,7 @@
 #include <asm/asm-const.h>
 #include <linux/stringify.h>
 
-#define __ALIGN .align 4, 0x07
+#define __ALIGN .align 16, 0x07
 #define __ALIGN_STR __stringify(__ALIGN)
 
 /*
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index 20e51c9ff240..2db45d7e68aa 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -23,12 +23,16 @@
 #define MCCK_CODE_SYSTEM_DAMAGE		BIT(63)
 #define MCCK_CODE_EXT_DAMAGE		BIT(63 - 5)
 #define MCCK_CODE_CP			BIT(63 - 9)
-#define MCCK_CODE_CPU_TIMER_VALID	BIT(63 - 46)
+#define MCCK_CODE_STG_ERROR		BIT(63 - 16)
+#define MCCK_CODE_STG_KEY_ERROR		BIT(63 - 18)
+#define MCCK_CODE_STG_DEGRAD		BIT(63 - 19)
 #define MCCK_CODE_PSW_MWP_VALID		BIT(63 - 20)
 #define MCCK_CODE_PSW_IA_VALID		BIT(63 - 23)
+#define MCCK_CODE_STG_FAIL_ADDR		BIT(63 - 24)
 #define MCCK_CODE_CR_VALID		BIT(63 - 29)
 #define MCCK_CODE_GS_VALID		BIT(63 - 36)
 #define MCCK_CODE_FC_VALID		BIT(63 - 43)
+#define MCCK_CODE_CPU_TIMER_VALID	BIT(63 - 46)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index 23ff51be7e29..d9d5350cc3ec 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc)
 				  old, new) != old);
 }
 
-#define init_task_preempt_count(p)	do { } while (0)
-
-#define init_idle_preempt_count(p, cpu)	do { \
-	S390_lowcore.preempt_count = PREEMPT_DISABLED; \
-} while (0)
-
 static inline void set_preempt_need_resched(void)
 {
 	__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
@@ -88,12 +82,6 @@ static inline void preempt_count_set(int pc)
 	S390_lowcore.preempt_count = pc;
 }
 
-#define init_task_preempt_count(p)	do { } while (0)
-
-#define init_idle_preempt_count(p, cpu)	do { \
-	S390_lowcore.preempt_count = PREEMPT_DISABLED; \
-} while (0)
-
 static inline void set_preempt_need_resched(void)
 {
 }
@@ -130,6 +118,10 @@ static inline bool should_resched(int preempt_offset)
 
 #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
 
+#define init_task_preempt_count(p)	do { } while (0)
+/* Deferred to CPU bringup time */
+#define init_idle_preempt_count(p, cpu)	do { } while (0)
+
 #ifdef CONFIG_PREEMPTION
 extern void preempt_schedule(void);
 #define __preempt_schedule() preempt_schedule()
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index c7850d649373..61b22aa990e7 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -11,15 +11,15 @@
 #include <uapi/asm/ptrace.h>
 #include <asm/tpi.h>
 
-#define PIF_SYSCALL		0	/* inside a system call */
-#define PIF_SYSCALL_RESTART	1	/* restart the current system call */
-#define PIF_SYSCALL_RET_SET	2	/* return value was set via ptrace */
-#define PIF_GUEST_FAULT		3	/* indicates program check in sie64a */
+#define PIF_SYSCALL			0	/* inside a system call */
+#define PIF_EXECVE_PGSTE_RESTART	1	/* restart execve for PGSTE binaries */
+#define PIF_SYSCALL_RET_SET		2	/* return value was set via ptrace */
+#define PIF_GUEST_FAULT			3	/* indicates program check in sie64a */
 
-#define _PIF_SYSCALL		BIT(PIF_SYSCALL)
-#define _PIF_SYSCALL_RESTART	BIT(PIF_SYSCALL_RESTART)
-#define _PIF_SYSCALL_RET_SET	BIT(PIF_SYSCALL_RET_SET)
-#define _PIF_GUEST_FAULT	BIT(PIF_GUEST_FAULT)
+#define _PIF_SYSCALL			BIT(PIF_SYSCALL)
+#define _PIF_EXECVE_PGSTE_RESTART	BIT(PIF_EXECVE_PGSTE_RESTART)
+#define _PIF_SYSCALL_RET_SET		BIT(PIF_SYSCALL_RET_SET)
+#define _PIF_GUEST_FAULT		BIT(PIF_GUEST_FAULT)
 
 #ifndef __ASSEMBLY__
 
@@ -162,6 +162,14 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
 	return !!(regs->flags & (1UL << flag));
 }
 
+static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
+{
+	int ret = test_pt_regs_flag(regs, flag);
+
+	clear_pt_regs_flag(regs, flag);
+	return ret;
+}
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index a8b75da3c1b8..3a77aa96d092 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -159,6 +159,8 @@ static inline unsigned long kaslr_offset(void)
 	return __kaslr_offset;
 }
 
+extern int is_full_image;
+
 static inline u32 gen_lpswe(unsigned long addr)
 {
 	BUILD_BUG_ON(addr > 0xfff);
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
new file mode 100644
index 000000000000..fd17f25704bd
--- /dev/null
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_S390_SOFTIRQ_STACK_H
+#define __ASM_S390_SOFTIRQ_STACK_H
+
+#include <asm/lowcore.h>
+#include <asm/stacktrace.h>
+
+static inline void do_softirq_own_stack(void)
+{
+	call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
+}
+
+#endif /* __ASM_S390_SOFTIRQ_STACK_H */
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index 76c6034428be..3d8a4b94c620 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -74,23 +74,6 @@ struct stack_frame {
 	((unsigned long)__builtin_frame_address(0) -			\
 	 offsetof(struct stack_frame, back_chain))
 
-#define CALL_ARGS_0()							\
-	register unsigned long r2 asm("2")
-#define CALL_ARGS_1(arg1)						\
-	register unsigned long r2 asm("2") = (unsigned long)(arg1)
-#define CALL_ARGS_2(arg1, arg2)						\
-	CALL_ARGS_1(arg1);						\
-	register unsigned long r3 asm("3") = (unsigned long)(arg2)
-#define CALL_ARGS_3(arg1, arg2, arg3)					\
-	CALL_ARGS_2(arg1, arg2);					\
-	register unsigned long r4 asm("4") = (unsigned long)(arg3)
-#define CALL_ARGS_4(arg1, arg2, arg3, arg4)				\
-	CALL_ARGS_3(arg1, arg2, arg3);					\
-	register unsigned long r4 asm("5") = (unsigned long)(arg4)
-#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5)			\
-	CALL_ARGS_4(arg1, arg2, arg3, arg4);				\
-	register unsigned long r4 asm("6") = (unsigned long)(arg5)
-
 /*
  * To keep this simple mark register 2-6 as being changed (volatile)
  * by the called function, even though register 6 is saved/nonvolatile.
@@ -109,34 +92,113 @@ struct stack_frame {
 #define CALL_CLOBBER_1 CALL_CLOBBER_2, "3"
 #define CALL_CLOBBER_0 CALL_CLOBBER_1
 
-#define CALL_ON_STACK(fn, stack, nr, args...)				\
+#define CALL_LARGS_0(...)						\
+	long dummy = 0
+#define CALL_LARGS_1(t1, a1)						\
+	long arg1  = (long)(t1)(a1)
+#define CALL_LARGS_2(t1, a1, t2, a2)					\
+	CALL_LARGS_1(t1, a1);						\
+	long arg2 = (long)(t2)(a2)
+#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3)				\
+	CALL_LARGS_2(t1, a1, t2, a2);					\
+	long arg3 = (long)(t3)(a3)
+#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4)			\
+	CALL_LARGS_3(t1, a1, t2, a2, t3, a3);				\
+	long arg4  = (long)(t4)(a4)
+#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5)		\
+	CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4);			\
+	long arg5 = (long)(t5)(a5)
+
+#define CALL_REGS_0							\
+	register long r2 asm("2") = dummy
+#define CALL_REGS_1							\
+	register long r2 asm("2") = arg1
+#define CALL_REGS_2							\
+	CALL_REGS_1;							\
+	register long r3 asm("3") = arg2
+#define CALL_REGS_3							\
+	CALL_REGS_2;							\
+	register long r4 asm("4") = arg3
+#define CALL_REGS_4							\
+	CALL_REGS_3;							\
+	register long r5 asm("5") = arg4
+#define CALL_REGS_5							\
+	CALL_REGS_4;							\
+	register long r6 asm("6") = arg5
+
+#define CALL_TYPECHECK_0(...)
+#define CALL_TYPECHECK_1(t, a, ...)					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_2(t, a, ...)					\
+	CALL_TYPECHECK_1(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_3(t, a, ...)					\
+	CALL_TYPECHECK_2(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_4(t, a, ...)					\
+	CALL_TYPECHECK_3(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_5(t, a, ...)					\
+	CALL_TYPECHECK_4(__VA_ARGS__);					\
+	typecheck(t, a)
+
+#define CALL_PARM_0(...) void
+#define CALL_PARM_1(t, a, ...) t
+#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__)
+#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__)
+#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__)
+#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__)
+#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__)
+
+/*
+ * Use call_on_stack() to call a function switching to a specified
+ * stack. Proper sign and zero extension of function arguments is
+ * done. Usage:
+ *
+ * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - stack specifies the stack to be used.
+ * - fn is the function to be called.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ *   argument of fn, t2 the second, etc. a1 is the corresponding
+ *   first function argument (not name), etc.
+ */
+#define call_on_stack(nr, stack, rettype, fn, ...)			\
 ({									\
+	rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn;		\
 	unsigned long frame = current_frame_address();			\
-	CALL_ARGS_##nr(args);						\
+	unsigned long __stack = stack;					\
 	unsigned long prev;						\
+	CALL_LARGS_##nr(__VA_ARGS__);					\
+	CALL_REGS_##nr;							\
 									\
+	CALL_TYPECHECK_##nr(__VA_ARGS__);				\
 	asm volatile(							\
-		"	la	%[_prev],0(15)\n"			\
+		"	lgr	%[_prev],15\n"				\
 		"	lg	15,%[_stack]\n"				\
 		"	stg	%[_frame],%[_bc](15)\n"			\
 		"	brasl	14,%[_fn]\n"				\
-		"	la	15,0(%[_prev])\n"			\
-		: [_prev] "=&a" (prev), CALL_FMT_##nr			\
-		: [_stack] "R" (stack),					\
+		"	lgr	15,%[_prev]\n"				\
+		: [_prev] "=&d" (prev), CALL_FMT_##nr			\
+		: [_stack] "R" (__stack),				\
 		  [_bc] "i" (offsetof(struct stack_frame, back_chain)),	\
 		  [_frame] "d" (frame),					\
-		  [_fn] "X" (fn) : CALL_CLOBBER_##nr);			\
-	r2;								\
+		  [_fn] "X" (__fn) : CALL_CLOBBER_##nr);		\
+	(rettype)r2;							\
 })
 
-#define CALL_ON_STACK_NORETURN(fn, stack)				\
+#define call_on_stack_noreturn(fn, stack)				\
 ({									\
+	void (*__fn)(void) = fn;					\
+									\
 	asm volatile(							\
 		"	la	15,0(%[_stack])\n"			\
 		"	xc	%[_bc](8,15),%[_bc](15)\n"		\
 		"	brasl	14,%[_fn]\n"				\
 		::[_bc] "i" (offsetof(struct stack_frame, back_chain)),	\
-		  [_stack] "a" (stack), [_fn] "X" (fn));		\
+		  [_stack] "a" (stack), [_fn] "X" (__fn));		\
 	BUG();								\
 })
 
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 7b98d4caee77..12c5f006c136 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -73,6 +73,10 @@ enum uv_cmds_inst {
 	BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
 };
 
+enum uv_feat_ind {
+	BIT_UV_FEAT_MISC = 0,
+};
+
 struct uv_cb_header {
 	u16 len;
 	u16 cmd;	/* Command Code */
@@ -97,7 +101,8 @@ struct uv_cb_qui {
 	u64 max_guest_stor_addr;
 	u8  reserved88[158 - 136];
 	u16 max_guest_cpu_id;
-	u8  reserveda0[200 - 160];
+	u64 uv_feature_indications;
+	u8  reserveda0[200 - 168];
 } __packed __aligned(8);
 
 /* Initialize Ultravisor */
@@ -274,6 +279,7 @@ struct uv_info {
 	unsigned long max_sec_stor_addr;
 	unsigned int max_num_sec_conf;
 	unsigned short max_guest_cpu_id;
+	unsigned long uv_feature_indications;
 };
 
 extern struct uv_info uv_info;
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index b45e3dddd2c2..53165aa7813a 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -4,18 +4,31 @@
 
 #include <vdso/datapage.h>
 
-/* Default link address for the vDSO */
-#define VDSO64_LBASE	0
+#ifndef __ASSEMBLY__
 
-#define __VVAR_PAGES	2
+#include <generated/vdso64-offsets.h>
+#ifdef CONFIG_COMPAT
+#include <generated/vdso32-offsets.h>
+#endif
 
-#define VDSO_VERSION_STRING	LINUX_2.6.29
-
-#ifndef __ASSEMBLY__
+#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
+#ifdef CONFIG_COMPAT
+#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
+#else
+#define VDSO32_SYMBOL(tsk, name) (-1UL)
+#endif
 
 extern struct vdso_data *vdso_data;
 
 int vdso_getcpu_init(void);
 
 #endif /* __ASSEMBLY__ */
+
+/* Default link address for the vDSO */
+#define VDSO_LBASE	0
+
+#define __VVAR_PAGES	2
+
+#define VDSO_VERSION_STRING	LINUX_2.6.29
+
 #endif /* __S390_VDSO_H__ */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
index 383c53c3dddd..d6465b22ffe3 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -8,7 +8,6 @@
 
 #include <asm/timex.h>
 #include <asm/unistd.h>
-#include <asm/vdso.h>
 #include <linux/compiler.h>
 
 #define vdso_calc_delta __arch_vdso_calc_delta
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 68ca1834316f..4a44ba5a2d73 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -71,10 +71,10 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf_common.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_diag.o
 
 obj-$(CONFIG_TRACEPOINTS)	+= trace.o
 obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
 
 # vdso
 obj-y				+= vdso64/
+obj-$(CONFIG_COMPAT)		+= vdso32/
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index f53605a3dfcd..77ff2130cb04 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -14,8 +14,6 @@
 #include <linux/pgtable.h>
 #include <asm/idle.h>
 #include <asm/gmap.h>
-#include <asm/nmi.h>
-#include <asm/setup.h>
 #include <asm/stacktrace.h>
 
 int main(void)
@@ -108,7 +106,6 @@ int main(void)
 	OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
 	OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
 	OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
-	OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
 	OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
 	OFFSET(__LC_CURRENT, lowcore, current_task);
 	OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
@@ -145,9 +142,6 @@ int main(void)
 	OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
 	OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
 	BLANK();
-	/* extended machine check save area */
-	OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
-	BLANK();
 	/* gmap/sie offsets */
 	OFFSET(__GMAP_ASCE, gmap, asce);
 	OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 1d0e17ec93eb..cca142fbb516 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -28,6 +28,7 @@
 #include <linux/uaccess.h>
 #include <asm/lowcore.h>
 #include <asm/switch_to.h>
+#include <asm/vdso.h>
 #include "compat_linux.h"
 #include "compat_ptrace.h"
 #include "entry.h"
@@ -118,7 +119,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
 	fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
 
 	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
-	clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
 	return 0;
 }
 
@@ -304,11 +304,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
 		restorer = (unsigned long __force)
 			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
 	} else {
-		/* Signal frames without vectors registers are short ! */
-		__u16 __user *svc = (void __user *) frame + frame_size - 2;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+		restorer = VDSO32_SYMBOL(current, sigreturn);
         }
 
 	/* Set up registers for signal handler */
@@ -371,10 +367,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
 		restorer = (unsigned long __force)
 			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
 	} else {
-		__u16 __user *svc = &frame->svc_insn;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+		restorer = VDSO32_SYMBOL(current, rt_sigreturn);
 	}
 
 	/* Create siginfo on the signal stack */
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index c2cf79d353cf..fb84e3fc1686 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -33,6 +33,8 @@
 #include <asm/switch_to.h>
 #include "entry.h"
 
+int __bootdata(is_full_image);
+
 static void __init reset_tod_clock(void)
 {
 	union tod_clock clk;
@@ -279,7 +281,7 @@ static void __init setup_boot_command_line(void)
 
 static void __init check_image_bootable(void)
 {
-	if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
+	if (is_full_image)
 		return;
 
 	sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 3e8c6669373a..5a2f70cbd3a9 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -14,7 +14,6 @@
 #include <asm/alternative-asm.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
-#include <asm/ctl_reg.h>
 #include <asm/dwarf.h>
 #include <asm/errno.h>
 #include <asm/ptrace.h>
@@ -129,6 +128,24 @@ _LPP_OFFSET	= __LC_LPP
 		    "jnz .+8; .long 0xb2e8d000", 82
 	.endm
 
+	/*
+	 * The CHKSTG macro jumps to the provided label in case the
+	 * machine check interruption code reports one of unrecoverable
+	 * storage errors:
+	 * - Storage error uncorrected
+	 * - Storage key error uncorrected
+	 * - Storage degradation with Failing-storage-address validity
+	 */
+	.macro CHKSTG errlabel
+	TSTMSK	__LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR)
+	jnz	\errlabel
+	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD
+	jz	oklabel\@
+	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR
+	jnz	\errlabel
+oklabel\@:
+	.endm
+
 #if IS_ENABLED(CONFIG_KVM)
 	/*
 	 * The OUTSIDE macro jumps to the provided label in case the value
@@ -148,6 +165,13 @@ _LPP_OFFSET	= __LC_LPP
 	clgr	%r14,%r13
 	jhe	\outside_label
 	.endm
+
+	.macro SIEEXIT
+	lg	%r9,__SF_SIE_CONTROL(%r15)	# get control block pointer
+	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
+	larl	%r9,sie_exit			# skip forward to sie_exit
+	.endm
 #endif
 
 	GEN_BR_THUNK %r14
@@ -235,7 +259,6 @@ ENTRY(sie64a)
 # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
 # Other instructions between sie64a and .Lsie_done should not cause program
 # interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-# See also .Lcleanup_sie
 .Lrewind_pad6:
 	nopr	7
 .Lrewind_pad4:
@@ -341,10 +364,7 @@ ENTRY(pgm_check_handler)
 #if IS_ENABLED(CONFIG_KVM)
 	# cleanup critical section for program checks in sie64a
 	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,1f
-	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
-	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
-	larl	%r9,sie_exit			# skip forward to sie_exit
+	SIEEXIT
 	lghi	%r10,_PIF_GUEST_FAULT
 #endif
 1:	tmhh	%r8,0x4000		# PER bit set in old PSW ?
@@ -410,7 +430,8 @@ ENTRY(\name)
 	jnz	1f
 #if IS_ENABLED(CONFIG_KVM)
 	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,0f
-	brasl	%r14,.Lcleanup_sie
+	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	SIEEXIT
 #endif
 0:	CHECK_STACK __LC_SAVE_AREA_ASYNC
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
@@ -484,8 +505,6 @@ ENTRY(mcck_int_handler)
 	BPOFF
 	la	%r1,4095		# validate r1
 	spt	__LC_CPU_TIMER_SAVE_AREA-4095(%r1)	# validate cpu timer
-	sckc	__LC_CLOCK_COMPARATOR			# validate comparator
-	lam	%a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
 	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
 	lg	%r12,__LC_CURRENT
 	lmg	%r8,%r9,__LC_MCK_OLD_PSW
@@ -496,41 +515,7 @@ ENTRY(mcck_int_handler)
 	la	%r14,4095
 	lctlg	%c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
 	ptlb
-	lg	%r11,__LC_MCESAD-4095(%r14) # extended machine check save area
-	nill	%r11,0xfc00		# MCESA_ORIGIN_MASK
-	TSTMSK	__LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
-	jno	0f
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_GS_VALID
-	jno	0f
-	.insn	 rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
-0:	l	%r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_FC_VALID
-	jo	0f
-	sr	%r14,%r14
-0:	sfpc	%r14
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	jo	0f
-	lghi	%r14,__LC_FPREGS_SAVE_AREA
-	ld	%f0,0(%r14)
-	ld	%f1,8(%r14)
-	ld	%f2,16(%r14)
-	ld	%f3,24(%r14)
-	ld	%f4,32(%r14)
-	ld	%f5,40(%r14)
-	ld	%f6,48(%r14)
-	ld	%f7,56(%r14)
-	ld	%f8,64(%r14)
-	ld	%f9,72(%r14)
-	ld	%f10,80(%r14)
-	ld	%f11,88(%r14)
-	ld	%f12,96(%r14)
-	ld	%f13,104(%r14)
-	ld	%f14,112(%r14)
-	ld	%f15,120(%r14)
-	j	1f
-0:	VLM	%v0,%v15,0,%r11
-	VLM	%v16,%v31,256,%r11
-1:	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
+	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
 	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
 	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
 	jo	3f
@@ -546,24 +531,29 @@ ENTRY(mcck_int_handler)
 3:	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
 	jno	.Lmcck_panic
 	tmhh	%r8,0x0001		# interrupting from user ?
-	jnz	4f
+	jnz	6f
 	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
 	jno	.Lmcck_panic
-4:	ssm	__LC_PGM_NEW_PSW	# turn dat on, keep irqs off
-	tmhh	%r8,0x0001			# interrupting from user ?
-	jnz	.Lmcck_user
 #if IS_ENABLED(CONFIG_KVM)
-	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack
-	OUTSIDE	%r9,.Lsie_entry,.Lsie_skip,5f
+	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,6f
+	OUTSIDE	%r9,.Lsie_entry,.Lsie_skip,4f
 	oi	__LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-5:	brasl	%r14,.Lcleanup_sie
-#endif
+	j	5f
+4:	CHKSTG	.Lmcck_panic
+5:	larl	%r14,.Lstosm_tmp
+	stosm	0(%r14),0x04		# turn dat on, keep irqs off
+	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	SIEEXIT
 	j	.Lmcck_stack
-.Lmcck_user:
+#endif
+6:	CHKSTG	.Lmcck_panic
+	larl	%r14,.Lstosm_tmp
+	stosm	0(%r14),0x04		# turn dat on, keep irqs off
+	tmhh	%r8,0x0001		# interrupting from user ?
+	jz	.Lmcck_stack
 	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
 .Lmcck_stack:
 	lg	%r15,__LC_MCCK_STACK
-.Lmcck_skip:
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stctg	%c1,%c1,__PT_CR1(%r11)
 	lctlg	%c1,%c1,__LC_KERNEL_ASCE
@@ -605,8 +595,33 @@ ENTRY(mcck_int_handler)
 	b	__LC_RETURN_MCCK_LPSWE
 
 .Lmcck_panic:
-	lg	%r15,__LC_NODAT_STACK
-	j	.Lmcck_skip
+	/*
+	 * Iterate over all possible CPU addresses in the range 0..0xffff
+	 * and stop each CPU using signal processor. Use compare and swap
+	 * to allow just one CPU-stopper and prevent concurrent CPUs from
+	 * stopping each other while leaving the others running.
+	 */
+	lhi	%r5,0
+	lhi	%r6,1
+	larl	%r7,.Lstop_lock
+	cs	%r5,%r6,0(%r7)		# single CPU-stopper only
+	jnz	4f
+	larl	%r7,.Lthis_cpu
+	stap	0(%r7)			# this CPU address
+	lh	%r4,0(%r7)
+	nilh	%r4,0
+	lhi	%r0,1
+	sll	%r0,16			# CPU counter
+	lhi	%r3,0			# next CPU address
+0:	cr	%r3,%r4
+	je	2f
+1:	sigp	%r1,%r3,SIGP_STOP	# stop next CPU
+	brc	SIGP_CC_BUSY,1b
+2:	ahi	%r3,1
+	brct	%r0,0b
+3:	sigp	%r1,%r4,SIGP_STOP	# stop this CPU
+	brc	SIGP_CC_BUSY,3b
+4:	j	4b
 ENDPROC(mcck_int_handler)
 
 #
@@ -657,15 +672,11 @@ ENTRY(stack_overflow)
 ENDPROC(stack_overflow)
 #endif
 
-#if IS_ENABLED(CONFIG_KVM)
-.Lcleanup_sie:
-	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
-	lg	%r9,__SF_SIE_CONTROL(%r15)	# get control block pointer
-	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE
-	larl	%r9,sie_exit			# skip forward to sie_exit
-	BR_EX	%r14,%r13
-#endif
+	.section .data, "aw"
+		.align	4
+.Lstop_lock:	.long	0
+.Lthis_cpu:	.short	0
+.Lstosm_tmp:	.byte	0
 	.section .rodata, "a"
 #define SYSCALL(esame,emu)	.quad __s390x_ ## esame
 	.globl	sys_call_table
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index c0df4060d28d..234d085257eb 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -110,15 +110,17 @@ static int on_async_stack(void)
 {
 	unsigned long frame = current_frame_address();
 
-	return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER));
+	return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
 }
 
 static void do_irq_async(struct pt_regs *regs, int irq)
 {
-	if (on_async_stack())
+	if (on_async_stack()) {
 		do_IRQ(regs, irq);
-	else
-		CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq);
+	} else {
+		call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
+			      struct pt_regs *, regs, int, irq);
+	}
 }
 
 static int irq_pending(struct pt_regs *regs)
@@ -265,24 +267,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
 	return from < NR_IRQS_BASE ? NR_IRQS_BASE : from;
 }
 
-/*
- * Switch to the asynchronous interrupt stack for softirq execution.
- */
-void do_softirq_own_stack(void)
-{
-	unsigned long old, new;
-
-	old = current_stack_pointer();
-	/* Check against async. stack address range. */
-	new = S390_lowcore.async_stack;
-	if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
-		CALL_ON_STACK(__do_softirq, new, 0);
-	} else {
-		/* We are already on the async stack. */
-		__do_softirq();
-	}
-}
-
 /*
  * ext_int_hash[index] is the list head for all external interrupts that hash
  * to this index.
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 528bb31815c3..52d056a5f89f 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -92,11 +92,6 @@ static void copy_instruction(struct kprobe *p)
 }
 NOKPROBE_SYMBOL(copy_instruction);
 
-static inline int is_kernel_addr(void *addr)
-{
-	return addr < (void *)_end;
-}
-
 static int s390_get_insn_slot(struct kprobe *p)
 {
 	/*
@@ -105,7 +100,7 @@ static int s390_get_insn_slot(struct kprobe *p)
 	 * field can be patched and executed within the insn slot.
 	 */
 	p->ainsn.insn = NULL;
-	if (is_kernel_addr(p->addr))
+	if (is_kernel((unsigned long)p->addr))
 		p->ainsn.insn = get_s390_insn_slot();
 	else if (is_module_addr(p->addr))
 		p->ainsn.insn = get_insn_slot();
@@ -117,7 +112,7 @@ static void s390_free_insn_slot(struct kprobe *p)
 {
 	if (!p->ainsn.insn)
 		return;
-	if (is_kernel_addr(p->addr))
+	if (is_kernel((unsigned long)p->addr))
 		free_s390_insn_slot(p->ainsn.insn, 0);
 	else
 		free_insn_slot(p->ainsn.insn, 0);
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index d91989c7bd6a..1005a6935fbe 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -132,7 +132,8 @@ static bool kdump_csum_valid(struct kimage *image)
 	int rc;
 
 	preempt_disable();
-	rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
+	rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump,
+			   unsigned long, (unsigned long)image);
 	preempt_enable();
 	return rc == 0;
 #else
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 11f8c296f60d..20f8e1868853 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -189,12 +189,16 @@ void noinstr s390_handle_mcck(void)
  * returns 0 if all required registers are available
  * returns 1 otherwise
  */
-static int notrace s390_check_registers(union mci mci, int umode)
+static int notrace s390_validate_registers(union mci mci, int umode)
 {
+	struct mcesa *mcesa;
+	void *fpt_save_area;
 	union ctlreg2 cr2;
 	int kill_task;
+	u64 zero;
 
 	kill_task = 0;
+	zero = 0;
 
 	if (!mci.gr) {
 		/*
@@ -205,14 +209,6 @@ static int notrace s390_check_registers(union mci mci, int umode)
 			s390_handle_damage();
 		kill_task = 1;
 	}
-	/* Check control registers */
-	if (!mci.cr) {
-		/*
-		 * Control registers have unknown contents.
-		 * Can't recover and therefore stopping machine.
-		 */
-		s390_handle_damage();
-	}
 	if (!mci.fp) {
 		/*
 		 * Floating point registers can't be restored. If the
@@ -225,35 +221,89 @@ static int notrace s390_check_registers(union mci mci, int umode)
 		if (!test_cpu_flag(CIF_FPU))
 			kill_task = 1;
 	}
+	fpt_save_area = &S390_lowcore.floating_pt_save_area;
 	if (!mci.fc) {
 		/*
 		 * Floating point control register can't be restored.
 		 * If the kernel currently uses the floating pointer
 		 * registers and needs the FPC register the system is
 		 * stopped. If the process has its floating pointer
-		 * registers loaded it is terminated.
+		 * registers loaded it is terminated. Otherwise the
+		 * FPC is just validated.
 		 */
 		if (S390_lowcore.fpu_flags & KERNEL_FPC)
 			s390_handle_damage();
+		asm volatile(
+			"	lfpc	%0\n"
+			:
+			: "Q" (zero));
 		if (!test_cpu_flag(CIF_FPU))
 			kill_task = 1;
+	} else {
+		asm volatile(
+			"	lfpc	%0\n"
+			:
+			: "Q" (S390_lowcore.fpt_creg_save_area));
 	}
 
-	if (MACHINE_HAS_VX) {
+	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+	if (!MACHINE_HAS_VX) {
+		/* Validate floating point registers */
+		asm volatile(
+			"	ld	0,0(%0)\n"
+			"	ld	1,8(%0)\n"
+			"	ld	2,16(%0)\n"
+			"	ld	3,24(%0)\n"
+			"	ld	4,32(%0)\n"
+			"	ld	5,40(%0)\n"
+			"	ld	6,48(%0)\n"
+			"	ld	7,56(%0)\n"
+			"	ld	8,64(%0)\n"
+			"	ld	9,72(%0)\n"
+			"	ld	10,80(%0)\n"
+			"	ld	11,88(%0)\n"
+			"	ld	12,96(%0)\n"
+			"	ld	13,104(%0)\n"
+			"	ld	14,112(%0)\n"
+			"	ld	15,120(%0)\n"
+			:
+			: "a" (fpt_save_area)
+			: "memory");
+	} else {
+		/* Validate vector registers */
+		union ctlreg0 cr0;
+
 		if (!mci.vr) {
 			/*
 			 * Vector registers can't be restored. If the kernel
 			 * currently uses vector registers the system is
 			 * stopped. If the process has its vector registers
-			 * loaded it is terminated.
+			 * loaded it is terminated. Otherwise just validate
+			 * the registers.
 			 */
 			if (S390_lowcore.fpu_flags & KERNEL_VXR)
 				s390_handle_damage();
 			if (!test_cpu_flag(CIF_FPU))
 				kill_task = 1;
 		}
+		cr0.val = S390_lowcore.cregs_save_area[0];
+		cr0.afp = cr0.vx = 1;
+		__ctl_load(cr0.val, 0, 0);
+		asm volatile(
+			"	la	1,%0\n"
+			"	.word	0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+			"	.word	0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+			:
+			: "Q" (*(struct vx_array *)mcesa->vector_save_area)
+			: "1");
+		__ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
 	}
-	/* Check if access registers are valid */
+	/* Validate access registers */
+	asm volatile(
+		"	lam	0,15,0(%0)\n"
+		:
+		: "a" (&S390_lowcore.access_regs_save_area)
+		: "memory");
 	if (!mci.ar) {
 		/*
 		 * Access registers have unknown contents.
@@ -261,7 +311,7 @@ static int notrace s390_check_registers(union mci mci, int umode)
 		 */
 		kill_task = 1;
 	}
-	/* Check guarded storage registers */
+	/* Validate guarded storage registers */
 	cr2.val = S390_lowcore.cregs_save_area[2];
 	if (cr2.gse) {
 		if (!mci.gs) {
@@ -271,31 +321,26 @@ static int notrace s390_check_registers(union mci mci, int umode)
 			 * It has to be terminated.
 			 */
 			kill_task = 1;
+		} else {
+			load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
 		}
 	}
-	/* Check if old PSW is valid */
-	if (!mci.wp) {
-		/*
-		 * Can't tell if we come from user or kernel mode
-		 * -> stopping machine.
-		 */
-		s390_handle_damage();
-	}
-	/* Check for invalid kernel instruction address */
-	if (!mci.ia && !umode) {
-		/*
-		 * The instruction address got lost while running
-		 * in the kernel -> stopping machine.
-		 */
-		s390_handle_damage();
-	}
+	/*
+	 * The getcpu vdso syscall reads CPU number from the programmable
+	 * field of the TOD clock. Disregard the TOD programmable register
+	 * validity bit and load the CPU number into the TOD programmable
+	 * field unconditionally.
+	 */
+	set_tod_programmable_field(raw_smp_processor_id());
+	/* Validate clock comparator register */
+	set_clock_comparator(S390_lowcore.clock_comparator);
 
 	if (!mci.ms || !mci.pm || !mci.ia)
 		kill_task = 1;
 
 	return kill_task;
 }
-NOKPROBE_SYMBOL(s390_check_registers);
+NOKPROBE_SYMBOL(s390_validate_registers);
 
 /*
  * Backup the guest's machine check info to its description block
@@ -353,11 +398,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
 	mci.val = S390_lowcore.mcck_interruption_code;
 	mcck = this_cpu_ptr(&cpu_mcck);
 
-	if (mci.sd) {
-		/* System damage -> stopping machine */
-		s390_handle_damage();
-	}
-
 	/*
 	 * Reinject the instruction processing damages' machine checks
 	 * including Delayed Access Exception into the guest
@@ -398,7 +438,7 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
 			s390_handle_damage();
 		}
 	}
-	if (s390_check_registers(mci, user_mode(regs))) {
+	if (s390_validate_registers(mci, user_mode(regs))) {
 		/*
 		 * Couldn't restore all register contents for the
 		 * user space process -> mark task for termination.
@@ -428,21 +468,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
 		mcck_pending = 1;
 	}
 
-	/*
-	 * Reinject storage related machine checks into the guest if they
-	 * happen when the guest is running.
-	 */
-	if (!test_cpu_flag(CIF_MCCK_GUEST)) {
-		if (mci.se)
-			/* Storage error uncorrected */
-			s390_handle_damage();
-		if (mci.ke)
-			/* Storage key-error uncorrected */
-			s390_handle_damage();
-		if (mci.ds && mci.fa)
-			/* Storage degradation */
-			s390_handle_damage();
-	}
 	if (mci.cp) {
 		/* Channel report word pending */
 		mcck->channel_report = 1;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1b7a0525fbed..975a00c8c564 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Facility
  *
- *  Copyright IBM Corp. 2012, 2019
+ *  Copyright IBM Corp. 2012, 2021
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ *	       Thomas Richter <tmricht@linux.ibm.com>
  */
 #define KMSG_COMPONENT	"cpum_cf"
 #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
@@ -14,7 +15,223 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/miscdevice.h>
+
 #include <asm/cpu_mcf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+static unsigned int cfdiag_cpu_speed;	/* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+						/* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ *   size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
+	unsigned int def:16;	/* 0-15  Data Entry Format */
+	unsigned int set:16;	/* 16-31 Counter set identifier */
+	unsigned int ctr:16;	/* 32-47 Number of stored counters */
+	unsigned int res1:16;	/* 48-63 Reserved */
+};
+
+struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
+	/* 0 - 7 */
+	union {
+		struct {
+			unsigned int clock_base:1;	/* TOD clock base set */
+			unsigned int speed:1;		/* CPU speed set */
+			/* Measurement alerts */
+			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
+			unsigned int caca:1;	/* Counter auth. change alert */
+			unsigned int lcda:1;	/* Loss of counter data alert */
+		};
+		unsigned long flags;	/* 0-63    All indicators */
+	};
+	/* 8 - 15 */
+	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
+	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
+	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
+	/* 16 - 23 */
+	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
+	/* 24 - 55 */
+	union {
+		struct {
+			unsigned long progusage1;
+			unsigned long progusage2;
+			unsigned long progusage3;
+			unsigned long tod_base;
+		};
+		unsigned long progusage[4];
+	};
+	/* 56 - 63 */
+	unsigned int mach_type:16;		/* Machine type */
+	unsigned int res1:16;			/* Reserved */
+	unsigned int res2:32;			/* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpuid cpuid;
+
+	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
+	te->csvn = cpuhw->info.csvn;
+
+	get_cpu_id(&cpuid);			/* Machine type */
+	te->mach_type = cpuid.machine;
+	te->cpu_speed = cfdiag_cpu_speed;
+	if (te->cpu_speed)
+		te->speed = 1;
+	te->clock_base = 1;			/* Save clock base */
+	te->tod_base = tod_clock_base.tod;
+	te->timestamp = get_tod_clock_fast();
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+			       size_t room, bool error_ok)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	size_t ctrset_size, need = 0;
+	int rc = 3;				/* Assume write failure */
+
+	ctrdata->def = CF_DIAG_CTRSET_DEF;
+	ctrdata->set = ctrset;
+	ctrdata->res1 = 0;
+	ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
+
+	if (ctrset_size) {			/* Save data */
+		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+		if (need <= room) {
+			rc = ctr_stcctm(ctrset, ctrset_size,
+					(u64 *)(ctrdata + 1));
+		}
+		if (rc != 3 || error_ok)
+			ctrdata->ctr = ctrset_size;
+		else
+			need = 0;
+	}
+
+	debug_sprintf_event(cf_dbg, 3,
+			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
+			    " need %zd rc %d\n", __func__, ctrset, ctrset_size,
+			    cpuhw->info.cfvn, cpuhw->info.csvn, need, rc);
+	return need;
+}
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+			    bool error_ok)
+{
+	struct cf_trailer_entry *trailer;
+	size_t offset = 0, done;
+	int i;
+
+	memset(data, 0, sz);
+	sz -= sizeof(*trailer);		/* Always room for trailer */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		struct cf_ctrset_entry *ctrdata = data + offset;
+
+		if (!(auth & cpumf_ctr_ctl[i]))
+			continue;	/* Counter set not authorized */
+
+		done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+		offset += done;
+	}
+	trailer = data + offset;
+	cfdiag_trailer(trailer);
+	return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+	for (; --counters >= 0; ++pstart, ++pstop)
+		if (*pstop >= *pstart)
+			*pstop -= *pstart;
+		else
+			*pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+	struct cf_trailer_entry *trailer_start, *trailer_stop;
+	struct cf_ctrset_entry *ctrstart, *ctrstop;
+	size_t offset = 0;
+
+	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
+	do {
+		ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+		ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+			pr_err_once("cpum_cf_diag counter set compare error "
+				    "in set %i\n", ctrstart->set);
+			return 0;
+		}
+		auth &= ~cpumf_ctr_ctl[ctrstart->set];
+		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+			cfdiag_diffctrset((u64 *)(ctrstart + 1),
+					  (u64 *)(ctrstop + 1), ctrstart->ctr);
+			offset += ctrstart->ctr * sizeof(u64) +
+							sizeof(*ctrstart);
+		}
+	} while (ctrstart->def && auth);
+
+	/* Save time_stamp from start of event in stop's trailer */
+	trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+	trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+	trailer_stop->progusage[0] = trailer_start->timestamp;
+
+	return 1;
+}
 
 static enum cpumf_ctr_set get_counter_set(u64 event)
 {
@@ -34,7 +251,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
 	return set;
 }
 
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const struct hw_perf_event *hwc,
+				enum cpumf_ctr_set set)
 {
 	struct cpu_cf_events *cpuhw;
 	int err = 0;
@@ -43,7 +261,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 	cpuhw = &get_cpu_var(cpu_cf_events);
 
 	/* check required version for counter sets */
-	switch (hwc->config_base) {
+	switch (set) {
 	case CPUMF_CTR_SET_BASIC:
 	case CPUMF_CTR_SET_USER:
 		if (cpuhw->info.cfvn < 1)
@@ -86,6 +304,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 		      (cpuhw->info.act_ctl & mtdiag_ctl)))
 			err = -EOPNOTSUPP;
 		break;
+	case CPUMF_CTR_SET_MAX:
+		err = -EOPNOTSUPP;
 	}
 
 	put_cpu_var(cpu_cf_events);
@@ -95,7 +315,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 static int validate_ctr_auth(const struct hw_perf_event *hwc)
 {
 	struct cpu_cf_events *cpuhw;
-	u64 ctrs_state;
 	int err = 0;
 
 	cpuhw = &get_cpu_var(cpu_cf_events);
@@ -105,8 +324,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
 	 * return with -ENOENT in order to fall back to other
 	 * PMUs that might suffice the event request.
 	 */
-	ctrs_state = cpumf_ctr_ctl[hwc->config_base];
-	if (!(ctrs_state & cpuhw->info.auth_ctl))
+	if (!(hwc->config_base & cpuhw->info.auth_ctl))
 		err = -ENOENT;
 
 	put_cpu_var(cpu_cf_events);
@@ -126,7 +344,7 @@ static void cpumf_pmu_enable(struct pmu *pmu)
 	if (cpuhw->flags & PMU_F_ENABLED)
 		return;
 
-	err = lcctl(cpuhw->state);
+	err = lcctl(cpuhw->state | cpuhw->dev_state);
 	if (err) {
 		pr_err("Enabling the performance measuring unit "
 		       "failed with rc=%x\n", err);
@@ -151,6 +369,7 @@ static void cpumf_pmu_disable(struct pmu *pmu)
 		return;
 
 	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+	inactive |= cpuhw->dev_state;
 	err = lcctl(inactive);
 	if (err) {
 		pr_err("Disabling the performance measuring unit "
@@ -199,6 +418,14 @@ static const int cpumf_generic_events_user[] = {
 	[PERF_COUNT_HW_BUS_CYCLES]	    = -1,
 };
 
+static void cpumf_hw_inuse(void)
+{
+	mutex_lock(&pmc_reserve_mutex);
+	if (atomic_inc_return(&num_events) == 1)
+		__kernel_cpumcf_begin();
+	mutex_unlock(&pmc_reserve_mutex);
+}
+
 static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -258,11 +485,11 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 		/*
 		 * Use the hardware perf event structure to store the
 		 * counter number in the 'config' member and the counter
-		 * set number in the 'config_base'.  The counter set number
-		 * is then later used to enable/disable the counter(s).
+		 * set number in the 'config_base' as bit mask.
+		 * It is later used to enable/disable the counter(s).
 		 */
 		hwc->config = ev;
-		hwc->config_base = set;
+		hwc->config_base = cpumf_ctr_ctl[set];
 		break;
 	case CPUMF_CTR_SET_MAX:
 		/* The counter could not be associated to a counter set */
@@ -270,22 +497,13 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 	}
 
 	/* Initialize for using the CPU-measurement counter facility */
-	if (!atomic_inc_not_zero(&num_events)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			atomic_inc(&num_events);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	if (err)
-		return err;
+	cpumf_hw_inuse();
 	event->destroy = hw_perf_event_destroy;
 
 	/* Finally, validate version and authorization of the counter set */
 	err = validate_ctr_auth(hwc);
 	if (!err)
-		err = validate_ctr_version(hwc);
+		err = validate_ctr_version(hwc, set);
 
 	return err;
 }
@@ -361,6 +579,7 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED))
 		return;
@@ -376,29 +595,92 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
 	 * needs to be synchronized.  At this point, the counter set can be in
 	 * the inactive or disabled state.
 	 */
-	hw_perf_event_reset(event);
+	if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+		cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+					      sizeof(cpuhw->start),
+					      hwc->config_base, true);
+	} else {
+		hw_perf_event_reset(event);
+	}
 
-	/* increment refcount for this counter set */
-	atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+	/* Increment refcount for counter sets */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if ((hwc->config_base & cpumf_ctr_ctl[i]))
+			atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data.	The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+			      struct cpu_cf_events *cpuhw)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+
+	if (event->attr.sample_type & PERF_SAMPLE_CPU)
+		data.cpu_entry.cpu = event->cpu;
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = cpuhw->usedss;
+		raw.frag.data = cpuhw->stop;
+		raw.size = raw.frag.size;
+		data.raw = &raw;
+	}
+
+	overflow = perf_event_overflow(event, &data, &regs);
+	debug_sprintf_event(cf_dbg, 3,
+			    "%s event %#llx sample_type %#llx raw %d ov %d\n",
+			    __func__, event->hw.config,
+			    event->attr.sample_type, raw.size, overflow);
+	if (overflow)
+		event->pmu->stop(event, 0);
+
+	perf_event_update_userpage(event);
+	return overflow;
 }
 
 static void cpumf_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED)) {
 		/* Decrement reference count for this counter set and if this
 		 * is the last used counter in the set, clear activation
 		 * control and set the counter set state to inactive.
 		 */
-		if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
-			ctr_set_stop(&cpuhw->state, hwc->config_base);
+		for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+			if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+				continue;
+			if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+				ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+		}
 		hwc->state |= PERF_HES_STOPPED;
 	}
 
 	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		hw_perf_event_update(event);
+		if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+			local64_inc(&event->count);
+			cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+						      sizeof(cpuhw->stop),
+						      event->hw.config_base,
+						      false);
+			if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+				cfdiag_push_sample(event, cpuhw);
+		} else
+			hw_perf_event_update(event);
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 }
@@ -419,6 +701,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
 static void cpumf_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int i;
 
 	cpumf_pmu_stop(event, PERF_EF_UPDATE);
 
@@ -430,8 +713,9 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
 	 * clear enable control and resets all counters in a set.  Therefore,
 	 * cpumf_pmu_start() always has to reenable a counter set.
 	 */
-	if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
-		ctr_set_disable(&cpuhw->state, event->hw.config_base);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if (!atomic_read(&cpuhw->ctr_set[i]))
+			ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
 }
 
 /* Performance monitoring unit for s390x */
@@ -448,6 +732,7 @@ static struct pmu cpumf_pmu = {
 	.read	      = cpumf_pmu_read,
 };
 
+static int cfset_init(void);
 static int __init cpumf_pmu_init(void)
 {
 	int rc;
@@ -455,10 +740,689 @@ static int __init cpumf_pmu_init(void)
 	if (!kernel_cpumcf_avail())
 		return -ENODEV;
 
+	/* Setup s390dbf facility */
+	cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+	if (!cf_dbg) {
+		pr_err("Registration of s390dbf(cpum_cf) failed\n");
+		return -ENOMEM;
+	};
+	debug_register_view(cf_dbg, &debug_sprintf_view);
+
 	cpumf_pmu.attr_groups = cpumf_cf_event_group();
 	rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
-	if (rc)
+	if (rc) {
+		debug_unregister_view(cf_dbg, &debug_sprintf_view);
+		debug_unregister(cf_dbg);
 		pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+	} else if (stccm_avail()) {	/* Setup counter set device */
+		cfset_init();
+	}
+	return rc;
+}
+
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
+ */
+
+static atomic_t cfset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
+static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
+struct cfset_call_on_cpu_parm {		/* Parm struct for smp_call_on_cpu */
+	unsigned int sets;		/* Counter set bit mask */
+	atomic_t cpus_ack;		/* # CPUs successfully executed func */
+};
+
+static struct cfset_request {		/* CPUs and counter set bit mask */
+	unsigned long ctrset;		/* Bit mask of counter set to read */
+	cpumask_t mask;			/* CPU mask to read from */
+} cfset_request;
+
+static void cfset_ctrset_clear(void)
+{
+	cpumask_clear(&cfset_request.mask);
+	cfset_request.ctrset = 0;
+}
+
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
+ *
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and  cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access.  The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
+ */
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->dev_state = 0;
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_dec(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	cpuhw->flags &= ~PMU_F_IN_USE;
+	debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+			    __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
+
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->flags |= PMU_F_IN_USE;
+	ctr_set_enable(&cpuhw->dev_state, p->sets);
+	ctr_set_start(&cpuhw->dev_state, p->sets);
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_inc(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->dev_state | cpuhw->state);	/* Start counter sets */
+	if (!rc)
+		atomic_inc(&p->cpus_ack);
+	else
+		pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+	debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+			    __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
+
+static void cfset_release_cpu(void *p)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int rc;
+
+	debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
+			    __func__, cpuhw->state, cpuhw->dev_state);
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	cpuhw->dev_state = 0;
+}
+
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+	on_each_cpu(cfset_release_cpu, NULL, 1);
+	hw_perf_event_destroy(NULL);
+	cfset_ctrset_clear();
+	atomic_set(&cfset_opencnt, 0);
+	return 0;
+}
+
+static int cfset_open(struct inode *inode, struct file *file)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/* Only one user space program can open /dev/hwctr */
+	if (atomic_xchg(&cfset_opencnt, 1))
+		return -EBUSY;
+
+	cpumf_hw_inuse();
+	file->private_data = NULL;
+	/* nonseekable_open() never fails */
+	return nonseekable_open(inode, file);
+}
+
+static int cfset_all_stop(void)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = cfset_request.ctrset,
+	};
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+	free_cpumask_var(mask);
+	return 0;
+}
+
+static int cfset_all_start(void)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = cfset_request.ctrset,
+		.cpus_ack = ATOMIC_INIT(0),
+	};
+	cpumask_var_t mask;
+	int rc = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+		on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+		rc = -EIO;
+		debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__);
+	}
+	free_cpumask_var(mask);
+	return rc;
+}
+
+
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cfset_needspace(unsigned int sets)
+{
+	struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
+	size_t bytes = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (!(sets & cpumf_ctr_ctl[i]))
+			continue;
+		bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+	}
+	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+	put_cpu_ptr(&cpu_cf_events);
+	return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+	struct s390_ctrset_read __user *ctrset_read;
+	unsigned int cpu, cpus, rc;
+	void __user *uptr;
+
+	ctrset_read = (struct s390_ctrset_read __user *)arg;
+	uptr = ctrset_read->data;
+	for_each_cpu(cpu, mask) {
+		struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu);
+		struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+		ctrset_cpudata = uptr;
+		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
+		rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+		rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+				   cpuhw->used);
+		if (rc)
+			return -EFAULT;
+		uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+		cond_resched();
+	}
+	cpus = cpumask_weight(mask);
+	if (put_user(cpus, &ctrset_read->no_cpus))
+		return -EFAULT;
+	debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__,
+			    uptr - (void __user *)ctrset_read->data);
+	return 0;
+}
+
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+				int ctrset_size, size_t room)
+{
+	size_t need = 0;
+	int rc = -1;
+
+	need = sizeof(*p) + sizeof(u64) * ctrset_size;
+	if (need <= room) {
+		p->set = cpumf_ctr_ctl[ctrset];
+		p->no_cnts = ctrset_size;
+		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+		if (rc == 3)		/* Nothing stored */
+			need = 0;
+	}
+	return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int set, set_size;
+	size_t space;
+
+	/* No data saved yet */
+	cpuhw->used = 0;
+	cpuhw->sets = 0;
+	memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+	/* Scan the counter sets */
+	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+		struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+						 cpuhw->used;
+
+		if (!(p->sets & cpumf_ctr_ctl[set]))
+			continue;	/* Counter set not in list */
+		set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
+		space = sizeof(cpuhw->data) - cpuhw->used;
+		space = cfset_cpuset_read(sp, set, set_size, space);
+		if (space) {
+			cpuhw->used += space;
+			cpuhw->sets += 1;
+		}
+	}
+	debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__,
+			    cpuhw->sets, cpuhw->used);
+}
+
+static int cfset_all_read(unsigned long arg)
+{
+	struct cfset_call_on_cpu_parm p;
+	cpumask_var_t mask;
+	int rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	p.sets = cfset_request.ctrset;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+	rc = cfset_all_copy(arg, mask);
+	free_cpumask_var(mask);
 	return rc;
 }
-subsys_initcall(cpumf_pmu_init);
+
+static long cfset_ioctl_read(unsigned long arg)
+{
+	struct s390_ctrset_read read;
+	int ret = 0;
+
+	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+		return -EFAULT;
+	ret = cfset_all_read(arg);
+	return ret;
+}
+
+static long cfset_ioctl_stop(void)
+{
+	int ret = ENXIO;
+
+	if (cfset_request.ctrset) {
+		ret = cfset_all_stop();
+		cfset_ctrset_clear();
+	}
+	return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg)
+{
+	struct s390_ctrset_start __user *ustart;
+	struct s390_ctrset_start start;
+	void __user *umask;
+	unsigned int len;
+	int ret = 0;
+	size_t need;
+
+	if (cfset_request.ctrset)
+		return -EBUSY;
+	ustart = (struct s390_ctrset_start __user *)arg;
+	if (copy_from_user(&start, ustart, sizeof(start)))
+		return -EFAULT;
+	if (start.version != S390_HWCTR_START_VERSION)
+		return -EINVAL;
+	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+		return -EINVAL;		/* Invalid counter set */
+	if (!start.counter_sets)
+		return -EINVAL;		/* No counter set at all? */
+	cpumask_clear(&cfset_request.mask);
+	len = min_t(u64, start.cpumask_len, cpumask_size());
+	umask = (void __user *)start.cpumask;
+	if (copy_from_user(&cfset_request.mask, umask, len))
+		return -EFAULT;
+	if (cpumask_empty(&cfset_request.mask))
+		return -EINVAL;
+	need = cfset_needspace(start.counter_sets);
+	if (put_user(need, &ustart->data_bytes))
+		ret = -EFAULT;
+	if (ret)
+		goto out;
+	cfset_request.ctrset = start.counter_sets;
+	ret = cfset_all_start();
+out:
+	if (ret)
+		cfset_ctrset_clear();
+	debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n",
+			    __func__, cfset_request.ctrset, need, ret);
+	return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ *    counter set keeps running until explicitly stopped. Returns the number
+ *    of bytes needed to store the counter values. If another S390_HWCTR_START
+ *    ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ *    command, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ *    with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ *    previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	get_online_cpus();
+	mutex_lock(&cfset_ctrset_mutex);
+	switch (cmd) {
+	case S390_HWCTR_START:
+		ret = cfset_ioctl_start(arg);
+		break;
+	case S390_HWCTR_STOP:
+		ret = cfset_ioctl_stop();
+		break;
+	case S390_HWCTR_READ:
+		ret = cfset_ioctl_read(arg);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	put_online_cpus();
+	return ret;
+}
+
+static const struct file_operations cfset_fops = {
+	.owner = THIS_MODULE,
+	.open = cfset_open,
+	.release = cfset_release,
+	.unlocked_ioctl	= cfset_ioctl,
+	.compat_ioctl = cfset_ioctl,
+	.llseek = no_llseek
+};
+
+static struct miscdevice cfset_dev = {
+	.name	= S390_HWCTR_DEVICE,
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &cfset_fops,
+};
+
+int cfset_online_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (cfset_request.ctrset) {
+		p.sets = cfset_request.ctrset;
+		cfset_ioctl_on(&p);
+		cpumask_set_cpu(cpu, &cfset_request.mask);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+int cfset_offline_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (cfset_request.ctrset) {
+		p.sets = cfset_request.ctrset;
+		cfset_ioctl_off(&p);
+		cpumask_clear_cpu(cpu, &cfset_request.mask);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+	debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__,
+			    event->attr.config, local64_read(&event->count));
+}
+
+static int get_authctrsets(void)
+{
+	struct cpu_cf_events *cpuhw;
+	unsigned long auth = 0;
+	enum cpumf_ctr_set i;
+
+	cpuhw = &get_cpu_var(cpu_cf_events);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
+			auth |= cpumf_ctr_ctl[i];
+	}
+	put_cpu_var(cpu_cf_events);
+	return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = 0;
+
+	/* Set sample_period to indicate sampling */
+	event->hw.config = attr->config;
+	event->hw.sample_period = attr->sample_period;
+	local64_set(&event->hw.period_left, event->hw.sample_period);
+	local64_set(&event->count, 0);
+	event->hw.last_period = event->hw.sample_period;
+
+	/* Add all authorized counter sets to config_base. The
+	 * the hardware init function is either called per-cpu or just once
+	 * for all CPUS (event->cpu == -1).  This depends on the whether
+	 * counting is started for all CPUs or on a per workload base where
+	 * the perf event moves from one CPU to another CPU.
+	 * Checking the authorization on any CPU is fine as the hardware
+	 * applies the same authorization settings to all CPUs.
+	 */
+	event->hw.config_base = get_authctrsets();
+
+	/* No authorized counter sets, nothing to count/sample */
+	if (!event->hw.config_base)
+		err = -EINVAL;
+
+	debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n",
+			    __func__, err, event->hw.config_base);
+	return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = -ENOENT;
+
+	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+	    event->attr.type != event->pmu->type)
+		goto out;
+
+	/* Raw events are used to access counters directly,
+	 * hence do not permit excludes.
+	 * This event is useless without PERF_SAMPLE_RAW to return counter set
+	 * values as raw data.
+	 */
+	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* Initialize for using the CPU-measurement counter facility */
+	cpumf_hw_inuse();
+	event->destroy = hw_perf_event_destroy;
+
+	err = cfdiag_event_init2(event);
+	if (unlikely(err))
+		event->destroy(event);
+out:
+	return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+	NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+	.name = "events",
+	.attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+	.name = "format",
+	.attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+	&cfdiag_events_group,
+	&cfdiag_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event strcture as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
+	.task_ctx_nr  = perf_sw_context,
+	.event_init   = cfdiag_event_init,
+	.pmu_enable   = cpumf_pmu_enable,
+	.pmu_disable  = cpumf_pmu_disable,
+	.add	      = cpumf_pmu_add,
+	.del	      = cpumf_pmu_del,
+	.start	      = cpumf_pmu_start,
+	.stop	      = cpumf_pmu_stop,
+	.read	      = cfdiag_read,
+
+	.attr_groups  = cfdiag_attr_groups
+};
+
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+	size_t max_size = sizeof(struct cf_trailer_entry);
+	enum cpumf_ctr_set i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		size_t size = cpum_cf_ctrset_size(i, info);
+
+		if (size)
+			max_size += size * sizeof(u64) +
+				    sizeof(struct cf_ctrset_entry);
+	}
+	return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+	if (cpum_sf_avail()) {			/* Sampling facility first */
+		struct hws_qsi_info_block si;
+
+		memset(&si, 0, sizeof(si));
+		if (!qsi(&si)) {
+			cfdiag_cpu_speed = si.cpu_speed;
+			return;
+		}
+	}
+
+	/* Fallback: CPU speed extract static part. Used in case
+	 * CPU Measurement Sampling Facility is turned off.
+	 */
+	if (test_facility(34)) {
+		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+
+		if (mhz != -1UL)
+			cfdiag_cpu_speed = mhz & 0xffffffff;
+	}
+}
+
+static int cfset_init(void)
+{
+	struct cpumf_ctr_info info;
+	size_t need;
+	int rc;
+
+	if (qctri(&info))
+		return -ENODEV;
+
+	cfdiag_get_cpu_speed();
+	/* Make sure the counter set data fits into predefined buffer. */
+	need = cfdiag_maxsize(&info);
+	if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+		       need);
+		return -ENOMEM;
+	}
+
+	rc = misc_register(&cfset_dev);
+	if (rc) {
+		pr_err("Registration of /dev/%s failed rc=%i\n",
+		       cfset_dev.name, rc);
+		goto out;
+	}
+
+	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+	if (rc) {
+		misc_deregister(&cfset_dev);
+		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+		       rc);
+	}
+out:
+	return rc;
+}
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
index 2300fbaac556..30f0242de4a5 100644
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ b/arch/s390/kernel/perf_cpum_cf_common.c
@@ -29,7 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
 	},
 	.alert = ATOMIC64_INIT(0),
 	.state = 0,
+	.dev_state = 0,
 	.flags = 0,
+	.used = 0,
+	.usedss = 0,
+	.sets = 0
 };
 /* Indicator whether the CPU-Measurement Counter Facility Support is ready */
 static bool cpum_cf_initalized;
@@ -96,25 +100,10 @@ bool kernel_cpumcf_avail(void)
 }
 EXPORT_SYMBOL(kernel_cpumcf_avail);
 
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
 /* Initialize the CPU-measurement counter facility */
 int __kernel_cpumcf_begin(void)
 {
 	int flags = PMC_INIT;
-	int err = 0;
-
-	spin_lock(&cpumcf_owner_lock);
-	if (cpumcf_owner)
-		err = -EBUSY;
-	else
-		cpumcf_owner = __builtin_return_address(0);
-	spin_unlock(&cpumcf_owner_lock);
-	if (err)
-		return err;
 
 	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
 	irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
@@ -144,10 +133,6 @@ void __kernel_cpumcf_end(void)
 
 	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
 	irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
-	spin_lock(&cpumcf_owner_lock);
-	cpumcf_owner = NULL;
-	spin_unlock(&cpumcf_owner_lock);
 }
 EXPORT_SYMBOL(__kernel_cpumcf_end);
 
@@ -161,11 +146,13 @@ static int cpum_cf_setup(unsigned int cpu, int flags)
 
 static int cpum_cf_online_cpu(unsigned int cpu)
 {
-	return cpum_cf_setup(cpu, PMC_INIT);
+	cpum_cf_setup(cpu, PMC_INIT);
+	return cfset_online_cpu(cpu);
 }
 
 static int cpum_cf_offline_cpu(unsigned int cpu)
 {
+	cfset_offline_cpu(cpu);
 	return cpum_cf_setup(cpu, PMC_RELEASE);
 }
 
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index 08c985c1097c..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,1148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- *  Copyright IBM Corp. 2019, 2021
- *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- *	       Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"cpum_cf_diag"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-#include <linux/miscdevice.h>
-#include <linux/mutex.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#include <asm/hwctrset.h>
-
-#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
-						/* interval in seconds */
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd {			/* Counter set data per CPU */
-	size_t used;			/* Bytes used in data/start */
-	unsigned char start[PAGE_SIZE];	/* Counter set at event start */
-	unsigned char data[PAGE_SIZE];	/* Counter set at event delete */
-	unsigned int sets;		/* # Counter set saved in data */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- *   size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
-	unsigned int def:16;	/* 0-15  Data Entry Format */
-	unsigned int set:16;	/* 16-31 Counter set identifier */
-	unsigned int ctr:16;	/* 32-47 Number of stored counters */
-	unsigned int res1:16;	/* 48-63 Reserved */
-};
-
-struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
-	/* 0 - 7 */
-	union {
-		struct {
-			unsigned int clock_base:1;	/* TOD clock base set */
-			unsigned int speed:1;		/* CPU speed set */
-			/* Measurement alerts */
-			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
-			unsigned int caca:1;	/* Counter auth. change alert */
-			unsigned int lcda:1;	/* Loss of counter data alert */
-		};
-		unsigned long flags;	/* 0-63    All indicators */
-	};
-	/* 8 - 15 */
-	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
-	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
-	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
-	/* 16 - 23 */
-	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
-	/* 24 - 55 */
-	union {
-		struct {
-			unsigned long progusage1;
-			unsigned long progusage2;
-			unsigned long progusage3;
-			unsigned long tod_base;
-		};
-		unsigned long progusage[4];
-	};
-	/* 56 - 63 */
-	unsigned int mach_type:16;		/* Machine type */
-	unsigned int res1:16;			/* Reserved */
-	unsigned int res2:32;			/* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cpuid cpuid;
-
-	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
-	te->csvn = cpuhw->info.csvn;
-
-	get_cpu_id(&cpuid);			/* Machine type */
-	te->mach_type = cpuid.machine;
-	te->cpu_speed = cf_diag_cpu_speed;
-	if (te->cpu_speed)
-		te->speed = 1;
-	te->clock_base = 1;			/* Save clock base */
-	te->tod_base = tod_clock_base.tod;
-	te->timestamp = get_tod_clock_fast();
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (cpuhw->flags & PMU_F_ENABLED)
-		return;
-
-	err = lcctl(cpuhw->state);
-	if (err) {
-		pr_err("Enabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	u64 inactive;
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (!(cpuhw->flags & PMU_F_ENABLED))
-		return;
-
-	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
-	err = lcctl(inactive);
-	if (err) {
-		pr_err("Disabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-/* Used to avoid races in calling reserve/release_cpumf_hardware */
-static DEFINE_MUTEX(cf_diag_reserve_mutex);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d cf_diag_events %d\n",
-			    __func__, event, smp_processor_id(),
-			    atomic_read(&cf_diag_events));
-	if (atomic_dec_return(&cf_diag_events) == 0)
-		__kernel_cpumcf_end();
-}
-
-static int get_authctrsets(void)
-{
-	struct cpu_cf_events *cpuhw;
-	unsigned long auth = 0;
-	enum cpumf_ctr_set i;
-
-	cpuhw = &get_cpu_var(cpu_cf_events);
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
-			auth |= cpumf_ctr_ctl[i];
-	}
-	put_cpu_var(cpu_cf_events);
-	return auth;
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
-			    event, event->cpu);
-
-	event->hw.config = attr->config;
-
-	/* Add all authorized counter sets to config_base. The
-	 * the hardware init function is either called per-cpu or just once
-	 * for all CPUS (event->cpu == -1).  This depends on the whether
-	 * counting is started for all CPUs or on a per workload base where
-	 * the perf event moves from one CPU to another CPU.
-	 * Checking the authorization on any CPU is fine as the hardware
-	 * applies the same authorization settings to all CPUs.
-	 */
-	event->hw.config_base = get_authctrsets();
-
-	/* No authorized counter sets, nothing to count/sample */
-	if (!event->hw.config_base) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/* Set sample_period to indicate sampling */
-	event->hw.sample_period = attr->sample_period;
-	local64_set(&event->hw.period_left, event->hw.sample_period);
-	event->hw.last_period  = event->hw.sample_period;
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
-			    __func__, err, event->hw.config_base);
-	return err;
-}
-
-/* Return 0 if the CPU-measurement counter facility is currently free
- * and an error otherwise.
- */
-static int cf_diag_perf_event_inuse(void)
-{
-	int err = 0;
-
-	if (!atomic_inc_not_zero(&cf_diag_events)) {
-		mutex_lock(&cf_diag_reserve_mutex);
-		if (atomic_read(&cf_diag_events) == 0 &&
-		    __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			err = atomic_inc_return(&cf_diag_events);
-		mutex_unlock(&cf_diag_reserve_mutex);
-	}
-	return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = -ENOENT;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d config %#llx type:%u "
-			    "sample_type %#llx cf_diag_events %d\n", __func__,
-			    event, event->cpu, attr->config, event->pmu->type,
-			    attr->sample_type, atomic_read(&cf_diag_events));
-
-	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
-	    event->attr.type != event->pmu->type)
-		goto out;
-
-	/* Raw events are used to access counters directly,
-	 * hence do not permit excludes.
-	 * This event is usesless without PERF_SAMPLE_RAW to return counter set
-	 * values as raw data.
-	 */
-	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
-	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	/* Initialize for using the CPU-measurement counter facility */
-	err = cf_diag_perf_event_inuse();
-	if (err < 0)
-		goto out;
-	event->destroy = cf_diag_perf_event_destroy;
-
-	err = __hw_perf_event_init(event);
-	if (unlikely(err))
-		event->destroy(event);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
-	size_t max_size = sizeof(struct cf_trailer_entry);
-	enum cpumf_ctr_set i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		size_t size = cpum_cf_ctrset_size(i, info);
-
-		if (size)
-			max_size += size * sizeof(u64) +
-				    sizeof(struct cf_ctrset_entry);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
-			    max_size);
-
-	return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
-				size_t room)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	size_t ctrset_size, need = 0;
-	int rc = 3;				/* Assume write failure */
-
-	ctrdata->def = CF_DIAG_CTRSET_DEF;
-	ctrdata->set = ctrset;
-	ctrdata->res1 = 0;
-	ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
-
-	if (ctrset_size) {			/* Save data */
-		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
-		if (need <= room)
-			rc = ctr_stcctm(ctrset, ctrset_size,
-					(u64 *)(ctrdata + 1));
-		if (rc != 3)
-			ctrdata->ctr = ctrset_size;
-		else
-			need = 0;
-	}
-
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
-			    " need %zd rc %d\n",
-			    __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
-			    cpuhw->info.csvn, need, rc);
-	return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer;
-	size_t offset = 0, done;
-	int i;
-
-	memset(data, 0, sz);
-	sz -= sizeof(*trailer);			/* Always room for trailer */
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		struct cf_ctrset_entry *ctrdata = data + offset;
-
-		if (!(auth & cpumf_ctr_ctl[i]))
-			continue;	/* Counter set not authorized */
-
-		done = cf_diag_getctrset(ctrdata, i, sz - offset);
-		offset += done;
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s ctrset %d offset %zu done %zu\n",
-				     __func__, i, offset, done);
-	}
-	trailer = data + offset;
-	cf_diag_trailer(trailer);
-	return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
-	for (; --counters >= 0; ++pstart, ++pstop)
-		if (*pstop >= *pstart)
-			*pstop -= *pstart;
-		else
-			*pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer_start, *trailer_stop;
-	struct cf_ctrset_entry *ctrstart, *ctrstop;
-	size_t offset = 0;
-
-	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
-	do {
-		ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
-		ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
-		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
-			pr_err("cpum_cf_diag counter set compare error "
-				"in set %i\n", ctrstart->set);
-			return 0;
-		}
-		auth &= ~cpumf_ctr_ctl[ctrstart->set];
-		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
-			cf_diag_diffctrset((u64 *)(ctrstart + 1),
-					  (u64 *)(ctrstop + 1), ctrstart->ctr);
-			offset += ctrstart->ctr * sizeof(u64) +
-				  sizeof(*ctrstart);
-		}
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s set %d ctr %d offset %zu auth %lx\n",
-				    __func__, ctrstart->set, ctrstart->ctr,
-				    offset, auth);
-	} while (ctrstart->def && auth);
-
-	/* Save time_stamp from start of event in stop's trailer */
-	trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
-	trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
-	trailer_stop->progusage[0] = trailer_start->timestamp;
-
-	return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data.	The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
-			       struct cf_diag_csd *csd)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	int overflow;
-
-	/* Setup perf sample */
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-	memset(&regs, 0, sizeof(regs));
-	memset(&raw, 0, sizeof(raw));
-
-	if (event->attr.sample_type & PERF_SAMPLE_CPU)
-		data.cpu_entry.cpu = event->cpu;
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.frag.size = csd->used;
-		raw.frag.data = csd->data;
-		raw.size = csd->used;
-		data.raw = &raw;
-	}
-
-	overflow = perf_event_overflow(event, &data, &regs);
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s event %p cpu %d sample_type %#llx raw %d "
-			    "ov %d\n", __func__, event, event->cpu,
-			    event->attr.sample_type, raw.size, overflow);
-	if (overflow)
-		event->pmu->stop(event, 0);
-
-	perf_event_update_userpage(event);
-	return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	/* (Re-)enable and activate all counter sets */
-	lcctl(0);		/* Reset counter sets */
-	hwc->state = 0;
-	ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
-	lcctl(cpuhw->state);	/* Enable counter sets */
-	csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
-				   event->hw.config_base);
-	ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
-	/* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-
-	/* Deactivate all counter sets */
-	ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
-	local64_inc(&event->count);
-	csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
-				   event->hw.config_base);
-	if (cf_diag_diffctr(csd, event->hw.config_base))
-		cf_diag_push_sample(event, csd);
-	hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x cpuhw %p\n",
-			    __func__, event, event->cpu, flags, cpuhw);
-
-	if (cpuhw->flags & PMU_F_IN_USE) {
-		err = -EAGAIN;
-		goto out;
-	}
-
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	cpuhw->flags |= PMU_F_IN_USE;
-	if (flags & PERF_EF_START)
-		cf_diag_start(event, PERF_EF_RELOAD);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x\n",
-			   __func__, event, event->cpu, flags);
-
-	cf_diag_stop(event, PERF_EF_UPDATE);
-	ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
-	ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
-	cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-/* Default counter set events and format attribute groups */
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
-	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
-	NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
-	.name = "events",
-	.attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
-	.name = "format",
-	.attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
-	&cf_diag_events_group,
-	&cf_diag_format_group,
-	NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
-	.task_ctx_nr  = perf_sw_context,
-	.pmu_enable   = cf_diag_enable,
-	.pmu_disable  = cf_diag_disable,
-	.event_init   = cf_diag_event_init,
-	.add	      = cf_diag_add,
-	.del	      = cf_diag_del,
-	.start	      = cf_diag_start,
-	.stop	      = cf_diag_stop,
-	.read	      = cf_diag_read,
-
-	.attr_groups  = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
-	if (cpum_sf_avail()) {			/* Sampling facility first */
-		struct hws_qsi_info_block si;
-
-		memset(&si, 0, sizeof(si));
-		if (!qsi(&si)) {
-			cf_diag_cpu_speed = si.cpu_speed;
-			return;
-		}
-	}
-
-	if (test_facility(34)) {		/* CPU speed extract static part */
-		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
-		if (mhz != -1UL)
-			cf_diag_cpu_speed = mhz & 0xffffffff;
-	}
-}
-
-/* Code to create device and file I/O operations */
-static atomic_t ctrset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
-
-static int cf_diag_open(struct inode *inode, struct file *file)
-{
-	int err = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (atomic_xchg(&ctrset_opencnt, 1))
-		return -EBUSY;
-
-	/* Avoid concurrent access with perf_event_open() system call */
-	mutex_lock(&cf_diag_reserve_mutex);
-	if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin())
-		err = -EBUSY;
-	mutex_unlock(&cf_diag_reserve_mutex);
-	if (err) {
-		atomic_set(&ctrset_opencnt, 0);
-		return err;
-	}
-	file->private_data = NULL;
-	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
-	/* nonseekable_open() never fails */
-	return nonseekable_open(inode, file);
-}
-
-/* Variables for ioctl() interface support */
-static DEFINE_MUTEX(cf_diag_ctrset_mutex);
-static struct cf_diag_ctrset {
-	unsigned long ctrset;		/* Bit mask of counter set to read */
-	cpumask_t mask;			/* CPU mask to read from */
-} cf_diag_ctrset;
-
-static void cf_diag_ctrset_clear(void)
-{
-	cpumask_clear(&cf_diag_ctrset.mask);
-	cf_diag_ctrset.ctrset = 0;
-}
-
-static void cf_diag_release_cpu(void *p)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__,
-			    smp_processor_id());
-	lcctl(0);		/* Reset counter sets */
-	cpuhw->state = 0;	/* Save state in CPU hardware state */
-}
-
-/* Release function is also called when application gets terminated without
- * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
- * Since only one application is allowed to open the device, simple stop all
- * CPU counter sets.
- */
-static int cf_diag_release(struct inode *inode, struct file *file)
-{
-	on_each_cpu(cf_diag_release_cpu, NULL, 1);
-	cf_diag_ctrset_clear();
-	atomic_set(&ctrset_opencnt, 0);
-	__kernel_cpumcf_end();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
-	return 0;
-}
-
-struct cf_diag_call_on_cpu_parm {	/* Parm struct for smp_call_on_cpu */
-	unsigned int sets;		/* Counter set bit mask */
-	atomic_t cpus_ack;		/* # CPUs successfully executed func */
-};
-
-static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask)
-{
-	struct s390_ctrset_read __user *ctrset_read;
-	unsigned int cpu, cpus, rc;
-	void __user *uptr;
-
-	ctrset_read = (struct s390_ctrset_read __user *)arg;
-	uptr = ctrset_read->data;
-	for_each_cpu(cpu, mask) {
-		struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu);
-		struct s390_ctrset_cpudata __user *ctrset_cpudata;
-
-		ctrset_cpudata = uptr;
-		debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n",
-				    __func__, cpu, csd->used);
-		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
-		rc |= put_user(csd->sets, &ctrset_cpudata->no_sets);
-		rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used);
-		if (rc)
-			return -EFAULT;
-		uptr += sizeof(struct s390_ctrset_cpudata) + csd->used;
-		cond_resched();
-	}
-	cpus = cpumask_weight(mask);
-	if (put_user(cpus, &ctrset_read->no_cpus))
-		return -EFAULT;
-	debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n",
-			    __func__, uptr - (void __user *)ctrset_read->data);
-	return 0;
-}
-
-static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
-				  int ctrset_size, size_t room)
-{
-	size_t need = 0;
-	int rc = -1;
-
-	need = sizeof(*p) + sizeof(u64) * ctrset_size;
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s room %zd need %zd set %#x set_size %d\n",
-			    __func__, room, need, ctrset, ctrset_size);
-	if (need <= room) {
-		p->set = cpumf_ctr_ctl[ctrset];
-		p->no_cnts = ctrset_size;
-		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
-		if (rc == 3)		/* Nothing stored */
-			need = 0;
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__,
-			    need, rc);
-	return need;
-}
-
-/* Read all counter sets. Since the perf_event_open() system call with
- * event cpum_cf_diag/.../ is blocked when this interface is active, reuse
- * the perf_event_open() data buffer to store the counter sets.
- */
-static void cf_diag_cpu_read(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int set, set_size;
-	size_t space;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-	/* No data saved yet */
-	csd->used = 0;
-	csd->sets = 0;
-	memset(csd->data, 0, sizeof(csd->data));
-
-	/* Scan the counter sets */
-	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
-		struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used;
-
-		if (!(p->sets & cpumf_ctr_ctl[set]))
-			continue;	/* Counter set not in list */
-		set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
-		space = sizeof(csd->data) - csd->used;
-		space = cf_diag_cpuset_read(sp, set, set_size, space);
-		if (space) {
-			csd->used += space;
-			csd->sets += 1;
-		}
-		debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n",
-				    __func__, sp, space);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__,
-			    csd->sets, csd->used);
-}
-
-static int cf_diag_all_read(unsigned long arg)
-{
-	struct cf_diag_call_on_cpu_parm p;
-	cpumask_var_t mask;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	p.sets = cf_diag_ctrset.ctrset;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1);
-	rc = cf_diag_all_copy(arg, mask);
-	free_cpumask_var(mask);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc);
-	return rc;
-}
-
-/* Stop all counter sets via ioctl interface */
-static void cf_diag_ioctl_off(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-
-	ctr_set_multiple_disable(&cpuhw->state, p->sets);
-	ctr_set_multiple_stop(&cpuhw->state, p->sets);
-	rc = lcctl(cpuhw->state);		/* Stop counter sets */
-	if (!cpuhw->state)
-		cpuhw->flags &= ~PMU_F_IN_USE;
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s rc %d flags %#x state %#llx\n", __func__,
-			     rc, cpuhw->flags, cpuhw->state);
-}
-
-/* Start counter sets on particular CPU */
-static void cf_diag_ioctl_on(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-
-	if (!(cpuhw->flags & PMU_F_IN_USE))
-		cpuhw->state = 0;
-	cpuhw->flags |= PMU_F_IN_USE;
-	rc = lcctl(cpuhw->state);		/* Reset unused counter sets */
-	ctr_set_multiple_enable(&cpuhw->state, p->sets);
-	ctr_set_multiple_start(&cpuhw->state, p->sets);
-	rc |= lcctl(cpuhw->state);		/* Start counter sets */
-	if (!rc)
-		atomic_inc(&p->cpus_ack);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n",
-			    __func__, rc, cpuhw->state);
-}
-
-static int cf_diag_all_stop(void)
-{
-	struct cf_diag_call_on_cpu_parm p = {
-		.sets = cf_diag_ctrset.ctrset,
-	};
-	cpumask_var_t mask;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
-	free_cpumask_var(mask);
-	return 0;
-}
-
-static int cf_diag_all_start(void)
-{
-	struct cf_diag_call_on_cpu_parm p = {
-		.sets = cf_diag_ctrset.ctrset,
-		.cpus_ack = ATOMIC_INIT(0),
-	};
-	cpumask_var_t mask;
-	int rc = 0;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1);
-	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
-		on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
-		rc = -EIO;
-	}
-	free_cpumask_var(mask);
-	return rc;
-}
-
-/* Return the maximum required space for all possible CPUs in case one
- * CPU will be onlined during the START, READ, STOP cycles.
- * To find out the size of the counter sets, any one CPU will do. They
- * all have the same counter sets.
- */
-static size_t cf_diag_needspace(unsigned int sets)
-{
-	struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
-	size_t bytes = 0;
-	int i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		if (!(sets & cpumf_ctr_ctl[i]))
-			continue;
-		bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
-			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
-			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
-	}
-	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
-		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
-		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
-	debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
-			    bytes);
-	put_cpu_ptr(&cpu_cf_events);
-	return bytes;
-}
-
-static long cf_diag_ioctl_read(unsigned long arg)
-{
-	struct s390_ctrset_read read;
-	int ret = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
-		return -EFAULT;
-	ret = cf_diag_all_read(arg);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl_stop(void)
-{
-	int ret;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	ret = cf_diag_all_stop();
-	cf_diag_ctrset_clear();
-	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl_start(unsigned long arg)
-{
-	struct s390_ctrset_start __user *ustart;
-	struct s390_ctrset_start start;
-	void __user *umask;
-	unsigned int len;
-	int ret = 0;
-	size_t need;
-
-	if (cf_diag_ctrset.ctrset)
-		return -EBUSY;
-	ustart = (struct s390_ctrset_start __user *)arg;
-	if (copy_from_user(&start, ustart, sizeof(start)))
-		return -EFAULT;
-	if (start.version != S390_HWCTR_START_VERSION)
-		return -EINVAL;
-	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
-		return -EINVAL;		/* Invalid counter set */
-	if (!start.counter_sets)
-		return -EINVAL;		/* No counter set at all? */
-	cpumask_clear(&cf_diag_ctrset.mask);
-	len = min_t(u64, start.cpumask_len, cpumask_size());
-	umask = (void __user *)start.cpumask;
-	if (copy_from_user(&cf_diag_ctrset.mask, umask, len))
-		return -EFAULT;
-	if (cpumask_empty(&cf_diag_ctrset.mask))
-		return -EINVAL;
-	need = cf_diag_needspace(start.counter_sets);
-	if (put_user(need, &ustart->data_bytes))
-		ret = -EFAULT;
-	if (ret)
-		goto out;
-	cf_diag_ctrset.ctrset = start.counter_sets;
-	ret = cf_diag_all_start();
-out:
-	if (ret)
-		cf_diag_ctrset_clear();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n",
-			    __func__, cf_diag_ctrset.ctrset, need, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	int ret;
-
-	debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__,
-			    cmd, arg);
-	get_online_cpus();
-	mutex_lock(&cf_diag_ctrset_mutex);
-	switch (cmd) {
-	case S390_HWCTR_START:
-		ret = cf_diag_ioctl_start(arg);
-		break;
-	case S390_HWCTR_STOP:
-		ret = cf_diag_ioctl_stop();
-		break;
-	case S390_HWCTR_READ:
-		ret = cf_diag_ioctl_read(arg);
-		break;
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	put_online_cpus();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static const struct file_operations cf_diag_fops = {
-	.owner = THIS_MODULE,
-	.open = cf_diag_open,
-	.release = cf_diag_release,
-	.unlocked_ioctl	= cf_diag_ioctl,
-	.compat_ioctl = cf_diag_ioctl,
-	.llseek = no_llseek
-};
-
-static struct miscdevice cf_diag_dev = {
-	.name	= S390_HWCTR_DEVICE,
-	.minor	= MISC_DYNAMIC_MINOR,
-	.fops	= &cf_diag_fops,
-};
-
-static int cf_diag_online_cpu(unsigned int cpu)
-{
-	struct cf_diag_call_on_cpu_parm p;
-
-	mutex_lock(&cf_diag_ctrset_mutex);
-	if (!cf_diag_ctrset.ctrset)
-		goto out;
-	p.sets = cf_diag_ctrset.ctrset;
-	cf_diag_ioctl_on(&p);
-out:
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	return 0;
-}
-
-static int cf_diag_offline_cpu(unsigned int cpu)
-{
-	struct cf_diag_call_on_cpu_parm p;
-
-	mutex_lock(&cf_diag_ctrset_mutex);
-	if (!cf_diag_ctrset.ctrset)
-		goto out;
-	p.sets = cf_diag_ctrset.ctrset;
-	cf_diag_ioctl_off(&p);
-out:
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	return 0;
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
-	struct cpumf_ctr_info info;
-	size_t need;
-	int rc;
-
-	if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
-		return -ENODEV;
-	cf_diag_get_cpu_speed();
-
-	/* Make sure the counter set data fits into predefined buffer. */
-	need = cf_diag_ctrset_maxsize(&info);
-	if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
-		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
-		       need);
-		return -ENOMEM;
-	}
-
-	rc = misc_register(&cf_diag_dev);
-	if (rc) {
-		pr_err("Registration of /dev/" S390_HWCTR_DEVICE
-		       "failed rc=%d\n", rc);
-		goto out;
-	}
-
-	/* Setup s390dbf facility */
-	cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
-	if (!cf_diag_dbg) {
-		pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
-		rc = -ENOMEM;
-		goto out_dbf;
-	}
-	debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
-	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
-	if (rc) {
-		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
-		       rc);
-		goto out_perf;
-	}
-	rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE,
-				       "perf/s390/cfd:online",
-				       cf_diag_online_cpu, cf_diag_offline_cpu);
-	if (!rc)
-		goto out;
-
-	pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n",
-	       rc);
-	perf_pmu_unregister(&cf_diag);
-out_perf:
-	debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
-	debug_unregister(cf_diag_dbg);
-out_dbf:
-	misc_deregister(&cf_diag_dev);
-out:
-	return rc;
-}
-device_initcall(cf_diag_init);
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 7ae5dde9c54d..350e94d0cac2 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -166,6 +166,12 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 			p->thread.acrs[1] = (unsigned int)tls;
 		}
 	}
+	/*
+	 * s390 stores the svc return address in arch_data when calling
+	 * sigreturn()/restart_syscall() via vdso. 1 means no valid address
+	 * stored.
+	 */
+	p->restart_block.arch_data = 1;
 	return 0;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 5486d82470b5..ff0f9e838916 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -354,7 +354,7 @@ void __init arch_call_rest_init(void)
 	set_task_stack_end_magic(current);
 	stack += STACK_INIT_OFFSET;
 	S390_lowcore.kernel_stack = stack;
-	CALL_ON_STACK_NORETURN(rest_init, stack);
+	call_on_stack_noreturn(rest_init, stack);
 }
 
 static void __init setup_lowcore_dat_off(void)
@@ -442,6 +442,7 @@ static void __init setup_lowcore_dat_off(void)
 	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
 	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
 	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+	lc->preempt_count = PREEMPT_DISABLED;
 
 	set_prefix((u32)(unsigned long) lc);
 	lowcore_ptr[0] = lc;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 080e7aed181f..78ef53b29958 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -32,6 +32,7 @@
 #include <linux/uaccess.h>
 #include <asm/lowcore.h>
 #include <asm/switch_to.h>
+#include <asm/vdso.h>
 #include "entry.h"
 
 /*
@@ -171,7 +172,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
 	fpregs_load(&user_sregs.fpregs, &current->thread.fpu);
 
 	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
-	clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
 	return 0;
 }
 
@@ -334,15 +334,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
-	if (ka->sa.sa_flags & SA_RESTORER) {
+	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
-	} else {
-		/* Signal frame without vector registers are short ! */
-		__u16 __user *svc = (void __user *) frame + frame_size - 2;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long) svc;
-	}
+	else
+		restorer = VDSO64_SYMBOL(current, sigreturn);
 
 	/* Set up registers for signal handler */
 	regs->gprs[14] = restorer;
@@ -397,14 +392,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
-	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ksig->ka.sa.sa_restorer;
-	} else {
-		__u16 __user *svc = &frame->svc_insn;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long) svc;
-	}
+	else
+		restorer = VDSO64_SYMBOL(current, rt_sigreturn);
 
 	/* Create siginfo on the signal stack */
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -501,7 +492,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
 		}
 		/* No longer in a system call */
 		clear_pt_regs_flag(regs, PIF_SYSCALL);
-		clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
+
 		rseq_signal_deliver(&ksig, regs);
 		if (is_compat_task())
 			handle_signal32(&ksig, oldset, regs);
@@ -517,14 +508,20 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
 		switch (regs->gprs[2]) {
 		case -ERESTART_RESTARTBLOCK:
 			/* Restart with sys_restart_syscall */
-			regs->int_code = __NR_restart_syscall;
-			fallthrough;
+			regs->gprs[2] = regs->orig_gpr2;
+			current->restart_block.arch_data = regs->psw.addr;
+			if (is_compat_task())
+				regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
+			else
+				regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+			if (test_thread_flag(TIF_SINGLE_STEP))
+				clear_thread_flag(TIF_PER_TRAP);
+			break;
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			/* Restart system call with magic TIF bit. */
 			regs->gprs[2] = regs->orig_gpr2;
-			set_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
+			regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 			if (test_thread_flag(TIF_SINGLE_STEP))
 				clear_thread_flag(TIF_PER_TRAP);
 			break;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index ff42d3aa0f00..8984711f72ed 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -210,6 +210,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
 	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
 	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+	lc->preempt_count = PREEMPT_DISABLED;
 	if (nmi_alloc_per_cpu(lc))
 		goto out;
 	lowcore_ptr[cpu] = lc;
@@ -300,24 +301,28 @@ static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
 	pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
 }
 
+typedef void (pcpu_delegate_fn)(void *);
+
 /*
  * Call function via PSW restart on pcpu and stop the current cpu.
  */
-static void __pcpu_delegate(void (*func)(void*), void *data)
+static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
 {
 	func(data);	/* should not return */
 }
 
 static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
-						void (*func)(void *),
+						pcpu_delegate_fn *func,
 						void *data, unsigned long stack)
 {
 	struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
 	unsigned long source_cpu = stap();
 
 	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
-	if (pcpu->address == source_cpu)
-		CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
+	if (pcpu->address == source_cpu) {
+		call_on_stack(2, stack, void, __pcpu_delegate,
+			      pcpu_delegate_fn *, func, void *, data);
+	}
 	/* Stop target cpu (if func returns this stops the current cpu). */
 	pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
 	/* Restart func on the target cpu and stop the current cpu. */
@@ -898,7 +903,7 @@ static void __no_sanitize_address smp_start_secondary(void *cpuvoid)
 	S390_lowcore.restart_source = -1UL;
 	__ctl_load(S390_lowcore.cregs_save_area, 0, 15);
 	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
-	CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
+	call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack);
 }
 
 /* Upping and downing of CPUs */
diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c
index 76f7916cc30f..8fe2d23b64f4 100644
--- a/arch/s390/kernel/syscall.c
+++ b/arch/s390/kernel/syscall.c
@@ -108,7 +108,7 @@ SYSCALL_DEFINE0(ni_syscall)
 	return -ENOSYS;
 }
 
-void do_syscall(struct pt_regs *regs)
+static void do_syscall(struct pt_regs *regs)
 {
 	unsigned long nr;
 
@@ -121,6 +121,10 @@ void do_syscall(struct pt_regs *regs)
 
 	regs->gprs[2] = nr;
 
+	if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
+		regs->psw.addr = current->restart_block.arch_data;
+		current->restart_block.arch_data = 1;
+	}
 	nr = syscall_enter_from_user_mode_work(regs, nr);
 
 	/*
@@ -130,13 +134,16 @@ void do_syscall(struct pt_regs *regs)
 	 * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
 	 * and if set, the syscall will be skipped.
 	 */
-	if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) {
-		regs->gprs[2] = -ENOSYS;
-		if (likely(nr < NR_syscalls))
-			regs->gprs[2] = current->thread.sys_call_table[nr](regs);
-	} else {
-		clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET);
-	}
+
+	if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+		goto out;
+	regs->gprs[2] = -ENOSYS;
+	if (likely(nr >= NR_syscalls))
+		goto out;
+	do {
+		regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+	} while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART));
+out:
 	syscall_exit_to_user_mode_work(regs);
 }
 
@@ -154,13 +161,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 	if (per_trap)
 		set_thread_flag(TIF_PER_TRAP);
 
-	for (;;) {
-		regs->flags = 0;
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		do_syscall(regs);
-		if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART))
-			break;
-		local_irq_enable();
-	}
+	regs->flags = 0;
+	set_pt_regs_flag(regs, PIF_SYSCALL);
+	do_syscall(regs);
 	exit_to_user_mode();
 }
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 019c5748b607..76947275fe8b 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -277,6 +277,8 @@ static void __init test_monitor_call(void)
 {
 	int val = 1;
 
+	if (!IS_ENABLED(CONFIG_BUG))
+		return;
 	asm volatile(
 		"	mc	0,0\n"
 		"0:	xgr	%0,%0\n"
@@ -299,10 +301,9 @@ static void (*pgm_check_table[128])(struct pt_regs *regs);
 void noinstr __do_pgm_check(struct pt_regs *regs)
 {
 	unsigned long last_break = S390_lowcore.breaking_event_addr;
-	unsigned int trapnr, syscall_redirect = 0;
+	unsigned int trapnr;
 	irqentry_state_t state;
 
-	add_random_kstack_offset();
 	regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc;
 	regs->int_parm_long = S390_lowcore.trans_exc_code;
 
@@ -344,18 +345,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
 	trapnr = regs->int_code & PGM_INT_CODE_MASK;
 	if (trapnr)
 		pgm_check_table[trapnr](regs);
-	syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL);
 out:
 	local_irq_disable();
 	irqentry_exit(regs, state);
-
-	if (syscall_redirect) {
-		enter_from_user_mode(regs);
-		local_irq_enable();
-		regs->orig_gpr2 = regs->gprs[2];
-		do_syscall(regs);
-		exit_to_user_mode();
-	}
 }
 
 /*
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 6be2167943bb..aeb0a15bcbb7 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -358,6 +358,15 @@ static ssize_t uv_query_facilities(struct kobject *kobj,
 static struct kobj_attribute uv_query_facilities_attr =
 	__ATTR(facilities, 0444, uv_query_facilities, NULL);
 
+static ssize_t uv_query_feature_indications(struct kobject *kobj,
+					    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
+}
+
+static struct kobj_attribute uv_query_feature_indications_attr =
+	__ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
+
 static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *page)
 {
@@ -390,6 +399,7 @@ static struct kobj_attribute uv_query_max_guest_addr_attr =
 
 static struct attribute *uv_query_attrs[] = {
 	&uv_query_facilities_attr.attr,
+	&uv_query_feature_indications_attr.attr,
 	&uv_query_max_guest_cpus_attr.attr,
 	&uv_query_max_guest_vms_attr.attr,
 	&uv_query_max_guest_addr_attr.attr,
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 8c4e07d533c8..99694260cac9 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -20,7 +20,7 @@
 #include <asm/vdso.h>
 
 extern char vdso64_start[], vdso64_end[];
-static unsigned int vdso_pages;
+extern char vdso32_start[], vdso32_end[];
 
 static struct vm_special_mapping vvar_mapping;
 
@@ -37,18 +37,6 @@ enum vvar_pages {
 	VVAR_NR_PAGES,
 };
 
-unsigned int __read_mostly vdso_enabled = 1;
-
-static int __init vdso_setup(char *str)
-{
-	bool enabled;
-
-	if (!kstrtobool(str, &enabled))
-		vdso_enabled = enabled;
-	return 1;
-}
-__setup("vdso=", vdso_setup);
-
 #ifdef CONFIG_TIME_NS
 struct vdso_data *arch_get_vdso_data(void *vvar_page)
 {
@@ -155,7 +143,12 @@ static struct vm_special_mapping vvar_mapping = {
 	.fault = vvar_fault,
 };
 
-static struct vm_special_mapping vdso_mapping = {
+static struct vm_special_mapping vdso64_mapping = {
+	.name = "[vdso]",
+	.mremap = vdso_mremap,
+};
+
+static struct vm_special_mapping vdso32_mapping = {
 	.name = "[vdso]",
 	.mremap = vdso_mremap,
 };
@@ -171,16 +164,22 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
 	unsigned long vdso_text_len, vdso_mapping_len;
 	unsigned long vvar_start, vdso_text_start;
+	struct vm_special_mapping *vdso_mapping;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int rc;
 
 	BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
-	if (!vdso_enabled || is_compat_task())
-		return 0;
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
-	vdso_text_len = vdso_pages << PAGE_SHIFT;
+
+	if (is_compat_task()) {
+		vdso_text_len = vdso32_end - vdso32_start;
+		vdso_mapping = &vdso32_mapping;
+	} else {
+		vdso_text_len = vdso64_end - vdso64_start;
+		vdso_mapping = &vdso64_mapping;
+	}
 	vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE;
 	vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
 	rc = vvar_start;
@@ -198,7 +197,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
 				       VM_READ|VM_EXEC|
 				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-				       &vdso_mapping);
+				       vdso_mapping);
 	if (IS_ERR(vma)) {
 		do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
 		rc = PTR_ERR(vma);
@@ -211,21 +210,25 @@ out:
 	return rc;
 }
 
-static int __init vdso_init(void)
+static struct page ** __init vdso_setup_pages(void *start, void *end)
 {
-	struct page **pages;
+	int pages = (end - start) >> PAGE_SHIFT;
+	struct page **pagelist;
 	int i;
 
-	vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT;
-	pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL);
-	if (!pages) {
-		vdso_enabled = 0;
-		return -ENOMEM;
-	}
-	for (i = 0; i < vdso_pages; i++)
-		pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE);
-	pages[vdso_pages] = NULL;
-	vdso_mapping.pages = pages;
+	pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+	if (!pagelist)
+		panic("%s: Cannot allocate page list for VDSO", __func__);
+	for (i = 0; i < pages; i++)
+		pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
+	return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+	vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
+	if (IS_ENABLED(CONFIG_COMPAT))
+		vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
 	return 0;
 }
 arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore
new file mode 100644
index 000000000000..5167384843b9
--- /dev/null
+++ b/arch/s390/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
new file mode 100644
index 000000000000..b2349a3f4fa3
--- /dev/null
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+KCOV_INSTRUMENT := n
+ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
+ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+
+include $(srctree)/lib/vdso/Makefile
+obj-vdso32 = vdso_user_wrapper-32.o note-32.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+KBUILD_AFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m31 -s
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
+
+LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \
+	--hash-style=both --build-id=sha1 -melf_s390 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+
+obj-y += vdso32_wrapper.o
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Disable gcov profiling, ubsan and kasan for VDSO code
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
+	$(call if_changed,ld)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+$(obj-vdso32): %-32.o: %.S FORCE
+	$(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32as = VDSO32A $@
+      cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+      cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso32.so: $(obj)/vdso32.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso_install: vdso32.so
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..9c4f951e227d
--- /dev/null
+++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S
new file mode 100644
index 000000000000..db19d0680a0a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/note.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 000000000000..bff50b6acd6d
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+
+#include <asm/page.h>
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
+OUTPUT_ARCH(s390:31-bit)
+ENTRY(_start)
+
+SECTIONS
+{
+	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+	. = VDSO_LBASE + SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+
+	. = ALIGN(16);
+	.text		: {
+		*(.text .stub .text.* .gnu.linkonce.t.*)
+	} :text
+	PROVIDE(__etext = .);
+	PROVIDE(_etext = .);
+	PROVIDE(etext = .);
+
+	/*
+	 * Other stuff is appended to the text segment:
+	 */
+	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+	.rodata1	: { *(.rodata1) }
+
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+	.gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
+
+	.rela.dyn ALIGN(8) : { *(.rela.dyn) }
+	.got ALIGN(8)	: { *(.got .toc) }
+
+	_end = .;
+	PROVIDE(end = .);
+
+	/*
+	 * Stabs debugging sections are here too.
+	 */
+	.stab	       0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+
+	/*
+	 * DWARF debug sections.
+	 * Symbols in the DWARF debugging sections are relative to the
+	 * beginning of the section so we begin them at 0.
+	 */
+	/* DWARF 1 */
+	.debug		0 : { *(.debug) }
+	.line		0 : { *(.line) }
+	/* GNU DWARF 1 extensions */
+	.debug_srcinfo	0 : { *(.debug_srcinfo) }
+	.debug_sfnames	0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2 */
+	.debug_aranges	0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2 */
+	.debug_info	0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev	0 : { *(.debug_abbrev) }
+	.debug_line	0 : { *(.debug_line) }
+	.debug_frame	0 : { *(.debug_frame) }
+	.debug_str	0 : { *(.debug_str) }
+	.debug_loc	0 : { *(.debug_loc) }
+	.debug_macinfo	0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3 */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges	0 : { *(.debug_ranges) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+
+	/DISCARD/	: {
+		*(.note.GNU-stack)
+		*(.branch_lt)
+		*(.data .data.* .gnu.linkonce.d.* .sdata*)
+		*(.bss .sbss .dynbss .dynsbss)
+	}
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME	0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD FILEHDR PHDRS FLAGS(5);	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC FLAGS(4);		/* PF_R */
+	note		PT_NOTE FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		/*
+		 * Has to be there for the kernel to find
+		 */
+		__kernel_compat_restart_syscall;
+		__kernel_compat_rt_sigreturn;
+		__kernel_compat_sigreturn;
+	local: *;
+	};
+}
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 000000000000..de2fb930471a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.globl vdso32_start, vdso32_end
+	.balign PAGE_SIZE
+vdso32_start:
+	.incbin "arch/s390/kernel/vdso32/vdso32.so"
+	.balign PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
new file mode 100644
index 000000000000..3f42f27f978c
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/unistd.h>
+#include <asm/dwarf.h>
+
+.macro vdso_syscall func,syscall
+	.globl __kernel_compat_\func
+	.type  __kernel_compat_\func,@function
+	.align 8
+__kernel_compat_\func:
+	CFI_STARTPROC
+	svc	\syscall
+	/* Make sure we notice when a syscall returns, which shouldn't happen */
+	.word	0
+	CFI_ENDPROC
+	.size	__kernel_compat_\func,.-__kernel_compat_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index a6e0fb6b91d6..2a2092ce19f1 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -74,3 +74,11 @@ vdso64.so: $(obj)/vdso64.so.dbg
 	$(call cmd,vdso_install)
 
 vdso_install: vdso64.so
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..37f05cb38dad
--- /dev/null
+++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 518f1ea405f4..d4fb336d747b 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -17,7 +17,7 @@ SECTIONS
 #ifdef CONFIG_TIME_NS
 	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
 #endif
-	. = VDSO64_LBASE + SIZEOF_HEADERS;
+	. = VDSO_LBASE + SIZEOF_HEADERS;
 
 	.hash		: { *(.hash) }			:text
 	.gnu.hash	: { *(.gnu.hash) }
@@ -137,6 +137,9 @@ VERSION
 		__kernel_clock_gettime;
 		__kernel_clock_getres;
 		__kernel_getcpu;
+		__kernel_restart_syscall;
+		__kernel_rt_sigreturn;
+		__kernel_sigreturn;
 	local: *;
 	};
 }
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
index f773505c7e63..97f0c0a669a5 100644
--- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -37,3 +37,20 @@ vdso_func gettimeofday
 vdso_func clock_getres
 vdso_func clock_gettime
 vdso_func getcpu
+
+.macro vdso_syscall func,syscall
+	.globl __kernel_\func
+	.type  __kernel_\func,@function
+	.align 8
+__kernel_\func:
+	CFI_STARTPROC
+	svc	\syscall
+	/* Make sure we notice when a syscall returns, which shouldn't happen */
+	.word	0
+	CFI_ENDPROC
+	.size	__kernel_\func,.-__kernel_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index ec5b76bde4d8..cfcdf76d6a95 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -162,7 +162,7 @@ char *strcat(char *dest, const char *src)
 		"	jo	0b\n"
 		"1:	mvst	%[dummy],%[src]\n"
 		"	jo	1b\n"
-		: [dummy] "=&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
+		: [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
 		:
 		: "cc", "memory", "0");
 	return ret;
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index 2f32802f79ce..ecf327d743a0 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -120,7 +120,7 @@ static struct unwindme *unwindme;
 #define UWM_REGS		0x2	/* Pass regs to test_unwind(). */
 #define UWM_SP			0x4	/* Pass sp to test_unwind(). */
 #define UWM_CALLER		0x8	/* Unwind starting from caller. */
-#define UWM_SWITCH_STACK	0x10	/* Use CALL_ON_STACK. */
+#define UWM_SWITCH_STACK	0x10	/* Use call_on_stack. */
 #define UWM_IRQ			0x20	/* Unwind from irq context. */
 #define UWM_PGM			0x40	/* Unwind from program check handler. */
 
@@ -211,7 +211,8 @@ static noinline int unwindme_func2(struct unwindme *u)
 	if (u->flags & UWM_SWITCH_STACK) {
 		local_irq_save(flags);
 		local_mcck_disable();
-		rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u);
+		rc = call_on_stack(1, S390_lowcore.nodat_stack,
+				   int, unwindme_func3, struct unwindme *, u);
 		local_mcck_enable();
 		local_irq_restore(flags);
 		return rc;
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 67606d932825..7ec8b1fa0f08 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -224,7 +224,7 @@ static inline unsigned long copy_in_user_mvcos(void __user *to, const void __use
 		EX_TABLE(0b,3b)
 		: "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
 		: [spec] "d" (0x810081UL)
-		: "cc", "memory");
+		: "cc", "memory", "0");
 	return size;
 }
 
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 8ae3dc5783fd..e33c43b38afe 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -285,26 +285,6 @@ static noinline void do_sigbus(struct pt_regs *regs)
 			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 }
 
-static noinline int signal_return(struct pt_regs *regs)
-{
-	u16 instruction;
-	int rc;
-
-	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
-	if (rc)
-		return rc;
-	if (instruction == 0x0a77) {
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		regs->int_code = 0x00040077;
-		return 0;
-	} else if (instruction == 0x0aad) {
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		regs->int_code = 0x000400ad;
-		return 0;
-	}
-	return -EACCES;
-}
-
 static noinline void do_fault_error(struct pt_regs *regs, int access,
 					vm_fault_t fault)
 {
@@ -312,9 +292,6 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
 
 	switch (fault) {
 	case VM_FAULT_BADACCESS:
-		if (access == VM_EXEC && signal_return(regs) == 0)
-			break;
-		fallthrough;
 	case VM_FAULT_BADMAP:
 		/* Bad memory access. Check if it is kernel or user space. */
 		if (user_mode(regs)) {
@@ -792,6 +769,32 @@ void do_secure_storage_access(struct pt_regs *regs)
 	struct page *page;
 	int rc;
 
+	/*
+	 * bit 61 tells us if the address is valid, if it's not we
+	 * have a major problem and should stop the kernel or send a
+	 * SIGSEGV to the process. Unfortunately bit 61 is not
+	 * reliable without the misc UV feature so we need to check
+	 * for that as well.
+	 */
+	if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+	    !test_bit_inv(61, &regs->int_parm_long)) {
+		/*
+		 * When this happens, userspace did something that it
+		 * was not supposed to do, e.g. branching into secure
+		 * memory. Trigger a segmentation fault.
+		 */
+		if (user_mode(regs)) {
+			send_sig(SIGSEGV, current, 0);
+			return;
+		}
+
+		/*
+		 * The kernel should never run into this case and we
+		 * have no way out of this situation.
+		 */
+		panic("Unexpected PGM 0x3d with TEID bit 61=0");
+	}
+
 	switch (get_fault_type(regs)) {
 	case USER_FAULT:
 		mm = current->mm;
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 1f1f906344ff..a0f54bd5e98a 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -125,12 +125,18 @@ static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
  */
 int memcpy_real(void *dest, void *src, size_t count)
 {
+	unsigned long _dest  = (unsigned long)dest;
+	unsigned long _src   = (unsigned long)src;
+	unsigned long _count = (unsigned long)count;
 	int rc;
 
 	if (S390_lowcore.nodat_stack != 0) {
 		preempt_disable();
-		rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
-				   dest, src, count);
+		rc = call_on_stack(3, S390_lowcore.nodat_stack,
+				   unsigned long, _memcpy_real,
+				   unsigned long, _dest,
+				   unsigned long, _src,
+				   unsigned long, _count);
 		preempt_enable();
 		return rc;
 	}
@@ -139,8 +145,7 @@ int memcpy_real(void *dest, void *src, size_t count)
 	 * not set up yet. Just call _memcpy_real on the early boot
 	 * stack
 	 */
-	return _memcpy_real((unsigned long) dest,(unsigned long) src,
-			    (unsigned long) count);
+	return _memcpy_real(_dest, _src, _count);
 }
 
 /*