diff options
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1199_linux-4.19.200.patch | 893 |
2 files changed, 897 insertions, 0 deletions
diff --git a/0000_README b/0000_README index f1619e02..58e78594 100644 --- a/0000_README +++ b/0000_README @@ -835,6 +835,10 @@ Patch: 1198_linux-4.19.199.patch From: https://www.kernel.org Desc: Linux 4.19.199 +Patch: 1199_linux-4.19.200.patch +From: https://www.kernel.org +Desc: Linux 4.19.200 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1199_linux-4.19.200.patch b/1199_linux-4.19.200.patch new file mode 100644 index 00000000..da6c8c42 --- /dev/null +++ b/1199_linux-4.19.200.patch @@ -0,0 +1,893 @@ +diff --git a/Makefile b/Makefile +index f3ad63a089a18..a4ea351c4e5d6 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 19 +-SUBLEVEL = 199 ++SUBLEVEL = 200 + EXTRAVERSION = + NAME = "People's Front" + +diff --git a/arch/arm/boot/dts/versatile-ab.dts b/arch/arm/boot/dts/versatile-ab.dts +index 6f4f60ba5429c..990b7ef1800e4 100644 +--- a/arch/arm/boot/dts/versatile-ab.dts ++++ b/arch/arm/boot/dts/versatile-ab.dts +@@ -192,16 +192,15 @@ + #size-cells = <1>; + ranges; + +- vic: intc@10140000 { ++ vic: interrupt-controller@10140000 { + compatible = "arm,versatile-vic"; + interrupt-controller; + #interrupt-cells = <1>; + reg = <0x10140000 0x1000>; +- clear-mask = <0xffffffff>; + valid-mask = <0xffffffff>; + }; + +- sic: intc@10003000 { ++ sic: interrupt-controller@10003000 { + compatible = "arm,versatile-sic"; + interrupt-controller; + #interrupt-cells = <1>; +diff --git a/arch/arm/boot/dts/versatile-pb.dts b/arch/arm/boot/dts/versatile-pb.dts +index 06a0fdf24026c..e7e751a858d81 100644 +--- a/arch/arm/boot/dts/versatile-pb.dts ++++ b/arch/arm/boot/dts/versatile-pb.dts +@@ -7,7 +7,7 @@ + + amba { + /* The Versatile PB is using more SIC IRQ lines than the AB */ +- sic: intc@10003000 { ++ sic: interrupt-controller@10003000 { + clear-mask = <0xffffffff>; + /* + * Valid interrupt lines mask according to +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 43fb4e296d8de..9cfc669b4a243 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -416,8 +416,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { + queue: +- if (has_error && !is_protmode(vcpu)) +- has_error = false; + if (reinject) { + /* + * On vmentry, vcpu->arch.exception.pending is only +@@ -7114,6 +7112,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + } + ++static void kvm_inject_exception(struct kvm_vcpu *vcpu) ++{ ++ if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) ++ vcpu->arch.exception.error_code = false; ++ kvm_x86_ops->queue_exception(vcpu); ++} ++ + static int inject_pending_event(struct kvm_vcpu *vcpu) + { + int r; +@@ -7121,7 +7126,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) + /* try to reinject previous events if any */ + + if (vcpu->arch.exception.injected) +- kvm_x86_ops->queue_exception(vcpu); ++ kvm_inject_exception(vcpu); + /* + * Do not inject an NMI or interrupt if there is a pending + * exception. Exceptions and interrupts are recognized at +@@ -7175,7 +7180,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) + kvm_update_dr7(vcpu); + } + +- kvm_x86_ops->queue_exception(vcpu); ++ kvm_inject_exception(vcpu); + } + + /* Don't consider new event if we re-injected an event */ +diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c +index effc4c17e0fb9..af5139eb96b5d 100644 +--- a/drivers/firmware/arm_scmi/driver.c ++++ b/drivers/firmware/arm_scmi/driver.c +@@ -48,7 +48,6 @@ enum scmi_error_codes { + SCMI_ERR_GENERIC = -8, /* Generic Error */ + SCMI_ERR_HARDWARE = -9, /* Hardware Error */ + SCMI_ERR_PROTOCOL = -10,/* Protocol Error */ +- SCMI_ERR_MAX + }; + + /* List of all SCMI devices active in system */ +@@ -168,8 +167,10 @@ static const int scmi_linux_errmap[] = { + + static inline int scmi_to_linux_errno(int errno) + { +- if (errno < SCMI_SUCCESS && errno > SCMI_ERR_MAX) +- return scmi_linux_errmap[-errno]; ++ int err_idx = -errno; ++ ++ if (err_idx >= SCMI_SUCCESS && err_idx < ARRAY_SIZE(scmi_linux_errmap)) ++ return scmi_linux_errmap[err_idx]; + return -EIO; + } + +@@ -628,8 +629,9 @@ static int scmi_xfer_info_init(struct scmi_info *sinfo) + struct scmi_xfers_info *info = &sinfo->minfo; + + /* Pre-allocated messages, no more than what hdr.seq can support */ +- if (WARN_ON(desc->max_msg >= MSG_TOKEN_MAX)) { +- dev_err(dev, "Maximum message of %d exceeds supported %ld\n", ++ if (WARN_ON(!desc->max_msg || desc->max_msg > MSG_TOKEN_MAX)) { ++ dev_err(dev, ++ "Invalid maximum messages %d, not in range [1 - %lu]\n", + desc->max_msg, MSG_TOKEN_MAX); + return -EINVAL; + } +diff --git a/drivers/iio/dac/ds4424.c b/drivers/iio/dac/ds4424.c +index 714a97f913199..ae9be792693bf 100644 +--- a/drivers/iio/dac/ds4424.c ++++ b/drivers/iio/dac/ds4424.c +@@ -236,12 +236,6 @@ static int ds4424_probe(struct i2c_client *client, + indio_dev->dev.of_node = client->dev.of_node; + indio_dev->dev.parent = &client->dev; + +- if (!client->dev.of_node) { +- dev_err(&client->dev, +- "Not found DT.\n"); +- return -ENODEV; +- } +- + data->vcc_reg = devm_regulator_get(&client->dev, "vcc"); + if (IS_ERR(data->vcc_reg)) { + dev_err(&client->dev, +diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c +index 5a14f518cd979..61955a7c838b4 100644 +--- a/fs/cifs/smb2ops.c ++++ b/fs/cifs/smb2ops.c +@@ -386,8 +386,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, + p = buf; + while (bytes_left >= sizeof(*p)) { + info->speed = le64_to_cpu(p->LinkSpeed); +- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE); +- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE); ++ info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; ++ info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); +diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c +index 4af318fbda774..ef9498a6e88ac 100644 +--- a/fs/hfs/bfind.c ++++ b/fs/hfs/bfind.c +@@ -25,7 +25,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) + fd->key = ptr + tree->max_key_len + 2; + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); +- mutex_lock(&tree->tree_lock); ++ switch (tree->cnid) { ++ case HFS_CAT_CNID: ++ mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); ++ break; ++ case HFS_EXT_CNID: ++ mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); ++ break; ++ case HFS_ATTR_CNID: ++ mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); ++ break; ++ default: ++ return -EINVAL; ++ } + return 0; + } + +diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c +index b63a4df7327b6..c0a73a6ffb28b 100644 +--- a/fs/hfs/bnode.c ++++ b/fs/hfs/bnode.c +@@ -15,16 +15,31 @@ + + #include "btree.h" + +-void hfs_bnode_read(struct hfs_bnode *node, void *buf, +- int off, int len) ++void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) + { + struct page *page; ++ int pagenum; ++ int bytes_read; ++ int bytes_to_read; ++ void *vaddr; + + off += node->page_offset; +- page = node->page[0]; ++ pagenum = off >> PAGE_SHIFT; ++ off &= ~PAGE_MASK; /* compute page offset for the first page */ + +- memcpy(buf, kmap(page) + off, len); +- kunmap(page); ++ for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) { ++ if (pagenum >= node->tree->pages_per_bnode) ++ break; ++ page = node->page[pagenum]; ++ bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); ++ ++ vaddr = kmap_atomic(page); ++ memcpy(buf + bytes_read, vaddr + off, bytes_to_read); ++ kunmap_atomic(vaddr); ++ ++ pagenum++; ++ off = 0; /* page offset only applies to the first page */ ++ } + } + + u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h +index dcc2aab1b2c43..25ac9a8bb57a7 100644 +--- a/fs/hfs/btree.h ++++ b/fs/hfs/btree.h +@@ -13,6 +13,13 @@ typedef int (*btree_keycmp)(const btree_key *, const btree_key *); + + #define NODE_HASH_SIZE 256 + ++/* B-tree mutex nested subclasses */ ++enum hfs_btree_mutex_classes { ++ CATALOG_BTREE_MUTEX, ++ EXTENTS_BTREE_MUTEX, ++ ATTR_BTREE_MUTEX, ++}; ++ + /* A HFS BTree held in memory */ + struct hfs_btree { + struct super_block *sb; +diff --git a/fs/hfs/super.c b/fs/hfs/super.c +index 173876782f73f..77b6f35a4aa93 100644 +--- a/fs/hfs/super.c ++++ b/fs/hfs/super.c +@@ -427,14 +427,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) + if (!res) { + if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + res = -EIO; +- goto bail; ++ goto bail_hfs_find; + } + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + } +- if (res) { +- hfs_find_exit(&fd); +- goto bail_no_root; +- } ++ if (res) ++ goto bail_hfs_find; + res = -EINVAL; + root_inode = hfs_iget(sb, &fd.search_key->cat, &rec); + hfs_find_exit(&fd); +@@ -450,6 +448,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) + /* everything's okay */ + return 0; + ++bail_hfs_find: ++ hfs_find_exit(&fd); + bail_no_root: + pr_err("get root inode failed\n"); + bail: +diff --git a/include/net/af_unix.h b/include/net/af_unix.h +index a5ba41b3b8673..7ec1cdb66be8d 100644 +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -10,6 +10,7 @@ + + void unix_inflight(struct user_struct *user, struct file *fp); + void unix_notinflight(struct user_struct *user, struct file *fp); ++void unix_destruct_scm(struct sk_buff *skb); + void unix_gc(void); + void wait_for_unix_gc(void); + struct sock *unix_get_socket(struct file *filp); +diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h +index cf8f792743ec2..c76a5e9894dac 100644 +--- a/include/net/busy_poll.h ++++ b/include/net/busy_poll.h +@@ -48,7 +48,7 @@ static inline bool net_busy_loop_on(void) + + static inline bool sk_can_busy_loop(const struct sock *sk) + { +- return sk->sk_ll_usec && !signal_pending(current); ++ return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); + } + + bool sk_busy_loop_end(void *p, unsigned long start_time); +diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h +index 48d74674d5e95..bc22e44ffcdf7 100644 +--- a/include/net/sctp/constants.h ++++ b/include/net/sctp/constants.h +@@ -348,8 +348,7 @@ enum { + #define SCTP_SCOPE_POLICY_MAX SCTP_SCOPE_POLICY_LINK + + /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>, +- * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24, +- * 192.88.99.0/24. ++ * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24. + * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP + * addresses. + */ +@@ -357,7 +356,6 @@ enum { + ((htonl(INADDR_BROADCAST) == a) || \ + ipv4_is_multicast(a) || \ + ipv4_is_zeronet(a) || \ +- ipv4_is_test_198(a) || \ + ipv4_is_anycast_6to4(a)) + + /* Flags used for the bind address copy functions. */ +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index f278e2f584fd2..1573d1bf63007 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -3498,15 +3498,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work) + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; +- bool is_last; ++ bool is_last = false; + +- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) +- return; ++ /* ++ * when @pwq is not linked, it doesn't hold any reference to the ++ * @wq, and @wq is invalid to access. ++ */ ++ if (!list_empty(&pwq->pwqs_node)) { ++ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) ++ return; + +- mutex_lock(&wq->mutex); +- list_del_rcu(&pwq->pwqs_node); +- is_last = list_empty(&wq->pwqs); +- mutex_unlock(&wq->mutex); ++ mutex_lock(&wq->mutex); ++ list_del_rcu(&pwq->pwqs_node); ++ is_last = list_empty(&wq->pwqs); ++ mutex_unlock(&wq->mutex); ++ } + + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); +diff --git a/net/802/garp.c b/net/802/garp.c +index 7f50d47470bd4..8e19f51833d6f 100644 +--- a/net/802/garp.c ++++ b/net/802/garp.c +@@ -206,6 +206,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr + kfree(attr); + } + ++static void garp_attr_destroy_all(struct garp_applicant *app) ++{ ++ struct rb_node *node, *next; ++ struct garp_attr *attr; ++ ++ for (node = rb_first(&app->gid); ++ next = node ? rb_next(node) : NULL, node != NULL; ++ node = next) { ++ attr = rb_entry(node, struct garp_attr, node); ++ garp_attr_destroy(app, attr); ++ } ++} ++ + static int garp_pdu_init(struct garp_applicant *app) + { + struct sk_buff *skb; +@@ -612,6 +625,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl + + spin_lock_bh(&app->lock); + garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); ++ garp_attr_destroy_all(app); + garp_pdu_queue(app); + spin_unlock_bh(&app->lock); + +diff --git a/net/802/mrp.c b/net/802/mrp.c +index a808dd5bbb27a..32f87d458f054 100644 +--- a/net/802/mrp.c ++++ b/net/802/mrp.c +@@ -295,6 +295,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) + kfree(attr); + } + ++static void mrp_attr_destroy_all(struct mrp_applicant *app) ++{ ++ struct rb_node *node, *next; ++ struct mrp_attr *attr; ++ ++ for (node = rb_first(&app->mad); ++ next = node ? rb_next(node) : NULL, node != NULL; ++ node = next) { ++ attr = rb_entry(node, struct mrp_attr, node); ++ mrp_attr_destroy(app, attr); ++ } ++} ++ + static int mrp_pdu_init(struct mrp_applicant *app) + { + struct sk_buff *skb; +@@ -898,6 +911,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) + + spin_lock_bh(&app->lock); + mrp_mad_event(app, MRP_EVENT_TX); ++ mrp_attr_destroy_all(app); + mrp_pdu_queue(app); + spin_unlock_bh(&app->lock); + +diff --git a/net/Makefile b/net/Makefile +index bdaf53925acd5..449fc0b221f83 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -18,7 +18,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ + obj-$(CONFIG_INET) += ipv4/ + obj-$(CONFIG_TLS) += tls/ + obj-$(CONFIG_XFRM) += xfrm/ +-obj-$(CONFIG_UNIX) += unix/ ++obj-$(CONFIG_UNIX_SCM) += unix/ + obj-$(CONFIG_NET) += ipv6/ + obj-$(CONFIG_BPFILTER) += bpfilter/ + obj-$(CONFIG_PACKET) += packet/ +diff --git a/net/core/sock.c b/net/core/sock.c +index e6cbe137cb6fc..956af38aa0d6e 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -989,7 +989,7 @@ set_rcvbuf: + if (val < 0) + ret = -EINVAL; + else +- sk->sk_ll_usec = val; ++ WRITE_ONCE(sk->sk_ll_usec, val); + } + break; + #endif +diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c +index dd51256582556..7207a9769f1a9 100644 +--- a/net/sctp/protocol.c ++++ b/net/sctp/protocol.c +@@ -412,7 +412,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr) + retval = SCTP_SCOPE_LINK; + } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || + ipv4_is_private_172(addr->v4.sin_addr.s_addr) || +- ipv4_is_private_192(addr->v4.sin_addr.s_addr)) { ++ ipv4_is_private_192(addr->v4.sin_addr.s_addr) || ++ ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_PRIVATE; + } else { + retval = SCTP_SCOPE_GLOBAL; +diff --git a/net/unix/Kconfig b/net/unix/Kconfig +index 8b31ab85d050f..3b9e450656a4d 100644 +--- a/net/unix/Kconfig ++++ b/net/unix/Kconfig +@@ -19,6 +19,11 @@ config UNIX + + Say Y unless you know what you are doing. + ++config UNIX_SCM ++ bool ++ depends on UNIX ++ default y ++ + config UNIX_DIAG + tristate "UNIX: socket monitoring interface" + depends on UNIX +diff --git a/net/unix/Makefile b/net/unix/Makefile +index ffd0a275c3a79..54e58cc4f9450 100644 +--- a/net/unix/Makefile ++++ b/net/unix/Makefile +@@ -10,3 +10,5 @@ unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o + + obj-$(CONFIG_UNIX_DIAG) += unix_diag.o + unix_diag-y := diag.o ++ ++obj-$(CONFIG_UNIX_SCM) += scm.o +diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c +index 53fe5ada5a83a..98c253afa0db2 100644 +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -119,6 +119,8 @@ + #include <linux/freezer.h> + #include <linux/file.h> + ++#include "scm.h" ++ + struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; + EXPORT_SYMBOL_GPL(unix_socket_table); + DEFINE_SPINLOCK(unix_table_lock); +@@ -1515,65 +1517,51 @@ out: + return err; + } + +-static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +-{ +- int i; +- +- scm->fp = UNIXCB(skb).fp; +- UNIXCB(skb).fp = NULL; +- +- for (i = scm->fp->count-1; i >= 0; i--) +- unix_notinflight(scm->fp->user, scm->fp->fp[i]); +-} +- +-static void unix_destruct_scm(struct sk_buff *skb) +-{ +- struct scm_cookie scm; +- memset(&scm, 0, sizeof(scm)); +- scm.pid = UNIXCB(skb).pid; +- if (UNIXCB(skb).fp) +- unix_detach_fds(&scm, skb); +- +- /* Alas, it calls VFS */ +- /* So fscking what? fput() had been SMP-safe since the last Summer */ +- scm_destroy(&scm); +- sock_wfree(skb); +-} +- +-/* +- * The "user->unix_inflight" variable is protected by the garbage +- * collection lock, and we just read it locklessly here. If you go +- * over the limit, there might be a tiny race in actually noticing +- * it across threads. Tough. +- */ +-static inline bool too_many_unix_fds(struct task_struct *p) +-{ +- struct user_struct *user = current_user(); +- +- if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) +- return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); +- return false; +-} +- +-static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) ++static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) + { +- int i; +- +- if (too_many_unix_fds(current)) +- return -ETOOMANYREFS; ++ scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* +- * Need to duplicate file references for the sake of garbage +- * collection. Otherwise a socket in the fps might become a +- * candidate for GC while the skb is not yet queued. ++ * Garbage collection of unix sockets starts by selecting a set of ++ * candidate sockets which have reference only from being in flight ++ * (total_refs == inflight_refs). This condition is checked once during ++ * the candidate collection phase, and candidates are marked as such, so ++ * that non-candidates can later be ignored. While inflight_refs is ++ * protected by unix_gc_lock, total_refs (file count) is not, hence this ++ * is an instantaneous decision. ++ * ++ * Once a candidate, however, the socket must not be reinstalled into a ++ * file descriptor while the garbage collection is in progress. ++ * ++ * If the above conditions are met, then the directed graph of ++ * candidates (*) does not change while unix_gc_lock is held. ++ * ++ * Any operations that changes the file count through file descriptors ++ * (dup, close, sendmsg) does not change the graph since candidates are ++ * not installed in fds. ++ * ++ * Dequeing a candidate via recvmsg would install it into an fd, but ++ * that takes unix_gc_lock to decrement the inflight count, so it's ++ * serialized with garbage collection. ++ * ++ * MSG_PEEK is special in that it does not change the inflight count, ++ * yet does install the socket into an fd. The following lock/unlock ++ * pair is to ensure serialization with garbage collection. It must be ++ * done between incrementing the file count and installing the file into ++ * an fd. ++ * ++ * If garbage collection starts after the barrier provided by the ++ * lock/unlock, then it will see the elevated refcount and not mark this ++ * as a candidate. If a garbage collection is already in progress ++ * before the file count was incremented, then the lock/unlock pair will ++ * ensure that garbage collection is finished before progressing to ++ * installing the fd. ++ * ++ * (*) A -> B where B is on the queue of A or B is on the queue of C ++ * which is on the queue of listening socket A. + */ +- UNIXCB(skb).fp = scm_fp_dup(scm->fp); +- if (!UNIXCB(skb).fp) +- return -ENOMEM; +- +- for (i = scm->fp->count - 1; i >= 0; i--) +- unix_inflight(scm->fp->user, scm->fp->fp[i]); +- return 0; ++ spin_lock(&unix_gc_lock); ++ spin_unlock(&unix_gc_lock); + } + + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) +@@ -2201,7 +2189,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + sk_peek_offset_fwd(sk, size); + + if (UNIXCB(skb).fp) +- scm.fp = scm_fp_dup(UNIXCB(skb).fp); ++ unix_peek_fds(&scm, skb); + } + err = (flags & MSG_TRUNC) ? skb->len - skip : size; + +@@ -2442,7 +2430,7 @@ unlock: + /* It is questionable, see note in unix_dgram_recvmsg. + */ + if (UNIXCB(skb).fp) +- scm.fp = scm_fp_dup(UNIXCB(skb).fp); ++ unix_peek_fds(&scm, skb); + + sk_peek_offset_fwd(sk, chunk); + +diff --git a/net/unix/garbage.c b/net/unix/garbage.c +index c36757e728442..8bbe1b8e4ff7f 100644 +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -86,77 +86,13 @@ + #include <net/scm.h> + #include <net/tcp_states.h> + ++#include "scm.h" ++ + /* Internal data structures and random procedures: */ + +-static LIST_HEAD(gc_inflight_list); + static LIST_HEAD(gc_candidates); +-static DEFINE_SPINLOCK(unix_gc_lock); + static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); + +-unsigned int unix_tot_inflight; +- +-struct sock *unix_get_socket(struct file *filp) +-{ +- struct sock *u_sock = NULL; +- struct inode *inode = file_inode(filp); +- +- /* Socket ? */ +- if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { +- struct socket *sock = SOCKET_I(inode); +- struct sock *s = sock->sk; +- +- /* PF_UNIX ? */ +- if (s && sock->ops && sock->ops->family == PF_UNIX) +- u_sock = s; +- } +- return u_sock; +-} +- +-/* Keep the number of times in flight count for the file +- * descriptor if it is for an AF_UNIX socket. +- */ +- +-void unix_inflight(struct user_struct *user, struct file *fp) +-{ +- struct sock *s = unix_get_socket(fp); +- +- spin_lock(&unix_gc_lock); +- +- if (s) { +- struct unix_sock *u = unix_sk(s); +- +- if (atomic_long_inc_return(&u->inflight) == 1) { +- BUG_ON(!list_empty(&u->link)); +- list_add_tail(&u->link, &gc_inflight_list); +- } else { +- BUG_ON(list_empty(&u->link)); +- } +- unix_tot_inflight++; +- } +- user->unix_inflight++; +- spin_unlock(&unix_gc_lock); +-} +- +-void unix_notinflight(struct user_struct *user, struct file *fp) +-{ +- struct sock *s = unix_get_socket(fp); +- +- spin_lock(&unix_gc_lock); +- +- if (s) { +- struct unix_sock *u = unix_sk(s); +- +- BUG_ON(!atomic_long_read(&u->inflight)); +- BUG_ON(list_empty(&u->link)); +- +- if (atomic_long_dec_and_test(&u->inflight)) +- list_del_init(&u->link); +- unix_tot_inflight--; +- } +- user->unix_inflight--; +- spin_unlock(&unix_gc_lock); +-} +- + static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), + struct sk_buff_head *hitlist) + { +diff --git a/net/unix/scm.c b/net/unix/scm.c +new file mode 100644 +index 0000000000000..83413ade79838 +--- /dev/null ++++ b/net/unix/scm.c +@@ -0,0 +1,148 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/string.h> ++#include <linux/socket.h> ++#include <linux/net.h> ++#include <linux/fs.h> ++#include <net/af_unix.h> ++#include <net/scm.h> ++#include <linux/init.h> ++ ++#include "scm.h" ++ ++unsigned int unix_tot_inflight; ++EXPORT_SYMBOL(unix_tot_inflight); ++ ++LIST_HEAD(gc_inflight_list); ++EXPORT_SYMBOL(gc_inflight_list); ++ ++DEFINE_SPINLOCK(unix_gc_lock); ++EXPORT_SYMBOL(unix_gc_lock); ++ ++struct sock *unix_get_socket(struct file *filp) ++{ ++ struct sock *u_sock = NULL; ++ struct inode *inode = file_inode(filp); ++ ++ /* Socket ? */ ++ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { ++ struct socket *sock = SOCKET_I(inode); ++ struct sock *s = sock->sk; ++ ++ /* PF_UNIX ? */ ++ if (s && sock->ops && sock->ops->family == PF_UNIX) ++ u_sock = s; ++ } ++ return u_sock; ++} ++EXPORT_SYMBOL(unix_get_socket); ++ ++/* Keep the number of times in flight count for the file ++ * descriptor if it is for an AF_UNIX socket. ++ */ ++void unix_inflight(struct user_struct *user, struct file *fp) ++{ ++ struct sock *s = unix_get_socket(fp); ++ ++ spin_lock(&unix_gc_lock); ++ ++ if (s) { ++ struct unix_sock *u = unix_sk(s); ++ ++ if (atomic_long_inc_return(&u->inflight) == 1) { ++ BUG_ON(!list_empty(&u->link)); ++ list_add_tail(&u->link, &gc_inflight_list); ++ } else { ++ BUG_ON(list_empty(&u->link)); ++ } ++ unix_tot_inflight++; ++ } ++ user->unix_inflight++; ++ spin_unlock(&unix_gc_lock); ++} ++ ++void unix_notinflight(struct user_struct *user, struct file *fp) ++{ ++ struct sock *s = unix_get_socket(fp); ++ ++ spin_lock(&unix_gc_lock); ++ ++ if (s) { ++ struct unix_sock *u = unix_sk(s); ++ ++ BUG_ON(!atomic_long_read(&u->inflight)); ++ BUG_ON(list_empty(&u->link)); ++ ++ if (atomic_long_dec_and_test(&u->inflight)) ++ list_del_init(&u->link); ++ unix_tot_inflight--; ++ } ++ user->unix_inflight--; ++ spin_unlock(&unix_gc_lock); ++} ++ ++/* ++ * The "user->unix_inflight" variable is protected by the garbage ++ * collection lock, and we just read it locklessly here. If you go ++ * over the limit, there might be a tiny race in actually noticing ++ * it across threads. Tough. ++ */ ++static inline bool too_many_unix_fds(struct task_struct *p) ++{ ++ struct user_struct *user = current_user(); ++ ++ if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) ++ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); ++ return false; ++} ++ ++int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) ++{ ++ int i; ++ ++ if (too_many_unix_fds(current)) ++ return -ETOOMANYREFS; ++ ++ /* ++ * Need to duplicate file references for the sake of garbage ++ * collection. Otherwise a socket in the fps might become a ++ * candidate for GC while the skb is not yet queued. ++ */ ++ UNIXCB(skb).fp = scm_fp_dup(scm->fp); ++ if (!UNIXCB(skb).fp) ++ return -ENOMEM; ++ ++ for (i = scm->fp->count - 1; i >= 0; i--) ++ unix_inflight(scm->fp->user, scm->fp->fp[i]); ++ return 0; ++} ++EXPORT_SYMBOL(unix_attach_fds); ++ ++void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) ++{ ++ int i; ++ ++ scm->fp = UNIXCB(skb).fp; ++ UNIXCB(skb).fp = NULL; ++ ++ for (i = scm->fp->count-1; i >= 0; i--) ++ unix_notinflight(scm->fp->user, scm->fp->fp[i]); ++} ++EXPORT_SYMBOL(unix_detach_fds); ++ ++void unix_destruct_scm(struct sk_buff *skb) ++{ ++ struct scm_cookie scm; ++ ++ memset(&scm, 0, sizeof(scm)); ++ scm.pid = UNIXCB(skb).pid; ++ if (UNIXCB(skb).fp) ++ unix_detach_fds(&scm, skb); ++ ++ /* Alas, it calls VFS */ ++ /* So fscking what? fput() had been SMP-safe since the last Summer */ ++ scm_destroy(&scm); ++ sock_wfree(skb); ++} ++EXPORT_SYMBOL(unix_destruct_scm); +diff --git a/net/unix/scm.h b/net/unix/scm.h +new file mode 100644 +index 0000000000000..5a255a477f160 +--- /dev/null ++++ b/net/unix/scm.h +@@ -0,0 +1,10 @@ ++#ifndef NET_UNIX_SCM_H ++#define NET_UNIX_SCM_H ++ ++extern struct list_head gc_inflight_list; ++extern spinlock_t unix_gc_lock; ++ ++int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); ++void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); ++ ++#endif +diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c +index 16d42b2de424e..1963440f67251 100644 +--- a/tools/testing/selftests/vm/userfaultfd.c ++++ b/tools/testing/selftests/vm/userfaultfd.c +@@ -131,7 +131,7 @@ static void anon_allocate_area(void **alloc_area) + { + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +- if (*alloc_area == MAP_FAILED) ++ if (*alloc_area == MAP_FAILED) { + fprintf(stderr, "mmap of anonymous memory failed"); + *alloc_area = NULL; + } |