Merge tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

 - Implement per-PMU context rescheduling to significantly improve
   single-PMU performance, and related cleanups/fixes (Peter Zijlstra
   and Namhyung Kim)

 - Fix ancient bug resulting in a lot of events being dropped
   erroneously at higher sampling frequencies (Luo Gengkun)

 - uprobes enhancements:

     - Implement RCU-protected hot path optimizations for better
       performance:

         "For baseline vs SRCU, peak througput increased from 3.7 M/s
          (million uprobe triggerings per second) up to about 8 M/s. For
          uretprobes it's a bit more modest with bump from 2.4 M/s to
          5 M/s.

          For SRCU vs RCU Tasks Trace, peak throughput for uprobes
          increases further from 8 M/s to 10.3 M/s (+28%!), and for
          uretprobes from 5.3 M/s to 5.8 M/s (+11%), as we have more
          work to do on uretprobes side.

          Even single-thread (no contention) performance is slightly
          better: 3.276 M/s to 3.396 M/s (+3.5%) for uprobes, and 2.055
          M/s to 2.174 M/s (+5.8%) for uretprobes."

          (Andrii Nakryiko et al)

     - Document mmap_lock, don't abuse get_user_pages_remote() (Oleg
       Nesterov)

     - Cleanups & fixes to prepare for future work:
        - Remove uprobe_register_refctr()
	- Simplify error handling for alloc_uprobe()
        - Make uprobe_register() return struct uprobe *
        - Fold __uprobe_unregister() into uprobe_unregister()
        - Shift put_uprobe() from delete_uprobe() to uprobe_unregister()
        - BPF: Fix use-after-free in bpf_uprobe_multi_link_attach()
          (Oleg Nesterov)

 - New feature & ABI extension: allow events to use PERF_SAMPLE READ
   with inheritance, enabling sample based profiling of a group of
   counters over a hierarchy of processes or threads (Ben Gainey)

 - Intel uncore & power events updates:

      - Add Arrow Lake and Lunar Lake support
      - Add PERF_EV_CAP_READ_SCOPE
      - Clean up and enhance cpumask and hotplug support
        (Kan Liang)

      - Add LNL uncore iMC freerunning support
      - Use D0:F0 as a default device
        (Zhenyu Wang)

 - Intel PT: fix AUX snapshot handling race (Adrian Hunter)

 - Misc fixes and cleanups (James Clark, Jiri Olsa, Oleg Nesterov and
   Peter Zijlstra)

* tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits)
  dmaengine: idxd: Clean up cpumask and hotplug for perfmon
  iommu/vt-d: Clean up cpumask and hotplug for perfmon
  perf/x86/intel/cstate: Clean up cpumask and hotplug
  perf: Add PERF_EV_CAP_READ_SCOPE
  perf: Generic hotplug support for a PMU with a scope
  uprobes: perform lockless SRCU-protected uprobes_tree lookup
  rbtree: provide rb_find_rcu() / rb_find_add_rcu()
  perf/uprobe: split uprobe_unregister()
  uprobes: travers uprobe's consumer list locklessly under SRCU protection
  uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks
  uprobes: protected uprobe lifetime with SRCU
  uprobes: revamp uprobe refcounting and lifetime management
  bpf: Fix use-after-free in bpf_uprobe_multi_link_attach()
  perf/core: Fix small negative period being ignored
  perf: Really fix event_function_call() locking
  perf: Optimize __pmu_ctx_sched_out()
  perf: Add context time freeze
  perf: Fix event_function_call() locking
  perf: Extract a few helpers
  perf: Optimize context reschedule for single PMU cases
  ...
This commit is contained in:
Linus Torvalds
2024-09-18 15:03:58 +02:00
21 changed files with 1146 additions and 857 deletions
+425 -161
View File
File diff suppressed because it is too large Load Diff
+280 -225
View File
@@ -40,6 +40,9 @@ static struct rb_root uprobes_tree = RB_ROOT;
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
DEFINE_STATIC_SRCU(uprobes_srcu);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
@@ -57,8 +60,9 @@ struct uprobe {
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
struct rcu_head rcu;
loff_t offset;
loff_t ref_ctr_offset;
unsigned long flags;
@@ -109,6 +113,11 @@ struct xol_area {
unsigned long vaddr; /* Page(s) of instruction slots */
};
static void uprobe_warn(struct task_struct *t, const char *msg)
{
pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
}
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -453,7 +462,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
* Called with mm->mmap_lock held for write.
* Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
@@ -587,25 +596,63 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
*(uprobe_opcode_t *)&auprobe->insn);
}
/* uprobe should have guaranteed positive refcount */
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref);
return uprobe;
}
/*
* uprobe should have guaranteed lifetime, which can be either of:
* - caller already has refcount taken (and wants an extra one);
* - uprobe is RCU protected and won't be freed until after grace period;
* - we are holding uprobes_treelock (for read or write, doesn't matter).
*/
static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
{
if (refcount_inc_not_zero(&uprobe->ref))
return uprobe;
return NULL;
}
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
static void uprobe_free_rcu(struct rcu_head *rcu)
{
struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
kfree(uprobe);
}
static void put_uprobe(struct uprobe *uprobe)
{
if (refcount_dec_and_test(&uprobe->ref)) {
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
* delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
kfree(uprobe);
if (!refcount_dec_and_test(&uprobe->ref))
return;
write_lock(&uprobes_treelock);
if (uprobe_is_active(uprobe)) {
write_seqcount_begin(&uprobes_seqcount);
rb_erase(&uprobe->rb_node, &uprobes_tree);
write_seqcount_end(&uprobes_seqcount);
}
write_unlock(&uprobes_treelock);
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
* delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
}
static __always_inline
@@ -647,62 +694,86 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
/*
* Assumes being inside RCU protected region.
* No refcount is taken on returned uprobe.
*/
static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
struct rb_node *node;
unsigned int seq;
if (node)
return get_uprobe(__node_2_uprobe(node));
lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
do {
seq = read_seqcount_begin(&uprobes_seqcount);
node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
/*
* Lockless RB-tree lookups can result only in false negatives.
* If the element is found, it is correct and can be returned
* under RCU protection. If we find nothing, we need to
* validate that seqcount didn't change. If it did, we have to
* try again as we might have missed the element (false
* negative). If seqcount is unchanged, search truly failed.
*/
if (node)
return __node_2_uprobe(node);
} while (read_seqcount_retry(&uprobes_seqcount, seq));
return NULL;
}
/*
* Find a uprobe corresponding to a given inode:offset
* Acquires uprobes_treelock
* Attempt to insert a new uprobe into uprobes_tree.
*
* If uprobe already exists (for given inode+offset), we just increment
* refcount of previously existing uprobe.
*
* If not, a provided new instance of uprobe is inserted into the tree (with
* assumed initial refcount == 1).
*
* In any case, we return a uprobe instance that ends up being in uprobes_tree.
* Caller has to clean up new uprobe instance, if it ended up not being
* inserted into the tree.
*
* We assume that uprobes_treelock is held for writing.
*/
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *uprobe;
struct rb_node *node;
again:
node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node) {
struct uprobe *u = __node_2_uprobe(node);
read_lock(&uprobes_treelock);
uprobe = __find_uprobe(inode, offset);
read_unlock(&uprobes_treelock);
if (!try_get_uprobe(u)) {
rb_erase(node, &uprobes_tree);
RB_CLEAR_NODE(&u->rb_node);
goto again;
}
return u;
}
return uprobe;
}
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node)
return get_uprobe(__node_2_uprobe(node));
/* get access + creation ref */
refcount_set(&uprobe->ref, 2);
return NULL;
}
/*
* Acquire uprobes_treelock.
* Matching uprobe already exists in rbtree;
* increment (access refcount) and return the matching uprobe.
*
* No matching uprobe; insert the uprobe in rb_tree;
* get a double refcount (access + creation) and return NULL.
* Acquire uprobes_treelock and insert uprobe into uprobes_tree
* (or reuse existing one, see __insert_uprobe() comments above).
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
write_lock(&uprobes_treelock);
write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
return u;
@@ -725,18 +796,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
return NULL;
return ERR_PTR(-ENOMEM);
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
RB_CLEAR_NODE(&uprobe->rb_node);
refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
if (cur_uprobe != uprobe) {
if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
ref_ctr_mismatch_warn(cur_uprobe, uprobe);
put_uprobe(cur_uprobe);
@@ -753,32 +827,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
list_add_rcu(&uc->cons_node, &uprobe->consumers);
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
* Return true if the @uc is deleted successfully
* or return false.
* Should never be called with consumer that's not part of @uprobe->consumers.
*/
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
struct uprobe_consumer **con;
bool ret = false;
down_write(&uprobe->consumer_rwsem);
for (con = &uprobe->consumers; *con; con = &(*con)->next) {
if (*con == uc) {
*con = uc->next;
ret = true;
break;
}
}
list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
@@ -863,21 +924,20 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
return ret;
}
static inline bool consumer_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
return !uc->filter || uc->filter(uc, ctx, mm);
return !uc->filter || uc->filter(uc, mm);
}
static bool filter_chain(struct uprobe *uprobe,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
ret = consumer_filter(uc, ctx, mm);
list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
ret = consumer_filter(uc, mm);
if (ret)
break;
}
@@ -921,27 +981,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
* See find_active_uprobe().
*/
static void delete_uprobe(struct uprobe *uprobe)
{
if (WARN_ON(!uprobe_is_active(uprobe)))
return;
write_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
write_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
}
struct map_info {
struct map_info *next;
struct mm_struct *mm;
@@ -1046,7 +1085,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
/*
* We take mmap_lock for writing to avoid the race with
* find_active_uprobe_rcu() which takes mmap_lock for reading.
* Thus this install_breakpoint() can not make
* is_trap_at_addr() true right after find_uprobe_rcu()
* returns NULL in find_active_uprobe_rcu().
*/
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
@@ -1059,12 +1104,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new,
UPROBE_FILTER_REGISTER, mm))
if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe,
UPROBE_FILTER_UNREGISTER, mm))
if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
@@ -1079,152 +1122,140 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
/**
* uprobe_unregister_nosync - unregister an already registered probe.
* @uprobe: uprobe to remove
* @uc: identify which probe if multiple probes are colocated.
*/
void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
if (WARN_ON(!consumer_del(uprobe, uc)))
return;
err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
if (!uprobe->consumers && !err)
delete_uprobe(uprobe);
}
/*
* uprobe_unregister - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
* @uc: identify which probe if multiple probes are colocated.
*/
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return;
down_write(&uprobe->register_rwsem);
__uprobe_unregister(uprobe, uc);
consumer_del(uprobe, uc);
err = register_for_each_vma(uprobe, NULL);
up_write(&uprobe->register_rwsem);
/* TODO : cant unregister? schedule a worker thread */
if (unlikely(err)) {
uprobe_warn(current, "unregister, leaking uprobe");
return;
}
put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
/*
* __uprobe_register - register a probe
void uprobe_unregister_sync(void)
{
/*
* Now that handler_chain() and handle_uretprobe_chain() iterate over
* uprobe->consumers list under RCU protection without holding
* uprobe->register_rwsem, we need to wait for RCU grace period to
* make sure that we can't call into just unregistered
* uprobe_consumer's callbacks anymore. If we don't do that, fast and
* unlucky enough caller can free consumer's memory and cause
* handler_chain() or handle_uretprobe_chain() to do an use-after-free.
*/
synchronize_srcu(&uprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
/**
* uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
* @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
* Apart from the access refcount, __uprobe_register() takes a creation
* Apart from the access refcount, uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
* unregisters. Caller of __uprobe_register() is required to keep @inode
* unregisters. Caller of uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
* Return errno if it cannot successully install probes
* else return 0 (success)
* Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
static int __uprobe_register(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
struct uprobe *uprobe_register(struct inode *inode,
loff_t offset, loff_t ref_ctr_offset,
struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
if (!inode->i_mapping->a_ops->read_folio &&
!shmem_mapping(inode->i_mapping))
return -EIO;
return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
return -EINVAL;
return ERR_PTR(-EINVAL);
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
return -EINVAL;
return ERR_PTR(-EINVAL);
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
return -ENOMEM;
if (IS_ERR(uprobe))
return PTR_ERR(uprobe);
return uprobe;
/*
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
*/
down_write(&uprobe->register_rwsem);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
if (ret)
__uprobe_unregister(uprobe, uc);
}
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
if (unlikely(ret == -EAGAIN))
goto retry;
return ret;
}
if (ret) {
uprobe_unregister_nosync(uprobe, uc);
/*
* Registration might have partially succeeded, so we can have
* this consumer being called right at this time. We need to
* sync here. It's ok, it's unlikely slow path.
*/
uprobe_unregister_sync();
return ERR_PTR(ret);
}
int uprobe_register(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc)
{
return __uprobe_register(inode, offset, 0, uc);
return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
int uprobe_register_refctr(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);
/*
* uprobe_apply - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
/**
* uprobe_apply - add or remove the breakpoints according to @uc->filter
* @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
* Return: 0 on success or negative error code.
*/
int uprobe_apply(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc, bool add)
int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return ret;
int ret = -ENOENT, srcu_idx;
down_write(&uprobe->register_rwsem);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
srcu_idx = srcu_read_lock(&uprobes_srcu);
list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
if (con == uc) {
ret = register_for_each_vma(uprobe, add ? uc : NULL);
break;
}
}
srcu_read_unlock(&uprobes_srcu, srcu_idx);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
return ret;
}
@@ -1305,15 +1336,17 @@ static void build_probe_list(struct inode *inode,
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
list_add(&u->pending_list, head);
get_uprobe(u);
/* if uprobe went away, it's safe to ignore it */
if (try_get_uprobe(u))
list_add(&u->pending_list, head);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
list_add(&u->pending_list, head);
get_uprobe(u);
/* if uprobe went away, it's safe to ignore it */
if (try_get_uprobe(u))
list_add(&u->pending_list, head);
}
}
read_unlock(&uprobes_treelock);
@@ -1384,7 +1417,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@@ -1770,6 +1803,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return -ENOMEM;
*n = *o;
/*
* uprobe's refcnt has to be positive at this point, kept by
* utask->return_instances items; return_instances can't be
* removed right now, as task is blocked due to duping; so
* get_uprobe() is safe to use here.
*/
get_uprobe(n->uprobe);
n->next = NULL;
@@ -1781,12 +1820,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return 0;
}
static void uprobe_warn(struct task_struct *t, const char *msg)
{
pr_warn("uprobe: %s:%d failed to %s\n",
current->comm, current->pid, msg);
}
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
@@ -1883,9 +1916,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
/* we need to bump refcount to store uprobe in utask */
if (!try_get_uprobe(uprobe))
return;
ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
return;
goto fail;
trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
@@ -1912,8 +1949,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
ri->uprobe = get_uprobe(uprobe);
ri->uprobe = uprobe;
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
@@ -1924,8 +1960,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
utask->return_instances = ri;
return;
fail:
fail:
kfree(ri);
put_uprobe(uprobe);
}
/* Prepare to single-step probed instruction out of line. */
@@ -1940,9 +1977,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
if (!utask)
return -ENOMEM;
if (!try_get_uprobe(uprobe))
return -EINVAL;
xol_vaddr = xol_get_insn_slot(uprobe);
if (!xol_vaddr)
return -ENOMEM;
if (!xol_vaddr) {
err = -ENOMEM;
goto err_out;
}
utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
@@ -1950,12 +1992,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
xol_free_insn_slot(current);
return err;
goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
err_out:
put_uprobe(uprobe);
return err;
}
/*
@@ -2028,13 +2073,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
/*
* The NULL 'tsk' here ensures that any faults that occur here
* will not be accounted to the task. 'mm' *is* current->mm,
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
@@ -2045,7 +2084,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
/* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
@@ -2058,7 +2098,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
@@ -2079,9 +2119,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
bool has_consumers = false;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
current->utask->auprobe = &uprobe->arch;
list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
int rc = 0;
if (uc->handler) {
@@ -2094,16 +2137,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
need_prep = true;
remove &= rc;
has_consumers = true;
}
current->utask->auprobe = NULL;
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
if (remove && uprobe->consumers) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
if (remove && has_consumers) {
down_read(&uprobe->register_rwsem);
/* re-check that removal is still required, this time under lock */
if (!filter_chain(uprobe, current->mm)) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
}
up_read(&uprobe->register_rwsem);
}
up_read(&uprobe->register_rwsem);
}
static void
@@ -2111,13 +2162,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
int srcu_idx;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
srcu_idx = srcu_read_lock(&uprobes_srcu);
list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
up_read(&uprobe->register_rwsem);
srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2202,13 +2255,15 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
int is_swbp;
int is_swbp, srcu_idx;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
srcu_idx = srcu_read_lock(&uprobes_srcu);
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -2224,7 +2279,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
return;
goto out;
}
/* change it in advance for ->handler() and restart */
@@ -2259,12 +2314,12 @@ static void handle_swbp(struct pt_regs *regs)
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
if (pre_ssout(uprobe, regs, bp_vaddr))
goto out;
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
/*
+21 -17
View File
@@ -3160,6 +3160,7 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
@@ -3178,15 +3179,15 @@ struct bpf_uprobe_multi_run_ctx {
struct bpf_uprobe *uprobe;
};
static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
u32 cnt)
static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
for (i = 0; i < cnt; i++) {
uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
&uprobes[i].consumer);
}
for (i = 0; i < cnt; i++)
uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
if (cnt)
uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3194,7 +3195,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
if (umulti_link->task)
put_task_struct(umulti_link->task);
path_put(&umulti_link->path);
@@ -3322,8 +3323,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
}
static bool
uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
struct mm_struct *mm)
uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
struct bpf_uprobe *uprobe;
@@ -3480,22 +3480,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
err = uprobe_register_refctr(d_real_inode(link->path.dentry),
uprobes[i].offset,
uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (err) {
bpf_uprobe_unregister(&path, uprobes, i);
goto error_free;
uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
uprobes[i].offset,
uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (IS_ERR(uprobes[i].uprobe)) {
err = PTR_ERR(uprobes[i].uprobe);
link->cnt = i;
goto error_unregister;
}
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
goto error_free;
goto error_unregister;
return bpf_link_settle(&link_primer);
error_unregister:
bpf_uprobe_unregister(uprobes, link->cnt);
error_free:
kvfree(uprobes);
kfree(link);
+20 -24
View File
@@ -58,8 +58,8 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
struct inode *inode;
char *filename;
struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
unsigned long nhit;
@@ -1078,43 +1078,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return trace_handle_return(s);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
int ret;
struct inode *inode = d_real_inode(tu->path.dentry);
struct uprobe *uprobe;
tu->consumer.filter = filter;
tu->inode = d_real_inode(tu->path.dentry);
uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
if (IS_ERR(uprobe))
return PTR_ERR(uprobe);
if (tu->ref_ctr_offset)
ret = uprobe_register_refctr(tu->inode, tu->offset,
tu->ref_ctr_offset, &tu->consumer);
else
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret)
tu->inode = NULL;
return ret;
tu->uprobe = uprobe;
return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
if (!tu->inode)
if (!tu->uprobe)
continue;
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->inode = NULL;
uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
sync = true;
tu->uprobe = NULL;
}
if (sync)
uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
@@ -1310,7 +1307,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
@@ -1334,7 +1331,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
@@ -1344,8 +1341,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return err;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
@@ -1431,7 +1427,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
struct uprobe_cpu_buffer **ucbp)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
if (!uprobe_perf_filter(&tu->consumer, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))